diff options
| -rw-r--r-- | cmd/ioworkload/scenario_retbytes.go | 59 | ||||
| -rw-r--r-- | docs/syscall-tracing-plan.md | 10 | ||||
| -rw-r--r-- | integrationtests/retbytes_test.go | 8 | ||||
| -rw-r--r-- | internal/c/generated_tracepoints.c | 8 | ||||
| -rw-r--r-- | internal/c/generated_tracepoints_result.txt | 4 | ||||
| -rw-r--r-- | internal/generate/classify.go | 2 | ||||
| -rw-r--r-- | internal/generate/classify_test.go | 18 | ||||
| -rw-r--r-- | internal/generate/retclassify_test.go | 17 |
8 files changed, 109 insertions, 17 deletions
diff --git a/cmd/ioworkload/scenario_retbytes.go b/cmd/ioworkload/scenario_retbytes.go index fa0b677..d9b2984 100644 --- a/cmd/ioworkload/scenario_retbytes.go +++ b/cmd/ioworkload/scenario_retbytes.go @@ -12,14 +12,24 @@ import ( const ( sysProcessVMReadv = 310 sysProcessVMWritev = 311 + sysSendmmsg = 307 retbytesPayloadLen = 18 ) +type mmsghdr struct { + hdr syscall.Msghdr + len uint32 + _ uint32 +} + // retbytesPhaseA exercises byte-classified syscalls that use generic ret_event exits. func retbytesPhaseA() error { if err := retbytesSocketIO(); err != nil { return err } + if err := retbytesBatchSocketIO(); err != nil { + return err + } if err := retbytesSendfile(); err != nil { return err } @@ -68,6 +78,44 @@ func retbytesSocketIO() error { return nil } +func retbytesBatchSocketIO() error { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) + if err != nil { + return fmt.Errorf("batch socketpair: %w", err) + } + defer syscall.Close(fds[0]) + defer syscall.Close(fds[1]) + + payloadA := []byte("batch-one") + payloadB := []byte("batch-two") + sendMsgs := mmsgSlice(payloadA, payloadB) + n, _, errno := syscall.Syscall6(sysSendmmsg, uintptr(fds[0]), uintptr(unsafe.Pointer(&sendMsgs[0])), uintptr(len(sendMsgs)), 0, 0, 0) + if errno != 0 { + return fmt.Errorf("sendmmsg: %w", errno) + } + if n != uintptr(len(sendMsgs)) { + return fmt.Errorf("sendmmsg sent %d messages, want %d", n, len(sendMsgs)) + } + runtime.KeepAlive(payloadA) + runtime.KeepAlive(payloadB) + runtime.KeepAlive(sendMsgs) + + recvA := make([]byte, len(payloadA)) + recvB := make([]byte, len(payloadB)) + recvMsgs := mmsgSlice(recvA, recvB) + n, _, errno = syscall.Syscall6(syscall.SYS_RECVMMSG, uintptr(fds[1]), uintptr(unsafe.Pointer(&recvMsgs[0])), uintptr(len(recvMsgs)), 0, 0, 0) + if errno != 0 { + return fmt.Errorf("recvmmsg: %w", errno) + } + if n != uintptr(len(recvMsgs)) { + return fmt.Errorf("recvmmsg received %d messages, want %d", n, len(recvMsgs)) + } + runtime.KeepAlive(recvA) + runtime.KeepAlive(recvB) + runtime.KeepAlive(recvMsgs) + return nil +} + func retbytesSendfile() error { dir, cleanup, err := makeTempDir("retbytes-sendfile") if err != nil { @@ -202,6 +250,17 @@ func openPayloadFile(path string) (int, error) { return fd, nil } +func mmsgSlice(bufs ...[]byte) []mmsghdr { + msgs := make([]mmsghdr, len(bufs)) + iovs := make([]syscall.Iovec, len(bufs)) + for i := range bufs { + iovs[i] = syscall.Iovec{Base: &bufs[i][0], Len: uint64(len(bufs[i]))} + msgs[i].hdr.Iov = &iovs[i] + msgs[i].hdr.Iovlen = 1 + } + return msgs +} + func processVMReadv(pid int, local, remote []byte) (int, error) { localIov := syscall.Iovec{Base: &local[0], Len: uint64(len(local))} remoteIov := syscall.Iovec{Base: &remote[0], Len: uint64(len(remote))} diff --git a/docs/syscall-tracing-plan.md b/docs/syscall-tracing-plan.md index 18ac033..ecf11b5 100644 --- a/docs/syscall-tracing-plan.md +++ b/docs/syscall-tracing-plan.md @@ -98,15 +98,15 @@ These are conspicuously absent from ior (which already has read/write but not so | `setsockopt` | yes | no | yes (optlen) | extend `KindFd` | level/optname | P2 | | `sendto` | yes | no | **yes** (len) | `KindRet` + `WriteClassified` (already in `retClassifications`!) | sockaddr family | P1 | | `sendmsg` | yes | no | **yes** (iov total) | `KindRet` + `WriteClassified` (already mapped) | flags | P1 | -| `sendmmsg` | yes | no | **yes** (sum of msgs) | `KindRet` + `WriteClassified` (already mapped) | vlen, flags | P1 | +| `sendmmsg` | yes | no | **yes** (sum of msgs) | defer byte classification until payload bytes can be computed from message vectors | vlen, flags | P1 | | `recvfrom` | yes | no | **yes** (len) | `KindRet` + `ReadClassified` (already mapped) | sockaddr family | P1 | | `recvmsg` | yes | no | **yes** (iov total) | `KindRet` + `ReadClassified` (already mapped) | flags | P1 | -| `recvmmsg` | yes | no | **yes** (sum of msgs) | `KindRet` + `ReadClassified` (already mapped) | vlen, flags, timeout | P1 | +| `recvmmsg` | yes | no | **yes** (sum of msgs) | defer byte classification until payload bytes can be computed from message vectors | vlen, flags, timeout | P1 | | `sendfile64` | yes (both in/out fd) | no | **yes** (count) | `KindRet` + `TransferClassified` (already mapped) | both fds | P1 | | `splice` | yes (both fds) | no | **yes** (len) | `KindRet` + `TransferClassified` (already mapped) | both fds, flags | P1 | | `tee` | yes (both fds) | no | **yes** (len) | `KindRet` + `TransferClassified` (already mapped) | both fds, flags | P1 | -> Note: `RetClassification` already lists the recv/send/sendfile/splice/tee/process_vm_* families. The classifier just refuses them today because `shouldIgnore`/`exactIgnores` short-circuits earlier in `classify.go`. **Removing those ignores is the cheapest possible win** — bytes accounting drops in for free. +> Note: `RetClassification` covers single-message recv/send, sendfile/splice/tee, and process_vm_* families. Batched `sendmmsg`/`recvmmsg` are not safe to classify through generic return-value byte accounting because their return value is message count, not payload bytes. ### 3.2 IPC — pipes, eventfd, signalfd, message queues, shared mem, semaphores @@ -355,7 +355,7 @@ Out of the ~230 currently-ignored syscalls, **fd as argument** appears in: - Security: `landlock_add_rule` (ruleset_fd), `landlock_restrict_self`, `kexec_file_load` - Mount: `move_mount` (two), `fsmount` (fsfd) - Perf: `perf_event_open` (group_fd) -- Already-mapped Ret-classified bytes-carrying entries that block on `shouldIgnore` only: all send/recv variants, `sendfile64`, `splice`, `tee`, `vmsplice` (already traced), `process_vm_readv`, `process_vm_writev` +- Already-mapped Ret-classified bytes-carrying entries that block on `shouldIgnore` only: single-message send/recv variants, `sendfile64`, `splice`, `tee`, `vmsplice` (already traced), `process_vm_readv`, `process_vm_writev`. `sendmmsg`/`recvmmsg` need message-vector byte accounting before they can join this set. ### 4.2 Which syscalls return an fd? (Summary) @@ -442,7 +442,7 @@ Tracing `futex`, `clock_gettime`, `epoll_wait`, `nanosleep`, and `read`/`write` A pragmatic, low-risk order of work — each step ships independent value: **Phase A — "free wins"** (no new kind needed, just unblock ignores) -- Network read/write bytes: enable `sendto`/`sendmsg`/`sendmmsg`/`recvfrom`/`recvmsg`/`recvmmsg`, `sendfile64`, `splice`, `tee`, `process_vm_readv`, `process_vm_writev`. These already appear in `retClassifications`; only `shouldIgnore` blocks them. Need a `KindRet` exit handler and minimal enter wiring. +- Network read/write bytes: enable `sendto`/`sendmsg`/`recvfrom`/`recvmsg`, `sendfile64`, `splice`, `tee`, `process_vm_readv`, `process_vm_writev`. These can use `retClassifications` directly because their return values are payload bytes. Defer `sendmmsg`/`recvmmsg` byte totals until enter-state/iovec accounting can compute payload bytes rather than message counts. **Phase B — high-impact families** (new kinds, but small set, very visible payoff) - `socket`/`socketpair`/`accept[4]`/`bind`/`connect`/`listen`/`shutdown` + getsock*/setsock* diff --git a/integrationtests/retbytes_test.go b/integrationtests/retbytes_test.go index 2e2ea1d..c6f06d8 100644 --- a/integrationtests/retbytes_test.go +++ b/integrationtests/retbytes_test.go @@ -10,6 +10,8 @@ func TestRetbytesPhaseA(t *testing.T) { {Tracepoint: "enter_recvfrom", Comm: "ioworkload", MinCount: 1}, {Tracepoint: "enter_sendmsg", Comm: "ioworkload", MinCount: 1}, {Tracepoint: "enter_recvmsg", Comm: "ioworkload", MinCount: 1}, + {Tracepoint: "enter_sendmmsg", Comm: "ioworkload", MinCount: 1}, + {Tracepoint: "enter_recvmmsg", Comm: "ioworkload", MinCount: 1}, {Tracepoint: "enter_sendfile64", Comm: "ioworkload", MinCount: 1}, {Tracepoint: "enter_splice", Comm: "ioworkload", MinCount: 1}, {Tracepoint: "enter_tee", Comm: "ioworkload", MinCount: 1}, @@ -32,4 +34,10 @@ func TestRetbytesPhaseA(t *testing.T) { assertEventBytesAtLeast(t, result, exp, payloadLen) assertEventDurationPositive(t, result, exp) } + + for _, tracepoint := range []string{"enter_sendmmsg", "enter_recvmmsg"} { + exp := ExpectedEvent{Tracepoint: tracepoint, Comm: "ioworkload"} + assertEventBytesEqual(t, result, exp, 0) + assertEventDurationPositive(t, result, exp) + } } diff --git a/internal/c/generated_tracepoints.c b/internal/c/generated_tracepoints.c index be6606d..980f91d 100644 --- a/internal/c/generated_tracepoints.c +++ b/internal/c/generated_tracepoints.c @@ -1431,7 +1431,7 @@ int handle_sys_enter_sendmmsg(struct syscall_trace_enter *ctx) { return 0; } -/// sys_exit_sendmmsg is a struct ret_event (WRITE_CLASSIFIED) +/// sys_exit_sendmmsg is a struct ret_event (UNCLASSIFIED) SEC("tracepoint/syscalls/sys_exit_sendmmsg") int handle_sys_exit_sendmmsg(struct syscall_trace_exit *ctx) { __u32 pid, tid; @@ -1448,7 +1448,7 @@ int handle_sys_exit_sendmmsg(struct syscall_trace_exit *ctx) { ev->tid = tid; ev->time = bpf_ktime_get_boot_ns(); ev->ret = ctx->ret; - ev->ret_type = WRITE_CLASSIFIED; + ev->ret_type = UNCLASSIFIED; bpf_ringbuf_submit(ev, 0); return 0; @@ -1521,7 +1521,7 @@ int handle_sys_enter_recvmmsg(struct syscall_trace_enter *ctx) { return 0; } -/// sys_exit_recvmmsg is a struct ret_event (READ_CLASSIFIED) +/// sys_exit_recvmmsg is a struct ret_event (UNCLASSIFIED) SEC("tracepoint/syscalls/sys_exit_recvmmsg") int handle_sys_exit_recvmmsg(struct syscall_trace_exit *ctx) { __u32 pid, tid; @@ -1538,7 +1538,7 @@ int handle_sys_exit_recvmmsg(struct syscall_trace_exit *ctx) { ev->tid = tid; ev->time = bpf_ktime_get_boot_ns(); ev->ret = ctx->ret; - ev->ret_type = READ_CLASSIFIED; + ev->ret_type = UNCLASSIFIED; bpf_ringbuf_submit(ev, 0); return 0; diff --git a/internal/c/generated_tracepoints_result.txt b/internal/c/generated_tracepoints_result.txt index 2cc1e52..ea4f2d1 100644 --- a/internal/c/generated_tracepoints_result.txt +++ b/internal/c/generated_tracepoints_result.txt @@ -608,7 +608,7 @@ sys_exit_readlinkat is a struct ret_event (READ_CLASSIFIED) sys_exit_readv is a struct ret_event (READ_CLASSIFIED) sys_exit_reboot is a struct ret_event (UNCLASSIFIED) sys_exit_recvfrom is a struct ret_event (READ_CLASSIFIED) -sys_exit_recvmmsg is a struct ret_event (READ_CLASSIFIED) +sys_exit_recvmmsg is a struct ret_event (UNCLASSIFIED) sys_exit_recvmsg is a struct ret_event (READ_CLASSIFIED) sys_exit_remap_file_pages is a struct ret_event (UNCLASSIFIED) sys_exit_removexattr is a struct ret_event (UNCLASSIFIED) @@ -647,7 +647,7 @@ sys_exit_semget is a struct ret_event (UNCLASSIFIED) sys_exit_semop is a struct ret_event (UNCLASSIFIED) sys_exit_semtimedop is a struct ret_event (UNCLASSIFIED) sys_exit_sendfile64 is a struct ret_event (TRANSFER_CLASSIFIED) -sys_exit_sendmmsg is a struct ret_event (WRITE_CLASSIFIED) +sys_exit_sendmmsg is a struct ret_event (UNCLASSIFIED) sys_exit_sendmsg is a struct ret_event (WRITE_CLASSIFIED) sys_exit_sendto is a struct ret_event (WRITE_CLASSIFIED) sys_exit_set_mempolicy is a struct ret_event (UNCLASSIFIED) diff --git a/internal/generate/classify.go b/internal/generate/classify.go index b96ee0d..7768ea7 100644 --- a/internal/generate/classify.go +++ b/internal/generate/classify.go @@ -169,7 +169,6 @@ var retClassifications = map[string]RetClassification{ "readlink": ReadClassified, "readlinkat": ReadClassified, "readv": ReadClassified, - "recvmmsg": ReadClassified, "recvmsg": ReadClassified, "recvfrom": ReadClassified, "syslog": ReadClassified, @@ -184,7 +183,6 @@ var retClassifications = map[string]RetClassification{ "pwrite64": WriteClassified, "pwritev": WriteClassified, "pwritev2": WriteClassified, - "sendmmsg": WriteClassified, "sendmsg": WriteClassified, "sendto": WriteClassified, "write": WriteClassified, diff --git a/internal/generate/classify_test.go b/internal/generate/classify_test.go index 4dd216e..ea7d662 100644 --- a/internal/generate/classify_test.go +++ b/internal/generate/classify_test.go @@ -372,11 +372,27 @@ func TestClassifyPhaseAByteSyscallPairsAccepted(t *testing.T) { } } +func TestBatchMessageSyscallPairsDeferByteClassification(t *testing.T) { + tests := []string{"sendmmsg", "recvmmsg"} + for i, name := range tests { + t.Run(name, func(t *testing.T) { + output := GenerateTracepointsC(phaseAFormats(name, 9100+i*2)) + if strings.Contains(output, "Ignoring") || strings.Contains(output, "Skipping") { + t.Fatalf("syscall %s was not accepted:\n%s", name, output) + } + if !strings.Contains(output, "/// sys_exit_"+name+" is a struct ret_event (UNCLASSIFIED)") { + t.Fatalf("sys_exit_%s should be generated without byte classification:\n%s", name, output) + } + }) + } +} + func phaseAFormats(name string, enterID int) []Format { enterFields := []Field{ {Type: "long", Name: "__syscall_nr"}, } - if name == "sendto" || name == "recvfrom" || name == "sendmsg" || name == "recvmsg" { + if name == "sendto" || name == "recvfrom" || name == "sendmsg" || name == "recvmsg" || + name == "sendmmsg" || name == "recvmmsg" { enterFields = append(enterFields, Field{Type: "int", Name: "fd"}) } diff --git a/internal/generate/retclassify_test.go b/internal/generate/retclassify_test.go index 9a75a15..1c5b2ac 100644 --- a/internal/generate/retclassify_test.go +++ b/internal/generate/retclassify_test.go @@ -7,7 +7,7 @@ func TestClassifyRetRead(t *testing.T) { "fgetxattr", "flistxattr", "getdents", "getdents64", "getxattr", "lgetxattr", "listxattr", "llistxattr", "pread64", "preadv", "preadv2", "process_vm_readv", "read", "readlink", "readlinkat", - "readv", "recvmmsg", "recvmsg", "recvfrom", "syslog", + "readv", "recvmsg", "recvfrom", "syslog", } for _, name := range reads { if got := ClassifyRet("sys_exit_" + name); got != ReadClassified { @@ -19,7 +19,7 @@ func TestClassifyRetRead(t *testing.T) { func TestClassifyRetWrite(t *testing.T) { writes := []string{ "process_vm_writev", "pwrite64", "pwritev", "pwritev2", - "sendmmsg", "sendmsg", "sendto", "write", "writev", + "sendmsg", "sendto", "write", "writev", } for _, name := range writes { if got := ClassifyRet("sys_exit_" + name); got != WriteClassified { @@ -43,7 +43,7 @@ func TestClassifyRetUnclassified(t *testing.T) { unclassified := []string{ "openat", "close", "rename", "unlink", "fcntl", "dup", "dup2", "dup3", "mkdir", "rmdir", "chmod", "chown", "chdir", "stat", "lseek", - "truncate", "fallocate", "mmap", "fsync", "flock", + "truncate", "fallocate", "mmap", "fsync", "flock", "recvmmsg", "sendmmsg", } for _, name := range unclassified { if got := ClassifyRet("sys_exit_" + name); got != Unclassified { @@ -52,6 +52,17 @@ func TestClassifyRetUnclassified(t *testing.T) { } } +func TestBatchMessageSyscallsDeferredFromRetByteClassification(t *testing.T) { + tests := []string{"recvmmsg", "sendmmsg"} + for _, name := range tests { + t.Run(name, func(t *testing.T) { + if got := ClassifyRet("sys_exit_" + name); got != Unclassified { + t.Fatalf("ClassifyRet(sys_exit_%s) = %q, want %q", name, got, Unclassified) + } + }) + } +} + func TestClassifyRetCaseInsensitive(t *testing.T) { if got := ClassifyRet("sys_exit_READ"); got != ReadClassified { t.Errorf("ClassifyRet(sys_exit_READ) = %q, want READ_CLASSIFIED", got) |
