diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-30 10:13:17 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-30 10:13:17 +0300 |
| commit | 768e53d90be2d15242266b898023c9c39dacf47d (patch) | |
| tree | 94bc78a2c02979e868836b127b4d6bbb6beb213c /internal/c | |
| parent | 23579dbdac320884bcdd670e46744b5f4ab45d5d (diff) | |
fix(z10): skip enter-state write for noreturn syscalls
After p10 suppressed the sys_exit_exit/sys_exit_exit_group handlers, the
enter handlers for exit/exit_group still called ior_on_syscall_enter,
which writes a per-tid entry into syscall_enter_state_map. With the exit
handler gone, nothing ever bpf_map_delete_elem'd that entry, so stale
per-tid state accumulated in the bounded (32768) map on hosts churning
many distinct tids and could starve legitimate inserts.
Add ior_on_noreturn_syscall_enter in internal/c/filter.c: it only makes
the sampling decision (ior_should_emit_trace) and deliberately does NOT
record enter-state. The code generator now emits this hook for noreturn
enter handlers (detected via isNoreturnSyscall(syscallName(name))) so the
enter null_event is still emitted while the dead, unreclaimable map write
is skipped. Regenerated generated_tracepoints.c accordingly.
Extend TestGenerateExitNoreturnHandlers with a negative assertion (no
ior_on_syscall_enter for noreturn) and add
TestGenerateReturningSyscallEnterRecordsState as a positive contrast.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Diffstat (limited to 'internal/c')
| -rw-r--r-- | internal/c/filter.c | 15 | ||||
| -rw-r--r-- | internal/c/generated_tracepoints.c | 4 |
2 files changed, 17 insertions, 2 deletions
diff --git a/internal/c/filter.c b/internal/c/filter.c index 5585c12..48907d8 100644 --- a/internal/c/filter.c +++ b/internal/c/filter.c @@ -78,6 +78,21 @@ static __always_inline int ior_on_syscall_enter(__u32 tid, __u32 enter_trace_id) return state.emit_event != 0; } +// ior_on_noreturn_syscall_enter is the enter hook for noreturn syscalls +// (exit, exit_group). Unlike ior_on_syscall_enter it deliberately does NOT +// write a per-tid entry into syscall_enter_state_map. A noreturn syscall never +// returns to userspace, so its sys_exit tracepoint never fires and the matching +// exit handler is suppressed by the generator (see internal/generate/codegen.go +// isNoreturnSyscall). With no exit handler, nothing would ever look up or +// bpf_map_delete_elem that enter-state entry, so recording it would only leave +// stale per-tid entries crowding the bounded (32768) map on hosts churning many +// distinct tids. We still honor the sampling decision so the enter null_event is +// emitted (or dropped) exactly as a normal syscall's enter would be, but without +// the dead, unreclaimable map write. +static __always_inline int ior_on_noreturn_syscall_enter(__u32 enter_trace_id) { + return ior_should_emit_trace(enter_trace_id); +} + static __always_inline int ior_on_syscall_exit(__u32 tid, __u32 enter_trace_id, __s64 ret) { __u64 now; __u64 duration = 0; diff --git a/internal/c/generated_tracepoints.c b/internal/c/generated_tracepoints.c index 51e30be..e38e1af 100644 --- a/internal/c/generated_tracepoints.c +++ b/internal/c/generated_tracepoints.c @@ -18753,7 +18753,7 @@ int handle_sys_enter_exit(struct syscall_trace_enter *ctx) { if (filter(&pid, &tid)) return 0; - if (!ior_on_syscall_enter(tid, SYS_ENTER_EXIT)) + if (!ior_on_noreturn_syscall_enter(SYS_ENTER_EXIT)) return 0; struct null_event *ev = bpf_ringbuf_reserve(&event_map, sizeof(struct null_event), 0); @@ -18777,7 +18777,7 @@ int handle_sys_enter_exit_group(struct syscall_trace_enter *ctx) { if (filter(&pid, &tid)) return 0; - if (!ior_on_syscall_enter(tid, SYS_ENTER_EXIT_GROUP)) + if (!ior_on_noreturn_syscall_enter(SYS_ENTER_EXIT_GROUP)) return 0; struct null_event *ev = bpf_ringbuf_reserve(&event_map, sizeof(struct null_event), 0); |
