summaryrefslogtreecommitdiff
path: root/internal/c
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-30 10:13:17 +0300
committerPaul Buetow <paul@buetow.org>2026-05-30 10:13:17 +0300
commit768e53d90be2d15242266b898023c9c39dacf47d (patch)
tree94bc78a2c02979e868836b127b4d6bbb6beb213c /internal/c
parent23579dbdac320884bcdd670e46744b5f4ab45d5d (diff)
fix(z10): skip enter-state write for noreturn syscalls
After p10 suppressed the sys_exit_exit/sys_exit_exit_group handlers, the enter handlers for exit/exit_group still called ior_on_syscall_enter, which writes a per-tid entry into syscall_enter_state_map. With the exit handler gone, nothing ever bpf_map_delete_elem'd that entry, so stale per-tid state accumulated in the bounded (32768) map on hosts churning many distinct tids and could starve legitimate inserts. Add ior_on_noreturn_syscall_enter in internal/c/filter.c: it only makes the sampling decision (ior_should_emit_trace) and deliberately does NOT record enter-state. The code generator now emits this hook for noreturn enter handlers (detected via isNoreturnSyscall(syscallName(name))) so the enter null_event is still emitted while the dead, unreclaimable map write is skipped. Regenerated generated_tracepoints.c accordingly. Extend TestGenerateExitNoreturnHandlers with a negative assertion (no ior_on_syscall_enter for noreturn) and add TestGenerateReturningSyscallEnterRecordsState as a positive contrast. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Diffstat (limited to 'internal/c')
-rw-r--r--internal/c/filter.c15
-rw-r--r--internal/c/generated_tracepoints.c4
2 files changed, 17 insertions, 2 deletions
diff --git a/internal/c/filter.c b/internal/c/filter.c
index 5585c12..48907d8 100644
--- a/internal/c/filter.c
+++ b/internal/c/filter.c
@@ -78,6 +78,21 @@ static __always_inline int ior_on_syscall_enter(__u32 tid, __u32 enter_trace_id)
return state.emit_event != 0;
}
+// ior_on_noreturn_syscall_enter is the enter hook for noreturn syscalls
+// (exit, exit_group). Unlike ior_on_syscall_enter it deliberately does NOT
+// write a per-tid entry into syscall_enter_state_map. A noreturn syscall never
+// returns to userspace, so its sys_exit tracepoint never fires and the matching
+// exit handler is suppressed by the generator (see internal/generate/codegen.go
+// isNoreturnSyscall). With no exit handler, nothing would ever look up or
+// bpf_map_delete_elem that enter-state entry, so recording it would only leave
+// stale per-tid entries crowding the bounded (32768) map on hosts churning many
+// distinct tids. We still honor the sampling decision so the enter null_event is
+// emitted (or dropped) exactly as a normal syscall's enter would be, but without
+// the dead, unreclaimable map write.
+static __always_inline int ior_on_noreturn_syscall_enter(__u32 enter_trace_id) {
+ return ior_should_emit_trace(enter_trace_id);
+}
+
static __always_inline int ior_on_syscall_exit(__u32 tid, __u32 enter_trace_id, __s64 ret) {
__u64 now;
__u64 duration = 0;
diff --git a/internal/c/generated_tracepoints.c b/internal/c/generated_tracepoints.c
index 51e30be..e38e1af 100644
--- a/internal/c/generated_tracepoints.c
+++ b/internal/c/generated_tracepoints.c
@@ -18753,7 +18753,7 @@ int handle_sys_enter_exit(struct syscall_trace_enter *ctx) {
if (filter(&pid, &tid))
return 0;
- if (!ior_on_syscall_enter(tid, SYS_ENTER_EXIT))
+ if (!ior_on_noreturn_syscall_enter(SYS_ENTER_EXIT))
return 0;
struct null_event *ev = bpf_ringbuf_reserve(&event_map, sizeof(struct null_event), 0);
@@ -18777,7 +18777,7 @@ int handle_sys_enter_exit_group(struct syscall_trace_enter *ctx) {
if (filter(&pid, &tid))
return 0;
- if (!ior_on_syscall_enter(tid, SYS_ENTER_EXIT_GROUP))
+ if (!ior_on_noreturn_syscall_enter(SYS_ENTER_EXIT_GROUP))
return 0;
struct null_event *ev = bpf_ringbuf_reserve(&event_map, sizeof(struct null_event), 0);