//+build ignore #define ACCEPT 0 #define FILTER 1 #define IOR_HISTOGRAM_BUCKETS 8 static __always_inline __u32 ior_histogram_bucket_index(__u64 duration_ns) { if (duration_ns < 1000) return 0; if (duration_ns < 10000) return 1; if (duration_ns < 100000) return 2; if (duration_ns < 1000000) return 3; if (duration_ns < 10000000) return 4; if (duration_ns < 100000000) return 5; if (duration_ns < 1000000000) return 6; return 7; } static __always_inline void ior_update_syscall_aggregate(__u32 enter_trace_id, __u64 duration_ns, __s64 ret) { __u32 bucket_idx; struct syscall_aggregate *existing; struct syscall_aggregate fresh = {}; existing = bpf_map_lookup_elem(&syscall_aggregate_map, &enter_trace_id); bucket_idx = ior_histogram_bucket_index(duration_ns); if (bucket_idx >= IOR_HISTOGRAM_BUCKETS) bucket_idx = IOR_HISTOGRAM_BUCKETS - 1; if (existing) { existing->count += 1; existing->total_duration_ns += duration_ns; if (ret < 0) existing->errors += 1; if (existing->count == 1 || duration_ns < existing->min_duration_ns) existing->min_duration_ns = duration_ns; if (duration_ns > existing->max_duration_ns) existing->max_duration_ns = duration_ns; existing->duration_histogram[bucket_idx] += 1; return; } fresh.count = 1; fresh.total_duration_ns = duration_ns; fresh.min_duration_ns = duration_ns; fresh.max_duration_ns = duration_ns; if (ret < 0) fresh.errors = 1; fresh.duration_histogram[bucket_idx] = 1; bpf_map_update_elem(&syscall_aggregate_map, &enter_trace_id, &fresh, BPF_ANY); } static __always_inline int ior_should_emit_trace(__u32 enter_trace_id) { __u32 default_rate = 1; __u32 *configured = bpf_map_lookup_elem(&syscall_sampling_rate_map, &enter_trace_id); __u32 rate = configured ? *configured : default_rate; // A zero rate means aggregate-only mode for this syscall. if (rate == 0) return 0; if (rate == 1) return 1; return (bpf_get_prandom_u32() % rate) == 0; } static __always_inline int ior_on_syscall_enter(__u32 tid, __u32 enter_trace_id) { struct syscall_enter_state state = {}; state.start_ns = bpf_ktime_get_boot_ns(); state.enter_trace_id = enter_trace_id; state.emit_event = ior_should_emit_trace(enter_trace_id) ? 1 : 0; bpf_map_update_elem(&syscall_enter_state_map, &tid, &state, BPF_ANY); return state.emit_event != 0; } // ior_on_noreturn_syscall_enter is the enter hook for noreturn syscalls // (exit, exit_group, rt_sigreturn). Unlike ior_on_syscall_enter it deliberately // does NOT write a per-tid entry into syscall_enter_state_map. A noreturn // syscall never returns to the syscall site (exit/exit_group terminate; // rt_sigreturn restores the pre-signal context), so its sys_exit tracepoint // never fires and the matching // exit handler is suppressed by the generator (see internal/generate/codegen.go // isNoreturnSyscall). With no exit handler, nothing would ever look up or // bpf_map_delete_elem that enter-state entry, so recording it would only leave // stale per-tid entries crowding the bounded (32768) map on hosts churning many // distinct tids. We still honor the sampling decision so the enter null_event is // emitted (or dropped) exactly as a normal syscall's enter would be, but without // the dead, unreclaimable map write. static __always_inline int ior_on_noreturn_syscall_enter(__u32 enter_trace_id) { return ior_should_emit_trace(enter_trace_id); } static __always_inline int ior_on_syscall_exit(__u32 tid, __u32 enter_trace_id, __s64 ret) { __u64 now; __u64 duration = 0; __u8 emit_event = 1; struct syscall_enter_state *state; state = bpf_map_lookup_elem(&syscall_enter_state_map, &tid); if (!state) return 1; now = bpf_ktime_get_boot_ns(); if (now >= state->start_ns) duration = now - state->start_ns; // Pair aggregate stats using the explicit enter_trace_id passed by the // generated exit handler, avoiding any numeric adjacency assumption // between kernel-assigned enter and exit tracepoint IDs. if (state->enter_trace_id == enter_trace_id) ior_update_syscall_aggregate(state->enter_trace_id, duration, ret); emit_event = state->emit_event; bpf_map_delete_elem(&syscall_enter_state_map, &tid); return emit_event != 0; } // filter() decides whether the current task's syscall is in scope. Today this is // a single-TGID gate (PID_FILTER, with -1 meaning trace-all) plus an optional // TID_FILTER. ior does NOT follow forks: a traced process's children run under a // different TGID and are excluded here, which also means their syscalls miss the // aggregate-count path downstream. A planned opt-in process-tree-following mode // would extend this gate to also accept descendant TGIDs from a BPF-maintained // set seeded with the root PID and updated via sched_process_fork/exit — see // docs/follow-forks-plan.md for the full design. static __always_inline int filter(__u32 *pid, __u32 *tid) { u64 pid_tgid = bpf_get_current_pid_tgid(); *pid = pid_tgid >> 32; // Ignore ior userland process itself if (*pid == IOR_PID_FILTER) { return FILTER; } *tid = pid_tgid & 0xFFFFFFFF; if (-1 == PID_FILTER || *pid == PID_FILTER) { if (-1 == TID_FILTER || *tid == TID_FILTER) { return ACCEPT; } } return FILTER; }