summaryrefslogtreecommitdiff
path: root/internal/c/filter.c
blob: 66c657400523f0ce38121ba12d7ae77846ec638a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
//+build ignore

#define ACCEPT 0
#define FILTER 1
#define IOR_HISTOGRAM_BUCKETS 8

static __always_inline __u32 ior_histogram_bucket_index(__u64 duration_ns) {
    if (duration_ns < 1000)
        return 0;
    if (duration_ns < 10000)
        return 1;
    if (duration_ns < 100000)
        return 2;
    if (duration_ns < 1000000)
        return 3;
    if (duration_ns < 10000000)
        return 4;
    if (duration_ns < 100000000)
        return 5;
    if (duration_ns < 1000000000)
        return 6;
    return 7;
}

static __always_inline void ior_update_syscall_aggregate(__u32 enter_trace_id, __u64 duration_ns, __s64 ret) {
    __u32 bucket_idx;
    struct syscall_aggregate *existing;
    struct syscall_aggregate fresh = {};

    existing = bpf_map_lookup_elem(&syscall_aggregate_map, &enter_trace_id);
    bucket_idx = ior_histogram_bucket_index(duration_ns);
    if (bucket_idx >= IOR_HISTOGRAM_BUCKETS)
        bucket_idx = IOR_HISTOGRAM_BUCKETS - 1;

    if (existing) {
        existing->count += 1;
        existing->total_duration_ns += duration_ns;
        if (ret < 0)
            existing->errors += 1;
        if (existing->count == 1 || duration_ns < existing->min_duration_ns)
            existing->min_duration_ns = duration_ns;
        if (duration_ns > existing->max_duration_ns)
            existing->max_duration_ns = duration_ns;
        existing->duration_histogram[bucket_idx] += 1;
        return;
    }

    fresh.count = 1;
    fresh.total_duration_ns = duration_ns;
    fresh.min_duration_ns = duration_ns;
    fresh.max_duration_ns = duration_ns;
    if (ret < 0)
        fresh.errors = 1;
    fresh.duration_histogram[bucket_idx] = 1;
    bpf_map_update_elem(&syscall_aggregate_map, &enter_trace_id, &fresh, BPF_ANY);
}

static __always_inline int ior_should_emit_trace(__u32 enter_trace_id) {
    __u32 default_rate = 1;
    __u32 *configured = bpf_map_lookup_elem(&syscall_sampling_rate_map, &enter_trace_id);
    __u32 rate = configured ? *configured : default_rate;

    // A zero rate means aggregate-only mode for this syscall.
    if (rate == 0)
        return 0;
    if (rate == 1)
        return 1;
    return (bpf_get_prandom_u32() % rate) == 0;
}

static __always_inline int ior_on_syscall_enter(__u32 tid, __u32 enter_trace_id) {
    struct syscall_enter_state state = {};

    state.start_ns = bpf_ktime_get_boot_ns();
    state.enter_trace_id = enter_trace_id;
    state.emit_event = ior_should_emit_trace(enter_trace_id) ? 1 : 0;
    bpf_map_update_elem(&syscall_enter_state_map, &tid, &state, BPF_ANY);
    return state.emit_event != 0;
}

// ior_on_noreturn_syscall_enter is the enter hook for noreturn syscalls
// (exit, exit_group, rt_sigreturn). Unlike ior_on_syscall_enter it deliberately
// does NOT write a per-tid entry into syscall_enter_state_map. A noreturn
// syscall never returns to the syscall site (exit/exit_group terminate;
// rt_sigreturn restores the pre-signal context), so its sys_exit tracepoint
// never fires and the matching
// exit handler is suppressed by the generator (see internal/generate/codegen.go
// isNoreturnSyscall). With no exit handler, nothing would ever look up or
// bpf_map_delete_elem that enter-state entry, so recording it would only leave
// stale per-tid entries crowding the bounded (32768) map on hosts churning many
// distinct tids. We still honor the sampling decision so the enter null_event is
// emitted (or dropped) exactly as a normal syscall's enter would be, but without
// the dead, unreclaimable map write.
static __always_inline int ior_on_noreturn_syscall_enter(__u32 enter_trace_id) {
    return ior_should_emit_trace(enter_trace_id);
}

static __always_inline int ior_on_syscall_exit(__u32 tid, __u32 enter_trace_id, __s64 ret) {
    __u64 now;
    __u64 duration = 0;
    __u8 emit_event = 1;
    struct syscall_enter_state *state;

    state = bpf_map_lookup_elem(&syscall_enter_state_map, &tid);
    if (!state)
        return 1;

    now = bpf_ktime_get_boot_ns();
    if (now >= state->start_ns)
        duration = now - state->start_ns;

    // Pair aggregate stats using the explicit enter_trace_id passed by the
    // generated exit handler, avoiding any numeric adjacency assumption
    // between kernel-assigned enter and exit tracepoint IDs.
    if (state->enter_trace_id == enter_trace_id)
        ior_update_syscall_aggregate(state->enter_trace_id, duration, ret);

    emit_event = state->emit_event;
    bpf_map_delete_elem(&syscall_enter_state_map, &tid);
    return emit_event != 0;
}

// filter() decides whether the current task's syscall is in scope. Today this is
// a single-TGID gate (PID_FILTER, with -1 meaning trace-all) plus an optional
// TID_FILTER. ior does NOT follow forks: a traced process's children run under a
// different TGID and are excluded here, which also means their syscalls miss the
// aggregate-count path downstream. A planned opt-in process-tree-following mode
// would extend this gate to also accept descendant TGIDs from a BPF-maintained
// set seeded with the root PID and updated via sched_process_fork/exit — see
// docs/follow-forks-plan.md for the full design.
static __always_inline int filter(__u32 *pid, __u32 *tid) {
    u64 pid_tgid = bpf_get_current_pid_tgid();
    *pid = pid_tgid >> 32;

    // Ignore ior userland process itself
    if (*pid == IOR_PID_FILTER) {
        return FILTER;
    }
    
    *tid = pid_tgid & 0xFFFFFFFF;
    if (-1 == PID_FILTER || *pid == PID_FILTER) {
        if (-1 == TID_FILTER || *tid == TID_FILTER) {
            return ACCEPT;
        }
    }

    return FILTER;
}