diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-30 10:49:46 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-30 10:49:46 +0300 |
| commit | ee3a7106a724dc193589ab652ef21503ba291562 (patch) | |
| tree | 4d5d0f2b75e085880f7f560de1546337aa2c14ce /internal | |
| parent | 5e334d33b3ddc3e308455262577c461835939bc7 (diff) | |
fix(sleep): record sentinel for TIMER_ABSTIME clock_nanosleep (a20)
clock_nanosleep with the TIMER_ABSTIME flag passes an ABSOLUTE wakeup
time in the request timespec, not a relative duration. The generated BPF
sleep handler computed requested_ns = tv_sec*1e9 + tv_nsec
unconditionally, so absolute sleeps exported a bogus multi-decade
"sleep duration" in CSV/parquet/stream.
generateExtraSleep now carries an optional flags-argument expression per
sleep syscall. For clock_nanosleep the generated handler checks
args[1] & TIMER_ABSTIME (value 1) and only computes the relative
duration when the flag is clear; absolute sleeps keep the existing -1
sentinel (same value used for null/unreadable timespec pointers).
nanosleep is always relative and stays unconditional (no flags arg).
- Regenerated internal/c/generated_tracepoints.c (mage generate idempotent).
- Added codegen tests asserting the TIMER_ABSTIME guard for clock_nanosleep
and its absence for nanosleep.
- Extended the ioworkload sleep scenario to issue an absolute clock_nanosleep
and the sleep parquet integration test to assert it is reported as -1.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/c/generated_tracepoints.c | 4 | ||||
| -rw-r--r-- | internal/generate/bpfhandler.go | 67 | ||||
| -rw-r--r-- | internal/generate/codegen_test.go | 25 |
3 files changed, 84 insertions, 12 deletions
diff --git a/internal/c/generated_tracepoints.c b/internal/c/generated_tracepoints.c index e38e1af..fbf690c 100644 --- a/internal/c/generated_tracepoints.c +++ b/internal/c/generated_tracepoints.c @@ -14536,7 +14536,9 @@ int handle_sys_enter_clock_nanosleep(struct syscall_trace_enter *ctx) { __s64 tv_nsec; } ts = {}; if (bpf_probe_read_user(&ts, sizeof(ts), (void *)ctx->args[2]) == 0) { - ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec; + if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) { + ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec; + } } } diff --git a/internal/generate/bpfhandler.go b/internal/generate/bpfhandler.go index 7b8fea0..1dff4d6 100644 --- a/internal/generate/bpfhandler.go +++ b/internal/generate/bpfhandler.go @@ -494,23 +494,68 @@ func memExpr(expr string) string { return expr } -// sleepTimespecPtr maps sleep-family syscall names to the C expression pointing -// at the user-space timespec struct. Syscalls not listed default to "0" (no -// pointer), which makes the generated code skip the probe_read_user call. -// To add a new sleep-like syscall, register its timespec pointer expression here. -var sleepTimespecPtr = map[string]string{ - "sys_enter_nanosleep": "ctx->args[0]", - "sys_enter_clock_nanosleep": "ctx->args[2]", -} +// sleepSpec describes how a sleep-family syscall exposes its requested sleep +// duration. +// +// - ptr is the C expression pointing at the user-space timespec struct. +// - flagsArg is the C expression for the flags argument that may carry +// TIMER_ABSTIME; it is empty for syscalls whose request is always relative. +type sleepSpec struct { + ptr string + flagsArg string +} + +// sleepTimespecPtr maps sleep-family syscall names to their timespec pointer and +// (where applicable) flags-argument expressions. Syscalls not listed default to +// ptr "0" (no pointer), which makes the generated code skip the probe_read_user +// call. To add a new sleep-like syscall, register it here. +// +// nanosleep(const struct timespec *req, struct timespec *rem) is ALWAYS a +// relative sleep, so it has no flagsArg. clock_nanosleep(clockid_t clockid, +// int flags, const struct timespec *request, struct timespec *remain) takes a +// flags argument: when flags & TIMER_ABSTIME is set, *request is an ABSOLUTE +// wakeup time against clockid, not a relative duration — see generateExtraSleep. +var sleepTimespecPtr = map[string]sleepSpec{ + "sys_enter_nanosleep": {ptr: "ctx->args[0]"}, + "sys_enter_clock_nanosleep": {ptr: "ctx->args[2]", flagsArg: "ctx->args[1]"}, +} + +// timerAbstimeFlag is the Linux TIMER_ABSTIME flag value (uapi/linux/time.h). +// When set in clock_nanosleep's flags argument, the request timespec is an +// absolute wakeup time rather than a relative duration. +const timerAbstimeFlag = "1 /* TIMER_ABSTIME */" // generateExtraSleep emits the requested_ns capture body for sleep-family -// syscalls. The timespec pointer expression comes from sleepTimespecPtr. +// syscalls. The timespec pointer (and optional flags) expression come from +// sleepTimespecPtr. +// +// requested_ns defaults to the -1 sentinel (the same value used for a +// null/unreadable timespec pointer). For relative sleeps we overwrite it with +// tv_sec*1e9 + tv_nsec. For an absolute sleep (clock_nanosleep with +// TIMER_ABSTIME set) the request timespec is an absolute clock value, NOT a +// duration; computing tv_sec*1e9 + tv_nsec there would export a bogus +// multi-decade "sleep duration". Deriving the true relative duration would +// require reading the current time of the (variable) clockid in BPF, which is +// racy and clock-dependent. Instead we leave the -1 sentinel so downstream +// consumers (CSV/parquet/stream) report "unknown" rather than a misleading +// value. func generateExtraSleep(name string) string { - ptrExpr := sleepTimespecPtr[name] // empty string if not found + spec := sleepTimespecPtr[name] // zero value (ptr "") if not found + ptrExpr := spec.ptr if ptrExpr == "" { ptrExpr = "0" } - return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n }\n }\n" + + compute := " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n" + if spec.flagsArg != "" { + // Absolute sleeps keep the -1 sentinel; only relative sleeps get a + // computed duration. + compute = " if ((" + spec.flagsArg + " & " + timerAbstimeFlag + ") == 0) {\n" + + " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n" + + " }\n" + } + + return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n" + compute + " }\n }\n" } // keyctlFieldSpec describes the three fields captured for keyctl-family syscalls. diff --git a/internal/generate/codegen_test.go b/internal/generate/codegen_test.go index 9bd391e..880d67d 100644 --- a/internal/generate/codegen_test.go +++ b/internal/generate/codegen_test.go @@ -1371,6 +1371,10 @@ func TestGenerateSleepHandlerCapturesRequestedTimespec(t *testing.T) { requireContains(t, output, "ev->requested_ns = -1;") requireContains(t, output, "if (ctx->args[0] != 0) {") requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;") + // nanosleep is ALWAYS a relative sleep (no flags argument), so its handler + // must compute the duration unconditionally — no TIMER_ABSTIME flags check. + requireNotContains(t, output, "TIMER_ABSTIME") + requireNotContains(t, output, "& 1 /* TIMER_ABSTIME */") } func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) { @@ -1386,6 +1390,27 @@ func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) { requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;") } +// TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps locks in the +// TIMER_ABSTIME fix: when flags (args[1]) has TIMER_ABSTIME set, the request +// timespec is an ABSOLUTE wakeup time, not a relative duration, so the handler +// must keep the -1 sentinel instead of exporting a bogus multi-decade +// "sleep duration". The relative-duration computation is therefore guarded by a +// flags check, and only runs when the flag is clear. +func TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps(t *testing.T) { + output := generateFromPair(t, FormatClockNanosleep, FormatExitClockNanosleep) + + // The flags check on args[1] against TIMER_ABSTIME (value 1) must be present, + // guarding the relative-duration assignment. + requireContains(t, output, "if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {") + // The duration is computed inside the guard (relative branch only); the abs + // branch leaves the -1 sentinel set above. + requireContains(t, output, + " if (bpf_probe_read_user(&ts, sizeof(ts), (void *)ctx->args[2]) == 0) {\n"+ + " if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {\n"+ + " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n"+ + " }\n }") +} + // TestClockNanosleepExitHandlerIsUnclassifiedRet locks in that the exit side of // clock_nanosleep records a plain ret_event with ret_type UNCLASSIFIED. The // syscall returns 0 on success or a positive errno (and -1 only when invoked |
