summaryrefslogtreecommitdiff
path: root/internal/generate
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-30 10:49:46 +0300
committerPaul Buetow <paul@buetow.org>2026-05-30 10:49:46 +0300
commitee3a7106a724dc193589ab652ef21503ba291562 (patch)
tree4d5d0f2b75e085880f7f560de1546337aa2c14ce /internal/generate
parent5e334d33b3ddc3e308455262577c461835939bc7 (diff)
fix(sleep): record sentinel for TIMER_ABSTIME clock_nanosleep (a20)
clock_nanosleep with the TIMER_ABSTIME flag passes an ABSOLUTE wakeup time in the request timespec, not a relative duration. The generated BPF sleep handler computed requested_ns = tv_sec*1e9 + tv_nsec unconditionally, so absolute sleeps exported a bogus multi-decade "sleep duration" in CSV/parquet/stream. generateExtraSleep now carries an optional flags-argument expression per sleep syscall. For clock_nanosleep the generated handler checks args[1] & TIMER_ABSTIME (value 1) and only computes the relative duration when the flag is clear; absolute sleeps keep the existing -1 sentinel (same value used for null/unreadable timespec pointers). nanosleep is always relative and stays unconditional (no flags arg). - Regenerated internal/c/generated_tracepoints.c (mage generate idempotent). - Added codegen tests asserting the TIMER_ABSTIME guard for clock_nanosleep and its absence for nanosleep. - Extended the ioworkload sleep scenario to issue an absolute clock_nanosleep and the sleep parquet integration test to assert it is reported as -1. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Diffstat (limited to 'internal/generate')
-rw-r--r--internal/generate/bpfhandler.go67
-rw-r--r--internal/generate/codegen_test.go25
2 files changed, 81 insertions, 11 deletions
diff --git a/internal/generate/bpfhandler.go b/internal/generate/bpfhandler.go
index 7b8fea0..1dff4d6 100644
--- a/internal/generate/bpfhandler.go
+++ b/internal/generate/bpfhandler.go
@@ -494,23 +494,68 @@ func memExpr(expr string) string {
return expr
}
-// sleepTimespecPtr maps sleep-family syscall names to the C expression pointing
-// at the user-space timespec struct. Syscalls not listed default to "0" (no
-// pointer), which makes the generated code skip the probe_read_user call.
-// To add a new sleep-like syscall, register its timespec pointer expression here.
-var sleepTimespecPtr = map[string]string{
- "sys_enter_nanosleep": "ctx->args[0]",
- "sys_enter_clock_nanosleep": "ctx->args[2]",
-}
+// sleepSpec describes how a sleep-family syscall exposes its requested sleep
+// duration.
+//
+// - ptr is the C expression pointing at the user-space timespec struct.
+// - flagsArg is the C expression for the flags argument that may carry
+// TIMER_ABSTIME; it is empty for syscalls whose request is always relative.
+type sleepSpec struct {
+ ptr string
+ flagsArg string
+}
+
+// sleepTimespecPtr maps sleep-family syscall names to their timespec pointer and
+// (where applicable) flags-argument expressions. Syscalls not listed default to
+// ptr "0" (no pointer), which makes the generated code skip the probe_read_user
+// call. To add a new sleep-like syscall, register it here.
+//
+// nanosleep(const struct timespec *req, struct timespec *rem) is ALWAYS a
+// relative sleep, so it has no flagsArg. clock_nanosleep(clockid_t clockid,
+// int flags, const struct timespec *request, struct timespec *remain) takes a
+// flags argument: when flags & TIMER_ABSTIME is set, *request is an ABSOLUTE
+// wakeup time against clockid, not a relative duration — see generateExtraSleep.
+var sleepTimespecPtr = map[string]sleepSpec{
+ "sys_enter_nanosleep": {ptr: "ctx->args[0]"},
+ "sys_enter_clock_nanosleep": {ptr: "ctx->args[2]", flagsArg: "ctx->args[1]"},
+}
+
+// timerAbstimeFlag is the Linux TIMER_ABSTIME flag value (uapi/linux/time.h).
+// When set in clock_nanosleep's flags argument, the request timespec is an
+// absolute wakeup time rather than a relative duration.
+const timerAbstimeFlag = "1 /* TIMER_ABSTIME */"
// generateExtraSleep emits the requested_ns capture body for sleep-family
-// syscalls. The timespec pointer expression comes from sleepTimespecPtr.
+// syscalls. The timespec pointer (and optional flags) expression come from
+// sleepTimespecPtr.
+//
+// requested_ns defaults to the -1 sentinel (the same value used for a
+// null/unreadable timespec pointer). For relative sleeps we overwrite it with
+// tv_sec*1e9 + tv_nsec. For an absolute sleep (clock_nanosleep with
+// TIMER_ABSTIME set) the request timespec is an absolute clock value, NOT a
+// duration; computing tv_sec*1e9 + tv_nsec there would export a bogus
+// multi-decade "sleep duration". Deriving the true relative duration would
+// require reading the current time of the (variable) clockid in BPF, which is
+// racy and clock-dependent. Instead we leave the -1 sentinel so downstream
+// consumers (CSV/parquet/stream) report "unknown" rather than a misleading
+// value.
func generateExtraSleep(name string) string {
- ptrExpr := sleepTimespecPtr[name] // empty string if not found
+ spec := sleepTimespecPtr[name] // zero value (ptr "") if not found
+ ptrExpr := spec.ptr
if ptrExpr == "" {
ptrExpr = "0"
}
- return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n }\n }\n"
+
+ compute := " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n"
+ if spec.flagsArg != "" {
+ // Absolute sleeps keep the -1 sentinel; only relative sleeps get a
+ // computed duration.
+ compute = " if ((" + spec.flagsArg + " & " + timerAbstimeFlag + ") == 0) {\n" +
+ " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n" +
+ " }\n"
+ }
+
+ return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n" + compute + " }\n }\n"
}
// keyctlFieldSpec describes the three fields captured for keyctl-family syscalls.
diff --git a/internal/generate/codegen_test.go b/internal/generate/codegen_test.go
index 9bd391e..880d67d 100644
--- a/internal/generate/codegen_test.go
+++ b/internal/generate/codegen_test.go
@@ -1371,6 +1371,10 @@ func TestGenerateSleepHandlerCapturesRequestedTimespec(t *testing.T) {
requireContains(t, output, "ev->requested_ns = -1;")
requireContains(t, output, "if (ctx->args[0] != 0) {")
requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;")
+ // nanosleep is ALWAYS a relative sleep (no flags argument), so its handler
+ // must compute the duration unconditionally — no TIMER_ABSTIME flags check.
+ requireNotContains(t, output, "TIMER_ABSTIME")
+ requireNotContains(t, output, "& 1 /* TIMER_ABSTIME */")
}
func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) {
@@ -1386,6 +1390,27 @@ func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) {
requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;")
}
+// TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps locks in the
+// TIMER_ABSTIME fix: when flags (args[1]) has TIMER_ABSTIME set, the request
+// timespec is an ABSOLUTE wakeup time, not a relative duration, so the handler
+// must keep the -1 sentinel instead of exporting a bogus multi-decade
+// "sleep duration". The relative-duration computation is therefore guarded by a
+// flags check, and only runs when the flag is clear.
+func TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps(t *testing.T) {
+ output := generateFromPair(t, FormatClockNanosleep, FormatExitClockNanosleep)
+
+ // The flags check on args[1] against TIMER_ABSTIME (value 1) must be present,
+ // guarding the relative-duration assignment.
+ requireContains(t, output, "if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {")
+ // The duration is computed inside the guard (relative branch only); the abs
+ // branch leaves the -1 sentinel set above.
+ requireContains(t, output,
+ " if (bpf_probe_read_user(&ts, sizeof(ts), (void *)ctx->args[2]) == 0) {\n"+
+ " if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {\n"+
+ " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n"+
+ " }\n }")
+}
+
// TestClockNanosleepExitHandlerIsUnclassifiedRet locks in that the exit side of
// clock_nanosleep records a plain ret_event with ret_type UNCLASSIFIED. The
// syscall returns 0 on success or a positive errno (and -1 only when invoked