summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-30 10:49:46 +0300
committerPaul Buetow <paul@buetow.org>2026-05-30 10:49:46 +0300
commitee3a7106a724dc193589ab652ef21503ba291562 (patch)
tree4d5d0f2b75e085880f7f560de1546337aa2c14ce
parent5e334d33b3ddc3e308455262577c461835939bc7 (diff)
fix(sleep): record sentinel for TIMER_ABSTIME clock_nanosleep (a20)
clock_nanosleep with the TIMER_ABSTIME flag passes an ABSOLUTE wakeup time in the request timespec, not a relative duration. The generated BPF sleep handler computed requested_ns = tv_sec*1e9 + tv_nsec unconditionally, so absolute sleeps exported a bogus multi-decade "sleep duration" in CSV/parquet/stream. generateExtraSleep now carries an optional flags-argument expression per sleep syscall. For clock_nanosleep the generated handler checks args[1] & TIMER_ABSTIME (value 1) and only computes the relative duration when the flag is clear; absolute sleeps keep the existing -1 sentinel (same value used for null/unreadable timespec pointers). nanosleep is always relative and stays unconditional (no flags arg). - Regenerated internal/c/generated_tracepoints.c (mage generate idempotent). - Added codegen tests asserting the TIMER_ABSTIME guard for clock_nanosleep and its absence for nanosleep. - Extended the ioworkload sleep scenario to issue an absolute clock_nanosleep and the sleep parquet integration test to assert it is reported as -1. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
-rw-r--r--cmd/ioworkload/scenario_sleep.go34
-rw-r--r--integrationtests/sleep_test.go29
-rw-r--r--internal/c/generated_tracepoints.c4
-rw-r--r--internal/generate/bpfhandler.go67
-rw-r--r--internal/generate/codegen_test.go25
5 files changed, 140 insertions, 19 deletions
diff --git a/cmd/ioworkload/scenario_sleep.go b/cmd/ioworkload/scenario_sleep.go
index bb6c5a4..c004c4b 100644
--- a/cmd/ioworkload/scenario_sleep.go
+++ b/cmd/ioworkload/scenario_sleep.go
@@ -21,17 +21,47 @@ func sleepSyscalls() error {
if err := callClockNanosleep(3_000_000); err != nil {
return err
}
+ // Also exercise the TIMER_ABSTIME (absolute) path so the tracer's
+ // absolute-sleep handling is covered: the request timespec here is an
+ // absolute CLOCK_MONOTONIC wakeup time, not a relative duration, so the
+ // tracer must NOT report it as a sleep duration (see task a20).
+ if err := callClockNanosleepAbs(2_000_000); err != nil {
+ return err
+ }
}
return nil
}
+// callClockNanosleep issues a relative clock_nanosleep (flags = 0), sleeping for
+// requestedNs nanoseconds.
func callClockNanosleep(requestedNs int64) error {
req := unix.Timespec{Sec: requestedNs / 1_000_000_000, Nsec: requestedNs % 1_000_000_000}
+ return invokeClockNanosleep(0, &req)
+}
+
+// callClockNanosleepAbs issues an absolute clock_nanosleep (flags =
+// TIMER_ABSTIME). The request timespec is an absolute CLOCK_MONOTONIC wakeup
+// time computed as "now + aheadNs", so the call blocks for roughly aheadNs but
+// the request value itself is a multi-decade absolute timestamp, not a
+// duration.
+func callClockNanosleepAbs(aheadNs int64) error {
+ var now unix.Timespec
+ if err := unix.ClockGettime(unix.CLOCK_MONOTONIC, &now); err != nil {
+ return fmt.Errorf("clock_gettime: %w", err)
+ }
+ target := now.Nano() + aheadNs
+ req := unix.Timespec{Sec: target / 1_000_000_000, Nsec: target % 1_000_000_000}
+ return invokeClockNanosleep(unix.TIMER_ABSTIME, &req)
+}
+
+// invokeClockNanosleep is the shared raw clock_nanosleep syscall wrapper used by
+// both the relative and absolute callers.
+func invokeClockNanosleep(flags uintptr, req *unix.Timespec) error {
_, _, errno := syscall.RawSyscall6(
unix.SYS_CLOCK_NANOSLEEP,
uintptr(unix.CLOCK_MONOTONIC),
- 0,
- uintptr(unsafe.Pointer(&req)),
+ flags,
+ uintptr(unsafe.Pointer(req)),
0,
0,
0,
diff --git a/integrationtests/sleep_test.go b/integrationtests/sleep_test.go
index fbd93a7..d7dbdd5 100644
--- a/integrationtests/sleep_test.go
+++ b/integrationtests/sleep_test.go
@@ -38,8 +38,14 @@ func TestSleepRequestedTimespecInParquet(t *testing.T) {
t.Fatalf("expected parquet rows for workload PID %d", pid)
}
+ // The workload issues, per loop iteration: a relative nanosleep (2ms), a
+ // relative clock_nanosleep (3ms), and an ABSOLUTE clock_nanosleep
+ // (TIMER_ABSTIME) whose request is an absolute CLOCK_MONOTONIC timestamp.
+ // The absolute one must be reported as the -1 sentinel, never as a bogus
+ // multi-decade "sleep duration" (task a20).
var sawNanosleep bool
- var sawClockNanosleep bool
+ var sawClockNanosleepRel bool
+ var sawClockNanosleepAbs bool
for _, row := range rows {
switch row.Syscall {
case "nanosleep":
@@ -50,8 +56,18 @@ func TestSleepRequestedTimespecInParquet(t *testing.T) {
t.Fatalf("nanosleep bytes = %d, want 0", row.Bytes)
}
case "clock_nanosleep":
- if row.RequestedSleepNS == 3_000_000 {
- sawClockNanosleep = true
+ switch row.RequestedSleepNS {
+ case 3_000_000:
+ sawClockNanosleepRel = true
+ case -1:
+ // Absolute (TIMER_ABSTIME) sleep: sentinel, not a duration.
+ sawClockNanosleepAbs = true
+ default:
+ // Any large positive value here means the absolute wakeup
+ // timestamp leaked through as a duration — the bug in a20.
+ if row.RequestedSleepNS > 1_000_000_000 {
+ t.Fatalf("clock_nanosleep RequestedSleepNS = %d looks like an absolute timestamp; TIMER_ABSTIME must record -1", row.RequestedSleepNS)
+ }
}
if row.Bytes != 0 {
t.Fatalf("clock_nanosleep bytes = %d, want 0", row.Bytes)
@@ -62,7 +78,10 @@ func TestSleepRequestedTimespecInParquet(t *testing.T) {
if !sawNanosleep {
t.Fatal("expected nanosleep row with RequestedSleepNS=2000000")
}
- if !sawClockNanosleep {
- t.Fatal("expected clock_nanosleep row with RequestedSleepNS=3000000")
+ if !sawClockNanosleepRel {
+ t.Fatal("expected relative clock_nanosleep row with RequestedSleepNS=3000000")
+ }
+ if !sawClockNanosleepAbs {
+ t.Fatal("expected absolute (TIMER_ABSTIME) clock_nanosleep row with RequestedSleepNS=-1")
}
}
diff --git a/internal/c/generated_tracepoints.c b/internal/c/generated_tracepoints.c
index e38e1af..fbf690c 100644
--- a/internal/c/generated_tracepoints.c
+++ b/internal/c/generated_tracepoints.c
@@ -14536,7 +14536,9 @@ int handle_sys_enter_clock_nanosleep(struct syscall_trace_enter *ctx) {
__s64 tv_nsec;
} ts = {};
if (bpf_probe_read_user(&ts, sizeof(ts), (void *)ctx->args[2]) == 0) {
- ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;
+ if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {
+ ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;
+ }
}
}
diff --git a/internal/generate/bpfhandler.go b/internal/generate/bpfhandler.go
index 7b8fea0..1dff4d6 100644
--- a/internal/generate/bpfhandler.go
+++ b/internal/generate/bpfhandler.go
@@ -494,23 +494,68 @@ func memExpr(expr string) string {
return expr
}
-// sleepTimespecPtr maps sleep-family syscall names to the C expression pointing
-// at the user-space timespec struct. Syscalls not listed default to "0" (no
-// pointer), which makes the generated code skip the probe_read_user call.
-// To add a new sleep-like syscall, register its timespec pointer expression here.
-var sleepTimespecPtr = map[string]string{
- "sys_enter_nanosleep": "ctx->args[0]",
- "sys_enter_clock_nanosleep": "ctx->args[2]",
-}
+// sleepSpec describes how a sleep-family syscall exposes its requested sleep
+// duration.
+//
+// - ptr is the C expression pointing at the user-space timespec struct.
+// - flagsArg is the C expression for the flags argument that may carry
+// TIMER_ABSTIME; it is empty for syscalls whose request is always relative.
+type sleepSpec struct {
+ ptr string
+ flagsArg string
+}
+
+// sleepTimespecPtr maps sleep-family syscall names to their timespec pointer and
+// (where applicable) flags-argument expressions. Syscalls not listed default to
+// ptr "0" (no pointer), which makes the generated code skip the probe_read_user
+// call. To add a new sleep-like syscall, register it here.
+//
+// nanosleep(const struct timespec *req, struct timespec *rem) is ALWAYS a
+// relative sleep, so it has no flagsArg. clock_nanosleep(clockid_t clockid,
+// int flags, const struct timespec *request, struct timespec *remain) takes a
+// flags argument: when flags & TIMER_ABSTIME is set, *request is an ABSOLUTE
+// wakeup time against clockid, not a relative duration — see generateExtraSleep.
+var sleepTimespecPtr = map[string]sleepSpec{
+ "sys_enter_nanosleep": {ptr: "ctx->args[0]"},
+ "sys_enter_clock_nanosleep": {ptr: "ctx->args[2]", flagsArg: "ctx->args[1]"},
+}
+
+// timerAbstimeFlag is the Linux TIMER_ABSTIME flag value (uapi/linux/time.h).
+// When set in clock_nanosleep's flags argument, the request timespec is an
+// absolute wakeup time rather than a relative duration.
+const timerAbstimeFlag = "1 /* TIMER_ABSTIME */"
// generateExtraSleep emits the requested_ns capture body for sleep-family
-// syscalls. The timespec pointer expression comes from sleepTimespecPtr.
+// syscalls. The timespec pointer (and optional flags) expression come from
+// sleepTimespecPtr.
+//
+// requested_ns defaults to the -1 sentinel (the same value used for a
+// null/unreadable timespec pointer). For relative sleeps we overwrite it with
+// tv_sec*1e9 + tv_nsec. For an absolute sleep (clock_nanosleep with
+// TIMER_ABSTIME set) the request timespec is an absolute clock value, NOT a
+// duration; computing tv_sec*1e9 + tv_nsec there would export a bogus
+// multi-decade "sleep duration". Deriving the true relative duration would
+// require reading the current time of the (variable) clockid in BPF, which is
+// racy and clock-dependent. Instead we leave the -1 sentinel so downstream
+// consumers (CSV/parquet/stream) report "unknown" rather than a misleading
+// value.
func generateExtraSleep(name string) string {
- ptrExpr := sleepTimespecPtr[name] // empty string if not found
+ spec := sleepTimespecPtr[name] // zero value (ptr "") if not found
+ ptrExpr := spec.ptr
if ptrExpr == "" {
ptrExpr = "0"
}
- return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n }\n }\n"
+
+ compute := " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n"
+ if spec.flagsArg != "" {
+ // Absolute sleeps keep the -1 sentinel; only relative sleeps get a
+ // computed duration.
+ compute = " if ((" + spec.flagsArg + " & " + timerAbstimeFlag + ") == 0) {\n" +
+ " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n" +
+ " }\n"
+ }
+
+ return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n" + compute + " }\n }\n"
}
// keyctlFieldSpec describes the three fields captured for keyctl-family syscalls.
diff --git a/internal/generate/codegen_test.go b/internal/generate/codegen_test.go
index 9bd391e..880d67d 100644
--- a/internal/generate/codegen_test.go
+++ b/internal/generate/codegen_test.go
@@ -1371,6 +1371,10 @@ func TestGenerateSleepHandlerCapturesRequestedTimespec(t *testing.T) {
requireContains(t, output, "ev->requested_ns = -1;")
requireContains(t, output, "if (ctx->args[0] != 0) {")
requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;")
+ // nanosleep is ALWAYS a relative sleep (no flags argument), so its handler
+ // must compute the duration unconditionally — no TIMER_ABSTIME flags check.
+ requireNotContains(t, output, "TIMER_ABSTIME")
+ requireNotContains(t, output, "& 1 /* TIMER_ABSTIME */")
}
func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) {
@@ -1386,6 +1390,27 @@ func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) {
requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;")
}
+// TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps locks in the
+// TIMER_ABSTIME fix: when flags (args[1]) has TIMER_ABSTIME set, the request
+// timespec is an ABSOLUTE wakeup time, not a relative duration, so the handler
+// must keep the -1 sentinel instead of exporting a bogus multi-decade
+// "sleep duration". The relative-duration computation is therefore guarded by a
+// flags check, and only runs when the flag is clear.
+func TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps(t *testing.T) {
+ output := generateFromPair(t, FormatClockNanosleep, FormatExitClockNanosleep)
+
+ // The flags check on args[1] against TIMER_ABSTIME (value 1) must be present,
+ // guarding the relative-duration assignment.
+ requireContains(t, output, "if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {")
+ // The duration is computed inside the guard (relative branch only); the abs
+ // branch leaves the -1 sentinel set above.
+ requireContains(t, output,
+ " if (bpf_probe_read_user(&ts, sizeof(ts), (void *)ctx->args[2]) == 0) {\n"+
+ " if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {\n"+
+ " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n"+
+ " }\n }")
+}
+
// TestClockNanosleepExitHandlerIsUnclassifiedRet locks in that the exit side of
// clock_nanosleep records a plain ret_event with ret_type UNCLASSIFIED. The
// syscall returns 0 on success or a positive errno (and -1 only when invoked