diff options
| -rw-r--r-- | cmd/ioworkload/scenario_sleep.go | 34 | ||||
| -rw-r--r-- | integrationtests/sleep_test.go | 29 | ||||
| -rw-r--r-- | internal/c/generated_tracepoints.c | 4 | ||||
| -rw-r--r-- | internal/generate/bpfhandler.go | 67 | ||||
| -rw-r--r-- | internal/generate/codegen_test.go | 25 |
5 files changed, 140 insertions, 19 deletions
diff --git a/cmd/ioworkload/scenario_sleep.go b/cmd/ioworkload/scenario_sleep.go index bb6c5a4..c004c4b 100644 --- a/cmd/ioworkload/scenario_sleep.go +++ b/cmd/ioworkload/scenario_sleep.go @@ -21,17 +21,47 @@ func sleepSyscalls() error { if err := callClockNanosleep(3_000_000); err != nil { return err } + // Also exercise the TIMER_ABSTIME (absolute) path so the tracer's + // absolute-sleep handling is covered: the request timespec here is an + // absolute CLOCK_MONOTONIC wakeup time, not a relative duration, so the + // tracer must NOT report it as a sleep duration (see task a20). + if err := callClockNanosleepAbs(2_000_000); err != nil { + return err + } } return nil } +// callClockNanosleep issues a relative clock_nanosleep (flags = 0), sleeping for +// requestedNs nanoseconds. func callClockNanosleep(requestedNs int64) error { req := unix.Timespec{Sec: requestedNs / 1_000_000_000, Nsec: requestedNs % 1_000_000_000} + return invokeClockNanosleep(0, &req) +} + +// callClockNanosleepAbs issues an absolute clock_nanosleep (flags = +// TIMER_ABSTIME). The request timespec is an absolute CLOCK_MONOTONIC wakeup +// time computed as "now + aheadNs", so the call blocks for roughly aheadNs but +// the request value itself is a multi-decade absolute timestamp, not a +// duration. +func callClockNanosleepAbs(aheadNs int64) error { + var now unix.Timespec + if err := unix.ClockGettime(unix.CLOCK_MONOTONIC, &now); err != nil { + return fmt.Errorf("clock_gettime: %w", err) + } + target := now.Nano() + aheadNs + req := unix.Timespec{Sec: target / 1_000_000_000, Nsec: target % 1_000_000_000} + return invokeClockNanosleep(unix.TIMER_ABSTIME, &req) +} + +// invokeClockNanosleep is the shared raw clock_nanosleep syscall wrapper used by +// both the relative and absolute callers. +func invokeClockNanosleep(flags uintptr, req *unix.Timespec) error { _, _, errno := syscall.RawSyscall6( unix.SYS_CLOCK_NANOSLEEP, uintptr(unix.CLOCK_MONOTONIC), - 0, - uintptr(unsafe.Pointer(&req)), + flags, + uintptr(unsafe.Pointer(req)), 0, 0, 0, diff --git a/integrationtests/sleep_test.go b/integrationtests/sleep_test.go index fbd93a7..d7dbdd5 100644 --- a/integrationtests/sleep_test.go +++ b/integrationtests/sleep_test.go @@ -38,8 +38,14 @@ func TestSleepRequestedTimespecInParquet(t *testing.T) { t.Fatalf("expected parquet rows for workload PID %d", pid) } + // The workload issues, per loop iteration: a relative nanosleep (2ms), a + // relative clock_nanosleep (3ms), and an ABSOLUTE clock_nanosleep + // (TIMER_ABSTIME) whose request is an absolute CLOCK_MONOTONIC timestamp. + // The absolute one must be reported as the -1 sentinel, never as a bogus + // multi-decade "sleep duration" (task a20). var sawNanosleep bool - var sawClockNanosleep bool + var sawClockNanosleepRel bool + var sawClockNanosleepAbs bool for _, row := range rows { switch row.Syscall { case "nanosleep": @@ -50,8 +56,18 @@ func TestSleepRequestedTimespecInParquet(t *testing.T) { t.Fatalf("nanosleep bytes = %d, want 0", row.Bytes) } case "clock_nanosleep": - if row.RequestedSleepNS == 3_000_000 { - sawClockNanosleep = true + switch row.RequestedSleepNS { + case 3_000_000: + sawClockNanosleepRel = true + case -1: + // Absolute (TIMER_ABSTIME) sleep: sentinel, not a duration. + sawClockNanosleepAbs = true + default: + // Any large positive value here means the absolute wakeup + // timestamp leaked through as a duration — the bug in a20. + if row.RequestedSleepNS > 1_000_000_000 { + t.Fatalf("clock_nanosleep RequestedSleepNS = %d looks like an absolute timestamp; TIMER_ABSTIME must record -1", row.RequestedSleepNS) + } } if row.Bytes != 0 { t.Fatalf("clock_nanosleep bytes = %d, want 0", row.Bytes) @@ -62,7 +78,10 @@ func TestSleepRequestedTimespecInParquet(t *testing.T) { if !sawNanosleep { t.Fatal("expected nanosleep row with RequestedSleepNS=2000000") } - if !sawClockNanosleep { - t.Fatal("expected clock_nanosleep row with RequestedSleepNS=3000000") + if !sawClockNanosleepRel { + t.Fatal("expected relative clock_nanosleep row with RequestedSleepNS=3000000") + } + if !sawClockNanosleepAbs { + t.Fatal("expected absolute (TIMER_ABSTIME) clock_nanosleep row with RequestedSleepNS=-1") } } diff --git a/internal/c/generated_tracepoints.c b/internal/c/generated_tracepoints.c index e38e1af..fbf690c 100644 --- a/internal/c/generated_tracepoints.c +++ b/internal/c/generated_tracepoints.c @@ -14536,7 +14536,9 @@ int handle_sys_enter_clock_nanosleep(struct syscall_trace_enter *ctx) { __s64 tv_nsec; } ts = {}; if (bpf_probe_read_user(&ts, sizeof(ts), (void *)ctx->args[2]) == 0) { - ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec; + if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) { + ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec; + } } } diff --git a/internal/generate/bpfhandler.go b/internal/generate/bpfhandler.go index 7b8fea0..1dff4d6 100644 --- a/internal/generate/bpfhandler.go +++ b/internal/generate/bpfhandler.go @@ -494,23 +494,68 @@ func memExpr(expr string) string { return expr } -// sleepTimespecPtr maps sleep-family syscall names to the C expression pointing -// at the user-space timespec struct. Syscalls not listed default to "0" (no -// pointer), which makes the generated code skip the probe_read_user call. -// To add a new sleep-like syscall, register its timespec pointer expression here. -var sleepTimespecPtr = map[string]string{ - "sys_enter_nanosleep": "ctx->args[0]", - "sys_enter_clock_nanosleep": "ctx->args[2]", -} +// sleepSpec describes how a sleep-family syscall exposes its requested sleep +// duration. +// +// - ptr is the C expression pointing at the user-space timespec struct. +// - flagsArg is the C expression for the flags argument that may carry +// TIMER_ABSTIME; it is empty for syscalls whose request is always relative. +type sleepSpec struct { + ptr string + flagsArg string +} + +// sleepTimespecPtr maps sleep-family syscall names to their timespec pointer and +// (where applicable) flags-argument expressions. Syscalls not listed default to +// ptr "0" (no pointer), which makes the generated code skip the probe_read_user +// call. To add a new sleep-like syscall, register it here. +// +// nanosleep(const struct timespec *req, struct timespec *rem) is ALWAYS a +// relative sleep, so it has no flagsArg. clock_nanosleep(clockid_t clockid, +// int flags, const struct timespec *request, struct timespec *remain) takes a +// flags argument: when flags & TIMER_ABSTIME is set, *request is an ABSOLUTE +// wakeup time against clockid, not a relative duration — see generateExtraSleep. +var sleepTimespecPtr = map[string]sleepSpec{ + "sys_enter_nanosleep": {ptr: "ctx->args[0]"}, + "sys_enter_clock_nanosleep": {ptr: "ctx->args[2]", flagsArg: "ctx->args[1]"}, +} + +// timerAbstimeFlag is the Linux TIMER_ABSTIME flag value (uapi/linux/time.h). +// When set in clock_nanosleep's flags argument, the request timespec is an +// absolute wakeup time rather than a relative duration. +const timerAbstimeFlag = "1 /* TIMER_ABSTIME */" // generateExtraSleep emits the requested_ns capture body for sleep-family -// syscalls. The timespec pointer expression comes from sleepTimespecPtr. +// syscalls. The timespec pointer (and optional flags) expression come from +// sleepTimespecPtr. +// +// requested_ns defaults to the -1 sentinel (the same value used for a +// null/unreadable timespec pointer). For relative sleeps we overwrite it with +// tv_sec*1e9 + tv_nsec. For an absolute sleep (clock_nanosleep with +// TIMER_ABSTIME set) the request timespec is an absolute clock value, NOT a +// duration; computing tv_sec*1e9 + tv_nsec there would export a bogus +// multi-decade "sleep duration". Deriving the true relative duration would +// require reading the current time of the (variable) clockid in BPF, which is +// racy and clock-dependent. Instead we leave the -1 sentinel so downstream +// consumers (CSV/parquet/stream) report "unknown" rather than a misleading +// value. func generateExtraSleep(name string) string { - ptrExpr := sleepTimespecPtr[name] // empty string if not found + spec := sleepTimespecPtr[name] // zero value (ptr "") if not found + ptrExpr := spec.ptr if ptrExpr == "" { ptrExpr = "0" } - return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n }\n }\n" + + compute := " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n" + if spec.flagsArg != "" { + // Absolute sleeps keep the -1 sentinel; only relative sleeps get a + // computed duration. + compute = " if ((" + spec.flagsArg + " & " + timerAbstimeFlag + ") == 0) {\n" + + " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n" + + " }\n" + } + + return " ev->requested_ns = -1;\n if (" + ptrExpr + " != 0) {\n struct __ior_timespec {\n __s64 tv_sec;\n __s64 tv_nsec;\n } ts = {};\n if (bpf_probe_read_user(&ts, sizeof(ts), (void *)" + ptrExpr + ") == 0) {\n" + compute + " }\n }\n" } // keyctlFieldSpec describes the three fields captured for keyctl-family syscalls. diff --git a/internal/generate/codegen_test.go b/internal/generate/codegen_test.go index 9bd391e..880d67d 100644 --- a/internal/generate/codegen_test.go +++ b/internal/generate/codegen_test.go @@ -1371,6 +1371,10 @@ func TestGenerateSleepHandlerCapturesRequestedTimespec(t *testing.T) { requireContains(t, output, "ev->requested_ns = -1;") requireContains(t, output, "if (ctx->args[0] != 0) {") requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;") + // nanosleep is ALWAYS a relative sleep (no flags argument), so its handler + // must compute the duration unconditionally — no TIMER_ABSTIME flags check. + requireNotContains(t, output, "TIMER_ABSTIME") + requireNotContains(t, output, "& 1 /* TIMER_ABSTIME */") } func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) { @@ -1386,6 +1390,27 @@ func TestGenerateClockNanosleepHandlerCapturesRequestedTimespec(t *testing.T) { requireContains(t, output, "ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;") } +// TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps locks in the +// TIMER_ABSTIME fix: when flags (args[1]) has TIMER_ABSTIME set, the request +// timespec is an ABSOLUTE wakeup time, not a relative duration, so the handler +// must keep the -1 sentinel instead of exporting a bogus multi-decade +// "sleep duration". The relative-duration computation is therefore guarded by a +// flags check, and only runs when the flag is clear. +func TestGenerateClockNanosleepHandlerSkipsAbsoluteSleeps(t *testing.T) { + output := generateFromPair(t, FormatClockNanosleep, FormatExitClockNanosleep) + + // The flags check on args[1] against TIMER_ABSTIME (value 1) must be present, + // guarding the relative-duration assignment. + requireContains(t, output, "if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {") + // The duration is computed inside the guard (relative branch only); the abs + // branch leaves the -1 sentinel set above. + requireContains(t, output, + " if (bpf_probe_read_user(&ts, sizeof(ts), (void *)ctx->args[2]) == 0) {\n"+ + " if ((ctx->args[1] & 1 /* TIMER_ABSTIME */) == 0) {\n"+ + " ev->requested_ns = ts.tv_sec * 1000000000LL + ts.tv_nsec;\n"+ + " }\n }") +} + // TestClockNanosleepExitHandlerIsUnclassifiedRet locks in that the exit side of // clock_nanosleep records a plain ret_event with ret_type UNCLASSIFIED. The // syscall returns 0 on success or a positive errno (and -1 only when invoked |
