diff options
| author | Paul Buetow <paul@buetow.org> | 2026-02-23 22:58:38 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-02-23 22:58:38 +0200 |
| commit | 4b00bd30a8c1247f5dfee77277fdd4b438750bd0 (patch) | |
| tree | 547dc3a71cecfdd0bf87a54f047b4096aeedb62b /internal/statsengine | |
| parent | 44d0558be543e1e31a5178736181305d5d1ac2ba (diff) | |
task 305: add per-syscall accumulator with reservoir percentiles
Diffstat (limited to 'internal/statsengine')
| -rw-r--r-- | internal/statsengine/syscall.go | 176 | ||||
| -rw-r--r-- | internal/statsengine/syscall_test.go | 132 |
2 files changed, 308 insertions, 0 deletions
diff --git a/internal/statsengine/syscall.go b/internal/statsengine/syscall.go new file mode 100644 index 0000000..b3b8c4c --- /dev/null +++ b/internal/statsengine/syscall.go @@ -0,0 +1,176 @@ +package statsengine + +import ( + "ior/internal/event" + "ior/internal/types" + "math" + "math/rand" + "sort" + "time" +) + +const syscallReservoirSampleCapDefault = 10_000 + +type syscallAccumulator struct { + byID map[types.TraceId]*syscallStats + sampleCap int + rng *rand.Rand +} + +type syscallStats struct { + traceID types.TraceId + name string + + count uint64 + errorCount uint64 + totalBytes uint64 + totalLatency uint64 + minLatency uint64 + maxLatency uint64 + + seenLatencies uint64 + samples []uint64 +} + +func newSyscallAccumulator() *syscallAccumulator { + return newSyscallAccumulatorWithConfig(syscallReservoirSampleCapDefault, rand.New(rand.NewSource(time.Now().UnixNano()))) +} + +func newSyscallAccumulatorWithConfig(sampleCap int, rng *rand.Rand) *syscallAccumulator { + if sampleCap <= 0 { + sampleCap = syscallReservoirSampleCapDefault + } + if rng == nil { + rng = rand.New(rand.NewSource(time.Now().UnixNano())) + } + + return &syscallAccumulator{ + byID: make(map[types.TraceId]*syscallStats), + sampleCap: sampleCap, + rng: rng, + } +} + +func (a *syscallAccumulator) Add(pair *event.Pair) { + if a == nil || pair == nil || pair.EnterEv == nil { + return + } + + traceID := pair.EnterEv.GetTraceId() + stats := a.byID[traceID] + if stats == nil { + stats = &syscallStats{traceID: traceID, name: traceID.Name()} + a.byID[traceID] = stats + } + + stats.count++ + stats.totalBytes += pair.Bytes + stats.totalLatency += pair.Duration + stats.updateMinMax(pair.Duration) + stats.addSample(pair.Duration, a.sampleCap, a.rng) + + if retEv, ok := pair.ExitEv.(*types.RetEvent); ok && retEv.Ret < 0 { + stats.errorCount++ + } +} + +func (a *syscallAccumulator) Snapshot(elapsed time.Duration) []SyscallSnapshot { + if a == nil { + return nil + } + + rateDiv := elapsed.Seconds() + result := make([]SyscallSnapshot, 0, len(a.byID)) + for _, stats := range a.byID { + result = append(result, stats.toSnapshot(rateDiv)) + } + + sort.Slice(result, func(i, j int) bool { + if result[i].Count != result[j].Count { + return result[i].Count > result[j].Count + } + return result[i].Name < result[j].Name + }) + + return result +} + +func (s *syscallStats) updateMinMax(duration uint64) { + if s.count == 1 || duration < s.minLatency { + s.minLatency = duration + } + if duration > s.maxLatency { + s.maxLatency = duration + } +} + +func (s *syscallStats) addSample(duration uint64, cap int, rng *rand.Rand) { + s.seenLatencies++ + if len(s.samples) < cap { + s.samples = append(s.samples, duration) + return + } + + idx := rng.Int63n(int64(s.seenLatencies)) + if idx >= int64(cap) { + return + } + s.samples[idx] = duration +} + +func (s *syscallStats) toSnapshot(rateDiv float64) SyscallSnapshot { + sortedSamples := append([]uint64(nil), s.samples...) + sort.Slice(sortedSamples, func(i, j int) bool { + return sortedSamples[i] < sortedSamples[j] + }) + + return SyscallSnapshot{ + TraceID: s.traceID, + Name: s.name, + Count: s.count, + RatePerSec: safeRate(s.count, rateDiv), + Errors: s.errorCount, + Bytes: s.totalBytes, + LatencyMinNs: s.minLatency, + LatencyMaxNs: s.maxLatency, + LatencyMeanNs: float64(s.totalLatency) / float64(maxU64(s.count, 1)), + LatencyP50Ns: samplePercentile(sortedSamples, 0.50), + LatencyP95Ns: samplePercentile(sortedSamples, 0.95), + LatencyP99Ns: samplePercentile(sortedSamples, 0.99), + } +} + +func samplePercentile(sorted []uint64, p float64) uint64 { + if len(sorted) == 0 { + return 0 + } + if p <= 0 { + return sorted[0] + } + if p >= 1 { + return sorted[len(sorted)-1] + } + + rank := int(math.Ceil(p*float64(len(sorted)))) - 1 + if rank < 0 { + rank = 0 + } + if rank >= len(sorted) { + rank = len(sorted) - 1 + } + return sorted[rank] +} + +func safeRate(count uint64, elapsedSeconds float64) float64 { + if elapsedSeconds <= 0 { + return 0 + } + return float64(count) / elapsedSeconds +} + +func maxU64(a, b uint64) uint64 { + if a > b { + return a + } + return b +} diff --git a/internal/statsengine/syscall_test.go b/internal/statsengine/syscall_test.go new file mode 100644 index 0000000..ad29026 --- /dev/null +++ b/internal/statsengine/syscall_test.go @@ -0,0 +1,132 @@ +package statsengine + +import ( + "ior/internal/event" + "ior/internal/types" + "math" + "math/rand" + "testing" + "time" +) + +func TestSyscallAccumulatorBasicStats(t *testing.T) { + acc := newSyscallAccumulatorWithConfig(10_000, rand.New(rand.NewSource(1))) + traceID := types.SYS_ENTER_READ + + acc.Add(newPair(traceID, 10, 100, 0)) + acc.Add(newPair(traceID, 20, 50, -1)) + acc.Add(newPair(traceID, 30, 25, 5)) + + snap := acc.Snapshot(2 * time.Second) + if len(snap) != 1 { + t.Fatalf("expected 1 syscall snapshot, got %d", len(snap)) + } + got := snap[0] + + if got.TraceID != traceID { + t.Fatalf("wrong trace id: got %v want %v", got.TraceID, traceID) + } + if got.Name != traceID.Name() { + t.Fatalf("wrong name: got %q want %q", got.Name, traceID.Name()) + } + if got.Count != 3 { + t.Fatalf("wrong count: got %d want 3", got.Count) + } + if got.Errors != 1 { + t.Fatalf("wrong errors: got %d want 1", got.Errors) + } + if got.Bytes != 175 { + t.Fatalf("wrong bytes: got %d want 175", got.Bytes) + } + if got.LatencyMinNs != 10 || got.LatencyMaxNs != 30 { + t.Fatalf("wrong min/max: got (%d,%d) want (10,30)", got.LatencyMinNs, got.LatencyMaxNs) + } + if got.LatencyMeanNs != 20 { + t.Fatalf("wrong mean: got %v want 20", got.LatencyMeanNs) + } + if got.LatencyP50Ns != 20 || got.LatencyP95Ns != 30 || got.LatencyP99Ns != 30 { + t.Fatalf("wrong percentiles: got p50=%d p95=%d p99=%d", got.LatencyP50Ns, got.LatencyP95Ns, got.LatencyP99Ns) + } + if math.Abs(got.RatePerSec-1.5) > 1e-9 { + t.Fatalf("wrong rate: got %v want 1.5", got.RatePerSec) + } +} + +func TestSyscallAccumulatorSortsByCountThenName(t *testing.T) { + acc := newSyscallAccumulatorWithConfig(10_000, rand.New(rand.NewSource(2))) + + idA := types.SYS_ENTER_OPENAT + idB := types.SYS_ENTER_READ + + acc.Add(newPair(idA, 10, 0, 0)) + acc.Add(newPair(idA, 11, 0, 0)) + acc.Add(newPair(idB, 12, 0, 0)) + + snap := acc.Snapshot(1 * time.Second) + if len(snap) != 2 { + t.Fatalf("expected 2 syscall snapshots, got %d", len(snap)) + } + if snap[0].TraceID != idA { + t.Fatalf("expected first id %v, got %v", idA, snap[0].TraceID) + } + if snap[1].TraceID != idB { + t.Fatalf("expected second id %v, got %v", idB, snap[1].TraceID) + } +} + +func TestSyscallAccumulatorReservoirPercentilesAccuracy(t *testing.T) { + acc := newSyscallAccumulatorWithConfig(100, rand.New(rand.NewSource(7))) + traceID := types.SYS_ENTER_WRITE + + for d := uint64(1); d <= 10_000; d++ { + acc.Add(newPair(traceID, d, 0, 0)) + } + + snap := acc.Snapshot(1 * time.Second) + if len(snap) != 1 { + t.Fatalf("expected 1 syscall snapshot, got %d", len(snap)) + } + got := snap[0] + + assertNearPercentile(t, "p50", got.LatencyP50Ns, 5_000, 1_500) + assertNearPercentile(t, "p95", got.LatencyP95Ns, 9_500, 800) + assertNearPercentile(t, "p99", got.LatencyP99Ns, 9_900, 500) +} + +func TestSyscallAccumulatorZeroElapsedRate(t *testing.T) { + acc := newSyscallAccumulatorWithConfig(32, rand.New(rand.NewSource(9))) + acc.Add(newPair(types.SYS_ENTER_READ, 9, 0, 0)) + + snap := acc.Snapshot(0) + if len(snap) != 1 { + t.Fatalf("expected 1 syscall snapshot, got %d", len(snap)) + } + if snap[0].RatePerSec != 0 { + t.Fatalf("expected zero rate, got %v", snap[0].RatePerSec) + } +} + +func newPair(traceID types.TraceId, duration uint64, bytes uint64, ret int64) *event.Pair { + return &event.Pair{ + EnterEv: &types.RetEvent{TraceId: traceID}, + ExitEv: &types.RetEvent{TraceId: traceID, Ret: ret}, + Duration: duration, + Bytes: bytes, + } +} + +func assertNearPercentile(t *testing.T, name string, got uint64, want uint64, tolerance uint64) { + t.Helper() + + delta := absDiff(got, want) + if delta > tolerance { + t.Fatalf("%s too far from expected: got %d want %d (delta %d > %d)", name, got, want, delta, tolerance) + } +} + +func absDiff(a, b uint64) uint64 { + if a > b { + return a - b + } + return b - a +} |
