summaryrefslogtreecommitdiff
path: root/internal/statsengine
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-02-23 22:58:38 +0200
committerPaul Buetow <paul@buetow.org>2026-02-23 22:58:38 +0200
commit4b00bd30a8c1247f5dfee77277fdd4b438750bd0 (patch)
tree547dc3a71cecfdd0bf87a54f047b4096aeedb62b /internal/statsengine
parent44d0558be543e1e31a5178736181305d5d1ac2ba (diff)
task 305: add per-syscall accumulator with reservoir percentiles
Diffstat (limited to 'internal/statsengine')
-rw-r--r--internal/statsengine/syscall.go176
-rw-r--r--internal/statsengine/syscall_test.go132
2 files changed, 308 insertions, 0 deletions
diff --git a/internal/statsengine/syscall.go b/internal/statsengine/syscall.go
new file mode 100644
index 0000000..b3b8c4c
--- /dev/null
+++ b/internal/statsengine/syscall.go
@@ -0,0 +1,176 @@
+package statsengine
+
+import (
+ "ior/internal/event"
+ "ior/internal/types"
+ "math"
+ "math/rand"
+ "sort"
+ "time"
+)
+
+const syscallReservoirSampleCapDefault = 10_000
+
+type syscallAccumulator struct {
+ byID map[types.TraceId]*syscallStats
+ sampleCap int
+ rng *rand.Rand
+}
+
+type syscallStats struct {
+ traceID types.TraceId
+ name string
+
+ count uint64
+ errorCount uint64
+ totalBytes uint64
+ totalLatency uint64
+ minLatency uint64
+ maxLatency uint64
+
+ seenLatencies uint64
+ samples []uint64
+}
+
+func newSyscallAccumulator() *syscallAccumulator {
+ return newSyscallAccumulatorWithConfig(syscallReservoirSampleCapDefault, rand.New(rand.NewSource(time.Now().UnixNano())))
+}
+
+func newSyscallAccumulatorWithConfig(sampleCap int, rng *rand.Rand) *syscallAccumulator {
+ if sampleCap <= 0 {
+ sampleCap = syscallReservoirSampleCapDefault
+ }
+ if rng == nil {
+ rng = rand.New(rand.NewSource(time.Now().UnixNano()))
+ }
+
+ return &syscallAccumulator{
+ byID: make(map[types.TraceId]*syscallStats),
+ sampleCap: sampleCap,
+ rng: rng,
+ }
+}
+
+func (a *syscallAccumulator) Add(pair *event.Pair) {
+ if a == nil || pair == nil || pair.EnterEv == nil {
+ return
+ }
+
+ traceID := pair.EnterEv.GetTraceId()
+ stats := a.byID[traceID]
+ if stats == nil {
+ stats = &syscallStats{traceID: traceID, name: traceID.Name()}
+ a.byID[traceID] = stats
+ }
+
+ stats.count++
+ stats.totalBytes += pair.Bytes
+ stats.totalLatency += pair.Duration
+ stats.updateMinMax(pair.Duration)
+ stats.addSample(pair.Duration, a.sampleCap, a.rng)
+
+ if retEv, ok := pair.ExitEv.(*types.RetEvent); ok && retEv.Ret < 0 {
+ stats.errorCount++
+ }
+}
+
+func (a *syscallAccumulator) Snapshot(elapsed time.Duration) []SyscallSnapshot {
+ if a == nil {
+ return nil
+ }
+
+ rateDiv := elapsed.Seconds()
+ result := make([]SyscallSnapshot, 0, len(a.byID))
+ for _, stats := range a.byID {
+ result = append(result, stats.toSnapshot(rateDiv))
+ }
+
+ sort.Slice(result, func(i, j int) bool {
+ if result[i].Count != result[j].Count {
+ return result[i].Count > result[j].Count
+ }
+ return result[i].Name < result[j].Name
+ })
+
+ return result
+}
+
+func (s *syscallStats) updateMinMax(duration uint64) {
+ if s.count == 1 || duration < s.minLatency {
+ s.minLatency = duration
+ }
+ if duration > s.maxLatency {
+ s.maxLatency = duration
+ }
+}
+
+func (s *syscallStats) addSample(duration uint64, cap int, rng *rand.Rand) {
+ s.seenLatencies++
+ if len(s.samples) < cap {
+ s.samples = append(s.samples, duration)
+ return
+ }
+
+ idx := rng.Int63n(int64(s.seenLatencies))
+ if idx >= int64(cap) {
+ return
+ }
+ s.samples[idx] = duration
+}
+
+func (s *syscallStats) toSnapshot(rateDiv float64) SyscallSnapshot {
+ sortedSamples := append([]uint64(nil), s.samples...)
+ sort.Slice(sortedSamples, func(i, j int) bool {
+ return sortedSamples[i] < sortedSamples[j]
+ })
+
+ return SyscallSnapshot{
+ TraceID: s.traceID,
+ Name: s.name,
+ Count: s.count,
+ RatePerSec: safeRate(s.count, rateDiv),
+ Errors: s.errorCount,
+ Bytes: s.totalBytes,
+ LatencyMinNs: s.minLatency,
+ LatencyMaxNs: s.maxLatency,
+ LatencyMeanNs: float64(s.totalLatency) / float64(maxU64(s.count, 1)),
+ LatencyP50Ns: samplePercentile(sortedSamples, 0.50),
+ LatencyP95Ns: samplePercentile(sortedSamples, 0.95),
+ LatencyP99Ns: samplePercentile(sortedSamples, 0.99),
+ }
+}
+
+func samplePercentile(sorted []uint64, p float64) uint64 {
+ if len(sorted) == 0 {
+ return 0
+ }
+ if p <= 0 {
+ return sorted[0]
+ }
+ if p >= 1 {
+ return sorted[len(sorted)-1]
+ }
+
+ rank := int(math.Ceil(p*float64(len(sorted)))) - 1
+ if rank < 0 {
+ rank = 0
+ }
+ if rank >= len(sorted) {
+ rank = len(sorted) - 1
+ }
+ return sorted[rank]
+}
+
+func safeRate(count uint64, elapsedSeconds float64) float64 {
+ if elapsedSeconds <= 0 {
+ return 0
+ }
+ return float64(count) / elapsedSeconds
+}
+
+func maxU64(a, b uint64) uint64 {
+ if a > b {
+ return a
+ }
+ return b
+}
diff --git a/internal/statsengine/syscall_test.go b/internal/statsengine/syscall_test.go
new file mode 100644
index 0000000..ad29026
--- /dev/null
+++ b/internal/statsengine/syscall_test.go
@@ -0,0 +1,132 @@
+package statsengine
+
+import (
+ "ior/internal/event"
+ "ior/internal/types"
+ "math"
+ "math/rand"
+ "testing"
+ "time"
+)
+
+func TestSyscallAccumulatorBasicStats(t *testing.T) {
+ acc := newSyscallAccumulatorWithConfig(10_000, rand.New(rand.NewSource(1)))
+ traceID := types.SYS_ENTER_READ
+
+ acc.Add(newPair(traceID, 10, 100, 0))
+ acc.Add(newPair(traceID, 20, 50, -1))
+ acc.Add(newPair(traceID, 30, 25, 5))
+
+ snap := acc.Snapshot(2 * time.Second)
+ if len(snap) != 1 {
+ t.Fatalf("expected 1 syscall snapshot, got %d", len(snap))
+ }
+ got := snap[0]
+
+ if got.TraceID != traceID {
+ t.Fatalf("wrong trace id: got %v want %v", got.TraceID, traceID)
+ }
+ if got.Name != traceID.Name() {
+ t.Fatalf("wrong name: got %q want %q", got.Name, traceID.Name())
+ }
+ if got.Count != 3 {
+ t.Fatalf("wrong count: got %d want 3", got.Count)
+ }
+ if got.Errors != 1 {
+ t.Fatalf("wrong errors: got %d want 1", got.Errors)
+ }
+ if got.Bytes != 175 {
+ t.Fatalf("wrong bytes: got %d want 175", got.Bytes)
+ }
+ if got.LatencyMinNs != 10 || got.LatencyMaxNs != 30 {
+ t.Fatalf("wrong min/max: got (%d,%d) want (10,30)", got.LatencyMinNs, got.LatencyMaxNs)
+ }
+ if got.LatencyMeanNs != 20 {
+ t.Fatalf("wrong mean: got %v want 20", got.LatencyMeanNs)
+ }
+ if got.LatencyP50Ns != 20 || got.LatencyP95Ns != 30 || got.LatencyP99Ns != 30 {
+ t.Fatalf("wrong percentiles: got p50=%d p95=%d p99=%d", got.LatencyP50Ns, got.LatencyP95Ns, got.LatencyP99Ns)
+ }
+ if math.Abs(got.RatePerSec-1.5) > 1e-9 {
+ t.Fatalf("wrong rate: got %v want 1.5", got.RatePerSec)
+ }
+}
+
+func TestSyscallAccumulatorSortsByCountThenName(t *testing.T) {
+ acc := newSyscallAccumulatorWithConfig(10_000, rand.New(rand.NewSource(2)))
+
+ idA := types.SYS_ENTER_OPENAT
+ idB := types.SYS_ENTER_READ
+
+ acc.Add(newPair(idA, 10, 0, 0))
+ acc.Add(newPair(idA, 11, 0, 0))
+ acc.Add(newPair(idB, 12, 0, 0))
+
+ snap := acc.Snapshot(1 * time.Second)
+ if len(snap) != 2 {
+ t.Fatalf("expected 2 syscall snapshots, got %d", len(snap))
+ }
+ if snap[0].TraceID != idA {
+ t.Fatalf("expected first id %v, got %v", idA, snap[0].TraceID)
+ }
+ if snap[1].TraceID != idB {
+ t.Fatalf("expected second id %v, got %v", idB, snap[1].TraceID)
+ }
+}
+
+func TestSyscallAccumulatorReservoirPercentilesAccuracy(t *testing.T) {
+ acc := newSyscallAccumulatorWithConfig(100, rand.New(rand.NewSource(7)))
+ traceID := types.SYS_ENTER_WRITE
+
+ for d := uint64(1); d <= 10_000; d++ {
+ acc.Add(newPair(traceID, d, 0, 0))
+ }
+
+ snap := acc.Snapshot(1 * time.Second)
+ if len(snap) != 1 {
+ t.Fatalf("expected 1 syscall snapshot, got %d", len(snap))
+ }
+ got := snap[0]
+
+ assertNearPercentile(t, "p50", got.LatencyP50Ns, 5_000, 1_500)
+ assertNearPercentile(t, "p95", got.LatencyP95Ns, 9_500, 800)
+ assertNearPercentile(t, "p99", got.LatencyP99Ns, 9_900, 500)
+}
+
+func TestSyscallAccumulatorZeroElapsedRate(t *testing.T) {
+ acc := newSyscallAccumulatorWithConfig(32, rand.New(rand.NewSource(9)))
+ acc.Add(newPair(types.SYS_ENTER_READ, 9, 0, 0))
+
+ snap := acc.Snapshot(0)
+ if len(snap) != 1 {
+ t.Fatalf("expected 1 syscall snapshot, got %d", len(snap))
+ }
+ if snap[0].RatePerSec != 0 {
+ t.Fatalf("expected zero rate, got %v", snap[0].RatePerSec)
+ }
+}
+
+func newPair(traceID types.TraceId, duration uint64, bytes uint64, ret int64) *event.Pair {
+ return &event.Pair{
+ EnterEv: &types.RetEvent{TraceId: traceID},
+ ExitEv: &types.RetEvent{TraceId: traceID, Ret: ret},
+ Duration: duration,
+ Bytes: bytes,
+ }
+}
+
+func assertNearPercentile(t *testing.T, name string, got uint64, want uint64, tolerance uint64) {
+ t.Helper()
+
+ delta := absDiff(got, want)
+ if delta > tolerance {
+ t.Fatalf("%s too far from expected: got %d want %d (delta %d > %d)", name, got, want, delta, tolerance)
+ }
+}
+
+func absDiff(a, b uint64) uint64 {
+ if a > b {
+ return a - b
+ }
+ return b - a
+}