summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-06-09 22:18:42 +0300
committerPaul Buetow <paul@buetow.org>2026-06-09 22:18:42 +0300
commit7031211501884555139351bb676fc0592c9df14c (patch)
treedc5bf201bacb706c355f477413ad375a6d8fbe5b
parentbab929022f4f4bba77439c63d130c833595758b6 (diff)
feat(parquet): surface epoll_ctl op/target-fd/events metadata
epoll_ctl's BPF handler already decodes the operation (args[1]), target descriptor (args[2]), and requested event mask (args[3]->events) into an EpollCtlEvent, but the single resolved-epfd `fd` column was the only epoll detail reaching the output schema. Consumers could not see which descriptor was registered nor the operation performed. Surface the metadata as three additive, backward-compatible columns, mirroring the existing dedicated optional-column convention used by requested_sleep_ns and address_space_bytes: - epoll_op (String): ADD/MOD/DEL, or the raw decimal for unknown ops; empty for non-epoll_ctl rows. - epoll_target_fd (Int32): registered descriptor (args[2]); 0 otherwise. - epoll_events (UInt32): requested event mask; 0 otherwise. Data flows EpollCtlEvent -> event.Pair (new EpollCtl/HasEpoll fields, populated in handleEpollCtlExit) -> streamrow.Row -> parquet.Record. The op-to-string mapping lives on event.EpollCtl.OpName. Docs (docs/parquet-querying.md) and the Magefile parquetValidate column list updated in lockstep (also adding the previously-undocumented address_space_bytes/requested_sleep_ns columns). The polling parquet integration test now asserts epoll_ctl rows carry a decoded op and a valid target fd, and that other syscalls leave epoll_op empty. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
-rw-r--r--Magefile.go7
-rw-r--r--docs/parquet-querying.md24
-rw-r--r--integrationtests/polling_test.go19
-rw-r--r--internal/event/pair.go38
-rw-r--r--internal/eventloop_exit.go9
-rw-r--r--internal/parquet/schema.go10
-rw-r--r--internal/streamrow/row.go16
7 files changed, 114 insertions, 9 deletions
diff --git a/Magefile.go b/Magefile.go
index 2a996eb..0396a9e 100644
--- a/Magefile.go
+++ b/Magefile.go
@@ -1164,11 +1164,14 @@ func runClickHouseQuery(dir, file, sql string) (string, error) {
return strings.TrimSpace(out), nil
}
-// expectedParquetColumns lists the 15 column names that the parquet schema must contain.
+// expectedParquetColumns lists the column names that the parquet schema must
+// contain. Keep in lockstep with parquet.Record (internal/parquet/schema.go).
var expectedParquetColumns = []string{
"seq", "time_ns", "gap_ns", "latency_ns", "comm",
"pid", "tid", "syscall", "family", "fd", "ret",
- "bytes", "file", "is_error", "filter_epoch",
+ "bytes", "address_space_bytes", "requested_sleep_ns",
+ "file", "is_error", "filter_epoch",
+ "epoll_op", "epoll_target_fd", "epoll_events",
}
// parquetSchemaCheck verifies that all expectedParquetColumns appear in the
diff --git a/docs/parquet-querying.md b/docs/parquet-querying.md
index 4c31474..2ebf16e 100644
--- a/docs/parquet-querying.md
+++ b/docs/parquet-querying.md
@@ -30,9 +30,14 @@ state, no installation needed beyond Docker.
| `fd` | Int32 | File descriptor |
| `ret` | Int64 | Return value (negative = errno) |
| `bytes` | UInt64 | Bytes transferred (0 if not applicable) |
+| `address_space_bytes` | UInt64 | Memory-region extent for memory syscalls (e.g. `munmap`/`mremap`); 0 otherwise |
+| `requested_sleep_ns` | Int64 | Requested sleep duration for nanosleep-style syscalls; 0 otherwise |
| `file` | String | File path (empty if not resolved) |
| `is_error` | Bool | True when `ret` is a negative errno |
| `filter_epoch` | UInt64 | Filter generation at capture time |
+| `epoll_op` | String | `epoll_ctl` operation (`ADD`/`MOD`/`DEL`); empty for other syscalls |
+| `epoll_target_fd` | Int32 | `epoll_ctl` target descriptor being registered (args[2]); 0 for other syscalls |
+| `epoll_events` | UInt32 | `epoll_ctl` requested event mask (args[3]->events); 0 for other syscalls |
---
@@ -78,12 +83,17 @@ pid UInt32
tid UInt32
syscall String
family String
-fd Int32
-ret Int64
-bytes UInt64
-file String
-is_error Bool
-filter_epoch UInt64
+fd Int32
+ret Int64
+bytes UInt64
+address_space_bytes UInt64
+requested_sleep_ns Int64
+file String
+is_error Bool
+filter_epoch UInt64
+epoll_op String
+epoll_target_fd Int32
+epoll_events UInt32
```
### Row count
@@ -220,6 +230,6 @@ PARQUET_FILE=ior-recording-20260313-170234.parquet env GOTOOLCHAIN=auto mage par
```
It checks:
-1. All 14 expected columns are present
+1. All 20 expected columns are present
2. Row count > 0
3. `seq` is monotonically ordered and `time_ns` is non-zero
diff --git a/integrationtests/polling_test.go b/integrationtests/polling_test.go
index d6b520c..c2f42d0 100644
--- a/integrationtests/polling_test.go
+++ b/integrationtests/polling_test.go
@@ -60,8 +60,19 @@ func TestPollingEpollReadyCountInParquet(t *testing.T) {
}
var sawPwait2 bool
var sawPwait2ReadyCount bool
+ var sawEpollCtlOp bool
for _, row := range rows {
switch row.Syscall {
+ case "epoll_ctl":
+ // The workload registers descriptors via epoll_ctl; at least one
+ // successful row must surface a decoded op and a non-negative target
+ // fd distinct from the resolved epfd column.
+ if row.EpollOp != "" && row.Ret == 0 {
+ sawEpollCtlOp = true
+ if row.EpollTargetFD < 0 {
+ t.Fatalf("epoll_ctl row has op %q but target fd %d < 0", row.EpollOp, row.EpollTargetFD)
+ }
+ }
case "epoll_wait", "epoll_pwait", "poll", "ppoll", "select", "pselect6":
if row.Ret > 0 {
wantReadyCount[row.Syscall] = true
@@ -77,9 +88,17 @@ func TestPollingEpollReadyCountInParquet(t *testing.T) {
if row.Bytes != 0 {
t.Fatalf("%s bytes = %d, want 0 for ready-count events", row.Syscall, row.Bytes)
}
+ // epoll_ctl metadata must stay empty for non-epoll_ctl syscalls.
+ if row.EpollOp != "" {
+ t.Fatalf("%s row has unexpected epoll_op %q", row.Syscall, row.EpollOp)
+ }
}
}
+ if !sawEpollCtlOp {
+ t.Fatalf("expected at least one successful epoll_ctl row with decoded op/target-fd in parquet output")
+ }
+
for syscall, ok := range wantReadyCount {
if !ok {
t.Fatalf("expected %s row with positive ready-count ret in parquet output", syscall)
diff --git a/internal/event/pair.go b/internal/event/pair.go
index 523f961..afc9bed 100644
--- a/internal/event/pair.go
+++ b/internal/event/pair.go
@@ -31,6 +31,44 @@ type Pair struct {
AddressSpaceBytes uint64
// RequestedSleepNs tracks requested sleep duration for nanosleep-style syscalls.
RequestedSleepNs int64
+ // Epoll carries epoll_ctl control metadata (op, target fd, requested event
+ // mask). It is only populated for epoll_ctl pairs; HasEpoll reports whether
+ // it is set. The Pair-level File still resolves to the epoll instance (epfd);
+ // Epoll.TargetFD is the descriptor being registered/modified/removed.
+ Epoll EpollCtl
+ HasEpoll bool
+}
+
+// EpollCtl holds the decoded epoll_ctl arguments surfaced from the BPF
+// EpollCtlEvent: the operation (EPOLL_CTL_ADD/MOD/DEL), the target fd
+// (args[2]), and the requested epoll event mask (args[3]->events).
+type EpollCtl struct {
+ Op int32
+ TargetFD int32
+ Events uint32
+}
+
+// Linux epoll_ctl op values from <sys/epoll.h>.
+const (
+ epollCtlAdd = 1
+ epollCtlDel = 2
+ epollCtlMod = 3
+)
+
+// OpName renders the epoll_ctl operation as a human-readable token
+// (ADD/DEL/MOD). Unknown values fall back to their decimal form so the
+// raw op is never lost.
+func (c EpollCtl) OpName() string {
+ switch c.Op {
+ case epollCtlAdd:
+ return "ADD"
+ case epollCtlDel:
+ return "DEL"
+ case epollCtlMod:
+ return "MOD"
+ default:
+ return strconv.FormatInt(int64(c.Op), 10)
+ }
}
func NewPair(enterEv Event) *Pair {
diff --git a/internal/eventloop_exit.go b/internal/eventloop_exit.go
index ae085c6..105d9ac 100644
--- a/internal/eventloop_exit.go
+++ b/internal/eventloop_exit.go
@@ -385,7 +385,16 @@ func (e *eventLoop) handleEventfdExit(ep *event.Pair, eventfdEv *types.EventfdEv
}
func (e *eventLoop) handleEpollCtlExit(ep *event.Pair, epollCtlEv *types.EpollCtlEvent) bool {
+ // File resolves to the epoll instance (epfd); the decoded op/target-fd/events
+ // are surfaced separately via ep.Epoll so consumers can see which descriptor
+ // was registered and the operation performed.
ep.File = e.fdState().resolve(epollCtlEv.Epfd, epollCtlEv.Pid)
+ ep.Epoll = event.EpollCtl{
+ Op: epollCtlEv.Op,
+ TargetFD: epollCtlEv.Fd,
+ Events: epollCtlEv.Events,
+ }
+ ep.HasEpoll = true
return e.finishPairForTid(ep, epollCtlEv.GetTid())
}
diff --git a/internal/parquet/schema.go b/internal/parquet/schema.go
index f39361c..b7ed381 100644
--- a/internal/parquet/schema.go
+++ b/internal/parquet/schema.go
@@ -30,6 +30,13 @@ type Record struct {
File string `parquet:"file"`
IsError bool `parquet:"is_error"`
FilterEpoch uint64 `parquet:"filter_epoch"`
+ // EpollOp/EpollTargetFD/EpollEvents surface epoll_ctl control metadata: the
+ // operation (ADD/MOD/DEL), the target descriptor registered (args[2]), and
+ // the requested event mask (args[3]->events). EpollOp is empty and the
+ // numeric fields are zero for all non-epoll_ctl rows.
+ EpollOp string `parquet:"epoll_op"`
+ EpollTargetFD int32 `parquet:"epoll_target_fd"`
+ EpollEvents uint32 `parquet:"epoll_events"`
}
// FileMetadata captures constant metadata written once into the parquet file.
@@ -74,6 +81,9 @@ func RecordFromStream(row streamrow.Row, filterEpoch uint64) Record {
File: row.FileName,
IsError: row.IsError,
FilterEpoch: filterEpoch,
+ EpollOp: row.EpollOp,
+ EpollTargetFD: row.EpollTargetFD,
+ EpollEvents: row.EpollEvents,
}
}
diff --git a/internal/streamrow/row.go b/internal/streamrow/row.go
index a6ccdf7..c846346 100644
--- a/internal/streamrow/row.go
+++ b/internal/streamrow/row.go
@@ -30,6 +30,13 @@ type Row struct {
RetVal int64
IsError bool
FD int32
+ // EpollOp is the epoll_ctl operation as a readable token (ADD/MOD/DEL),
+ // empty for non-epoll_ctl rows. EpollTargetFD and EpollEvents hold the
+ // registered descriptor (args[2]) and requested event mask (args[3]->events)
+ // for epoll_ctl rows; both are zero when EpollOp is empty.
+ EpollOp string
+ EpollTargetFD int32
+ EpollEvents uint32
}
func (r Row) SyscallValue() string {
@@ -122,6 +129,15 @@ func New(seq uint64, pair *event.Pair) Row {
row.FD = fd
}
+ // Surface epoll_ctl control metadata when present. The Pair's FD/File still
+ // reflect the epoll instance (epfd); these fields expose the target fd and
+ // operation so consumers can see which descriptor was registered.
+ if pair.HasEpoll {
+ row.EpollOp = pair.Epoll.OpName()
+ row.EpollTargetFD = pair.Epoll.TargetFD
+ row.EpollEvents = pair.Epoll.Events
+ }
+
if retEv, ok := pair.ExitEv.(*types.RetEvent); ok {
row.RetVal = retEv.Ret
row.IsError = retEv.Ret < 0