diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/c/generated_tracepoints.c | 10 | ||||
| -rw-r--r-- | internal/c/generated_tracepoints_result.txt | 2 | ||||
| -rw-r--r-- | internal/eventloop_exit.go | 42 | ||||
| -rw-r--r-- | internal/eventloop_state.go | 27 | ||||
| -rw-r--r-- | internal/eventloop_test.go | 128 | ||||
| -rw-r--r-- | internal/generate/classify.go | 11 | ||||
| -rw-r--r-- | internal/generate/classify_test.go | 1 | ||||
| -rw-r--r-- | internal/tracepoints/generated_tracepoints.go | 2 |
8 files changed, 180 insertions, 43 deletions
diff --git a/internal/c/generated_tracepoints.c b/internal/c/generated_tracepoints.c index 8dfd53f..48c1f84 100644 --- a/internal/c/generated_tracepoints.c +++ b/internal/c/generated_tracepoints.c @@ -7549,7 +7549,7 @@ int handle_sys_exit_sysfs(struct syscall_trace_exit *ctx) { return 0; } -/// sys_enter_close_range is a struct fd_event (kind=fd) +/// sys_enter_close_range is a struct two_fd_event (kind=two-fd) SEC("tracepoint/syscalls/sys_enter_close_range") int handle_sys_enter_close_range(struct syscall_trace_enter *ctx) { __u32 pid, tid; @@ -7559,16 +7559,18 @@ int handle_sys_enter_close_range(struct syscall_trace_enter *ctx) { if (!ior_on_syscall_enter(tid, SYS_ENTER_CLOSE_RANGE)) return 0; - struct fd_event *ev = bpf_ringbuf_reserve(&event_map, sizeof(struct fd_event), 0); + struct two_fd_event *ev = bpf_ringbuf_reserve(&event_map, sizeof(struct two_fd_event), 0); if (!ev) return 0; - ev->event_type = ENTER_FD_EVENT; + ev->event_type = ENTER_TWO_FD_EVENT; ev->trace_id = SYS_ENTER_CLOSE_RANGE; ev->pid = pid; ev->tid = tid; ev->time = bpf_ktime_get_boot_ns(); - ev->fd = (__s32)ctx->args[0]; + ev->fd_a = (__s32)ctx->args[0]; + ev->fd_b = (__s32)ctx->args[1]; + ev->extra = (__u64)ctx->args[2]; bpf_ringbuf_submit(ev, 0); return 0; diff --git a/internal/c/generated_tracepoints_result.txt b/internal/c/generated_tracepoints_result.txt index 4d44867..7918006 100644 --- a/internal/c/generated_tracepoints_result.txt +++ b/internal/c/generated_tracepoints_result.txt @@ -24,7 +24,7 @@ sys_enter_clock_settime is a struct null_event (kind=null) sys_enter_clone is a struct null_event (kind=proc) sys_enter_clone3 is a struct null_event (kind=proc) sys_enter_close is a struct fd_event (kind=fd) -sys_enter_close_range is a struct fd_event (kind=fd) +sys_enter_close_range is a struct two_fd_event (kind=two-fd) sys_enter_connect is a struct fd_event (kind=fd) sys_enter_copy_file_range is a struct fd_event (kind=fd) sys_enter_creat is a struct path_event (kind=pathname) diff --git a/internal/eventloop_exit.go b/internal/eventloop_exit.go index cb11074..a5b38d4 100644 --- a/internal/eventloop_exit.go +++ b/internal/eventloop_exit.go @@ -118,8 +118,10 @@ func (e *eventLoop) handlePathExit(ep *event.Pair, pathEv *types.PathEvent) bool } // handleFdExit processes exit events for fd-based syscalls. It resolves the fd -// to a file, applies close/close_range state transitions, filters the pair, and -// handles dup/pidfd_getfd fd-transfer operations before finalising bytes. +// to a file, applies the close state transition, filters the pair, and handles +// dup/pidfd_getfd fd-transfer operations before finalising bytes. close_range is +// not handled here: it carries (first, last, flags) and is routed through +// handleTwoFdExit so the upper bound and flags are honoured. func (e *eventLoop) handleFdExit(ep *event.Pair, fdEv *types.FdEvent) bool { fd := fdEv.Fd ep.File = e.fdState().resolve(fd, fdEv.Pid) @@ -134,21 +136,11 @@ func (e *eventLoop) handleFdExit(ep *event.Pair, fdEv *types.FdEvent) bool { return true } -// applyFdCloseState updates fd-tracking state for close and close_range syscalls. +// applyFdCloseState updates fd-tracking state for the close syscall. func (e *eventLoop) applyFdCloseState(ep *event.Pair, fd int32, pid uint32) { if ep.Is(types.SYS_ENTER_CLOSE) { e.fdState().delete(fd) e.fdState().deleteProcFdCache(fd, pid) - return - } - if ep.Is(types.SYS_ENTER_CLOSE_RANGE) { - // close_range provides (first, last), but fd_event only carries the first - // argument, so we approximate by closing all tracked fds >= first. - retEv, ok := ep.ExitEv.(*types.RetEvent) - if ok && retEv.Ret == 0 { - e.fdState().closeRangeFrom(fd) - e.fdState().deleteProcFdCacheFrom(fd, pid) - } } } @@ -385,9 +377,33 @@ func (e *eventLoop) handlePollExit(ep *event.Pair, pollEv *types.PollEvent) bool func (e *eventLoop) handleTwoFdExit(ep *event.Pair, twoFdEv *types.TwoFdEvent) bool { ep.File = e.fdState().resolve(twoFdEv.FdA, twoFdEv.Pid) + if ep.Is(types.SYS_ENTER_CLOSE_RANGE) { + e.applyCloseRangeState(ep, twoFdEv) + } return e.finishPairForTid(ep, twoFdEv.GetTid()) } +// closeRangeCloexec mirrors CLOSE_RANGE_CLOEXEC from <linux/close_range.h>: when +// set, close_range only marks the descriptors close-on-exec instead of closing +// them, so the fds stay open and must remain tracked. +const closeRangeCloexec = 1 << 2 + +// applyCloseRangeState evicts the fds closed by a successful close_range. The +// enter event carries (first, last, flags) in fd_a/fd_b/extra. fd_b is an __s32 +// view of the unsigned "last" argument, so a negative value (e.g. ~0U meaning +// "close everything from first up") is treated as having no upper bound. +func (e *eventLoop) applyCloseRangeState(ep *event.Pair, ev *types.TwoFdEvent) { + retEv, ok := ep.ExitEv.(*types.RetEvent) + if !ok || retEv.Ret != 0 { + return + } + if ev.Extra&closeRangeCloexec != 0 { + return + } + e.fdState().closeRange(ev.FdA, ev.FdB) + e.fdState().deleteProcFdCacheRange(ev.FdA, ev.FdB, ev.Pid) +} + func (e *eventLoop) handleMemExit(ep *event.Pair, memEv *types.MemEvent) bool { return e.finishPairForTid(ep, memEv.GetTid()) } diff --git a/internal/eventloop_state.go b/internal/eventloop_state.go index a277e31..40e11c2 100644 --- a/internal/eventloop_state.go +++ b/internal/eventloop_state.go @@ -58,11 +58,19 @@ func (t *fdTracker) delete(fd int32) { delete(t.files, fd) } -func (t *fdTracker) closeRangeFrom(first int32) { +// closeRange removes all tracked fds in the inclusive range [first, last], as +// closed by close_range(2). A negative last means "no upper bound": close_range's +// last argument is an unsigned int, so the common close-everything form ~0U +// arrives here as a negative __s32 and must close every tracked fd >= first. +func (t *fdTracker) closeRange(first, last int32) { for fd := range t.files { - if fd >= first { - delete(t.files, fd) + if fd < first { + continue } + if last >= 0 && fd > last { + continue + } + delete(t.files, fd) } } @@ -113,16 +121,23 @@ func (t *fdTracker) deleteProcFdCache(fd int32, pid uint32) { t.deleteCacheKey(procFdCacheKey(pid, fd)) } -func (t *fdTracker) deleteProcFdCacheFrom(first int32, pid uint32) { +// deleteProcFdCacheRange drops cached procfs resolutions for pid's fds in the +// inclusive range [first, last]. A negative last means "no upper bound" (see +// closeRange for why close_range's last argument can arrive negative). +func (t *fdTracker) deleteProcFdCacheRange(first, last int32, pid uint32) { if t.procFdCache == nil { return } for key := range t.procFdCache { cachePid := uint32(key >> 32) cacheFd := int32(uint32(key)) - if cachePid == pid && cacheFd >= first { - t.deleteCacheKey(key) + if cachePid != pid || cacheFd < first { + continue + } + if last >= 0 && cacheFd > last { + continue } + t.deleteCacheKey(key) } } diff --git a/internal/eventloop_test.go b/internal/eventloop_test.go index 473a107..b768fcb 100644 --- a/internal/eventloop_test.go +++ b/internal/eventloop_test.go @@ -186,39 +186,114 @@ func TestHandleFdExitCloseClearsProcFdCache(t *testing.T) { verifyProcFdNotCached(t, el, pid, fd) } -func TestHandleFdExitCloseRangeClearsProcFdCacheRange(t *testing.T) { +func TestHandleTwoFdExitCloseRangeClearsProcFdCacheRange(t *testing.T) { el := mustNewEventLoop(t, eventLoopConfig{}) pid := uint32(2002) - el.fdState().setProcFdCache(10, pid, file.NewFd(10, "keep", syscall.O_RDONLY)) + el.fdState().setProcFdCache(10, pid, file.NewFd(10, "keep-below", syscall.O_RDONLY)) el.fdState().setProcFdCache(20, pid, file.NewFd(20, "drop", syscall.O_RDONLY)) el.fdState().setProcFdCache(30, pid, file.NewFd(30, "drop", syscall.O_RDONLY)) + el.fdState().setProcFdCache(40, pid, file.NewFd(40, "keep-above", syscall.O_RDONLY)) el.fdState().setProcFdCache(20, pid+1, file.NewFd(20, "other-pid", syscall.O_RDONLY)) - enter := &types.FdEvent{ - TraceId: types.SYS_ENTER_CLOSE_RANGE, - Pid: pid, - Tid: pid, - Fd: 20, + // close_range(20, 30, 0): only the inclusive [20,30] window for pid is evicted. + enter := &types.TwoFdEvent{ + EventType: types.ENTER_TWO_FD_EVENT, + TraceId: types.SYS_ENTER_CLOSE_RANGE, + Pid: pid, + Tid: pid, + FdA: 20, + FdB: 30, + Extra: 0, } exit := &types.RetEvent{ - TraceId: types.SYS_EXIT_CLOSE_RANGE, - Pid: pid, - Tid: pid, - Ret: 0, + EventType: types.EXIT_RET_EVENT, + TraceId: types.SYS_EXIT_CLOSE_RANGE, + Pid: pid, + Tid: pid, + Ret: 0, } ep := &event.Pair{EnterEv: enter, ExitEv: exit} - if ok := el.handleFdExit(ep, enter); !ok { - t.Fatal("handleFdExit(close_range) returned false") + if ok := el.handleTwoFdExit(ep, enter); !ok { + t.Fatal("handleTwoFdExit(close_range) returned false") } verifyProcFdCached(t, el, pid, 10) verifyProcFdNotCached(t, el, pid, 20) verifyProcFdNotCached(t, el, pid, 30) + verifyProcFdCached(t, el, pid, 40) verifyProcFdCached(t, el, pid+1, 20) } +func TestHandleTwoFdExitCloseRangeCloexecKeepsFds(t *testing.T) { + el := mustNewEventLoop(t, eventLoopConfig{}) + el.fdState().set(5, file.NewFd(5, "stays-open", syscall.O_RDONLY)) + el.fdState().set(6, file.NewFd(6, "stays-open", syscall.O_RDONLY)) + + // close_range(5, 6, CLOSE_RANGE_CLOEXEC): the kernel only marks the fds + // close-on-exec, so they remain open and must stay tracked. + enter := &types.TwoFdEvent{ + EventType: types.ENTER_TWO_FD_EVENT, + TraceId: types.SYS_ENTER_CLOSE_RANGE, + Pid: 3003, + Tid: 3003, + FdA: 5, + FdB: 6, + Extra: closeRangeCloexec, + } + exit := &types.RetEvent{ + EventType: types.EXIT_RET_EVENT, + TraceId: types.SYS_EXIT_CLOSE_RANGE, + Pid: 3003, + Tid: 3003, + Ret: 0, + } + ep := &event.Pair{EnterEv: enter, ExitEv: exit} + + if ok := el.handleTwoFdExit(ep, enter); !ok { + t.Fatal("handleTwoFdExit(close_range cloexec) returned false") + } + + verifyFileDescriptor(t, el, 5, "stays-open") + verifyFileDescriptor(t, el, 6, "stays-open") +} + +func TestHandleTwoFdExitCloseRangeUnboundedClosesAll(t *testing.T) { + el := mustNewEventLoop(t, eventLoopConfig{}) + el.fdState().set(2, file.NewFd(2, "keep-below", syscall.O_RDONLY)) + el.fdState().set(7, file.NewFd(7, "drop", syscall.O_RDONLY)) + el.fdState().set(900, file.NewFd(900, "drop-high", syscall.O_RDONLY)) + + // close_range(3, ~0U, 0): the unsigned UINT_MAX upper bound arrives as a + // negative __s32, meaning "close everything from fd 3 up". + enter := &types.TwoFdEvent{ + EventType: types.ENTER_TWO_FD_EVENT, + TraceId: types.SYS_ENTER_CLOSE_RANGE, + Pid: 4004, + Tid: 4004, + FdA: 3, + FdB: -1, + Extra: 0, + } + exit := &types.RetEvent{ + EventType: types.EXIT_RET_EVENT, + TraceId: types.SYS_EXIT_CLOSE_RANGE, + Pid: 4004, + Tid: 4004, + Ret: 0, + } + ep := &event.Pair{EnterEv: enter, ExitEv: exit} + + if ok := el.handleTwoFdExit(ep, enter); !ok { + t.Fatal("handleTwoFdExit(close_range unbounded) returned false") + } + + verifyFileDescriptor(t, el, 2, "keep-below") + verifyFdNotTracked(t, el, 7) + verifyFdNotTracked(t, el, 900) +} + func TestFreezePairForEmissionCopiesFdFile(t *testing.T) { el := mustNewEventLoop(t, eventLoopConfig{}) fdFile := file.NewFd(9, "/tmp/x", syscall.O_RDONLY) @@ -423,6 +498,27 @@ func makeExitFdEvent(t *testing.T, time uint64, pid, tid uint32, fd int32, trace return ev, bytes } +// makeEnterTwoFdEvent builds an enter two_fd_event and its wire bytes. For +// close_range the three fields carry (first, last, flags). +func makeEnterTwoFdEvent(t *testing.T, time uint64, pid, tid uint32, fdA, fdB int32, extra uint64, traceId types.TraceId) (types.TwoFdEvent, []byte) { + ev := types.TwoFdEvent{ + EventType: types.ENTER_TWO_FD_EVENT, + TraceId: traceId, + Time: time, + Pid: pid, + Tid: tid, + FdA: fdA, + FdB: fdB, + Extra: extra, + } + + bytes, err := ev.Bytes() + if err != nil { + t.Error(err) + } + return ev, bytes +} + // Helper function to create exit RetEvent func makeExitRetEvent(t *testing.T, time uint64, pid, tid uint32, traceId types.TraceId, ret int64) (types.RetEvent, []byte) { ev := types.RetEvent{ @@ -590,7 +686,8 @@ func makeCloseRangeEventTestData(t *testing.T) (td testData) { openExitBytes3, _ = openExitEv3.Bytes() td.rawTracepoints = append(td.rawTracepoints, openExitBytes3) - enterCloseRange, enterCloseRangeBytes := makeEnterFdEvent(t, defaulTime+600, defaultPid, defaultTid, fd2, types.SYS_ENTER_CLOSE_RANGE) + // close_range(fd2, fd3, 0): closes the inclusive window [fd2, fd3], leaving fd1 tracked. + enterCloseRange, enterCloseRangeBytes := makeEnterTwoFdEvent(t, defaulTime+600, defaultPid, defaultTid, fd2, fd3, 0, types.SYS_ENTER_CLOSE_RANGE) td.rawTracepoints = append(td.rawTracepoints, enterCloseRangeBytes) exitCloseRange, exitCloseRangeBytes := makeExitRetEvent(t, defaulTime+700, defaultPid, defaultTid, types.SYS_EXIT_CLOSE_RANGE, 0) @@ -671,7 +768,8 @@ func makeCloseRangeFailureTestData(t *testing.T) (td testData) { openExitBytes2, _ = openExitEv2.Bytes() td.rawTracepoints = append(td.rawTracepoints, openExitBytes2) - enterCloseRange, enterCloseRangeBytes := makeEnterFdEvent(t, defaulTime+400, defaultPid, defaultTid, fd1, types.SYS_ENTER_CLOSE_RANGE) + // close_range(fd1, fd2, 0) that fails (ret=-1): no fds should be evicted. + enterCloseRange, enterCloseRangeBytes := makeEnterTwoFdEvent(t, defaulTime+400, defaultPid, defaultTid, fd1, fd2, 0, types.SYS_ENTER_CLOSE_RANGE) td.rawTracepoints = append(td.rawTracepoints, enterCloseRangeBytes) exitCloseRange, exitCloseRangeBytes := makeExitRetEvent(t, defaulTime+500, defaultPid, defaultTid, types.SYS_EXIT_CLOSE_RANGE, -1) diff --git a/internal/generate/classify.go b/internal/generate/classify.go index 7dcbf5a..b7e9c0f 100644 --- a/internal/generate/classify.go +++ b/internal/generate/classify.go @@ -241,9 +241,14 @@ var nameOnlyKindsTable = map[string]TracepointKind{ "sys_enter_epoll_ctl": KindEpollCtl, "sys_enter_move_mount": KindTwoFd, - "sys_enter_statmount": KindNull, - "sys_enter_listmount": KindNull, - "sys_enter_listns": KindNull, + // close_range(first, last, flags) needs all three arguments, so it is a + // two_fd_event (fd_a=first, fd_b=last, extra=flags) rather than a single-fd + // fd_event. This lets the runtime honour the upper bound and the + // CLOSE_RANGE_CLOEXEC flag instead of closing every fd >= first. + "sys_enter_close_range": KindTwoFd, + "sys_enter_statmount": KindNull, + "sys_enter_listmount": KindNull, + "sys_enter_listns": KindNull, "sys_enter_poll": KindPoll, "sys_enter_ppoll": KindPoll, diff --git a/internal/generate/classify_test.go b/internal/generate/classify_test.go index 618274a..46947e2 100644 --- a/internal/generate/classify_test.go +++ b/internal/generate/classify_test.go @@ -1401,6 +1401,7 @@ func TestClassifySyscallPairAccepted(t *testing.T) { {"mount", FormatMount, FormatExitMount, KindPathname}, {"umount", FormatUmount, FormatExitUmount, KindPathname}, {"move_mount", FormatMoveMount, FormatExitMoveMount, KindTwoFd}, + {"close_range", syntheticEnter("close_range", 9322), syntheticExit("close_range", 9321), KindTwoFd}, {"kcmp", syntheticEnter("kcmp", 9324), syntheticExit("kcmp", 9323), KindTwoFd}, {"kexec_file_load", syntheticEnter("kexec_file_load", 9326), syntheticExit("kexec_file_load", 9325), KindFd}, {"membarrier", syntheticEnter("membarrier", 9328), syntheticExit("membarrier", 9327), KindNull}, diff --git a/internal/tracepoints/generated_tracepoints.go b/internal/tracepoints/generated_tracepoints.go index fd3ac75..5ff2f5c 100644 --- a/internal/tracepoints/generated_tracepoints.go +++ b/internal/tracepoints/generated_tracepoints.go @@ -1135,7 +1135,7 @@ var syscallKinds = map[string]string{ "clone": "proc", "clone3": "proc", "close": "fd", - "close_range": "fd", + "close_range": "two-fd", "connect": "fd", "copy_file_range": "fd", "creat": "pathname", |
