diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-13 10:01:40 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-13 10:01:40 +0300 |
| commit | 42d7821dc8d81781f2c3cfc269e99c0ee1dbd017 (patch) | |
| tree | b7b07bec653ff26bf70e866370f96b8d63a636e1 /internal/eventloop_comm.go | |
| parent | f7ebc44d8b770132904b64996eac50e26945bc94 (diff) | |
fix: add 1s context timeout to commResolver procfs reads to prevent indefinite blocking
Frozen cgroup entries in /proc could stall a lookup worker goroutine
forever, preventing clean shutdown because shutdown() waits on
workersWG.Wait(). Changed resolveFn signature to accept context.Context
and wrap each call in context.WithTimeout(1s) in both lookupWorker and
seedTrackedPidComm. Added TestCommResolverLookupWorkerRespectsTimeout
to verify the pending entry is cleared after a timeout.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'internal/eventloop_comm.go')
| -rw-r--r-- | internal/eventloop_comm.go | 32 |
1 files changed, 28 insertions, 4 deletions
diff --git a/internal/eventloop_comm.go b/internal/eventloop_comm.go index 4d49ef2..6e9ed0b 100644 --- a/internal/eventloop_comm.go +++ b/internal/eventloop_comm.go @@ -1,6 +1,7 @@ package internal import ( + "context" "errors" "fmt" "os" @@ -8,8 +9,13 @@ import ( "strconv" "sync" "syscall" + "time" ) +// resolveCommTimeout caps each procfs read so a frozen cgroup cannot stall +// a lookup worker indefinitely and block clean shutdown. +const resolveCommTimeout = time.Second + type commResolver struct { comms map[uint32]string @@ -19,7 +25,7 @@ type commResolver struct { lookupQueue chan uint32 lookupWorkers int - resolveFn func(uint32) (string, error) + resolveFn func(context.Context, uint32) (string, error) warningFn func(string) startWorkersOnce sync.Once workersWG sync.WaitGroup @@ -46,7 +52,16 @@ func (r *commResolver) ensureLookupConfig() { r.lookupQueue = make(chan uint32, defaultCommLookupQueueSize) } if r.resolveFn == nil { - r.resolveFn = resolveCommFromProcWithError + // Default resolver wraps resolveCommFromProcWithError, which does not + // accept a context itself, so we honour cancellation by returning early + // when the context deadline is already exceeded before the call returns. + r.resolveFn = func(ctx context.Context, tid uint32) (string, error) { + comm, err := resolveCommFromProcWithError(tid) + if ctx.Err() != nil { + return "", ctx.Err() + } + return comm, err + } } } @@ -69,7 +84,12 @@ func (r *commResolver) startLookupWorkers() { func (r *commResolver) lookupWorker() { defer r.workersWG.Done() for tid := range r.lookupQueue { - comm, err := r.resolveFn(tid) + // Each procfs read gets an independent timeout so that a frozen cgroup + // or a slow /proc entry cannot block a worker goroutine indefinitely + // and stall shutdown (which waits on workersWG). + ctx, cancel := context.WithTimeout(context.Background(), resolveCommTimeout) + comm, err := r.resolveFn(ctx, tid) + cancel() r.mu.Lock() delete(r.pending, tid) if comm != "" { @@ -95,7 +115,11 @@ func (r *commResolver) seedTrackedPidComm(pidFilter int) { continue } seen[tid] = struct{}{} - comm, err := r.resolveFn(tid) + // Use a short timeout here too; seeding happens at startup and a stall + // would delay the entire event loop initialisation. + ctx, cancel := context.WithTimeout(context.Background(), resolveCommTimeout) + comm, err := r.resolveFn(ctx, tid) + cancel() if comm != "" { r.setCached(tid, comm) continue |
