summaryrefslogtreecommitdiff
path: root/internal/eventloop_comm.go
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-13 10:01:40 +0300
committerPaul Buetow <paul@buetow.org>2026-05-13 10:01:40 +0300
commit42d7821dc8d81781f2c3cfc269e99c0ee1dbd017 (patch)
treeb7b07bec653ff26bf70e866370f96b8d63a636e1 /internal/eventloop_comm.go
parentf7ebc44d8b770132904b64996eac50e26945bc94 (diff)
fix: add 1s context timeout to commResolver procfs reads to prevent indefinite blocking
Frozen cgroup entries in /proc could stall a lookup worker goroutine forever, preventing clean shutdown because shutdown() waits on workersWG.Wait(). Changed resolveFn signature to accept context.Context and wrap each call in context.WithTimeout(1s) in both lookupWorker and seedTrackedPidComm. Added TestCommResolverLookupWorkerRespectsTimeout to verify the pending entry is cleared after a timeout. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'internal/eventloop_comm.go')
-rw-r--r--internal/eventloop_comm.go32
1 files changed, 28 insertions, 4 deletions
diff --git a/internal/eventloop_comm.go b/internal/eventloop_comm.go
index 4d49ef2..6e9ed0b 100644
--- a/internal/eventloop_comm.go
+++ b/internal/eventloop_comm.go
@@ -1,6 +1,7 @@
package internal
import (
+ "context"
"errors"
"fmt"
"os"
@@ -8,8 +9,13 @@ import (
"strconv"
"sync"
"syscall"
+ "time"
)
+// resolveCommTimeout caps each procfs read so a frozen cgroup cannot stall
+// a lookup worker indefinitely and block clean shutdown.
+const resolveCommTimeout = time.Second
+
type commResolver struct {
comms map[uint32]string
@@ -19,7 +25,7 @@ type commResolver struct {
lookupQueue chan uint32
lookupWorkers int
- resolveFn func(uint32) (string, error)
+ resolveFn func(context.Context, uint32) (string, error)
warningFn func(string)
startWorkersOnce sync.Once
workersWG sync.WaitGroup
@@ -46,7 +52,16 @@ func (r *commResolver) ensureLookupConfig() {
r.lookupQueue = make(chan uint32, defaultCommLookupQueueSize)
}
if r.resolveFn == nil {
- r.resolveFn = resolveCommFromProcWithError
+ // Default resolver wraps resolveCommFromProcWithError, which does not
+ // accept a context itself, so we honour cancellation by returning early
+ // when the context deadline is already exceeded before the call returns.
+ r.resolveFn = func(ctx context.Context, tid uint32) (string, error) {
+ comm, err := resolveCommFromProcWithError(tid)
+ if ctx.Err() != nil {
+ return "", ctx.Err()
+ }
+ return comm, err
+ }
}
}
@@ -69,7 +84,12 @@ func (r *commResolver) startLookupWorkers() {
func (r *commResolver) lookupWorker() {
defer r.workersWG.Done()
for tid := range r.lookupQueue {
- comm, err := r.resolveFn(tid)
+ // Each procfs read gets an independent timeout so that a frozen cgroup
+ // or a slow /proc entry cannot block a worker goroutine indefinitely
+ // and stall shutdown (which waits on workersWG).
+ ctx, cancel := context.WithTimeout(context.Background(), resolveCommTimeout)
+ comm, err := r.resolveFn(ctx, tid)
+ cancel()
r.mu.Lock()
delete(r.pending, tid)
if comm != "" {
@@ -95,7 +115,11 @@ func (r *commResolver) seedTrackedPidComm(pidFilter int) {
continue
}
seen[tid] = struct{}{}
- comm, err := r.resolveFn(tid)
+ // Use a short timeout here too; seeding happens at startup and a stall
+ // would delay the entire event loop initialisation.
+ ctx, cancel := context.WithTimeout(context.Background(), resolveCommTimeout)
+ comm, err := r.resolveFn(ctx, tid)
+ cancel()
if comm != "" {
r.setCached(tid, comm)
continue