diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-13 10:01:40 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-13 10:01:40 +0300 |
| commit | 42d7821dc8d81781f2c3cfc269e99c0ee1dbd017 (patch) | |
| tree | b7b07bec653ff26bf70e866370f96b8d63a636e1 /internal/eventloop_commresolver_test.go | |
| parent | f7ebc44d8b770132904b64996eac50e26945bc94 (diff) | |
fix: add 1s context timeout to commResolver procfs reads to prevent indefinite blocking
Frozen cgroup entries in /proc could stall a lookup worker goroutine
forever, preventing clean shutdown because shutdown() waits on
workersWG.Wait(). Changed resolveFn signature to accept context.Context
and wrap each call in context.WithTimeout(1s) in both lookupWorker and
seedTrackedPidComm. Added TestCommResolverLookupWorkerRespectsTimeout
to verify the pending entry is cleared after a timeout.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'internal/eventloop_commresolver_test.go')
| -rw-r--r-- | internal/eventloop_commresolver_test.go | 48 |
1 files changed, 44 insertions, 4 deletions
diff --git a/internal/eventloop_commresolver_test.go b/internal/eventloop_commresolver_test.go index 351db70..04019d4 100644 --- a/internal/eventloop_commresolver_test.go +++ b/internal/eventloop_commresolver_test.go @@ -1,6 +1,7 @@ package internal import ( + "context" "errors" "fmt" "strings" @@ -26,7 +27,7 @@ func TestCommResolverQueueLookupRespectsWorkerLimit(t *testing.T) { defer resolver.shutdown() resolver.lookupWorkers = workers resolver.lookupQueue = make(chan uint32, lookups) - resolver.resolveFn = func(tid uint32) (string, error) { + resolver.resolveFn = func(_ context.Context, tid uint32) (string, error) { current := atomic.AddInt32(&running, 1) setMaxInt32(&maxRunning, current) started <- struct{}{} @@ -86,7 +87,7 @@ func TestCommResolverQueueLookupQueueFullClearsPending(t *testing.T) { defer resolver.shutdown() resolver.lookupWorkers = 1 resolver.lookupQueue = make(chan uint32, 1) - resolver.resolveFn = func(tid uint32) (string, error) { + resolver.resolveFn = func(_ context.Context, tid uint32) (string, error) { select { case started <- struct{}{}: default: @@ -141,7 +142,7 @@ func TestCommResolverShutdownStopsWorkersAndPreventsNewLookups(t *testing.T) { resolver := newCommResolver(nil) resolver.lookupWorkers = 1 resolver.lookupQueue = make(chan uint32, 1) - resolver.resolveFn = func(tid uint32) (string, error) { + resolver.resolveFn = func(_ context.Context, tid uint32) (string, error) { started <- struct{}{} <-release return fmt.Sprintf("comm-%d", tid), nil @@ -193,7 +194,7 @@ func TestCommResolverLookupWarnsOnUnexpectedResolveError(t *testing.T) { resolver.lookupWorkers = 1 resolver.lookupQueue = make(chan uint32, 1) resolver.warningFn = func(message string) { warnings <- message } - resolver.resolveFn = func(uint32) (string, error) { + resolver.resolveFn = func(context.Context, uint32) (string, error) { return "", errors.New("boom") } @@ -226,6 +227,45 @@ func TestResolveCommFromProcWithErrorIgnoresMissingProcess(t *testing.T) { } } +// TestCommResolverLookupWorkerRespectsTimeout verifies that a resolveFn that +// blocks longer than resolveCommTimeout is interrupted and the pending entry +// is cleared so shutdown is not stalled. +func TestCommResolverLookupWorkerRespectsTimeout(t *testing.T) { + const tid uint32 = 401 + + // blockUntilCtxDone blocks until the context passed by the worker expires. + blockUntilCtxDone := make(chan struct{}) + resolver := newCommResolver(nil) + defer resolver.shutdown() + resolver.lookupWorkers = 1 + resolver.lookupQueue = make(chan uint32, 1) + resolver.resolveFn = func(ctx context.Context, _ uint32) (string, error) { + close(blockUntilCtxDone) + <-ctx.Done() + return "", ctx.Err() + } + + resolver.queueLookup(tid) + + // Wait until the resolver fn has started and confirmed it is blocking. + select { + case <-blockUntilCtxDone: + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for resolver fn to start") + } + + // The pending entry must be cleared once the context times out and the + // worker loop continues to the next iteration. + waitForCondition(t, resolveCommTimeout+2*time.Second, + "expected pending entry to be cleared after context timeout", + func() bool { return pendingCount(resolver) == 0 }, + ) + + if _, ok := resolver.cached(tid); ok { + t.Fatalf("did not expect tid %d to be cached after a timed-out resolve", tid) + } +} + func hasPending(r *commResolver, tid uint32) bool { r.mu.RLock() defer r.mu.RUnlock() |
