From 6eccdbfa107d16670e5bd96d31e075493d0ee2d6 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Fri, 29 May 2026 17:45:17 +0300 Subject: askcli: prevent stale-lock inode split during contention --- internal/askcli/runlock.go | 31 ++++---------- internal/askcli/runlock_test.go | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 24 deletions(-) diff --git a/internal/askcli/runlock.go b/internal/askcli/runlock.go index d0d7be3..9fbe0cd 100644 --- a/internal/askcli/runlock.go +++ b/internal/askcli/runlock.go @@ -16,8 +16,6 @@ import ( const askRepoLockFile = "hexai-ask.lock" -var errAskLockReopen = errors.New("ask lock: reopen after stale file removal") - func lockProcessLabel() string { if exe, err := os.Executable(); err == nil { if b := filepath.Base(exe); b != "" && b != "." { @@ -70,11 +68,9 @@ func writeLockMetadata(f *os.File, pid int, comm string) error { // waitOrAcquireAskLockFD tries to take an exclusive lock on f, or blocks until ctx ends. // On success it writes lock metadata and returns an unlock function (which closes f). -// errAskLockReopen means the caller should open the lock path again after stale removal. func waitOrAcquireAskLockFD( ctx context.Context, f *os.File, - lockPath string, comm string, retryTimer *time.Timer, ) (func() error, error) { @@ -98,12 +94,10 @@ func waitOrAcquireAskLockFD( } pid := readLockHolderPID(f) + // Keep waiting even if metadata appears stale: removing a contended lock file can + // split ownership across different inodes and break serialization guarantees. if pid > 0 && lockHolderIsStale(pid, comm) { - _ = f.Close() - if rerr := os.Remove(lockPath); rerr != nil && !errors.Is(rerr, os.ErrNotExist) { - return nil, fmt.Errorf("ask lock: remove stale %s: %w", lockPath, rerr) - } - return nil, errAskLockReopen + // Intentional no-op: contention is resolved only by waiting for flock release. } retryTimer.Reset(5 * time.Millisecond) @@ -128,20 +122,9 @@ func acquireAskRepoLock(ctx context.Context, gitRoot string) (func() error, erro retryTimer := time.NewTimer(5 * time.Millisecond) defer retryTimer.Stop() - for removalAttempts := 0; removalAttempts < 16; removalAttempts++ { - f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) - if err != nil { - return nil, fmt.Errorf("ask lock: open %s: %w", lockPath, err) - } - unlock, err := waitOrAcquireAskLockFD(ctx, f, lockPath, comm, retryTimer) - if err == nil { - return unlock, nil - } - if errors.Is(err, errAskLockReopen) { - continue - } - return nil, err + f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) + if err != nil { + return nil, fmt.Errorf("ask lock: open %s: %w", lockPath, err) } - - return nil, fmt.Errorf("ask lock: could not acquire %s after stale recovery attempts", lockPath) + return waitOrAcquireAskLockFD(ctx, f, comm, retryTimer) } diff --git a/internal/askcli/runlock_test.go b/internal/askcli/runlock_test.go index f56f214..8ef8f2c 100644 --- a/internal/askcli/runlock_test.go +++ b/internal/askcli/runlock_test.go @@ -8,8 +8,15 @@ import ( "sync/atomic" "testing" "time" + + "codeberg.org/snonux/hexai/internal/filelock" ) +type lockResult struct { + unlock func() error + err error +} + func TestAcquireAskRepoLock_SerializesConcurrentHolders(t *testing.T) { tmp := t.TempDir() if err := os.MkdirAll(filepath.Join(tmp, ".git"), 0o755); err != nil { @@ -44,3 +51,85 @@ func TestAcquireAskRepoLock_SerializesConcurrentHolders(t *testing.T) { t.Fatalf("max concurrent lock holders = %d, want 1", got) } } + +func TestAcquireAskRepoLock_StaleMetadataDoesNotRotateContendedLockFile(t *testing.T) { + tmp := t.TempDir() + holder, lockPath, origInfo := prepareContendedStaleLock(t, tmp) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + resultCh := acquireLockAsync(ctx, tmp) + + select { + case result := <-resultCh: + if result.unlock != nil { + _ = result.unlock() + } + t.Fatalf("lock acquired while holder still held lock: %v", result.err) + case <-time.After(40 * time.Millisecond): + } + + curInfo, err := os.Stat(lockPath) + if err != nil { + t.Fatalf("stat contended lock: %v", err) + } + if !os.SameFile(origInfo, curInfo) { + t.Fatal("contended lock file was replaced while locked") + } + + releaseContendedLock(t, holder) + + result := <-resultCh + if result.err != nil { + t.Fatalf("contender lock: %v", result.err) + } + if result.unlock == nil { + t.Fatal("contender returned nil unlock") + } + if err := result.unlock(); err != nil { + t.Fatalf("contender unlock: %v", err) + } +} + +func prepareContendedStaleLock(t *testing.T, gitRoot string) (*os.File, string, os.FileInfo) { + t.Helper() + lockDir := filepath.Join(gitRoot, ".git") + if err := os.MkdirAll(lockDir, 0o755); err != nil { + t.Fatal(err) + } + lockPath := filepath.Join(lockDir, askRepoLockFile) + holder, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) + if err != nil { + t.Fatal(err) + } + if err := filelock.TryExclusive(holder); err != nil { + t.Fatalf("holder lock: %v", err) + } + if err := writeLockMetadata(holder, 999999, "ask"); err != nil { + t.Fatalf("write stale metadata: %v", err) + } + origInfo, err := os.Stat(lockPath) + if err != nil { + t.Fatalf("stat original lock: %v", err) + } + return holder, lockPath, origInfo +} + +func acquireLockAsync(ctx context.Context, gitRoot string) <-chan lockResult { + resultCh := make(chan lockResult, 1) + go func() { + unlock, err := acquireAskRepoLock(ctx, gitRoot) + resultCh <- lockResult{unlock: unlock, err: err} + }() + return resultCh +} + +func releaseContendedLock(t *testing.T, holder *os.File) { + t.Helper() + if err := filelock.UnlockExclusive(holder); err != nil { + t.Fatalf("release holder lock: %v", err) + } + if err := holder.Close(); err != nil { + t.Fatalf("close holder file: %v", err) + } +} -- cgit v1.2.3