From 90feabaf14b67ef61124881164ef3636b1827398 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Mon, 4 May 2026 23:20:21 +0300 Subject: askcli: serialize task alias cache writes to fix concurrent rename race Two concurrent 'ask' invocations could race on the alias cache file: both wrote the JSON to a shared '.tmp' filename and then both called os.Rename, so the loser failed with: replace task alias cache: rename .../task-aliases-v2.json.tmp .../task-aliases-v2.json: no such file or directory The shared tempfile also enabled lost updates because each process loaded the file independently before saving its own version on top. Fix: - Take an exclusive flock on a sentinel file (task-aliases-v2.json.lock) in the cache directory around the full load/modify/save cycle in both ensureTaskAliases and resolveTaskSelectorFromCache, using the existing internal/filelock package. - Switch save() to os.CreateTemp so each writer gets a unique tempfile name; the loser's tempfile is removed cleanly on rename failure. - Refactor resolveTaskSelectorFromCache by extracting finalizeResolvedTaskSelector to keep functions under 50 lines. Adds TestEnsureTaskAliases_ConcurrentCallsDoNotRaceOnTempFile, which reproduces the original error reliably on the unfixed code and now passes with -race. Amp-Thread-ID: https://ampcode.com/threads/T-019df49f-52a5-75b1-98d5-371a163ef100 Co-authored-by: Amp --- internal/askcli/task_alias_cache.go | 100 ++++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 16 deletions(-) (limited to 'internal/askcli/task_alias_cache.go') diff --git a/internal/askcli/task_alias_cache.go b/internal/askcli/task_alias_cache.go index e89dbb5..8e3fcc4 100644 --- a/internal/askcli/task_alias_cache.go +++ b/internal/askcli/task_alias_cache.go @@ -1,17 +1,24 @@ package askcli import ( + "context" "encoding/json" + "errors" "fmt" "os" "path/filepath" "slices" "time" + "codeberg.org/snonux/hexai/internal/filelock" "codeberg.org/snonux/hexai/internal/stats" ) -const taskAliasCacheTTL = 120 * 24 * time.Hour +const ( + taskAliasCacheTTL = 120 * 24 * time.Hour + taskAliasCacheLockFileName = "task-aliases-v2.json.lock" + taskAliasCacheLockTimeout = 30 * time.Second +) var ( nowTaskAliasCache = time.Now @@ -39,7 +46,20 @@ type taskAliasCacheEntry struct { } func ensureTaskAliases(tasks []TaskExport) (map[string]string, error) { - cache, path, err := loadTaskAliasCache() + path, err := taskAliasCachePath() + if err != nil { + return nil, err + } + // Hold an advisory lock for the full load/modify/save cycle so concurrent + // ask invocations cannot race on the cache file (lost-update or + // rename-against-missing-tempfile). + unlock, err := acquireTaskAliasCacheLock(filepath.Dir(path)) + if err != nil { + return nil, err + } + defer func() { _ = unlock() }() + + cache, err := loadTaskAliasCacheAt(path) if err != nil { return nil, err } @@ -76,17 +96,13 @@ func ensureTaskAliasesForUUIDs(uuids []string) (map[string]string, error) { return ensureTaskAliases(tasks) } -func loadTaskAliasCache() (taskAliasCache, string, error) { - path, err := taskAliasCachePath() - if err != nil { - return taskAliasCache{}, "", err - } +func loadTaskAliasCacheAt(path string) (taskAliasCache, error) { data, err := os.ReadFile(path) if os.IsNotExist(err) { - return taskAliasCache{}, path, nil + return taskAliasCache{}, nil } if err != nil { - return taskAliasCache{}, "", fmt.Errorf("read task alias cache: %w", err) + return taskAliasCache{}, fmt.Errorf("read task alias cache: %w", err) } var cache taskAliasCache @@ -95,15 +111,42 @@ func loadTaskAliasCache() (taskAliasCache, string, error) { // corruption). Discard and start fresh — tasks will get new alias IDs on // the next run, which is preferable to a hard failure. _ = os.Remove(path) - return taskAliasCache{}, path, nil + return taskAliasCache{}, nil } if err := cache.validate(); err != nil { - return taskAliasCache{}, "", fmt.Errorf("validate task alias cache: %w", err) + return taskAliasCache{}, fmt.Errorf("validate task alias cache: %w", err) } // Rebuild lookup maps from the deserialized entries so that all subsequent // operations use O(1) map access rather than a linear scan. cache.rebuildMaps() - return cache, path, nil + return cache, nil +} + +// acquireTaskAliasCacheLock takes an exclusive advisory lock on a sentinel +// file in dir so that concurrent ask invocations serialize their +// load/modify/save of the alias cache. The returned closure releases the lock +// and closes the underlying file. +func acquireTaskAliasCacheLock(dir string) (func() error, error) { + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, fmt.Errorf("create task alias cache dir: %w", err) + } + lockPath := filepath.Join(dir, taskAliasCacheLockFileName) + f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) + if err != nil { + return nil, fmt.Errorf("open task alias cache lock: %w", err) + } + ctx, cancel := context.WithTimeout(context.Background(), taskAliasCacheLockTimeout) + release, err := filelock.AcquireExclusive(ctx, f) + cancel() + if err != nil { + _ = f.Close() + return nil, fmt.Errorf("lock task alias cache: %w", err) + } + return func() error { + rerr := release() + cerr := f.Close() + return errors.Join(rerr, cerr) + }, nil } func taskAliasCachePath() (string, error) { @@ -242,7 +285,8 @@ func (c *taskAliasCache) lookupAliasByUUID(uuid string, now time.Time) (string, } func (c *taskAliasCache) save(path string) error { - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o755); err != nil { return fmt.Errorf("create task alias cache dir: %w", err) } data, err := json.MarshalIndent(c, "", " ") @@ -250,16 +294,40 @@ func (c *taskAliasCache) save(path string) error { return fmt.Errorf("marshal task alias cache: %w", err) } - tempPath := path + ".tmp" - if err := os.WriteFile(tempPath, data, 0o600); err != nil { - return fmt.Errorf("write task alias cache: %w", err) + // Use a unique temp filename per invocation so two concurrent saves cannot + // clobber each other's tempfile and produce a "rename: no such file or + // directory" error on the loser. + tempFile, err := os.CreateTemp(dir, filepath.Base(path)+".*.tmp") + if err != nil { + return fmt.Errorf("create task alias cache temp: %w", err) + } + tempPath := tempFile.Name() + if err := writeAndCloseTaskAliasTemp(tempFile, tempPath, data); err != nil { + _ = os.Remove(tempPath) + return err } if err := os.Rename(tempPath, path); err != nil { + _ = os.Remove(tempPath) return fmt.Errorf("replace task alias cache: %w", err) } return nil } +func writeAndCloseTaskAliasTemp(f *os.File, path string, data []byte) error { + if _, err := f.Write(data); err != nil { + _ = f.Close() + return fmt.Errorf("write task alias cache: %w", err) + } + if err := f.Chmod(0o600); err != nil { + _ = f.Close() + return fmt.Errorf("chmod task alias cache: %w", err) + } + if err := f.Close(); err != nil { + return fmt.Errorf("close task alias cache temp %s: %w", path, err) + } + return nil +} + func (c *taskAliasCache) sortEntries() { slices.SortFunc(c.Entries, func(a, b taskAliasCacheEntry) int { switch { -- cgit v1.2.3