diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-04 23:20:21 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-04 23:20:21 +0300 |
| commit | 90feabaf14b67ef61124881164ef3636b1827398 (patch) | |
| tree | 6503a4da2e0ea0cee0c3fd97e64de10dab0d6b38 /internal/askcli/task_alias_cache.go | |
| parent | 266bedf71fe8a54b86af038889522a68bae562a8 (diff) | |
askcli: serialize task alias cache writes to fix concurrent rename race
Two concurrent 'ask' invocations could race on the alias cache file:
both wrote the JSON to a shared '<path>.tmp' filename and then both
called os.Rename, so the loser failed with:
replace task alias cache: rename .../task-aliases-v2.json.tmp
.../task-aliases-v2.json: no such file or directory
The shared tempfile also enabled lost updates because each process
loaded the file independently before saving its own version on top.
Fix:
- Take an exclusive flock on a sentinel file (task-aliases-v2.json.lock)
in the cache directory around the full load/modify/save cycle in both
ensureTaskAliases and resolveTaskSelectorFromCache, using the existing
internal/filelock package.
- Switch save() to os.CreateTemp so each writer gets a unique tempfile
name; the loser's tempfile is removed cleanly on rename failure.
- Refactor resolveTaskSelectorFromCache by extracting
finalizeResolvedTaskSelector to keep functions under 50 lines.
Adds TestEnsureTaskAliases_ConcurrentCallsDoNotRaceOnTempFile, which
reproduces the original error reliably on the unfixed code and now
passes with -race.
Amp-Thread-ID: https://ampcode.com/threads/T-019df49f-52a5-75b1-98d5-371a163ef100
Co-authored-by: Amp <amp@ampcode.com>
Diffstat (limited to 'internal/askcli/task_alias_cache.go')
| -rw-r--r-- | internal/askcli/task_alias_cache.go | 100 |
1 files changed, 84 insertions, 16 deletions
diff --git a/internal/askcli/task_alias_cache.go b/internal/askcli/task_alias_cache.go index e89dbb5..8e3fcc4 100644 --- a/internal/askcli/task_alias_cache.go +++ b/internal/askcli/task_alias_cache.go @@ -1,17 +1,24 @@ package askcli import ( + "context" "encoding/json" + "errors" "fmt" "os" "path/filepath" "slices" "time" + "codeberg.org/snonux/hexai/internal/filelock" "codeberg.org/snonux/hexai/internal/stats" ) -const taskAliasCacheTTL = 120 * 24 * time.Hour +const ( + taskAliasCacheTTL = 120 * 24 * time.Hour + taskAliasCacheLockFileName = "task-aliases-v2.json.lock" + taskAliasCacheLockTimeout = 30 * time.Second +) var ( nowTaskAliasCache = time.Now @@ -39,7 +46,20 @@ type taskAliasCacheEntry struct { } func ensureTaskAliases(tasks []TaskExport) (map[string]string, error) { - cache, path, err := loadTaskAliasCache() + path, err := taskAliasCachePath() + if err != nil { + return nil, err + } + // Hold an advisory lock for the full load/modify/save cycle so concurrent + // ask invocations cannot race on the cache file (lost-update or + // rename-against-missing-tempfile). + unlock, err := acquireTaskAliasCacheLock(filepath.Dir(path)) + if err != nil { + return nil, err + } + defer func() { _ = unlock() }() + + cache, err := loadTaskAliasCacheAt(path) if err != nil { return nil, err } @@ -76,17 +96,13 @@ func ensureTaskAliasesForUUIDs(uuids []string) (map[string]string, error) { return ensureTaskAliases(tasks) } -func loadTaskAliasCache() (taskAliasCache, string, error) { - path, err := taskAliasCachePath() - if err != nil { - return taskAliasCache{}, "", err - } +func loadTaskAliasCacheAt(path string) (taskAliasCache, error) { data, err := os.ReadFile(path) if os.IsNotExist(err) { - return taskAliasCache{}, path, nil + return taskAliasCache{}, nil } if err != nil { - return taskAliasCache{}, "", fmt.Errorf("read task alias cache: %w", err) + return taskAliasCache{}, fmt.Errorf("read task alias cache: %w", err) } var cache taskAliasCache @@ -95,15 +111,42 @@ func loadTaskAliasCache() (taskAliasCache, string, error) { // corruption). Discard and start fresh — tasks will get new alias IDs on // the next run, which is preferable to a hard failure. _ = os.Remove(path) - return taskAliasCache{}, path, nil + return taskAliasCache{}, nil } if err := cache.validate(); err != nil { - return taskAliasCache{}, "", fmt.Errorf("validate task alias cache: %w", err) + return taskAliasCache{}, fmt.Errorf("validate task alias cache: %w", err) } // Rebuild lookup maps from the deserialized entries so that all subsequent // operations use O(1) map access rather than a linear scan. cache.rebuildMaps() - return cache, path, nil + return cache, nil +} + +// acquireTaskAliasCacheLock takes an exclusive advisory lock on a sentinel +// file in dir so that concurrent ask invocations serialize their +// load/modify/save of the alias cache. The returned closure releases the lock +// and closes the underlying file. +func acquireTaskAliasCacheLock(dir string) (func() error, error) { + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, fmt.Errorf("create task alias cache dir: %w", err) + } + lockPath := filepath.Join(dir, taskAliasCacheLockFileName) + f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) + if err != nil { + return nil, fmt.Errorf("open task alias cache lock: %w", err) + } + ctx, cancel := context.WithTimeout(context.Background(), taskAliasCacheLockTimeout) + release, err := filelock.AcquireExclusive(ctx, f) + cancel() + if err != nil { + _ = f.Close() + return nil, fmt.Errorf("lock task alias cache: %w", err) + } + return func() error { + rerr := release() + cerr := f.Close() + return errors.Join(rerr, cerr) + }, nil } func taskAliasCachePath() (string, error) { @@ -242,7 +285,8 @@ func (c *taskAliasCache) lookupAliasByUUID(uuid string, now time.Time) (string, } func (c *taskAliasCache) save(path string) error { - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o755); err != nil { return fmt.Errorf("create task alias cache dir: %w", err) } data, err := json.MarshalIndent(c, "", " ") @@ -250,16 +294,40 @@ func (c *taskAliasCache) save(path string) error { return fmt.Errorf("marshal task alias cache: %w", err) } - tempPath := path + ".tmp" - if err := os.WriteFile(tempPath, data, 0o600); err != nil { - return fmt.Errorf("write task alias cache: %w", err) + // Use a unique temp filename per invocation so two concurrent saves cannot + // clobber each other's tempfile and produce a "rename: no such file or + // directory" error on the loser. + tempFile, err := os.CreateTemp(dir, filepath.Base(path)+".*.tmp") + if err != nil { + return fmt.Errorf("create task alias cache temp: %w", err) + } + tempPath := tempFile.Name() + if err := writeAndCloseTaskAliasTemp(tempFile, tempPath, data); err != nil { + _ = os.Remove(tempPath) + return err } if err := os.Rename(tempPath, path); err != nil { + _ = os.Remove(tempPath) return fmt.Errorf("replace task alias cache: %w", err) } return nil } +func writeAndCloseTaskAliasTemp(f *os.File, path string, data []byte) error { + if _, err := f.Write(data); err != nil { + _ = f.Close() + return fmt.Errorf("write task alias cache: %w", err) + } + if err := f.Chmod(0o600); err != nil { + _ = f.Close() + return fmt.Errorf("chmod task alias cache: %w", err) + } + if err := f.Close(); err != nil { + return fmt.Errorf("close task alias cache temp %s: %w", path, err) + } + return nil +} + func (c *taskAliasCache) sortEntries() { slices.SortFunc(c.Entries, func(a, b taskAliasCacheEntry) int { switch { |
