diff options
| author | Paul Buetow <paul@buetow.org> | 2026-01-18 16:38:15 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-01-18 16:38:15 +0200 |
| commit | 6d6ab671dbb600e790853ca1ab02046807f19b9c (patch) | |
| tree | 05d4673fc007f3e07b7306143b1d6d5c411fcfdc /internal | |
| parent | 22589c1b41f8653764b583a87016122ff0131181 (diff) | |
clear resolved prometheus alerts from state
When Prometheus alerts stop firing, they were previously left in state
and became stale. Now they are automatically removed when no longer
in the firing alerts list from Prometheus.
Also fix Magefile Openbsd target to run build before deploy sequentially
instead of using mg.Deps which runs them in parallel.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/prometheus.go | 27 |
1 files changed, 26 insertions, 1 deletions
diff --git a/internal/prometheus.go b/internal/prometheus.go index 5bf2638..73c05cc 100644 --- a/internal/prometheus.go +++ b/internal/prometheus.go @@ -7,6 +7,7 @@ import ( "io" "log" "net/http" + "strings" "time" ) @@ -65,7 +66,8 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state state.update(cs) } - // Check if Watchdog alert is firing + // Track currently firing alerts to clear resolved ones later + firingAlerts := make(map[string]bool) watchdogFiring := false for _, alert := range alerts { @@ -75,6 +77,7 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state if alertname == "Watchdog" { if alert.State == "firing" { watchdogFiring = true + firingAlerts["Prometheus: Watchdog"] = true // Watchdog is firing as expected, treat as OK cs := checkResult{ name: "Prometheus: Watchdog", @@ -92,6 +95,7 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state } name := fmt.Sprintf("Prometheus: %s", alertname) + firingAlerts[name] = true severity := alert.Labels["severity"] description := alert.Annotations["summary"] if description == "" { @@ -117,6 +121,7 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state // If Watchdog is not firing, alert as critical if !watchdogFiring { + firingAlerts["Prometheus: Watchdog"] = true cs := checkResult{ name: "Prometheus: Watchdog", output: "CRITICAL [none]: Watchdog alert is not firing, Alertmanager may not be working", @@ -126,9 +131,29 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state state.update(cs) } + // Clear any Prometheus alerts that are no longer firing + clearResolvedPrometheusAlerts(state, firingAlerts) + return state } +// clearResolvedPrometheusAlerts removes Prometheus alerts from state that are +// no longer firing. This prevents stale alerts from accumulating. +func clearResolvedPrometheusAlerts(state state, firingAlerts map[string]bool) { + const prometheusPrefix = "Prometheus: " + for name := range state.checks { + // Skip non-Prometheus alerts and the connection status check + if !strings.HasPrefix(name, prometheusPrefix) || name == "Prometheus alerts" { + continue + } + // If this alert is not currently firing, remove it from state + if !firingAlerts[name] { + delete(state.checks, name) + log.Printf("Cleared resolved Prometheus alert: %s", name) + } + } +} + func fetchPrometheusAlerts(ctx context.Context, hosts []string, timeout time.Duration) ([]prometheusAlert, string, error) { var lastErr error |
