summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-01-18 16:38:15 +0200
committerPaul Buetow <paul@buetow.org>2026-01-18 16:38:15 +0200
commit6d6ab671dbb600e790853ca1ab02046807f19b9c (patch)
tree05d4673fc007f3e07b7306143b1d6d5c411fcfdc
parent22589c1b41f8653764b583a87016122ff0131181 (diff)
clear resolved prometheus alerts from state
When Prometheus alerts stop firing, they were previously left in state and became stale. Now they are automatically removed when no longer in the firing alerts list from Prometheus. Also fix Magefile Openbsd target to run build before deploy sequentially instead of using mg.Deps which runs them in parallel. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
-rw-r--r--Magefile.go7
-rw-r--r--internal/prometheus.go27
2 files changed, 31 insertions, 3 deletions
diff --git a/Magefile.go b/Magefile.go
index 03cbd97..0ddb7bc 100644
--- a/Magefile.go
+++ b/Magefile.go
@@ -75,9 +75,12 @@ func Test() error {
}
// Openbsd builds and deploys the gogios binary for OpenBSD.
+// Runs sequentially to ensure build completes before deploy.
func Openbsd() error {
- mg.Deps(BuildOpenbsd, DeployOpenbsd)
- return nil
+ if err := BuildOpenbsd(); err != nil {
+ return err
+ }
+ return DeployOpenbsd()
}
// BuildOpenbsd builds the gogios binary for OpenBSD.
diff --git a/internal/prometheus.go b/internal/prometheus.go
index 5bf2638..73c05cc 100644
--- a/internal/prometheus.go
+++ b/internal/prometheus.go
@@ -7,6 +7,7 @@ import (
"io"
"log"
"net/http"
+ "strings"
"time"
)
@@ -65,7 +66,8 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state
state.update(cs)
}
- // Check if Watchdog alert is firing
+ // Track currently firing alerts to clear resolved ones later
+ firingAlerts := make(map[string]bool)
watchdogFiring := false
for _, alert := range alerts {
@@ -75,6 +77,7 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state
if alertname == "Watchdog" {
if alert.State == "firing" {
watchdogFiring = true
+ firingAlerts["Prometheus: Watchdog"] = true
// Watchdog is firing as expected, treat as OK
cs := checkResult{
name: "Prometheus: Watchdog",
@@ -92,6 +95,7 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state
}
name := fmt.Sprintf("Prometheus: %s", alertname)
+ firingAlerts[name] = true
severity := alert.Labels["severity"]
description := alert.Annotations["summary"]
if description == "" {
@@ -117,6 +121,7 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state
// If Watchdog is not firing, alert as critical
if !watchdogFiring {
+ firingAlerts["Prometheus: Watchdog"] = true
cs := checkResult{
name: "Prometheus: Watchdog",
output: "CRITICAL [none]: Watchdog alert is not firing, Alertmanager may not be working",
@@ -126,9 +131,29 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state
state.update(cs)
}
+ // Clear any Prometheus alerts that are no longer firing
+ clearResolvedPrometheusAlerts(state, firingAlerts)
+
return state
}
+// clearResolvedPrometheusAlerts removes Prometheus alerts from state that are
+// no longer firing. This prevents stale alerts from accumulating.
+func clearResolvedPrometheusAlerts(state state, firingAlerts map[string]bool) {
+ const prometheusPrefix = "Prometheus: "
+ for name := range state.checks {
+ // Skip non-Prometheus alerts and the connection status check
+ if !strings.HasPrefix(name, prometheusPrefix) || name == "Prometheus alerts" {
+ continue
+ }
+ // If this alert is not currently firing, remove it from state
+ if !firingAlerts[name] {
+ delete(state.checks, name)
+ log.Printf("Cleared resolved Prometheus alert: %s", name)
+ }
+ }
+}
+
func fetchPrometheusAlerts(ctx context.Context, hosts []string, timeout time.Duration) ([]prometheusAlert, string, error) {
var lastErr error