diff options
| author | Paul Buetow <paul@buetow.org> | 2026-01-08 22:22:10 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-01-08 22:22:10 +0200 |
| commit | d9671ba9c6ba158cd4516626c4627d38d6478110 (patch) | |
| tree | 715dd6c2f7a4479545ae780bf12fb2cabbea0937 | |
| parent | a10cbd4e27d944464cec88aaf49d8b8c354d26e1 (diff) | |
Add special handling for Prometheus Watchdog alert
- Treat firing Watchdog as OK status to confirm Alertmanager is working
- Treat absent/non-firing Watchdog as CRITICAL to alert on Alertmanager issues
- Add comprehensive tests for both scenarios
| -rw-r--r-- | internal/config.go | 22 | ||||
| -rw-r--r-- | internal/prometheus.go | 36 | ||||
| -rw-r--r-- | internal/prometheus_test.go | 102 | ||||
| -rw-r--r-- | report.txt | 27 |
4 files changed, 174 insertions, 13 deletions
diff --git a/internal/config.go b/internal/config.go index c129ab7..25a9ddd 100644 --- a/internal/config.go +++ b/internal/config.go @@ -9,17 +9,17 @@ import ( ) type config struct { - EmailTo string - EmailFrom string - SMTPServer string `json:"SMTPServer,omitempty"` - SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option - StateDir string `json:"StateDir,omitempty"` - HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file - HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation - CheckTimeoutS int - CheckConcurrency int - StaleThreshold int `json:"StaleThreshold,omitempty"` - Federated []string `json:"Federated,omitempty"` // TODO: Document this option + EmailTo string + EmailFrom string + SMTPServer string `json:"SMTPServer,omitempty"` + SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option + StateDir string `json:"StateDir,omitempty"` + HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file + HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation + CheckTimeoutS int + CheckConcurrency int + StaleThreshold int `json:"StaleThreshold,omitempty"` + Federated []string `json:"Federated,omitempty"` // TODO: Document this option PrometheusHosts []string `json:"PrometheusHosts,omitempty"` PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"` Checks map[string]check diff --git a/internal/prometheus.go b/internal/prometheus.go index 1c06aaf..c73f1cf 100644 --- a/internal/prometheus.go +++ b/internal/prometheus.go @@ -44,12 +44,33 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state log.Printf("Fetched %d firing alerts from Prometheus host %s", len(alerts), host) + // Check if Watchdog alert is firing + watchdogFiring := false + for _, alert := range alerts { + alertname := alert.Labels["alertname"] + + // Special handling for Prometheus Watchdog alert + if alertname == "Watchdog" { + if alert.State == "firing" { + watchdogFiring = true + // Watchdog is firing as expected, treat as OK + cs := checkResult{ + name: "Prometheus: Watchdog", + output: "OK [none]: Alertmanager is working properly", + epoch: time.Now().Unix(), + status: nagiosOk, + } + state.update(cs) + } + continue + } + if alert.State != "firing" { continue } - name := fmt.Sprintf("Prometheus: %s", alert.Labels["alertname"]) + name := fmt.Sprintf("Prometheus: %s", alertname) severity := alert.Labels["severity"] description := alert.Annotations["summary"] if description == "" { @@ -66,13 +87,24 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state cs := checkResult{ name: name, - output: fmt.Sprintf("%s [%s]: %s", alert.Labels["alertname"], severity, description), + output: fmt.Sprintf("%s [%s]: %s", alertname, severity, description), epoch: time.Now().Unix(), status: status, } state.update(cs) } + // If Watchdog is not firing, alert as critical + if !watchdogFiring { + cs := checkResult{ + name: "Prometheus: Watchdog", + output: "CRITICAL [none]: Watchdog alert is not firing, Alertmanager may not be working", + epoch: time.Now().Unix(), + status: nagiosCritical, + } + state.update(cs) + } + return state } diff --git a/internal/prometheus_test.go b/internal/prometheus_test.go index 8d98e7f..150ba52 100644 --- a/internal/prometheus_test.go +++ b/internal/prometheus_test.go @@ -157,3 +157,105 @@ func TestMergePrometheusAlertsNoHosts(t *testing.T) { t.Errorf("expected no checks, got %d", len(result.checks)) } } + +func TestMergePrometheusAlertsWatchdogFiring(t *testing.T) { + resp := prometheusResponse{ + Status: "success", + Data: struct { + Alerts []prometheusAlert `json:"alerts"` + }{ + Alerts: []prometheusAlert{ + { + Labels: map[string]string{"alertname": "Watchdog", "severity": "none"}, + Annotations: map[string]string{"summary": "An alert that should always be firing to certify that Alertmanager is working properly."}, + State: "firing", + }, + { + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"}, + Annotations: map[string]string{"summary": "CPU usage is high"}, + State: "firing", + }, + }, + }, + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(resp) + })) + defer server.Close() + + conf := config{ + PrometheusHosts: []string{strings.TrimPrefix(server.URL, "http://")}, + PrometheusTimeoutS: 2, + } + s := state{checks: make(map[string]checkState)} + + result := mergePrometheusAlerts(context.Background(), s, conf) + + watchdog, ok := result.checks["Prometheus: Watchdog"] + if !ok { + t.Fatal("Watchdog check not found in state") + } + + if watchdog.Status != nagiosOk { + t.Errorf("expected Watchdog status OK, got %v", watchdog.Status) + } + + if !strings.Contains(watchdog.output, "working properly") { + t.Errorf("expected working properly message, got: %s", watchdog.output) + } + + // Verify other alerts are still processed + cpu, ok := result.checks["Prometheus: HighCPU"] + if !ok { + t.Fatal("HighCPU check not found in state") + } + if cpu.Status != nagiosCritical { + t.Errorf("expected HighCPU status CRITICAL, got %v", cpu.Status) + } +} + +func TestMergePrometheusAlertsWatchdogNotFiring(t *testing.T) { + resp := prometheusResponse{ + Status: "success", + Data: struct { + Alerts []prometheusAlert `json:"alerts"` + }{ + Alerts: []prometheusAlert{ + { + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"}, + Annotations: map[string]string{"summary": "CPU usage is high"}, + State: "firing", + }, + }, + }, + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(resp) + })) + defer server.Close() + + conf := config{ + PrometheusHosts: []string{strings.TrimPrefix(server.URL, "http://")}, + PrometheusTimeoutS: 2, + } + s := state{checks: make(map[string]checkState)} + + result := mergePrometheusAlerts(context.Background(), s, conf) + + watchdog, ok := result.checks["Prometheus: Watchdog"] + if !ok { + t.Fatal("Watchdog check not found in state") + } + + if watchdog.Status != nagiosCritical { + t.Errorf("expected Watchdog status CRITICAL, got %v", watchdog.Status) + } + + if !strings.Contains(watchdog.output, "not firing") { + t.Errorf("expected not firing message, got: %s", watchdog.output) + } +} diff --git a/report.txt b/report.txt new file mode 100644 index 0000000..ea445f0 --- /dev/null +++ b/report.txt @@ -0,0 +1,27 @@ +GOGIOS Report [C:2 W:2 U:0 S:0 OK:1] + +This is the recent Gogios report! + +# Alerts with status changed: + +UNKNOWN->CRITICAL: Prometheus: KubeProxyDown: KubeProxyDown [critical]: Target disappeared from Prometheus target discovery. +UNKNOWN->CRITICAL: Prometheus: KubeSchedulerDown: KubeSchedulerDown [critical]: Target disappeared from Prometheus target discovery. + +UNKNOWN->WARNING: Prometheus: TargetDown: TargetDown [warning]: One or more targets are unreachable. +UNKNOWN->WARNING: Prometheus: Watchdog: Watchdog [none]: An alert that should always be firing to certify that Alertmanager is working properly. + +UNKNOWN->OK: localhost ping: OK: localhost is alive + +# Unhandled alerts: + +CRITICAL: Prometheus: KubeProxyDown: KubeProxyDown [critical]: Target disappeared from Prometheus target discovery. +CRITICAL: Prometheus: KubeSchedulerDown: KubeSchedulerDown [critical]: Target disappeared from Prometheus target discovery. + +WARNING: Prometheus: TargetDown: TargetDown [warning]: One or more targets are unreachable. +WARNING: Prometheus: Watchdog: Watchdog [none]: An alert that should always be firing to certify that Alertmanager is working properly. + +# Stale alerts: + +There are no stale alerts... + +Have a nice day! |
