summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-01-08 22:22:10 +0200
committerPaul Buetow <paul@buetow.org>2026-01-08 22:22:10 +0200
commitd9671ba9c6ba158cd4516626c4627d38d6478110 (patch)
tree715dd6c2f7a4479545ae780bf12fb2cabbea0937
parenta10cbd4e27d944464cec88aaf49d8b8c354d26e1 (diff)
Add special handling for Prometheus Watchdog alert
- Treat firing Watchdog as OK status to confirm Alertmanager is working - Treat absent/non-firing Watchdog as CRITICAL to alert on Alertmanager issues - Add comprehensive tests for both scenarios
-rw-r--r--internal/config.go22
-rw-r--r--internal/prometheus.go36
-rw-r--r--internal/prometheus_test.go102
-rw-r--r--report.txt27
4 files changed, 174 insertions, 13 deletions
diff --git a/internal/config.go b/internal/config.go
index c129ab7..25a9ddd 100644
--- a/internal/config.go
+++ b/internal/config.go
@@ -9,17 +9,17 @@ import (
)
type config struct {
- EmailTo string
- EmailFrom string
- SMTPServer string `json:"SMTPServer,omitempty"`
- SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option
- StateDir string `json:"StateDir,omitempty"`
- HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file
- HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation
- CheckTimeoutS int
- CheckConcurrency int
- StaleThreshold int `json:"StaleThreshold,omitempty"`
- Federated []string `json:"Federated,omitempty"` // TODO: Document this option
+ EmailTo string
+ EmailFrom string
+ SMTPServer string `json:"SMTPServer,omitempty"`
+ SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option
+ StateDir string `json:"StateDir,omitempty"`
+ HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file
+ HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation
+ CheckTimeoutS int
+ CheckConcurrency int
+ StaleThreshold int `json:"StaleThreshold,omitempty"`
+ Federated []string `json:"Federated,omitempty"` // TODO: Document this option
PrometheusHosts []string `json:"PrometheusHosts,omitempty"`
PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"`
Checks map[string]check
diff --git a/internal/prometheus.go b/internal/prometheus.go
index 1c06aaf..c73f1cf 100644
--- a/internal/prometheus.go
+++ b/internal/prometheus.go
@@ -44,12 +44,33 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state
log.Printf("Fetched %d firing alerts from Prometheus host %s", len(alerts), host)
+ // Check if Watchdog alert is firing
+ watchdogFiring := false
+
for _, alert := range alerts {
+ alertname := alert.Labels["alertname"]
+
+ // Special handling for Prometheus Watchdog alert
+ if alertname == "Watchdog" {
+ if alert.State == "firing" {
+ watchdogFiring = true
+ // Watchdog is firing as expected, treat as OK
+ cs := checkResult{
+ name: "Prometheus: Watchdog",
+ output: "OK [none]: Alertmanager is working properly",
+ epoch: time.Now().Unix(),
+ status: nagiosOk,
+ }
+ state.update(cs)
+ }
+ continue
+ }
+
if alert.State != "firing" {
continue
}
- name := fmt.Sprintf("Prometheus: %s", alert.Labels["alertname"])
+ name := fmt.Sprintf("Prometheus: %s", alertname)
severity := alert.Labels["severity"]
description := alert.Annotations["summary"]
if description == "" {
@@ -66,13 +87,24 @@ func mergePrometheusAlerts(ctx context.Context, state state, conf config) state
cs := checkResult{
name: name,
- output: fmt.Sprintf("%s [%s]: %s", alert.Labels["alertname"], severity, description),
+ output: fmt.Sprintf("%s [%s]: %s", alertname, severity, description),
epoch: time.Now().Unix(),
status: status,
}
state.update(cs)
}
+ // If Watchdog is not firing, alert as critical
+ if !watchdogFiring {
+ cs := checkResult{
+ name: "Prometheus: Watchdog",
+ output: "CRITICAL [none]: Watchdog alert is not firing, Alertmanager may not be working",
+ epoch: time.Now().Unix(),
+ status: nagiosCritical,
+ }
+ state.update(cs)
+ }
+
return state
}
diff --git a/internal/prometheus_test.go b/internal/prometheus_test.go
index 8d98e7f..150ba52 100644
--- a/internal/prometheus_test.go
+++ b/internal/prometheus_test.go
@@ -157,3 +157,105 @@ func TestMergePrometheusAlertsNoHosts(t *testing.T) {
t.Errorf("expected no checks, got %d", len(result.checks))
}
}
+
+func TestMergePrometheusAlertsWatchdogFiring(t *testing.T) {
+ resp := prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{
+ {
+ Labels: map[string]string{"alertname": "Watchdog", "severity": "none"},
+ Annotations: map[string]string{"summary": "An alert that should always be firing to certify that Alertmanager is working properly."},
+ State: "firing",
+ },
+ {
+ Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"},
+ Annotations: map[string]string{"summary": "CPU usage is high"},
+ State: "firing",
+ },
+ },
+ },
+ }
+
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ _ = json.NewEncoder(w).Encode(resp)
+ }))
+ defer server.Close()
+
+ conf := config{
+ PrometheusHosts: []string{strings.TrimPrefix(server.URL, "http://")},
+ PrometheusTimeoutS: 2,
+ }
+ s := state{checks: make(map[string]checkState)}
+
+ result := mergePrometheusAlerts(context.Background(), s, conf)
+
+ watchdog, ok := result.checks["Prometheus: Watchdog"]
+ if !ok {
+ t.Fatal("Watchdog check not found in state")
+ }
+
+ if watchdog.Status != nagiosOk {
+ t.Errorf("expected Watchdog status OK, got %v", watchdog.Status)
+ }
+
+ if !strings.Contains(watchdog.output, "working properly") {
+ t.Errorf("expected working properly message, got: %s", watchdog.output)
+ }
+
+ // Verify other alerts are still processed
+ cpu, ok := result.checks["Prometheus: HighCPU"]
+ if !ok {
+ t.Fatal("HighCPU check not found in state")
+ }
+ if cpu.Status != nagiosCritical {
+ t.Errorf("expected HighCPU status CRITICAL, got %v", cpu.Status)
+ }
+}
+
+func TestMergePrometheusAlertsWatchdogNotFiring(t *testing.T) {
+ resp := prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{
+ {
+ Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"},
+ Annotations: map[string]string{"summary": "CPU usage is high"},
+ State: "firing",
+ },
+ },
+ },
+ }
+
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ _ = json.NewEncoder(w).Encode(resp)
+ }))
+ defer server.Close()
+
+ conf := config{
+ PrometheusHosts: []string{strings.TrimPrefix(server.URL, "http://")},
+ PrometheusTimeoutS: 2,
+ }
+ s := state{checks: make(map[string]checkState)}
+
+ result := mergePrometheusAlerts(context.Background(), s, conf)
+
+ watchdog, ok := result.checks["Prometheus: Watchdog"]
+ if !ok {
+ t.Fatal("Watchdog check not found in state")
+ }
+
+ if watchdog.Status != nagiosCritical {
+ t.Errorf("expected Watchdog status CRITICAL, got %v", watchdog.Status)
+ }
+
+ if !strings.Contains(watchdog.output, "not firing") {
+ t.Errorf("expected not firing message, got: %s", watchdog.output)
+ }
+}
diff --git a/report.txt b/report.txt
new file mode 100644
index 0000000..ea445f0
--- /dev/null
+++ b/report.txt
@@ -0,0 +1,27 @@
+GOGIOS Report [C:2 W:2 U:0 S:0 OK:1]
+
+This is the recent Gogios report!
+
+# Alerts with status changed:
+
+UNKNOWN->CRITICAL: Prometheus: KubeProxyDown: KubeProxyDown [critical]: Target disappeared from Prometheus target discovery.
+UNKNOWN->CRITICAL: Prometheus: KubeSchedulerDown: KubeSchedulerDown [critical]: Target disappeared from Prometheus target discovery.
+
+UNKNOWN->WARNING: Prometheus: TargetDown: TargetDown [warning]: One or more targets are unreachable.
+UNKNOWN->WARNING: Prometheus: Watchdog: Watchdog [none]: An alert that should always be firing to certify that Alertmanager is working properly.
+
+UNKNOWN->OK: localhost ping: OK: localhost is alive
+
+# Unhandled alerts:
+
+CRITICAL: Prometheus: KubeProxyDown: KubeProxyDown [critical]: Target disappeared from Prometheus target discovery.
+CRITICAL: Prometheus: KubeSchedulerDown: KubeSchedulerDown [critical]: Target disappeared from Prometheus target discovery.
+
+WARNING: Prometheus: TargetDown: TargetDown [warning]: One or more targets are unreachable.
+WARNING: Prometheus: Watchdog: Watchdog [none]: An alert that should always be firing to certify that Alertmanager is working properly.
+
+# Stale alerts:
+
+There are no stale alerts...
+
+Have a nice day!