summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-01-08 21:41:29 +0200
committerPaul Buetow <paul@buetow.org>2026-01-08 21:41:29 +0200
commita10cbd4e27d944464cec88aaf49d8b8c354d26e1 (patch)
tree0bdd0a23fbb8939c15544b857c74101cb5721a6c
parentf5cffe240c44045684d4f74981235b060828550e (diff)
Add Prometheus alert scraping with configurable timeout and host failover
-rw-r--r--AGENTS.md73
-rw-r--r--internal/config.go8
-rw-r--r--internal/prometheus.go131
-rw-r--r--internal/prometheus_test.go159
-rw-r--r--internal/run.go1
5 files changed, 371 insertions, 1 deletions
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..ed5e50a
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,73 @@
+# Gogios - Agent Guidelines
+
+## Project Overview
+
+Gogios is a lightweight, minimalistic monitoring tool written in Go. It executes Nagios/Icinga monitoring plugins and sends email notifications on status changes.
+
+## Commands
+
+### Build
+```bash
+mage build # Build the gogios binary
+go build -o gogios cmd/gogios/main.go # Alternative without mage
+```
+
+### Development Build (with race detector)
+```bash
+mage dev # Runs vet + lint, then builds with -race
+```
+
+### Test
+```bash
+mage test # Run all unit tests (clears test cache first)
+go test ./... # Alternative without mage
+```
+
+### Lint & Vet
+```bash
+mage vet # Run go vet
+mage lint # Run golangci-lint
+mage lintInstall # Install golangci-lint
+```
+
+### Cross-compile for OpenBSD
+```bash
+mage openbsd # Build and deploy for OpenBSD
+mage buildOpenbsd # Build only
+```
+
+## Project Structure
+
+```
+cmd/gogios/ # Main entry point
+internal/ # Core implementation
+ check.go # Check execution logic
+ config.go # Configuration parsing
+ dependency.go # Check dependency handling
+ federated.go # Federated monitoring
+ html.go # HTML report generation
+ nagioscode.go # Nagios exit code handling
+ notify.go # Email notification
+ run.go # Main run logic
+ runchecks.go # Check orchestration
+ state.go # State persistence
+```
+
+## Code Conventions
+
+- Go 1.24+
+- Use standard Go formatting (`gofmt`)
+- Tests use the standard `testing` package with `*_test.go` suffix
+- Internal packages under `internal/` are not exported
+- Module path: `codeberg.org/snonux/gogios`
+
+## Testing
+
+Tests exist in `internal/` with the `*_test.go` naming convention:
+- `federated_test.go`
+- `html_test.go`
+- `state_test.go`
+
+Run tests before committing changes.
+
+For best practices also follow ~/git/conf/snippets/go/go-projects/go-projects.md if present.
diff --git a/internal/config.go b/internal/config.go
index 2ade802..c129ab7 100644
--- a/internal/config.go
+++ b/internal/config.go
@@ -20,7 +20,9 @@ type config struct {
CheckConcurrency int
StaleThreshold int `json:"StaleThreshold,omitempty"`
Federated []string `json:"Federated,omitempty"` // TODO: Document this option
- Checks map[string]check
+ PrometheusHosts []string `json:"PrometheusHosts,omitempty"`
+ PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"`
+ Checks map[string]check
}
func newConfig(configFile string) (config, error) {
@@ -60,6 +62,10 @@ func newConfig(configFile string) (config, error) {
conf.StaleThreshold = 3600 // Default to 1 hour
}
+ if conf.PrometheusTimeoutS == 0 {
+ conf.PrometheusTimeoutS = 2 // Default to 2 seconds
+ }
+
if !conf.HTMLDisable && conf.HTMLStatusFile == "" {
conf.HTMLStatusFile = "/var/www/htdocs/buetow.org/self/gogios/index.html"
log.Println("Set HTMLStatusFile to " + conf.HTMLStatusFile)
diff --git a/internal/prometheus.go b/internal/prometheus.go
new file mode 100644
index 0000000..1c06aaf
--- /dev/null
+++ b/internal/prometheus.go
@@ -0,0 +1,131 @@
+package internal
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "time"
+)
+
+type prometheusResponse struct {
+ Status string `json:"status"`
+ Data struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ } `json:"data"`
+}
+
+type prometheusAlert struct {
+ Labels map[string]string `json:"labels"`
+ Annotations map[string]string `json:"annotations"`
+ State string `json:"state"`
+}
+
+func mergePrometheusAlerts(ctx context.Context, state state, conf config) state {
+ if len(conf.PrometheusHosts) == 0 {
+ return state
+ }
+
+ timeout := time.Duration(conf.PrometheusTimeoutS) * time.Second
+ alerts, host, err := fetchPrometheusAlerts(ctx, conf.PrometheusHosts, timeout)
+ if err != nil {
+ log.Printf("Failed to fetch Prometheus alerts from any host: %v", err)
+ cs := checkResult{
+ name: "Prometheus alerts",
+ output: fmt.Sprintf("CRITICAL: %v", err),
+ epoch: time.Now().Unix(),
+ status: nagiosCritical,
+ }
+ state.update(cs)
+ return state
+ }
+
+ log.Printf("Fetched %d firing alerts from Prometheus host %s", len(alerts), host)
+
+ for _, alert := range alerts {
+ if alert.State != "firing" {
+ continue
+ }
+
+ name := fmt.Sprintf("Prometheus: %s", alert.Labels["alertname"])
+ severity := alert.Labels["severity"]
+ description := alert.Annotations["summary"]
+ if description == "" {
+ description = alert.Annotations["description"]
+ }
+ if description == "" {
+ description = "no description"
+ }
+
+ status := nagiosWarning
+ if severity == "critical" {
+ status = nagiosCritical
+ }
+
+ cs := checkResult{
+ name: name,
+ output: fmt.Sprintf("%s [%s]: %s", alert.Labels["alertname"], severity, description),
+ epoch: time.Now().Unix(),
+ status: status,
+ }
+ state.update(cs)
+ }
+
+ return state
+}
+
+func fetchPrometheusAlerts(ctx context.Context, hosts []string, timeout time.Duration) ([]prometheusAlert, string, error) {
+ var lastErr error
+
+ for _, host := range hosts {
+ alerts, err := fetchFromHost(ctx, host, timeout)
+ if err != nil {
+ log.Printf("Failed to fetch from Prometheus host %s: %v", host, err)
+ lastErr = err
+ continue
+ }
+ return alerts, host, nil
+ }
+
+ return nil, "", fmt.Errorf("all Prometheus hosts failed, last error: %w", lastErr)
+}
+
+func fetchFromHost(ctx context.Context, host string, timeout time.Duration) ([]prometheusAlert, error) {
+ url := fmt.Sprintf("http://%s/api/v1/alerts", host)
+
+ ctx, cancel := context.WithTimeout(ctx, timeout)
+ defer cancel()
+
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create request: %w", err)
+ }
+
+ resp, err := http.DefaultClient.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("request failed: %w", err)
+ }
+ defer func() { _ = resp.Body.Close() }()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read response: %w", err)
+ }
+
+ var promResp prometheusResponse
+ if err := json.Unmarshal(body, &promResp); err != nil {
+ return nil, fmt.Errorf("failed to parse response: %w", err)
+ }
+
+ if promResp.Status != "success" {
+ return nil, fmt.Errorf("prometheus returned status: %s", promResp.Status)
+ }
+
+ return promResp.Data.Alerts, nil
+}
diff --git a/internal/prometheus_test.go b/internal/prometheus_test.go
new file mode 100644
index 0000000..8d98e7f
--- /dev/null
+++ b/internal/prometheus_test.go
@@ -0,0 +1,159 @@
+package internal
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestFetchFromHost(t *testing.T) {
+ tests := []struct {
+ name string
+ response prometheusResponse
+ statusCode int
+ wantAlertCount int
+ wantErr bool
+ }{
+ {
+ name: "successful response with firing alerts",
+ response: prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{
+ {
+ Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"},
+ Annotations: map[string]string{"summary": "CPU usage is high"},
+ State: "firing",
+ },
+ {
+ Labels: map[string]string{"alertname": "DiskSpace", "severity": "warning"},
+ Annotations: map[string]string{"summary": "Disk space low"},
+ State: "firing",
+ },
+ },
+ },
+ },
+ statusCode: http.StatusOK,
+ wantAlertCount: 2,
+ wantErr: false,
+ },
+ {
+ name: "empty alerts",
+ response: prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{},
+ },
+ },
+ statusCode: http.StatusOK,
+ wantAlertCount: 0,
+ wantErr: false,
+ },
+ {
+ name: "server error",
+ response: prometheusResponse{},
+ statusCode: http.StatusInternalServerError,
+ wantAlertCount: 0,
+ wantErr: true,
+ },
+ {
+ name: "prometheus error status",
+ response: prometheusResponse{
+ Status: "error",
+ },
+ statusCode: http.StatusOK,
+ wantAlertCount: 0,
+ wantErr: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/api/v1/alerts" {
+ t.Errorf("unexpected path: %s", r.URL.Path)
+ }
+ w.WriteHeader(tt.statusCode)
+ _ = json.NewEncoder(w).Encode(tt.response)
+ }))
+ defer server.Close()
+
+ host := strings.TrimPrefix(server.URL, "http://")
+ alerts, err := fetchFromHost(context.Background(), host, 2*time.Second)
+
+ if tt.wantErr && err == nil {
+ t.Error("expected error, got nil")
+ }
+ if !tt.wantErr && err != nil {
+ t.Errorf("unexpected error: %v", err)
+ }
+ if len(alerts) != tt.wantAlertCount {
+ t.Errorf("got %d alerts, want %d", len(alerts), tt.wantAlertCount)
+ }
+ })
+ }
+}
+
+func TestFetchPrometheusAlertsFailover(t *testing.T) {
+ callCount := 0
+
+ server1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ callCount++
+ time.Sleep(3 * time.Second) // exceed timeout
+ }))
+ defer server1.Close()
+
+ server2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ callCount++
+ resp := prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{
+ {
+ Labels: map[string]string{"alertname": "Test"},
+ State: "firing",
+ },
+ },
+ },
+ }
+ _ = json.NewEncoder(w).Encode(resp)
+ }))
+ defer server2.Close()
+
+ hosts := []string{
+ strings.TrimPrefix(server1.URL, "http://"),
+ strings.TrimPrefix(server2.URL, "http://"),
+ }
+
+ alerts, host, err := fetchPrometheusAlerts(context.Background(), hosts, 2*time.Second)
+ if err != nil {
+ t.Errorf("unexpected error: %v", err)
+ }
+ if len(alerts) != 1 {
+ t.Errorf("got %d alerts, want 1", len(alerts))
+ }
+ if host != hosts[1] {
+ t.Errorf("got host %s, want %s", host, hosts[1])
+ }
+}
+
+func TestMergePrometheusAlertsNoHosts(t *testing.T) {
+ conf := config{PrometheusHosts: nil}
+ s := state{checks: make(map[string]checkState)}
+
+ result := mergePrometheusAlerts(context.Background(), s, conf)
+
+ if len(result.checks) != 0 {
+ t.Errorf("expected no checks, got %d", len(result.checks))
+ }
+}
diff --git a/internal/run.go b/internal/run.go
index eed8ad5..f45f937 100644
--- a/internal/run.go
+++ b/internal/run.go
@@ -23,6 +23,7 @@ func Run(ctx context.Context, configFile string, renotify, force bool) {
}
state = runChecks(ctx, state, conf)
+ state = mergePrometheusAlerts(ctx, state, conf)
state = mergeFederated(ctx, state, conf)
if err := state.persist(); err != nil {