summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--f3s/prometheus/additional-scrape-configs.yaml14
-rw-r--r--f3s/prometheus/manifests/additional-scrape-configs-secret.yaml14
-rw-r--r--f3s/prometheus/manifests/argocd-application-alerts.yaml90
3 files changed, 118 insertions, 0 deletions
diff --git a/f3s/prometheus/additional-scrape-configs.yaml b/f3s/prometheus/additional-scrape-configs.yaml
index 0ea8d69..9406c7f 100644
--- a/f3s/prometheus/additional-scrape-configs.yaml
+++ b/f3s/prometheus/additional-scrape-configs.yaml
@@ -17,3 +17,17 @@
static_configs:
- targets:
- 'pushgateway.monitoring.svc.cluster.local:9091'
+
+# Radicale CalDAV/CardDAV server health monitoring
+# Radicale doesn't expose Prometheus metrics, so we scrape the web interface
+# and rely on the 'up' metric to determine if the service is available
+- job_name: 'radicale'
+ metrics_path: '/.web/'
+ static_configs:
+ - targets:
+ - 'radicale-service.services.svc.cluster.local:80'
+ labels:
+ service: radicale
+ namespace: services
+ scrape_interval: 30s
+ scrape_timeout: 10s
diff --git a/f3s/prometheus/manifests/additional-scrape-configs-secret.yaml b/f3s/prometheus/manifests/additional-scrape-configs-secret.yaml
index a2d9534..4f599e8 100644
--- a/f3s/prometheus/manifests/additional-scrape-configs-secret.yaml
+++ b/f3s/prometheus/manifests/additional-scrape-configs-secret.yaml
@@ -27,3 +27,17 @@ stringData:
static_configs:
- targets:
- 'pushgateway.monitoring.svc.cluster.local:9091'
+
+ # Radicale CalDAV/CardDAV server health monitoring
+ # Radicale doesn't expose Prometheus metrics, so we scrape the web interface
+ # and rely on the 'up' metric to determine if the service is available
+ - job_name: 'radicale'
+ metrics_path: '/.web/'
+ static_configs:
+ - targets:
+ - 'radicale-service.services.svc.cluster.local:80'
+ labels:
+ service: radicale
+ namespace: services
+ scrape_interval: 30s
+ scrape_timeout: 10s
diff --git a/f3s/prometheus/manifests/argocd-application-alerts.yaml b/f3s/prometheus/manifests/argocd-application-alerts.yaml
new file mode 100644
index 0000000..d843347
--- /dev/null
+++ b/f3s/prometheus/manifests/argocd-application-alerts.yaml
@@ -0,0 +1,90 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: argocd-application-alerts
+ namespace: monitoring
+ labels:
+ release: prometheus
+spec:
+ groups:
+ # ArgoCD application health and sync monitoring
+ - name: argocd-applications
+ interval: 30s
+ rules:
+ # Alert when any ArgoCD application is unhealthy
+ - alert: ArgoCDApplicationUnhealthy
+ expr: argocd_app_info{health_status!="Healthy"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ component: argocd
+ annotations:
+ summary: "ArgoCD application {{ $labels.name }} is unhealthy"
+ description: "ArgoCD application {{ $labels.name }} in namespace {{ $labels.dest_namespace }} has health status: {{ $labels.health_status }}. The application has been unhealthy for more than 5 minutes."
+ impact: "Service {{ $labels.name }} may be degraded or unavailable. Users may experience service interruptions."
+ action: "Check application status with: argocd app get {{ $labels.name }} --core\nCheck pods: kubectl get pods -n {{ $labels.dest_namespace }}\nView logs: kubectl logs -n {{ $labels.dest_namespace }} -l app={{ $labels.name }} --tail=100"
+ dashboard: "https://argocd.yourdomain.com/applications/{{ $labels.name }}"
+
+ # Alert when any ArgoCD application is out of sync
+ - alert: ArgoCDApplicationOutOfSync
+ expr: argocd_app_info{sync_status!="Synced"} == 1
+ for: 10m
+ labels:
+ severity: warning
+ component: argocd
+ annotations:
+ summary: "ArgoCD application {{ $labels.name }} is out of sync"
+ description: "ArgoCD application {{ $labels.name }} has sync status: {{ $labels.sync_status }}. The application has been out of sync for more than 10 minutes."
+ impact: "Configuration drift detected. Running configuration may not match Git repository."
+ action: "Check sync status with: argocd app get {{ $labels.name }} --core\nManual sync: argocd app sync {{ $labels.name }} --core\nCheck diff: argocd app diff {{ $labels.name }} --core"
+
+ # Alert when application sync fails repeatedly
+ - alert: ArgoCDApplicationSyncFailing
+ expr: increase(argocd_app_sync_total{phase="Error"}[30m]) > 3
+ for: 5m
+ labels:
+ severity: critical
+ component: argocd
+ annotations:
+ summary: "ArgoCD application {{ $labels.name }} sync is failing repeatedly"
+ description: "ArgoCD application {{ $labels.name }} has failed to sync {{ $value }} times in the last 30 minutes."
+ impact: "Application cannot be updated. New changes are not being deployed."
+ action: "Check sync errors: argocd app get {{ $labels.name }} --core\nView detailed logs: kubectl logs -n cicd -l app.kubernetes.io/name=argocd-application-controller --tail=200 | grep {{ $labels.name }}"
+
+ # Alert when application has been progressing for too long
+ - alert: ArgoCDApplicationStuckProgressing
+ expr: argocd_app_info{health_status="Progressing"} == 1
+ for: 30m
+ labels:
+ severity: warning
+ component: argocd
+ annotations:
+ summary: "ArgoCD application {{ $labels.name }} stuck in Progressing state"
+ description: "ArgoCD application {{ $labels.name }} has been in Progressing state for more than 30 minutes."
+ impact: "Application deployment is taking longer than expected. New version may not be fully deployed."
+ action: "Check application status: argocd app get {{ $labels.name }} --core\nCheck pods: kubectl get pods -n {{ $labels.dest_namespace }} -w\nCheck events: kubectl get events -n {{ $labels.dest_namespace }} --sort-by='.lastTimestamp'"
+
+ # Alert for critical application failures (Degraded, Suspended, Missing, Unknown)
+ - alert: ArgoCDApplicationDegraded
+ expr: argocd_app_info{health_status=~"Degraded|Suspended|Missing|Unknown"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ component: argocd
+ annotations:
+ summary: "ArgoCD application {{ $labels.name }} is degraded"
+ description: "ArgoCD application {{ $labels.name }} has critical health status: {{ $labels.health_status }}."
+ impact: "Application is in a critical state and requires immediate attention."
+ action: "Check application status: argocd app get {{ $labels.name }} --core\nReview ArgoCD logs: kubectl logs -n cicd -l app.kubernetes.io/name=argocd-server --tail=100"
+
+ # Recording rule: count of unhealthy applications
+ - record: argocd:applications:unhealthy:count
+ expr: count(argocd_app_info{health_status!="Healthy"})
+
+ # Recording rule: count of out-of-sync applications
+ - record: argocd:applications:outof_sync:count
+ expr: count(argocd_app_info{sync_status!="Synced"})
+
+ # Recording rule: total applications
+ - record: argocd:applications:total:count
+ expr: count(argocd_app_info)