summaryrefslogtreecommitdiff
path: root/f3s/prometheus/manifests/argocd-application-alerts.yaml
blob: d84334743c3599defa8ac5a4626b3539663df632 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: argocd-application-alerts
  namespace: monitoring
  labels:
    release: prometheus
spec:
  groups:
    # ArgoCD application health and sync monitoring
    - name: argocd-applications
      interval: 30s
      rules:
        # Alert when any ArgoCD application is unhealthy
        - alert: ArgoCDApplicationUnhealthy
          expr: argocd_app_info{health_status!="Healthy"} == 1
          for: 5m
          labels:
            severity: critical
            component: argocd
          annotations:
            summary: "ArgoCD application {{ $labels.name }} is unhealthy"
            description: "ArgoCD application {{ $labels.name }} in namespace {{ $labels.dest_namespace }} has health status: {{ $labels.health_status }}. The application has been unhealthy for more than 5 minutes."
            impact: "Service {{ $labels.name }} may be degraded or unavailable. Users may experience service interruptions."
            action: "Check application status with: argocd app get {{ $labels.name }} --core\nCheck pods: kubectl get pods -n {{ $labels.dest_namespace }}\nView logs: kubectl logs -n {{ $labels.dest_namespace }} -l app={{ $labels.name }} --tail=100"
            dashboard: "https://argocd.yourdomain.com/applications/{{ $labels.name }}"

        # Alert when any ArgoCD application is out of sync
        - alert: ArgoCDApplicationOutOfSync
          expr: argocd_app_info{sync_status!="Synced"} == 1
          for: 10m
          labels:
            severity: warning
            component: argocd
          annotations:
            summary: "ArgoCD application {{ $labels.name }} is out of sync"
            description: "ArgoCD application {{ $labels.name }} has sync status: {{ $labels.sync_status }}. The application has been out of sync for more than 10 minutes."
            impact: "Configuration drift detected. Running configuration may not match Git repository."
            action: "Check sync status with: argocd app get {{ $labels.name }} --core\nManual sync: argocd app sync {{ $labels.name }} --core\nCheck diff: argocd app diff {{ $labels.name }} --core"

        # Alert when application sync fails repeatedly
        - alert: ArgoCDApplicationSyncFailing
          expr: increase(argocd_app_sync_total{phase="Error"}[30m]) > 3
          for: 5m
          labels:
            severity: critical
            component: argocd
          annotations:
            summary: "ArgoCD application {{ $labels.name }} sync is failing repeatedly"
            description: "ArgoCD application {{ $labels.name }} has failed to sync {{ $value }} times in the last 30 minutes."
            impact: "Application cannot be updated. New changes are not being deployed."
            action: "Check sync errors: argocd app get {{ $labels.name }} --core\nView detailed logs: kubectl logs -n cicd -l app.kubernetes.io/name=argocd-application-controller --tail=200 | grep {{ $labels.name }}"

        # Alert when application has been progressing for too long
        - alert: ArgoCDApplicationStuckProgressing
          expr: argocd_app_info{health_status="Progressing"} == 1
          for: 30m
          labels:
            severity: warning
            component: argocd
          annotations:
            summary: "ArgoCD application {{ $labels.name }} stuck in Progressing state"
            description: "ArgoCD application {{ $labels.name }} has been in Progressing state for more than 30 minutes."
            impact: "Application deployment is taking longer than expected. New version may not be fully deployed."
            action: "Check application status: argocd app get {{ $labels.name }} --core\nCheck pods: kubectl get pods -n {{ $labels.dest_namespace }} -w\nCheck events: kubectl get events -n {{ $labels.dest_namespace }} --sort-by='.lastTimestamp'"

        # Alert for critical application failures (Degraded, Suspended, Missing, Unknown)
        - alert: ArgoCDApplicationDegraded
          expr: argocd_app_info{health_status=~"Degraded|Suspended|Missing|Unknown"} == 1
          for: 5m
          labels:
            severity: critical
            component: argocd
          annotations:
            summary: "ArgoCD application {{ $labels.name }} is degraded"
            description: "ArgoCD application {{ $labels.name }} has critical health status: {{ $labels.health_status }}."
            impact: "Application is in a critical state and requires immediate attention."
            action: "Check application status: argocd app get {{ $labels.name }} --core\nReview ArgoCD logs: kubectl logs -n cicd -l app.kubernetes.io/name=argocd-server --tail=100"

        # Recording rule: count of unhealthy applications
        - record: argocd:applications:unhealthy:count
          expr: count(argocd_app_info{health_status!="Healthy"})

        # Recording rule: count of out-of-sync applications
        - record: argocd:applications:outof_sync:count
          expr: count(argocd_app_info{sync_status!="Synced"})

        # Recording rule: total applications
        - record: argocd:applications:total:count
          expr: count(argocd_app_info)