blob: d84334743c3599defa8ac5a4626b3539663df632 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: argocd-application-alerts
namespace: monitoring
labels:
release: prometheus
spec:
groups:
# ArgoCD application health and sync monitoring
- name: argocd-applications
interval: 30s
rules:
# Alert when any ArgoCD application is unhealthy
- alert: ArgoCDApplicationUnhealthy
expr: argocd_app_info{health_status!="Healthy"} == 1
for: 5m
labels:
severity: critical
component: argocd
annotations:
summary: "ArgoCD application {{ $labels.name }} is unhealthy"
description: "ArgoCD application {{ $labels.name }} in namespace {{ $labels.dest_namespace }} has health status: {{ $labels.health_status }}. The application has been unhealthy for more than 5 minutes."
impact: "Service {{ $labels.name }} may be degraded or unavailable. Users may experience service interruptions."
action: "Check application status with: argocd app get {{ $labels.name }} --core\nCheck pods: kubectl get pods -n {{ $labels.dest_namespace }}\nView logs: kubectl logs -n {{ $labels.dest_namespace }} -l app={{ $labels.name }} --tail=100"
dashboard: "https://argocd.yourdomain.com/applications/{{ $labels.name }}"
# Alert when any ArgoCD application is out of sync
- alert: ArgoCDApplicationOutOfSync
expr: argocd_app_info{sync_status!="Synced"} == 1
for: 10m
labels:
severity: warning
component: argocd
annotations:
summary: "ArgoCD application {{ $labels.name }} is out of sync"
description: "ArgoCD application {{ $labels.name }} has sync status: {{ $labels.sync_status }}. The application has been out of sync for more than 10 minutes."
impact: "Configuration drift detected. Running configuration may not match Git repository."
action: "Check sync status with: argocd app get {{ $labels.name }} --core\nManual sync: argocd app sync {{ $labels.name }} --core\nCheck diff: argocd app diff {{ $labels.name }} --core"
# Alert when application sync fails repeatedly
- alert: ArgoCDApplicationSyncFailing
expr: increase(argocd_app_sync_total{phase="Error"}[30m]) > 3
for: 5m
labels:
severity: critical
component: argocd
annotations:
summary: "ArgoCD application {{ $labels.name }} sync is failing repeatedly"
description: "ArgoCD application {{ $labels.name }} has failed to sync {{ $value }} times in the last 30 minutes."
impact: "Application cannot be updated. New changes are not being deployed."
action: "Check sync errors: argocd app get {{ $labels.name }} --core\nView detailed logs: kubectl logs -n cicd -l app.kubernetes.io/name=argocd-application-controller --tail=200 | grep {{ $labels.name }}"
# Alert when application has been progressing for too long
- alert: ArgoCDApplicationStuckProgressing
expr: argocd_app_info{health_status="Progressing"} == 1
for: 30m
labels:
severity: warning
component: argocd
annotations:
summary: "ArgoCD application {{ $labels.name }} stuck in Progressing state"
description: "ArgoCD application {{ $labels.name }} has been in Progressing state for more than 30 minutes."
impact: "Application deployment is taking longer than expected. New version may not be fully deployed."
action: "Check application status: argocd app get {{ $labels.name }} --core\nCheck pods: kubectl get pods -n {{ $labels.dest_namespace }} -w\nCheck events: kubectl get events -n {{ $labels.dest_namespace }} --sort-by='.lastTimestamp'"
# Alert for critical application failures (Degraded, Suspended, Missing, Unknown)
- alert: ArgoCDApplicationDegraded
expr: argocd_app_info{health_status=~"Degraded|Suspended|Missing|Unknown"} == 1
for: 5m
labels:
severity: critical
component: argocd
annotations:
summary: "ArgoCD application {{ $labels.name }} is degraded"
description: "ArgoCD application {{ $labels.name }} has critical health status: {{ $labels.health_status }}."
impact: "Application is in a critical state and requires immediate attention."
action: "Check application status: argocd app get {{ $labels.name }} --core\nReview ArgoCD logs: kubectl logs -n cicd -l app.kubernetes.io/name=argocd-server --tail=100"
# Recording rule: count of unhealthy applications
- record: argocd:applications:unhealthy:count
expr: count(argocd_app_info{health_status!="Healthy"})
# Recording rule: count of out-of-sync applications
- record: argocd:applications:outof_sync:count
expr: count(argocd_app_info{sync_status!="Synced"})
# Recording rule: total applications
- record: argocd:applications:total:count
expr: count(argocd_app_info)
|