apiVersion: argoproj.io/v1alpha1 kind: Application metadata: name: prometheus namespace: cicd finalizers: - resources-finalizer.argocd.argoproj.io spec: project: default sources: # Source 1: Upstream Helm chart from prometheus-community - repoURL: https://prometheus-community.github.io/helm-charts chart: kube-prometheus-stack targetRevision: 55.5.0 helm: releaseName: prometheus valuesObject: kubeEtcd: enabled: true endpoints: - 192.168.2.120 - 192.168.2.121 - 192.168.2.122 service: enabled: true port: 2381 targetPort: 2381 kubeControllerManager: enabled: true endpoints: - 192.168.2.120 - 192.168.2.121 - 192.168.2.122 service: enabled: true port: 10257 targetPort: 10257 serviceMonitor: enabled: true https: true insecureSkipVerify: true # k3s embeds kube-proxy and kube-scheduler into the main process # Disable these components to prevent false-positive alerts kubeProxy: enabled: false kubeScheduler: enabled: false # Disable alert rules for components not available in k3s defaultRules: rules: kubeProxy: false kubeSchedulerAlerting: false kubeSchedulerRecording: false kubeScheduler: false # Enable the textfile collector in the node_exporter DaemonSet so # check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures # as a Prometheus metric without needing a separate exporter. # extraHostVolumeMounts is the prometheus-node-exporter sub-chart key # for mounting host paths into the container; extraArgs adds the # --collector.textfile.directory flag to enable the textfile scraper. prometheus-node-exporter: extraArgs: - --collector.textfile.directory=/host/textfile_collector extraHostVolumeMounts: - name: textfile-collector hostPath: /var/lib/node_exporter/textfile_collector mountPath: /host/textfile_collector readOnly: true mountPropagation: None prometheus: prometheusSpec: additionalArgs: - name: web.enable-remote-write-receiver value: "" - name: web.enable-admin-api value: "" enableFeatures: - exemplar-storage - otlp-write-receiver additionalScrapeConfigs: [] tsdb: outOfOrderTimeWindow: 744h # 31 days additionalScrapeConfigsSecret: enabled: true name: additional-scrape-configs key: additional-scrape-configs.yaml storageSpec: volumeClaimTemplate: spec: storageClassName: "" accessModes: ["ReadWriteOnce"] resources: requests: storage: 10Gi selector: matchLabels: type: local app: prometheus alertmanager: config: global: resolve_timeout: 5m route: group_by: ['alertname', 'namespace', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: 'default' routes: # Watchdog alert is just for testing, suppress it - matchers: - alertname = "Watchdog" receiver: 'null' # ArgoCD application alerts - high priority - matchers: - component = "argocd" receiver: 'argocd-alerts' group_by: ['alertname', 'name', 'severity'] group_wait: 10s repeat_interval: 6h # NFS auto-repair alerts from r0/r1/r2 — short group_wait so # operators are notified quickly when the mount breaks. # repeat_interval is short: NFS outages are urgent and # the auto-reboot cycle takes only ~30 s per node. - matchers: - component = "nfs" receiver: 'nfs-alerts' group_by: ['alertname', 'host', 'severity'] group_wait: 10s repeat_interval: 30m # Container image CVEs from Trivy Operator (see trivy-operator ArgoCD app) - matchers: - component = "trivy" receiver: 'trivy-alerts' group_by: ['alertname', 'namespace', 'severity'] group_wait: 2m repeat_interval: 24h receivers: - name: 'null' - name: 'default' # Default receiver - alerts visible in UI only - name: 'argocd-alerts' # ArgoCD-specific receiver - alerts visible in UI only # Future: add email/slack/webhook configuration here - name: 'nfs-alerts' # NFS auto-repair alerts — visible in Alertmanager UI # Future: add webhook/email for on-call paging here - name: 'trivy-alerts' # Trivy Operator CVE alerts - visible in Alertmanager UI; add webhook/email when desired inhibit_rules: # Inhibit info alerts if warning or critical is firing - source_matchers: - severity = "critical" target_matchers: - severity =~ "warning|info" equal: ['namespace', 'alertname'] - source_matchers: - severity = "warning" target_matchers: - severity = "info" equal: ['namespace', 'alertname'] grafana: # Disabled: SQLite-on-NFS is unreliable across restarts (lock # state cannot be reacquired cleanly), and Loki + Tempo are # also disabled, so there's nothing to visualize. Prometheus # alone is kept for metrics + alerting. enabled: false persistence: enabled: true type: pvc existingClaim: "grafana-data-pvc" initChownData: enabled: false podSecurityContext: fsGroup: 911 runAsUser: 911 runAsGroup: 911 # Disable sidecar-based datasource provisioning sidecar: datasources: enabled: false # Mount datasources ConfigMap directly to provisioning directory extraVolumes: - name: datasources-volume configMap: name: grafana-datasources-all extraVolumeMounts: - name: datasources-volume mountPath: /etc/grafana/provisioning/datasources readOnly: true # Source 2: Additional manifests from Git repository - repoURL: http://git-server.cicd.svc.cluster.local/conf.git targetRevision: master path: f3s/prometheus/manifests destination: server: https://kubernetes.default.svc namespace: monitoring syncPolicy: automated: prune: true selfHeal: true syncOptions: - CreateNamespace=false - ServerSideApply=true retry: limit: 3 backoff: duration: 10s factor: 2 maxDuration: 3m