apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: prometheus
  namespace: cicd
  finalizers:
    - resources-finalizer.argocd.argoproj.io
spec:
  project: default

  sources:
    # Source 1: Upstream Helm chart from prometheus-community
    - repoURL: https://prometheus-community.github.io/helm-charts
      chart: kube-prometheus-stack
      targetRevision: 55.5.0
      helm:
        releaseName: prometheus
        valuesObject:
          kubeEtcd:
            enabled: true
            endpoints:
              - 192.168.2.120
              - 192.168.2.121
              - 192.168.2.122
            service:
              enabled: true
              port: 2381
              targetPort: 2381

          kubeControllerManager:
            enabled: true
            endpoints:
              - 192.168.2.120
              - 192.168.2.121
              - 192.168.2.122
            service:
              enabled: true
              port: 10257
              targetPort: 10257
            serviceMonitor:
              enabled: true
              https: true
              insecureSkipVerify: true

          # k3s embeds kube-proxy and kube-scheduler into the main process
          # Disable these components to prevent false-positive alerts
          kubeProxy:
            enabled: false

          kubeScheduler:
            enabled: false

          # Disable alert rules for components not available in k3s
          defaultRules:
            rules:
              kubeProxy: false
              kubeSchedulerAlerting: false
              kubeSchedulerRecording: false
              kubeScheduler: false

          # Enable the textfile collector in the node_exporter DaemonSet so
          # check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures
          # as a Prometheus metric without needing a separate exporter.
          # extraHostVolumeMounts is the prometheus-node-exporter sub-chart key
          # for mounting host paths into the container; extraArgs adds the
          # --collector.textfile.directory flag to enable the textfile scraper.
          prometheus-node-exporter:
            extraArgs:
              - --collector.textfile.directory=/host/textfile_collector
            extraHostVolumeMounts:
              - name: textfile-collector
                hostPath: /var/lib/node_exporter/textfile_collector
                mountPath: /host/textfile_collector
                readOnly: true
                mountPropagation: None

          prometheus:
            prometheusSpec:
              additionalArgs:
                - name: web.enable-remote-write-receiver
                  value: ""
                - name: web.enable-admin-api
                  value: ""
              enableFeatures:
                - exemplar-storage
                - otlp-write-receiver
              additionalScrapeConfigs: []
              tsdb:
                outOfOrderTimeWindow: 744h  # 31 days
              additionalScrapeConfigsSecret:
                enabled: true
                name: additional-scrape-configs
                key: additional-scrape-configs.yaml
              storageSpec:
                volumeClaimTemplate:
                  spec:
                    storageClassName: ""
                    accessModes: ["ReadWriteOnce"]
                    resources:
                      requests:
                        storage: 10Gi
                    selector:
                      matchLabels:
                        type: local
                        app: prometheus

          alertmanager:
            config:
              global:
                resolve_timeout: 5m
              route:
                group_by: ['alertname', 'namespace', 'severity']
                group_wait: 30s
                group_interval: 5m
                repeat_interval: 12h
                receiver: 'default'
                routes:
                  # Watchdog alert is just for testing, suppress it
                  - matchers:
                      - alertname = "Watchdog"
                    receiver: 'null'
                  # ArgoCD application alerts - high priority
                  - matchers:
                      - component = "argocd"
                    receiver: 'argocd-alerts'
                    group_by: ['alertname', 'name', 'severity']
                    group_wait: 10s
                    repeat_interval: 6h
                  # NFS auto-repair alerts from r0/r1/r2 — short group_wait so
                  # operators are notified quickly when the mount breaks.
                  # repeat_interval is short: NFS outages are urgent and
                  # the auto-reboot cycle takes only ~30 s per node.
                  - matchers:
                      - component = "nfs"
                    receiver: 'nfs-alerts'
                    group_by: ['alertname', 'host', 'severity']
                    group_wait: 10s
                    repeat_interval: 30m
                  # Container image CVEs from Trivy Operator (see trivy-operator ArgoCD app)
                  - matchers:
                      - component = "trivy"
                    receiver: 'trivy-alerts'
                    group_by: ['alertname', 'namespace', 'severity']
                    group_wait: 2m
                    repeat_interval: 24h
              receivers:
                - name: 'null'
                - name: 'default'
                  # Default receiver - alerts visible in UI only
                - name: 'argocd-alerts'
                  # ArgoCD-specific receiver - alerts visible in UI only
                  # Future: add email/slack/webhook configuration here
                - name: 'nfs-alerts'
                  # NFS auto-repair alerts — visible in Alertmanager UI
                  # Future: add webhook/email for on-call paging here
                - name: 'trivy-alerts'
                  # Trivy Operator CVE alerts - visible in Alertmanager UI; add webhook/email when desired
              inhibit_rules:
                # Inhibit info alerts if warning or critical is firing
                - source_matchers:
                    - severity = "critical"
                  target_matchers:
                    - severity =~ "warning|info"
                  equal: ['namespace', 'alertname']
                - source_matchers:
                    - severity = "warning"
                  target_matchers:
                    - severity = "info"
                  equal: ['namespace', 'alertname']

          grafana:
            # Disabled: SQLite-on-NFS is unreliable across restarts (lock
            # state cannot be reacquired cleanly), and Loki + Tempo are
            # also disabled, so there's nothing to visualize. Prometheus
            # alone is kept for metrics + alerting.
            enabled: false

            persistence:
              enabled: true
              type: pvc
              existingClaim: "grafana-data-pvc"

            initChownData:
              enabled: false

            podSecurityContext:
              fsGroup: 911
              runAsUser: 911
              runAsGroup: 911

            # Disable sidecar-based datasource provisioning
            sidecar:
              datasources:
                enabled: false

            # Mount datasources ConfigMap directly to provisioning directory
            extraVolumes:
              - name: datasources-volume
                configMap:
                  name: grafana-datasources-all

            extraVolumeMounts:
              - name: datasources-volume
                mountPath: /etc/grafana/provisioning/datasources
                readOnly: true

    # Source 2: Additional manifests from Git repository
    - repoURL: http://git-server.cicd.svc.cluster.local/conf.git
      targetRevision: master
      path: f3s/prometheus/manifests

  destination:
    server: https://kubernetes.default.svc
    namespace: monitoring

  syncPolicy:
    automated:
      prune: true
      selfHeal: true
    syncOptions:
      - CreateNamespace=false
      - ServerSideApply=true
    retry:
      limit: 3
      backoff:
        duration: 10s
        factor: 2
        maxDuration: 3m