nfs-monitor: add Prometheus alerts for NFS auto-repair failures

- check-nfs-mount.sh: write nfs_mount_monitor_consecutive_failures gauge to /var/lib/node_exporter/textfile_collector/nfs_mount_monitor.prom on every run (via write_textfile_metric helper, called from write_fail_count and directly on healthy runs); atomic tmp+mv write prevents partial reads - Rexfile: create /var/lib/node_exporter/textfile_collector dir on r-nodes - prometheus.yaml (ArgoCD app): enable textfile_collector in node_exporter DaemonSet via extraArgs/extraVolumes/extraVolumeMounts; mount host path /var/lib/node_exporter/textfile_collector into container - persistence-values.yaml: sync node_exporter textfile_collector config - nfs-mount-monitor-alerts.yaml: PrometheusRule with two alerts: NfsMountAutoRepairWarning (>= 3 consecutive failures, severity: warning) NfsMountAutoRepairCritical (>= 5 consecutive failures, severity: critical) wired into new 'nfs-alerts' Alertmanager receiver with 30m repeat_interval Tested: rex deploy succeeded, .prom files present on r0/r1/r2, timer clean. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-05-10 10:42:59 +0300
committer: Paul Buetow <paul@buetow.org> 2026-05-10 10:42:59 +0300
commit: f8179f12afd53f3ce7f8a9f13155ecdef7c7382b (patch)
tree: d2284606a2bb73112d33389226b15dd063a0dde6
parent: 965e61016751d132fe83a8f44c6a1bf87d92b1a8 (diff)
5 files changed, 168 insertions, 2 deletions
diff --git a/f3s/argocd-apps/monitoring/prometheus.yaml b/f3s/argocd-apps/monitoring/prometheus.yaml
index 3d88e39..f59be60 100644
--- a/f3s/argocd-apps/monitoring/prometheus.yaml
+++ b/f3s/argocd-apps/monitoring/prometheus.yaml
@@ -58,6 +58,25 @@ spec:
               kubeSchedulerRecording: false
               kubeScheduler: false
 
+          # Enable the textfile collector in the node_exporter DaemonSet so
+          # check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures
+          # as a Prometheus metric without needing a separate exporter.
+          # The hostPath mount maps /var/lib/node_exporter/textfile_collector
+          # on the node into the container; the extra arg tells node_exporter
+          # to scan that directory for .prom files.
+          prometheus-node-exporter:
+            extraArgs:
+              - --collector.textfile.directory=/host/textfile_collector
+            extraVolumes:
+              - name: textfile-collector
+                hostPath:
+                  path: /var/lib/node_exporter/textfile_collector
+                  type: DirectoryOrCreate
+            extraVolumeMounts:
+              - name: textfile-collector
+                mountPath: /host/textfile_collector
+                readOnly: true
+
           prometheus:
             prometheusSpec:
               additionalArgs:
@@ -110,6 +129,16 @@ spec:
                     group_by: ['alertname', 'name', 'severity']
                     group_wait: 10s
                     repeat_interval: 6h
+                  # NFS auto-repair alerts from r0/r1/r2 — short group_wait so
+                  # operators are notified quickly when the mount breaks.
+                  # repeat_interval is short: NFS outages are urgent and
+                  # the auto-reboot cycle takes only ~30 s per node.
+                  - matchers:
+                      - component = "nfs"
+                    receiver: 'nfs-alerts'
+                    group_by: ['alertname', 'host', 'severity']
+                    group_wait: 10s
+                    repeat_interval: 30m
                   # Container image CVEs from Trivy Operator (see trivy-operator ArgoCD app)
                   - matchers:
                       - component = "trivy"
@@ -124,6 +153,9 @@ spec:
                 - name: 'argocd-alerts'
                   # ArgoCD-specific receiver - alerts visible in UI only
                   # Future: add email/slack/webhook configuration here
+                - name: 'nfs-alerts'
+                  # NFS auto-repair alerts — visible in Alertmanager UI
+                  # Future: add webhook/email for on-call paging here
                 - name: 'trivy-alerts'
                   # Trivy Operator CVE alerts - visible in Alertmanager UI; add webhook/email when desired
               inhibit_rules:
diff --git a/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml b/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml
new file mode 100644
index 0000000..bcc3b2d
--- /dev/null
+++ b/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml
@@ -0,0 +1,68 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: nfs-mount-monitor-alerts
+  namespace: monitoring
+  labels:
+    release: prometheus
+spec:
+  groups:
+    # Prometheus alerting rules for the NFS auto-repair monitor running on
+    # r0/r1/r2 (k3s Rocky Linux VMs).  The metric is produced by
+    # check-nfs-mount.sh writing a textfile_collector .prom file that the
+    # node_exporter DaemonSet scrapes.
+    #
+    # Metric: nfs_mount_monitor_consecutive_failures{host="rN"}
+    #   0     = all probes passed, mount healthy
+    #   1-4   = fix_mount was called but repair succeeded or is in progress
+    #   >= 3  = warning: auto-repair is struggling (threshold: 3)
+    #   >= 5  = critical: at or above auto-reboot threshold (NFS_FAIL_THRESHOLD)
+    - name: nfs-mount-monitor
+      interval: 30s
+      rules:
+        # Warning: three or more consecutive fix_mount failures.
+        # The auto-repair is struggling; a human should watch this node.
+        # At NFS_FAIL_THRESHOLD=5 the node will auto-reboot, so three
+        # failures gives ~30 s lead time to investigate.
+        - alert: NfsMountAutoRepairWarning
+          expr: |
+            nfs_mount_monitor_consecutive_failures >= 3
+          for: 0s
+          labels:
+            severity: warning
+            component: nfs
+          annotations:
+            summary: "NFS auto-repair struggling on {{ $labels.host }}"
+            description: >
+              Host {{ $labels.host }} has {{ $value }} consecutive NFS
+              fix_mount failures. The auto-repair script (check-nfs-mount.sh)
+              is running but has not yet recovered the mount at
+              /data/nfs/k3svolumes. The node will auto-reboot at
+              NFS_FAIL_THRESHOLD=5 failures (~50 s from first failure).
+            action: >
+              Check journal: ssh root@{{ $labels.host }} "journalctl -u nfs-mount-monitor.service -n 30 --no-pager"
+              Check mount:   ssh root@{{ $labels.host }} "mountpoint /data/nfs/k3svolumes && stat /data/nfs/k3svolumes"
+              Check stunnel: ssh root@{{ $labels.host }} "systemctl status stunnel"
+
+        # Critical: at or above NFS_FAIL_THRESHOLD (default 5).
+        # The node is about to be rebooted (or the counter is stuck above
+        # threshold if escalate_reboot itself failed).
+        - alert: NfsMountAutoRepairCritical
+          expr: |
+            nfs_mount_monitor_consecutive_failures >= 5
+          for: 0s
+          labels:
+            severity: critical
+            component: nfs
+          annotations:
+            summary: "NFS auto-repair failed on {{ $labels.host }} — reboot imminent"
+            description: >
+              Host {{ $labels.host }} has {{ $value }} consecutive NFS
+              fix_mount failures (NFS_FAIL_THRESHOLD=5). The node has been
+              cordoned and systemctl reboot issued. If the node is still up,
+              the reboot may have stalled or escalate_reboot failed.
+            action: >
+              Check node status: kubectl get node {{ $labels.host }}.lan.buetow.org
+              Check journal:     ssh root@{{ $labels.host }} "journalctl -u nfs-mount-monitor.service -n 50 --no-pager"
+              Manual recovery:   ssh root@{{ $labels.host }} "systemctl restart stunnel && mount /data/nfs/k3svolumes"
+              Uncordon after fix: kubectl uncordon {{ $labels.host }}.lan.buetow.org
diff --git a/f3s/prometheus/persistence-values.yaml b/f3s/prometheus/persistence-values.yaml
index 732d0a9..e00f6d1 100644
--- a/f3s/prometheus/persistence-values.yaml
+++ b/f3s/prometheus/persistence-values.yaml
@@ -24,6 +24,22 @@ kubeControllerManager:
     https: true
     insecureSkipVerify: true
 
+# Enable the textfile collector in the node_exporter DaemonSet so
+# check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures
+# as a Prometheus metric without needing a separate exporter.
+prometheus-node-exporter:
+  extraArgs:
+    - --collector.textfile.directory=/host/textfile_collector
+  extraVolumes:
+    - name: textfile-collector
+      hostPath:
+        path: /var/lib/node_exporter/textfile_collector
+        type: DirectoryOrCreate
+  extraVolumeMounts:
+    - name: textfile-collector
+      mountPath: /host/textfile_collector
+      readOnly: true
+
 prometheus:
   prometheusSpec:
     # Enable remote write receiver for accepting historic data with custom timestamps
diff --git a/f3s/r-nodes/Rexfile b/f3s/r-nodes/Rexfile
index 0dc2aea..fd61326 100644
--- a/f3s/r-nodes/Rexfile
+++ b/f3s/r-nodes/Rexfile
@@ -63,6 +63,23 @@ task 'nfs_mount_monitor',
       group  => 'root',
       mode   => '700';
 
+    # Ensure the node_exporter textfile_collector directory exists.
+    # The check-nfs-mount.sh script writes nfs_mount_monitor.prom here;
+    # node_exporter reads it when --collector.textfile.directory is set.
+    # world-readable so the node_exporter process (root or dedicated user)
+    # can pick up the file without special ACLs.
+    file '/var/lib/node_exporter',
+      ensure => 'directory',
+      owner  => 'root',
+      group  => 'root',
+      mode   => '755';
+
+    file '/var/lib/node_exporter/textfile_collector',
+      ensure => 'directory',
+      owner  => 'root',
+      group  => 'root',
+      mode   => '755';
+
     # Deploy the health-monitor script.
     file '/usr/local/bin/check-nfs-mount.sh',
       source    => catfile( $monitor_dir, 'check-nfs-mount.sh' ),
diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
index dd71a4d..0a772d8 100644
--- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
+++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
@@ -48,6 +48,12 @@ LOCK_FILE="/var/run/nfs-mount-check.lock"
 STATE_DIR="/var/lib/nfs-mount-monitor"
 FAIL_COUNT_FILE="$STATE_DIR/fail-count"
 
+# Textfile collector output for node_exporter.
+# Written on every run so Prometheus always has a current sample.
+# The DaemonSet mounts /var/lib/node_exporter/textfile_collector from the host.
+TEXTFILE_DIR="/var/lib/node_exporter/textfile_collector"
+TEXTFILE_PROM="$TEXTFILE_DIR/nfs_mount_monitor.prom"
+
 # Load tunable configuration (NFS_FAIL_THRESHOLD) from the EnvironmentFile
 # deployed alongside this script.  Defaults are defined here so the script
 # works even if the file is absent.
@@ -82,6 +88,29 @@ write_fail_count() {
     local count="$1"
     mkdir -p "$STATE_DIR"
     echo "$count" > "$FAIL_COUNT_FILE"
+    # Also export the current count to the node_exporter textfile collector
+    # so Prometheus can alert directly without parsing journal logs.
+    write_textfile_metric "$count"
+}
+
+# write_textfile_metric — write the consecutive-failure gauge to the
+# node_exporter textfile_collector directory.  The metric name follows the
+# node_exporter convention: lowercase, underscores, no units suffix for counts.
+# The host label lets Prometheus distinguish r0/r1/r2 even before
+# relabelling resolves the instance IP to a hostname.
+# We write atomically (tmp + mv) to avoid node_exporter reading a partial file.
+write_textfile_metric() {
+    local count="$1"
+    local host
+    host=$(hostname -s)
+    mkdir -p "$TEXTFILE_DIR"
+    local tmp_file
+    tmp_file="$(mktemp "$TEXTFILE_DIR/nfs_mount_monitor.prom.XXXXXX")"
+    # Write metric with HELP/TYPE headers for valid exposition format
+    printf '# HELP nfs_mount_monitor_consecutive_failures Consecutive NFS fix_mount failure count\n' > "$tmp_file"
+    printf '# TYPE nfs_mount_monitor_consecutive_failures gauge\n' >> "$tmp_file"
+    printf 'nfs_mount_monitor_consecutive_failures{host="%s"} %s\n' "$host" "$count" >> "$tmp_file"
+    mv "$tmp_file" "$TEXTFILE_PROM"
 }
 
 # kill_pinning_processes — send SIGKILL to any process whose wchan starts
@@ -293,12 +322,16 @@ fi
 
 # If all three probes passed cleanly (no repair attempt needed), reset the
 # consecutive-failure counter so a previous partial failure streak does not
-# lower the effective reboot threshold.  We only write the file when the
-# counter is non-zero to avoid unnecessary writes on every healthy run.
+# lower the effective reboot threshold.  write_fail_count also refreshes the
+# textfile metric so Prometheus always has a current sample.
 if [ "$PROBE_FAILED" -eq 0 ]; then
     if [ "$(read_fail_count)" -ne 0 ]; then
         write_fail_count 0
         echo "All probes passed; consecutive-failure counter reset to 0"
+    else
+        # Counter is already zero; update the textfile metric timestamp
+        # so node_exporter sees a fresh scrape on every healthy run.
+        write_textfile_metric 0
     fi
 fi
author	Paul Buetow <paul@buetow.org>	2026-05-10 10:42:59 +0300
committer	Paul Buetow <paul@buetow.org>	2026-05-10 10:42:59 +0300
commit	f8179f12afd53f3ce7f8a9f13155ecdef7c7382b (patch)
tree	d2284606a2bb73112d33389226b15dd063a0dde6
parent	965e61016751d132fe83a8f44c6a1bf87d92b1a8 (diff)