summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-10 10:42:59 +0300
committerPaul Buetow <paul@buetow.org>2026-05-10 10:42:59 +0300
commitf8179f12afd53f3ce7f8a9f13155ecdef7c7382b (patch)
treed2284606a2bb73112d33389226b15dd063a0dde6
parent965e61016751d132fe83a8f44c6a1bf87d92b1a8 (diff)
nfs-monitor: add Prometheus alerts for NFS auto-repair failures
- check-nfs-mount.sh: write nfs_mount_monitor_consecutive_failures gauge to /var/lib/node_exporter/textfile_collector/nfs_mount_monitor.prom on every run (via write_textfile_metric helper, called from write_fail_count and directly on healthy runs); atomic tmp+mv write prevents partial reads - Rexfile: create /var/lib/node_exporter/textfile_collector dir on r-nodes - prometheus.yaml (ArgoCD app): enable textfile_collector in node_exporter DaemonSet via extraArgs/extraVolumes/extraVolumeMounts; mount host path /var/lib/node_exporter/textfile_collector into container - persistence-values.yaml: sync node_exporter textfile_collector config - nfs-mount-monitor-alerts.yaml: PrometheusRule with two alerts: NfsMountAutoRepairWarning (>= 3 consecutive failures, severity: warning) NfsMountAutoRepairCritical (>= 5 consecutive failures, severity: critical) wired into new 'nfs-alerts' Alertmanager receiver with 30m repeat_interval Tested: rex deploy succeeded, .prom files present on r0/r1/r2, timer clean. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
-rw-r--r--f3s/argocd-apps/monitoring/prometheus.yaml32
-rw-r--r--f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml68
-rw-r--r--f3s/prometheus/persistence-values.yaml16
-rw-r--r--f3s/r-nodes/Rexfile17
-rw-r--r--f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh37
5 files changed, 168 insertions, 2 deletions
diff --git a/f3s/argocd-apps/monitoring/prometheus.yaml b/f3s/argocd-apps/monitoring/prometheus.yaml
index 3d88e39..f59be60 100644
--- a/f3s/argocd-apps/monitoring/prometheus.yaml
+++ b/f3s/argocd-apps/monitoring/prometheus.yaml
@@ -58,6 +58,25 @@ spec:
kubeSchedulerRecording: false
kubeScheduler: false
+ # Enable the textfile collector in the node_exporter DaemonSet so
+ # check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures
+ # as a Prometheus metric without needing a separate exporter.
+ # The hostPath mount maps /var/lib/node_exporter/textfile_collector
+ # on the node into the container; the extra arg tells node_exporter
+ # to scan that directory for .prom files.
+ prometheus-node-exporter:
+ extraArgs:
+ - --collector.textfile.directory=/host/textfile_collector
+ extraVolumes:
+ - name: textfile-collector
+ hostPath:
+ path: /var/lib/node_exporter/textfile_collector
+ type: DirectoryOrCreate
+ extraVolumeMounts:
+ - name: textfile-collector
+ mountPath: /host/textfile_collector
+ readOnly: true
+
prometheus:
prometheusSpec:
additionalArgs:
@@ -110,6 +129,16 @@ spec:
group_by: ['alertname', 'name', 'severity']
group_wait: 10s
repeat_interval: 6h
+ # NFS auto-repair alerts from r0/r1/r2 — short group_wait so
+ # operators are notified quickly when the mount breaks.
+ # repeat_interval is short: NFS outages are urgent and
+ # the auto-reboot cycle takes only ~30 s per node.
+ - matchers:
+ - component = "nfs"
+ receiver: 'nfs-alerts'
+ group_by: ['alertname', 'host', 'severity']
+ group_wait: 10s
+ repeat_interval: 30m
# Container image CVEs from Trivy Operator (see trivy-operator ArgoCD app)
- matchers:
- component = "trivy"
@@ -124,6 +153,9 @@ spec:
- name: 'argocd-alerts'
# ArgoCD-specific receiver - alerts visible in UI only
# Future: add email/slack/webhook configuration here
+ - name: 'nfs-alerts'
+ # NFS auto-repair alerts — visible in Alertmanager UI
+ # Future: add webhook/email for on-call paging here
- name: 'trivy-alerts'
# Trivy Operator CVE alerts - visible in Alertmanager UI; add webhook/email when desired
inhibit_rules:
diff --git a/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml b/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml
new file mode 100644
index 0000000..bcc3b2d
--- /dev/null
+++ b/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml
@@ -0,0 +1,68 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: nfs-mount-monitor-alerts
+ namespace: monitoring
+ labels:
+ release: prometheus
+spec:
+ groups:
+ # Prometheus alerting rules for the NFS auto-repair monitor running on
+ # r0/r1/r2 (k3s Rocky Linux VMs). The metric is produced by
+ # check-nfs-mount.sh writing a textfile_collector .prom file that the
+ # node_exporter DaemonSet scrapes.
+ #
+ # Metric: nfs_mount_monitor_consecutive_failures{host="rN"}
+ # 0 = all probes passed, mount healthy
+ # 1-4 = fix_mount was called but repair succeeded or is in progress
+ # >= 3 = warning: auto-repair is struggling (threshold: 3)
+ # >= 5 = critical: at or above auto-reboot threshold (NFS_FAIL_THRESHOLD)
+ - name: nfs-mount-monitor
+ interval: 30s
+ rules:
+ # Warning: three or more consecutive fix_mount failures.
+ # The auto-repair is struggling; a human should watch this node.
+ # At NFS_FAIL_THRESHOLD=5 the node will auto-reboot, so three
+ # failures gives ~30 s lead time to investigate.
+ - alert: NfsMountAutoRepairWarning
+ expr: |
+ nfs_mount_monitor_consecutive_failures >= 3
+ for: 0s
+ labels:
+ severity: warning
+ component: nfs
+ annotations:
+ summary: "NFS auto-repair struggling on {{ $labels.host }}"
+ description: >
+ Host {{ $labels.host }} has {{ $value }} consecutive NFS
+ fix_mount failures. The auto-repair script (check-nfs-mount.sh)
+ is running but has not yet recovered the mount at
+ /data/nfs/k3svolumes. The node will auto-reboot at
+ NFS_FAIL_THRESHOLD=5 failures (~50 s from first failure).
+ action: >
+ Check journal: ssh root@{{ $labels.host }} "journalctl -u nfs-mount-monitor.service -n 30 --no-pager"
+ Check mount: ssh root@{{ $labels.host }} "mountpoint /data/nfs/k3svolumes && stat /data/nfs/k3svolumes"
+ Check stunnel: ssh root@{{ $labels.host }} "systemctl status stunnel"
+
+ # Critical: at or above NFS_FAIL_THRESHOLD (default 5).
+ # The node is about to be rebooted (or the counter is stuck above
+ # threshold if escalate_reboot itself failed).
+ - alert: NfsMountAutoRepairCritical
+ expr: |
+ nfs_mount_monitor_consecutive_failures >= 5
+ for: 0s
+ labels:
+ severity: critical
+ component: nfs
+ annotations:
+ summary: "NFS auto-repair failed on {{ $labels.host }} — reboot imminent"
+ description: >
+ Host {{ $labels.host }} has {{ $value }} consecutive NFS
+ fix_mount failures (NFS_FAIL_THRESHOLD=5). The node has been
+ cordoned and systemctl reboot issued. If the node is still up,
+ the reboot may have stalled or escalate_reboot failed.
+ action: >
+ Check node status: kubectl get node {{ $labels.host }}.lan.buetow.org
+ Check journal: ssh root@{{ $labels.host }} "journalctl -u nfs-mount-monitor.service -n 50 --no-pager"
+ Manual recovery: ssh root@{{ $labels.host }} "systemctl restart stunnel && mount /data/nfs/k3svolumes"
+ Uncordon after fix: kubectl uncordon {{ $labels.host }}.lan.buetow.org
diff --git a/f3s/prometheus/persistence-values.yaml b/f3s/prometheus/persistence-values.yaml
index 732d0a9..e00f6d1 100644
--- a/f3s/prometheus/persistence-values.yaml
+++ b/f3s/prometheus/persistence-values.yaml
@@ -24,6 +24,22 @@ kubeControllerManager:
https: true
insecureSkipVerify: true
+# Enable the textfile collector in the node_exporter DaemonSet so
+# check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures
+# as a Prometheus metric without needing a separate exporter.
+prometheus-node-exporter:
+ extraArgs:
+ - --collector.textfile.directory=/host/textfile_collector
+ extraVolumes:
+ - name: textfile-collector
+ hostPath:
+ path: /var/lib/node_exporter/textfile_collector
+ type: DirectoryOrCreate
+ extraVolumeMounts:
+ - name: textfile-collector
+ mountPath: /host/textfile_collector
+ readOnly: true
+
prometheus:
prometheusSpec:
# Enable remote write receiver for accepting historic data with custom timestamps
diff --git a/f3s/r-nodes/Rexfile b/f3s/r-nodes/Rexfile
index 0dc2aea..fd61326 100644
--- a/f3s/r-nodes/Rexfile
+++ b/f3s/r-nodes/Rexfile
@@ -63,6 +63,23 @@ task 'nfs_mount_monitor',
group => 'root',
mode => '700';
+ # Ensure the node_exporter textfile_collector directory exists.
+ # The check-nfs-mount.sh script writes nfs_mount_monitor.prom here;
+ # node_exporter reads it when --collector.textfile.directory is set.
+ # world-readable so the node_exporter process (root or dedicated user)
+ # can pick up the file without special ACLs.
+ file '/var/lib/node_exporter',
+ ensure => 'directory',
+ owner => 'root',
+ group => 'root',
+ mode => '755';
+
+ file '/var/lib/node_exporter/textfile_collector',
+ ensure => 'directory',
+ owner => 'root',
+ group => 'root',
+ mode => '755';
+
# Deploy the health-monitor script.
file '/usr/local/bin/check-nfs-mount.sh',
source => catfile( $monitor_dir, 'check-nfs-mount.sh' ),
diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
index dd71a4d..0a772d8 100644
--- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
+++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
@@ -48,6 +48,12 @@ LOCK_FILE="/var/run/nfs-mount-check.lock"
STATE_DIR="/var/lib/nfs-mount-monitor"
FAIL_COUNT_FILE="$STATE_DIR/fail-count"
+# Textfile collector output for node_exporter.
+# Written on every run so Prometheus always has a current sample.
+# The DaemonSet mounts /var/lib/node_exporter/textfile_collector from the host.
+TEXTFILE_DIR="/var/lib/node_exporter/textfile_collector"
+TEXTFILE_PROM="$TEXTFILE_DIR/nfs_mount_monitor.prom"
+
# Load tunable configuration (NFS_FAIL_THRESHOLD) from the EnvironmentFile
# deployed alongside this script. Defaults are defined here so the script
# works even if the file is absent.
@@ -82,6 +88,29 @@ write_fail_count() {
local count="$1"
mkdir -p "$STATE_DIR"
echo "$count" > "$FAIL_COUNT_FILE"
+ # Also export the current count to the node_exporter textfile collector
+ # so Prometheus can alert directly without parsing journal logs.
+ write_textfile_metric "$count"
+}
+
+# write_textfile_metric — write the consecutive-failure gauge to the
+# node_exporter textfile_collector directory. The metric name follows the
+# node_exporter convention: lowercase, underscores, no units suffix for counts.
+# The host label lets Prometheus distinguish r0/r1/r2 even before
+# relabelling resolves the instance IP to a hostname.
+# We write atomically (tmp + mv) to avoid node_exporter reading a partial file.
+write_textfile_metric() {
+ local count="$1"
+ local host
+ host=$(hostname -s)
+ mkdir -p "$TEXTFILE_DIR"
+ local tmp_file
+ tmp_file="$(mktemp "$TEXTFILE_DIR/nfs_mount_monitor.prom.XXXXXX")"
+ # Write metric with HELP/TYPE headers for valid exposition format
+ printf '# HELP nfs_mount_monitor_consecutive_failures Consecutive NFS fix_mount failure count\n' > "$tmp_file"
+ printf '# TYPE nfs_mount_monitor_consecutive_failures gauge\n' >> "$tmp_file"
+ printf 'nfs_mount_monitor_consecutive_failures{host="%s"} %s\n' "$host" "$count" >> "$tmp_file"
+ mv "$tmp_file" "$TEXTFILE_PROM"
}
# kill_pinning_processes — send SIGKILL to any process whose wchan starts
@@ -293,12 +322,16 @@ fi
# If all three probes passed cleanly (no repair attempt needed), reset the
# consecutive-failure counter so a previous partial failure streak does not
-# lower the effective reboot threshold. We only write the file when the
-# counter is non-zero to avoid unnecessary writes on every healthy run.
+# lower the effective reboot threshold. write_fail_count also refreshes the
+# textfile metric so Prometheus always has a current sample.
if [ "$PROBE_FAILED" -eq 0 ]; then
if [ "$(read_fail_count)" -ne 0 ]; then
write_fail_count 0
echo "All probes passed; consecutive-failure counter reset to 0"
+ else
+ # Counter is already zero; update the textfile metric timestamp
+ # so node_exporter sees a fresh scrape on every healthy run.
+ write_textfile_metric 0
fi
fi