diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-10 10:42:59 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-10 10:42:59 +0300 |
| commit | f8179f12afd53f3ce7f8a9f13155ecdef7c7382b (patch) | |
| tree | d2284606a2bb73112d33389226b15dd063a0dde6 | |
| parent | 965e61016751d132fe83a8f44c6a1bf87d92b1a8 (diff) | |
nfs-monitor: add Prometheus alerts for NFS auto-repair failures
- check-nfs-mount.sh: write nfs_mount_monitor_consecutive_failures gauge
to /var/lib/node_exporter/textfile_collector/nfs_mount_monitor.prom on
every run (via write_textfile_metric helper, called from write_fail_count
and directly on healthy runs); atomic tmp+mv write prevents partial reads
- Rexfile: create /var/lib/node_exporter/textfile_collector dir on r-nodes
- prometheus.yaml (ArgoCD app): enable textfile_collector in node_exporter
DaemonSet via extraArgs/extraVolumes/extraVolumeMounts; mount host path
/var/lib/node_exporter/textfile_collector into container
- persistence-values.yaml: sync node_exporter textfile_collector config
- nfs-mount-monitor-alerts.yaml: PrometheusRule with two alerts:
NfsMountAutoRepairWarning (>= 3 consecutive failures, severity: warning)
NfsMountAutoRepairCritical (>= 5 consecutive failures, severity: critical)
wired into new 'nfs-alerts' Alertmanager receiver with 30m repeat_interval
Tested: rex deploy succeeded, .prom files present on r0/r1/r2, timer clean.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
| -rw-r--r-- | f3s/argocd-apps/monitoring/prometheus.yaml | 32 | ||||
| -rw-r--r-- | f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml | 68 | ||||
| -rw-r--r-- | f3s/prometheus/persistence-values.yaml | 16 | ||||
| -rw-r--r-- | f3s/r-nodes/Rexfile | 17 | ||||
| -rw-r--r-- | f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh | 37 |
5 files changed, 168 insertions, 2 deletions
diff --git a/f3s/argocd-apps/monitoring/prometheus.yaml b/f3s/argocd-apps/monitoring/prometheus.yaml index 3d88e39..f59be60 100644 --- a/f3s/argocd-apps/monitoring/prometheus.yaml +++ b/f3s/argocd-apps/monitoring/prometheus.yaml @@ -58,6 +58,25 @@ spec: kubeSchedulerRecording: false kubeScheduler: false + # Enable the textfile collector in the node_exporter DaemonSet so + # check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures + # as a Prometheus metric without needing a separate exporter. + # The hostPath mount maps /var/lib/node_exporter/textfile_collector + # on the node into the container; the extra arg tells node_exporter + # to scan that directory for .prom files. + prometheus-node-exporter: + extraArgs: + - --collector.textfile.directory=/host/textfile_collector + extraVolumes: + - name: textfile-collector + hostPath: + path: /var/lib/node_exporter/textfile_collector + type: DirectoryOrCreate + extraVolumeMounts: + - name: textfile-collector + mountPath: /host/textfile_collector + readOnly: true + prometheus: prometheusSpec: additionalArgs: @@ -110,6 +129,16 @@ spec: group_by: ['alertname', 'name', 'severity'] group_wait: 10s repeat_interval: 6h + # NFS auto-repair alerts from r0/r1/r2 — short group_wait so + # operators are notified quickly when the mount breaks. + # repeat_interval is short: NFS outages are urgent and + # the auto-reboot cycle takes only ~30 s per node. + - matchers: + - component = "nfs" + receiver: 'nfs-alerts' + group_by: ['alertname', 'host', 'severity'] + group_wait: 10s + repeat_interval: 30m # Container image CVEs from Trivy Operator (see trivy-operator ArgoCD app) - matchers: - component = "trivy" @@ -124,6 +153,9 @@ spec: - name: 'argocd-alerts' # ArgoCD-specific receiver - alerts visible in UI only # Future: add email/slack/webhook configuration here + - name: 'nfs-alerts' + # NFS auto-repair alerts — visible in Alertmanager UI + # Future: add webhook/email for on-call paging here - name: 'trivy-alerts' # Trivy Operator CVE alerts - visible in Alertmanager UI; add webhook/email when desired inhibit_rules: diff --git a/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml b/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml new file mode 100644 index 0000000..bcc3b2d --- /dev/null +++ b/f3s/prometheus/manifests/nfs-mount-monitor-alerts.yaml @@ -0,0 +1,68 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: nfs-mount-monitor-alerts + namespace: monitoring + labels: + release: prometheus +spec: + groups: + # Prometheus alerting rules for the NFS auto-repair monitor running on + # r0/r1/r2 (k3s Rocky Linux VMs). The metric is produced by + # check-nfs-mount.sh writing a textfile_collector .prom file that the + # node_exporter DaemonSet scrapes. + # + # Metric: nfs_mount_monitor_consecutive_failures{host="rN"} + # 0 = all probes passed, mount healthy + # 1-4 = fix_mount was called but repair succeeded or is in progress + # >= 3 = warning: auto-repair is struggling (threshold: 3) + # >= 5 = critical: at or above auto-reboot threshold (NFS_FAIL_THRESHOLD) + - name: nfs-mount-monitor + interval: 30s + rules: + # Warning: three or more consecutive fix_mount failures. + # The auto-repair is struggling; a human should watch this node. + # At NFS_FAIL_THRESHOLD=5 the node will auto-reboot, so three + # failures gives ~30 s lead time to investigate. + - alert: NfsMountAutoRepairWarning + expr: | + nfs_mount_monitor_consecutive_failures >= 3 + for: 0s + labels: + severity: warning + component: nfs + annotations: + summary: "NFS auto-repair struggling on {{ $labels.host }}" + description: > + Host {{ $labels.host }} has {{ $value }} consecutive NFS + fix_mount failures. The auto-repair script (check-nfs-mount.sh) + is running but has not yet recovered the mount at + /data/nfs/k3svolumes. The node will auto-reboot at + NFS_FAIL_THRESHOLD=5 failures (~50 s from first failure). + action: > + Check journal: ssh root@{{ $labels.host }} "journalctl -u nfs-mount-monitor.service -n 30 --no-pager" + Check mount: ssh root@{{ $labels.host }} "mountpoint /data/nfs/k3svolumes && stat /data/nfs/k3svolumes" + Check stunnel: ssh root@{{ $labels.host }} "systemctl status stunnel" + + # Critical: at or above NFS_FAIL_THRESHOLD (default 5). + # The node is about to be rebooted (or the counter is stuck above + # threshold if escalate_reboot itself failed). + - alert: NfsMountAutoRepairCritical + expr: | + nfs_mount_monitor_consecutive_failures >= 5 + for: 0s + labels: + severity: critical + component: nfs + annotations: + summary: "NFS auto-repair failed on {{ $labels.host }} — reboot imminent" + description: > + Host {{ $labels.host }} has {{ $value }} consecutive NFS + fix_mount failures (NFS_FAIL_THRESHOLD=5). The node has been + cordoned and systemctl reboot issued. If the node is still up, + the reboot may have stalled or escalate_reboot failed. + action: > + Check node status: kubectl get node {{ $labels.host }}.lan.buetow.org + Check journal: ssh root@{{ $labels.host }} "journalctl -u nfs-mount-monitor.service -n 50 --no-pager" + Manual recovery: ssh root@{{ $labels.host }} "systemctl restart stunnel && mount /data/nfs/k3svolumes" + Uncordon after fix: kubectl uncordon {{ $labels.host }}.lan.buetow.org diff --git a/f3s/prometheus/persistence-values.yaml b/f3s/prometheus/persistence-values.yaml index 732d0a9..e00f6d1 100644 --- a/f3s/prometheus/persistence-values.yaml +++ b/f3s/prometheus/persistence-values.yaml @@ -24,6 +24,22 @@ kubeControllerManager: https: true insecureSkipVerify: true +# Enable the textfile collector in the node_exporter DaemonSet so +# check-nfs-mount.sh can expose nfs_mount_monitor_consecutive_failures +# as a Prometheus metric without needing a separate exporter. +prometheus-node-exporter: + extraArgs: + - --collector.textfile.directory=/host/textfile_collector + extraVolumes: + - name: textfile-collector + hostPath: + path: /var/lib/node_exporter/textfile_collector + type: DirectoryOrCreate + extraVolumeMounts: + - name: textfile-collector + mountPath: /host/textfile_collector + readOnly: true + prometheus: prometheusSpec: # Enable remote write receiver for accepting historic data with custom timestamps diff --git a/f3s/r-nodes/Rexfile b/f3s/r-nodes/Rexfile index 0dc2aea..fd61326 100644 --- a/f3s/r-nodes/Rexfile +++ b/f3s/r-nodes/Rexfile @@ -63,6 +63,23 @@ task 'nfs_mount_monitor', group => 'root', mode => '700'; + # Ensure the node_exporter textfile_collector directory exists. + # The check-nfs-mount.sh script writes nfs_mount_monitor.prom here; + # node_exporter reads it when --collector.textfile.directory is set. + # world-readable so the node_exporter process (root or dedicated user) + # can pick up the file without special ACLs. + file '/var/lib/node_exporter', + ensure => 'directory', + owner => 'root', + group => 'root', + mode => '755'; + + file '/var/lib/node_exporter/textfile_collector', + ensure => 'directory', + owner => 'root', + group => 'root', + mode => '755'; + # Deploy the health-monitor script. file '/usr/local/bin/check-nfs-mount.sh', source => catfile( $monitor_dir, 'check-nfs-mount.sh' ), diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh index dd71a4d..0a772d8 100644 --- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh +++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh @@ -48,6 +48,12 @@ LOCK_FILE="/var/run/nfs-mount-check.lock" STATE_DIR="/var/lib/nfs-mount-monitor" FAIL_COUNT_FILE="$STATE_DIR/fail-count" +# Textfile collector output for node_exporter. +# Written on every run so Prometheus always has a current sample. +# The DaemonSet mounts /var/lib/node_exporter/textfile_collector from the host. +TEXTFILE_DIR="/var/lib/node_exporter/textfile_collector" +TEXTFILE_PROM="$TEXTFILE_DIR/nfs_mount_monitor.prom" + # Load tunable configuration (NFS_FAIL_THRESHOLD) from the EnvironmentFile # deployed alongside this script. Defaults are defined here so the script # works even if the file is absent. @@ -82,6 +88,29 @@ write_fail_count() { local count="$1" mkdir -p "$STATE_DIR" echo "$count" > "$FAIL_COUNT_FILE" + # Also export the current count to the node_exporter textfile collector + # so Prometheus can alert directly without parsing journal logs. + write_textfile_metric "$count" +} + +# write_textfile_metric — write the consecutive-failure gauge to the +# node_exporter textfile_collector directory. The metric name follows the +# node_exporter convention: lowercase, underscores, no units suffix for counts. +# The host label lets Prometheus distinguish r0/r1/r2 even before +# relabelling resolves the instance IP to a hostname. +# We write atomically (tmp + mv) to avoid node_exporter reading a partial file. +write_textfile_metric() { + local count="$1" + local host + host=$(hostname -s) + mkdir -p "$TEXTFILE_DIR" + local tmp_file + tmp_file="$(mktemp "$TEXTFILE_DIR/nfs_mount_monitor.prom.XXXXXX")" + # Write metric with HELP/TYPE headers for valid exposition format + printf '# HELP nfs_mount_monitor_consecutive_failures Consecutive NFS fix_mount failure count\n' > "$tmp_file" + printf '# TYPE nfs_mount_monitor_consecutive_failures gauge\n' >> "$tmp_file" + printf 'nfs_mount_monitor_consecutive_failures{host="%s"} %s\n' "$host" "$count" >> "$tmp_file" + mv "$tmp_file" "$TEXTFILE_PROM" } # kill_pinning_processes — send SIGKILL to any process whose wchan starts @@ -293,12 +322,16 @@ fi # If all three probes passed cleanly (no repair attempt needed), reset the # consecutive-failure counter so a previous partial failure streak does not -# lower the effective reboot threshold. We only write the file when the -# counter is non-zero to avoid unnecessary writes on every healthy run. +# lower the effective reboot threshold. write_fail_count also refreshes the +# textfile metric so Prometheus always has a current sample. if [ "$PROBE_FAILED" -eq 0 ]; then if [ "$(read_fail_count)" -ne 0 ]; then write_fail_count 0 echo "All probes passed; consecutive-failure counter reset to 0" + else + # Counter is already zero; update the textfile metric timestamp + # so node_exporter sees a fresh scrape on every healthy run. + write_textfile_metric 0 fi fi |
