diff options
| -rw-r--r-- | f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh index 7fabed8..3593fb7 100644 --- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh +++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh @@ -3,7 +3,16 @@ # (nfs-mount-monitor.timer / nfs-mount-monitor.service) # # Checks whether /data/nfs/k3svolumes is mounted and responsive. -# If the mount is stale or missing it attempts a remount, then a +# Three probes are run in order: +# 1. mountpoint — detects completely missing mounts +# 2. stat — detects read hangs / stale cache misses +# 3. write-probe — detects the "reads OK, writes hang" failure mode +# (stunnel-wrapped NFSv4 can enter a state where stat returns from +# cache but ALL writes block indefinitely; only the write probe +# catches this — mount timeo=10 deciseconds = 1s, so 5s gives one +# full retransmit window plus margin) +# +# If any probe fails, fix_mount is called to attempt a remount, then a # fresh umount+mount cycle. On a successful repair it force-deletes # any pods on this node that are stuck in Unknown/Pending/ContainerCreating, # allowing the kubelet to reschedule them against the now-healthy volume. @@ -74,6 +83,16 @@ if ! timeout 2s stat "$MOUNT_POINT" >/dev/null 2>&1; then fix_mount fi +# Write-probe: detect the "reads OK, writes hang" failure mode. +# A per-host filename prevents r0/r1/r2 from racing on the same file. +# Timeout of 5s covers one full NFS retransmit window (timeo=10 = 1s, +# retrans=2) plus margin, without making the 10-second cron run too long. +HEALTHCHECK_FILE="$MOUNT_POINT/.healthcheck.$(hostname)" +if ! timeout 5s sh -c "echo \$\$ > '$HEALTHCHECK_FILE' && rm -f '$HEALTHCHECK_FILE'" 2>/dev/null; then + echo "NFS writes hanging on $MOUNT_POINT" + fix_mount +fi + # After a successful remount, delete pods stuck on this node if [ "$MOUNT_FIXED" -eq 1 ]; then echo "Mount was fixed, checking for stuck pods on this node..." |
