summaryrefslogtreecommitdiff
path: root/f3s/r-nodes/Rexfile
blob: fd613260b1904b73b5a5cf1514c57ef8f203ba1a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Rex tasks for Rocky Linux r-nodes (r0, r1, r2) — k3s cluster VMs.
#
# Run from repository root:
#   rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
#
# All tasks connect as root (r-nodes require root for systemd and
# /usr/local/bin writes; paul user has no sudo configured on these VMs).

use Rex -feature => [ '1.14', 'exec_autodie' ];
use Rex::Logger;
use File::Basename qw(dirname);
use File::Spec::Functions qw(catfile rel2abs);
use Cwd qw(realpath);

# Rex loads the Rexfile as a synthetic module (__Rexfile__.pm) via @INC, so
# __FILE__ resolves to the internal Rex loader path rather than this file.
# $::rexfile is set to $0 (the -f argument) in Rex::CLI before any tasks run;
# realpath() resolves any relative component against the CWD at load time so
# the path remains valid even when Rex forks worker processes for parallelism.
my $RNODES_DIR = dirname( realpath($::rexfile) );

# All three k3s Rocky Linux VMs; root SSH is configured via authorized_keys.
group r_nodes => qw(
  192.168.1.120
  192.168.1.121
  192.168.1.122
);

user 'root';
sudo FALSE;

# Deploy in parallel — tasks are idempotent and independent per node.
parallelism 3;

# Deploy the NFS mount health-monitor script, its systemd units, and the
# tunable configuration file to all three r-nodes, then reload systemd and
# restart the timer so the new files take effect immediately.
#
# Files managed:
#   /usr/local/bin/check-nfs-mount.sh       (monitor + auto-repair script)
#   /etc/default/nfs-mount-monitor          (tunable: NFS_FAIL_THRESHOLD)
#   /etc/systemd/system/nfs-mount-monitor.service
#   /etc/systemd/system/nfs-mount-monitor.timer
#   /var/lib/nfs-mount-monitor/             (state dir for fail-count file)
#
# Idempotent: Rex only writes the file when content changes; the
# on_change handler reloads systemd and restarts the timer only when
# something actually changed.
desc 'Deploy NFS mount monitor script and systemd units to r0/r1/r2';
task 'nfs_mount_monitor',
  group => 'r_nodes',
  sub {
    my $monitor_dir = catfile( $RNODES_DIR, 'nfs-mount-monitor' );

    # Reload flag — set to 1 if any file changed, so we only reload once.
    my $changed = 0;

    # Ensure the state directory for the fail counter exists with tight
    # permissions (only root should read/write the counter).
    file '/var/lib/nfs-mount-monitor',
      ensure => 'directory',
      owner  => 'root',
      group  => 'root',
      mode   => '700';

    # Ensure the node_exporter textfile_collector directory exists.
    # The check-nfs-mount.sh script writes nfs_mount_monitor.prom here;
    # node_exporter reads it when --collector.textfile.directory is set.
    # world-readable so the node_exporter process (root or dedicated user)
    # can pick up the file without special ACLs.
    file '/var/lib/node_exporter',
      ensure => 'directory',
      owner  => 'root',
      group  => 'root',
      mode   => '755';

    file '/var/lib/node_exporter/textfile_collector',
      ensure => 'directory',
      owner  => 'root',
      group  => 'root',
      mode   => '755';

    # Deploy the health-monitor script.
    file '/usr/local/bin/check-nfs-mount.sh',
      source    => catfile( $monitor_dir, 'check-nfs-mount.sh' ),
      owner     => 'root',
      group     => 'root',
      mode      => '755',
      on_change => sub { $changed = 1 };

    # Deploy the tunable configuration (NFS_FAIL_THRESHOLD).
    # The leading '-' in EnvironmentFile=-/etc/default/... means systemd
    # tolerates the file being absent, but we deploy it so the threshold
    # is explicitly documented on each node.
    file '/etc/default/nfs-mount-monitor',
      source    => catfile( $monitor_dir, 'nfs-mount-monitor.default' ),
      owner     => 'root',
      group     => 'root',
      mode      => '644',
      on_change => sub { $changed = 1 };

    # Deploy the systemd service unit.
    file '/etc/systemd/system/nfs-mount-monitor.service',
      source    => catfile( $monitor_dir, 'nfs-mount-monitor.service' ),
      owner     => 'root',
      group     => 'root',
      mode      => '644',
      on_change => sub { $changed = 1 };

    # Deploy the systemd timer unit.
    file '/etc/systemd/system/nfs-mount-monitor.timer',
      source    => catfile( $monitor_dir, 'nfs-mount-monitor.timer' ),
      owner     => 'root',
      group     => 'root',
      mode      => '644',
      on_change => sub { $changed = 1 };

    if ($changed) {
        Rex::Logger::info('Files changed — reloading systemd and restarting timer');
        run 'systemctl daemon-reload';
        run 'systemctl restart nfs-mount-monitor.timer';
    }

    # Ensure the timer is enabled and running regardless of whether files changed.
    service 'nfs-mount-monitor.timer', ensure => 'started';
    run 'systemctl enable nfs-mount-monitor.timer';
  };

1;

# vim: syntax=perl