diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-25 09:14:31 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-25 09:14:31 +0300 |
| commit | b1bcf57124b810b629ecb33ff651b619ff8d7178 (patch) | |
| tree | e63eeb4c0d15c5a3c66388515dc30d28cdfee636 /lib/hyperstack | |
| parent | c96b33d2cdef6a6743f602ba27a46cadef26818a (diff) | |
fix(watch): auto-recover when default VM is dead or replaced
- Add per-VM 10s fetch timeout so one dead VM cannot stall the dashboard
- Make fallback logic check VM state (public_ip + ACTIVE status) instead of
just file existence, so a stale/deleted VM1 state does not block watch
- Auto-replace cached SSH host keys when a VM is recreated instead of failing
- Suppress Ruby thread exception noise on killed SSH threads
Fixes 'just watch' showing blank screen when VM1 is deleted but has a stale
state file, and SSH host-key mismatch on VM recreation.
Diffstat (limited to 'lib/hyperstack')
| -rw-r--r-- | lib/hyperstack/cli.rb | 29 | ||||
| -rw-r--r-- | lib/hyperstack/watcher.rb | 29 |
2 files changed, 52 insertions, 6 deletions
diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb index 915f38c..3b85437 100644 --- a/lib/hyperstack/cli.rb +++ b/lib/hyperstack/cli.rb @@ -112,15 +112,36 @@ module HyperstackVM # Returns only the config loaders whose state files exist, i.e. VMs that have # been provisioned at least once. Used by watch/status/test when the user # wants to see whatever is currently up without specifying --vm explicitly. + # VMs that have an active (live) state file: state exists, has a public IP, + # and status is ACTIVE. Used by watch/status/test when falling back from + # a dead or unprovisioned default VM. def active_config_loaders - pair_config_loaders.select { |loader| File.exist?(loader.config.state_file) } + pair_config_loaders.filter_map do |loader| + next unless File.exist?(loader.config.state_file) + + state = JSON.parse(File.read(loader.config.state_file)) + state['public_ip'] && state['status'] == 'ACTIVE' ? loader : nil + rescue JSON::ParserError, Errno::ENOENT + nil + end + end + + # True when VM1 has a state file that actually points to a running VM. + def vm1_alive? + path = ConfigLoader.load(vm_config_path('1')).config.state_file + return false unless File.exist?(path) + + state = JSON.parse(File.read(path)) + state['public_ip'] && state['status'] == 'ACTIVE' + rescue JSON::ParserError, Errno::ENOENT + false end # When the user runs a command with the default --vm 1 but VM1 has not yet been - # provisioned, fall back to whichever VMs actually have state files so the - # command is useful even with only VM2 (or VM1) running. + # provisioned (or its tracked VM is dead), fall back to whichever VMs actually + # have active state files so the command is useful even with only VM2 running. def default_or_active_loaders - if @vm == '1' && !File.exist?(ConfigLoader.load(vm_config_path('1')).config.state_file) + if @vm == '1' && !vm1_alive? active_config_loaders else selected_config_loaders diff --git a/lib/hyperstack/watcher.rb b/lib/hyperstack/watcher.rb index 73dd6b7..06b5be1 100644 --- a/lib/hyperstack/watcher.rb +++ b/lib/hyperstack/watcher.rb @@ -45,6 +45,8 @@ module HyperstackVM # Runs the watch loop until the user presses Ctrl-C. def run + old_report = Thread.report_on_exception + Thread.report_on_exception = false $stdout.print "\033[?25l" # hide cursor loop do snapshots = fetch_all_parallel @@ -54,15 +56,38 @@ module HyperstackVM rescue Interrupt nil ensure + Thread.report_on_exception = old_report $stdout.print "\033[?25h\n" # restore cursor end private # Fetches stats for every VM concurrently and returns an array of VmSnapshot. + # Each VM is capped at 10 s so one dead VM cannot stall the entire dashboard. + FETCH_TIMEOUT = 10 + def fetch_all_parallel - threads = @config_loaders.map { |loader| Thread.new { fetch_vm(loader) } } - threads.map(&:value) + pairs = @config_loaders.map do |loader| + [loader, Thread.new { fetch_vm(loader) }] + end + pairs.map do |loader, thread| + if thread.join(FETCH_TIMEOUT) + thread.value + else + thread.kill + VmSnapshot.new( + label: File.basename(loader.path, '.toml'), + wg_host: loader.config.wireguard_gateway_hostname, + service_type: :vllm, + vllm_model: nil, container_name: nil, + metrics: nil, gpus: nil, + vllm_error: 'timed out fetching stats (VM may be down or unreachable)', + gpu_error: nil, + loading_status: nil, + fetched_at: Time.now + ) + end + end end # Fetches GPU stats and vLLM container stats for a single VM via one SSH session. |
