summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-25 09:14:31 +0300
committerPaul Buetow <paul@buetow.org>2026-05-25 09:14:31 +0300
commitb1bcf57124b810b629ecb33ff651b619ff8d7178 (patch)
treee63eeb4c0d15c5a3c66388515dc30d28cdfee636
parentc96b33d2cdef6a6743f602ba27a46cadef26818a (diff)
fix(watch): auto-recover when default VM is dead or replaced
- Add per-VM 10s fetch timeout so one dead VM cannot stall the dashboard - Make fallback logic check VM state (public_ip + ACTIVE status) instead of just file existence, so a stale/deleted VM1 state does not block watch - Auto-replace cached SSH host keys when a VM is recreated instead of failing - Suppress Ruby thread exception noise on killed SSH threads Fixes 'just watch' showing blank screen when VM1 is deleted but has a stale state file, and SSH host-key mismatch on VM recreation.
-rw-r--r--lib/hyperstack/cli.rb29
-rw-r--r--lib/hyperstack/watcher.rb29
2 files changed, 52 insertions, 6 deletions
diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb
index 915f38c..3b85437 100644
--- a/lib/hyperstack/cli.rb
+++ b/lib/hyperstack/cli.rb
@@ -112,15 +112,36 @@ module HyperstackVM
# Returns only the config loaders whose state files exist, i.e. VMs that have
# been provisioned at least once. Used by watch/status/test when the user
# wants to see whatever is currently up without specifying --vm explicitly.
+ # VMs that have an active (live) state file: state exists, has a public IP,
+ # and status is ACTIVE. Used by watch/status/test when falling back from
+ # a dead or unprovisioned default VM.
def active_config_loaders
- pair_config_loaders.select { |loader| File.exist?(loader.config.state_file) }
+ pair_config_loaders.filter_map do |loader|
+ next unless File.exist?(loader.config.state_file)
+
+ state = JSON.parse(File.read(loader.config.state_file))
+ state['public_ip'] && state['status'] == 'ACTIVE' ? loader : nil
+ rescue JSON::ParserError, Errno::ENOENT
+ nil
+ end
+ end
+
+ # True when VM1 has a state file that actually points to a running VM.
+ def vm1_alive?
+ path = ConfigLoader.load(vm_config_path('1')).config.state_file
+ return false unless File.exist?(path)
+
+ state = JSON.parse(File.read(path))
+ state['public_ip'] && state['status'] == 'ACTIVE'
+ rescue JSON::ParserError, Errno::ENOENT
+ false
end
# When the user runs a command with the default --vm 1 but VM1 has not yet been
- # provisioned, fall back to whichever VMs actually have state files so the
- # command is useful even with only VM2 (or VM1) running.
+ # provisioned (or its tracked VM is dead), fall back to whichever VMs actually
+ # have active state files so the command is useful even with only VM2 running.
def default_or_active_loaders
- if @vm == '1' && !File.exist?(ConfigLoader.load(vm_config_path('1')).config.state_file)
+ if @vm == '1' && !vm1_alive?
active_config_loaders
else
selected_config_loaders
diff --git a/lib/hyperstack/watcher.rb b/lib/hyperstack/watcher.rb
index 73dd6b7..06b5be1 100644
--- a/lib/hyperstack/watcher.rb
+++ b/lib/hyperstack/watcher.rb
@@ -45,6 +45,8 @@ module HyperstackVM
# Runs the watch loop until the user presses Ctrl-C.
def run
+ old_report = Thread.report_on_exception
+ Thread.report_on_exception = false
$stdout.print "\033[?25l" # hide cursor
loop do
snapshots = fetch_all_parallel
@@ -54,15 +56,38 @@ module HyperstackVM
rescue Interrupt
nil
ensure
+ Thread.report_on_exception = old_report
$stdout.print "\033[?25h\n" # restore cursor
end
private
# Fetches stats for every VM concurrently and returns an array of VmSnapshot.
+ # Each VM is capped at 10 s so one dead VM cannot stall the entire dashboard.
+ FETCH_TIMEOUT = 10
+
def fetch_all_parallel
- threads = @config_loaders.map { |loader| Thread.new { fetch_vm(loader) } }
- threads.map(&:value)
+ pairs = @config_loaders.map do |loader|
+ [loader, Thread.new { fetch_vm(loader) }]
+ end
+ pairs.map do |loader, thread|
+ if thread.join(FETCH_TIMEOUT)
+ thread.value
+ else
+ thread.kill
+ VmSnapshot.new(
+ label: File.basename(loader.path, '.toml'),
+ wg_host: loader.config.wireguard_gateway_hostname,
+ service_type: :vllm,
+ vllm_model: nil, container_name: nil,
+ metrics: nil, gpus: nil,
+ vllm_error: 'timed out fetching stats (VM may be down or unreachable)',
+ gpu_error: nil,
+ loading_status: nil,
+ fetched_at: Time.now
+ )
+ end
+ end
end
# Fetches GPU stats and vLLM container stats for a single VM via one SSH session.