feat(watch): retry SSH connection failures with exponential backoff

Remove the vm_api_reachable? filter from run_watch so VMs that are currently booting are not silently dropped from the dashboard. Add exponential-backoff retry logic (up to 4 attempts, sleeping 2s, 4s, 8s, 16s) inside VllmWatcher#fetch_vm_stats for transient SSH/WireGuard errors such as connection refused, host unreachable, and exit 255. This lets watch automatically recover while a VM is still starting up instead of failing immediately.
author: Paul Buetow <paul@buetow.org> 2026-05-24 13:48:42 +0300
committer: Paul Buetow <paul@buetow.org> 2026-05-24 13:48:42 +0300
commit: f16f4b753b3bf317e6da79f479ff5f506ed34b47 (patch)
tree: e2c71514677aac0cd7cd85bfc28032d37e9bd55d
parent: 24c7bfa60448c74dff6e21010ac0b98c19be7c04 (diff)
2 files changed, 44 insertions, 24 deletions
diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb
index 2669186..8fa8993 100644
--- a/lib/hyperstack/cli.rb
+++ b/lib/hyperstack/cli.rb
@@ -208,11 +208,11 @@ module HyperstackVM
       end
     end
 
-    # Starts the VllmWatcher dashboard restricted to VMs that are currently reachable.
-    # Uses watch_config_loaders instead of status_config_loaders so VMs whose state
-    # files are stale (e.g. deleted from the console without `delete`) are excluded.
+    # Starts the VllmWatcher dashboard for all selected VMs.
+    # The watcher retries transient SSH/WireGuard connection failures internally,
+    # so VMs that are still booting appear in the dashboard once they come up.
     def run_watch
-      loaders = watch_config_loaders
+      loaders = selected_config_loaders
       raise Error, 'No active VMs found. Run `create --vm 1|2|both` first.' if loaders.empty?
       VllmWatcher.new(config_loaders: loaders).run
     end
@@ -237,26 +237,6 @@ module HyperstackVM
       build_manager(loaders.first.config).show_local_wireguard(expected_ips)
     end
 
-    # Returns only the loaders for VMs whose inference API port is currently reachable.
-    # Falls back to all state-tracked loaders when none are reachable (e.g. WireGuard down),
-    # so the watcher can still render meaningful error output instead of raising.
-    def watch_config_loaders
-      loaders   = selected_config_loaders
-      reachable = loaders.select { |l| vm_api_reachable?(l.config) }
-      reachable.empty? ? loaders : reachable
-    end
-
-    # Quick TCP probe on the VM's inference port via WireGuard.
-    # A successful connect (immediately closed) means the API is up; any network
-    # error means the VM is down or unreachable — exclude it from the watch loop.
-    def vm_api_reachable?(config)
-      TCPSocket.new(config.wireguard_gateway_hostname, config.ollama_port).close
-      true
-    rescue Errno::ECONNREFUSED, Errno::EHOSTUNREACH, Errno::ETIMEDOUT,
-           Errno::ENETUNREACH, SocketError
-      false
-    end
-
     def pair_config_loaders
       [
         ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1.toml')),
diff --git a/lib/hyperstack/watcher.rb b/lib/hyperstack/watcher.rb
index 50c032f..771aa1c 100644
--- a/lib/hyperstack/watcher.rb
+++ b/lib/hyperstack/watcher.rb
@@ -113,8 +113,48 @@ module HyperstackVM
     # Captures the Engine 0 stats line (present once the model is running) and,
     # when that line is absent, the last relevant loading-phase log line so the
     # watch display can show model-download / weight-load progress.
+    # Retries on SSH connection failures (e.g. VM still booting or WireGuard
+    # handshake not yet established) with exponential back-off so the watch loop
+    # does not drop a VM that is still starting up.
     # Returns [gpus, metrics, loading_status, error_or_nil].
+    SSH_RETRYABLE_ERRORS = [
+      /Connection refused/i,
+      /Connection timed out/i,
+      /Connection reset/i,
+      /No route to host/i,
+      /Host is unreachable/i,
+      /Network is unreachable/i,
+      /Could not resolve hostname/i,
+      /Connection closed/i,
+      /Operation timed out/i,
+      /exit 255/i
+    ].freeze
+    MAX_SSH_RETRIES = 4
+
     def fetch_vm_stats(config, wg_host, container_name)
+      attempt = 0
+      while attempt < MAX_SSH_RETRIES
+        result = try_fetch_vm_stats(config, wg_host, container_name)
+        gpus, metrics, loading_status, error = result
+        return result if error.nil?
+
+        break unless ssh_retryable?(error, attempt)
+
+        attempt += 1
+        sleep 2**attempt # exponential back-off: 2, 4, 8, 16s
+      end
+      [nil, nil, nil, error]
+    end
+
+    # True when the error looks like a transient connection problem and we still
+    # have retries left.
+    def ssh_retryable?(error, attempt)
+      return false if attempt >= MAX_SSH_RETRIES
+
+      SSH_RETRYABLE_ERRORS.any? { |pattern| error.match?(pattern) }
+    end
+
+    def try_fetch_vm_stats(config, wg_host, container_name)
       gpu_query = 'index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total'
       # Capture logs once into a shell variable to avoid two docker calls.
       # --tail 300 instead of --since N so we always get the last stats line
author	Paul Buetow <paul@buetow.org>	2026-05-24 13:48:42 +0300
committer	Paul Buetow <paul@buetow.org>	2026-05-24 13:48:42 +0300
commit	f16f4b753b3bf317e6da79f479ff5f506ed34b47 (patch)
tree	e2c71514677aac0cd7cd85bfc28032d37e9bd55d
parent	24c7bfa60448c74dff6e21010ac0b98c19be7c04 (diff)