From 8dca92ea40b191b9de367197aac7e1f882ed3d43 Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Mon, 16 Mar 2026 19:11:00 +0200
Subject: Update hyperstack VM bootstrap, WireGuard, and Ollama setup logic;
 add retries, apt lock waits, and model verification

---
 snippets/hyperstack/hyperstack_vm.rb | 198 +++++++++++++++++++++++++++--------
 1 file changed, 157 insertions(+), 41 deletions(-)

(limited to 'snippets/hyperstack/hyperstack_vm.rb')

diff --git a/snippets/hyperstack/hyperstack_vm.rb b/snippets/hyperstack/hyperstack_vm.rb
index 9743683..ac60da9 100644
--- a/snippets/hyperstack/hyperstack_vm.rb
+++ b/snippets/hyperstack/hyperstack_vm.rb
@@ -43,7 +43,7 @@ module HyperstackVM
         'file' => '.hyperstack-vm-state.json'
       },
       'vm' => {
-        'name_prefix' => 'gpt-oss',
+        'name_prefix' => 'hyperstack',
         'hostname' => 'hyperstack',
         'flavor_name' => 'n3-A100x1',
         'image_name' => 'Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker',
@@ -506,17 +506,28 @@ module HyperstackVM
         request.body = JSON.generate(payload)
       end
 
-      response = Net::HTTP.start(
-        uri.host,
-        uri.port,
-        use_ssl: uri.scheme == 'https',
-        open_timeout: 30,
-        read_timeout: 120
-      ) { |http| http.request(request) }
-
-      parse_response(response)
-    rescue Timeout::Error, Errno::ECONNREFUSED, SocketError, OpenSSL::SSL::SSLError => e
-      raise Error, "Hyperstack API request failed for #{path}: #{e.message}"
+      retries_left = 4
+      begin
+        response = Net::HTTP.start(
+          uri.host,
+          uri.port,
+          use_ssl: uri.scheme == 'https',
+          open_timeout: 30,
+          read_timeout: 120
+        ) { |http| http.request(request) }
+
+        parse_response(response)
+      rescue Timeout::Error, Errno::ECONNREFUSED, Errno::ECONNRESET,
+             Errno::EHOSTUNREACH, Errno::ENETUNREACH,
+             SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout => e
+        raise Error, "Hyperstack API request failed for #{path}: #{e.message}" if retries_left <= 0
+
+        retries_left -= 1
+        delay = (4 - retries_left) * 5
+        warn "API request to #{path} failed (#{e.class}: #{e.message}), retrying in #{delay}s (#{retries_left} left)..."
+        sleep delay
+        retry
+      end
     end
 
     def parse_response(response)
@@ -733,11 +744,12 @@ module HyperstackVM
         @state_store.save(state)
       end
 
-      if ollama_setup_needed?(state)
-        setup_ollama(state['public_ip'])
-        state['ollama_setup_at'] = Time.now.utc.iso8601
-        state['ollama_models_dir'] = @config.ollama_models_dir
-        state['ollama_pulled_models'] = desired_ollama_models
+      # Install Ollama binary and configure the service (fast), but defer
+      # model pulls until after the WireGuard tunnel is up so that the user
+      # can monitor progress over the tunnel.
+      if @config.ollama_install_enabled? && state['ollama_installed_at'].nil?
+        install_ollama_service(state['public_ip'])
+        state['ollama_installed_at'] = Time.now.utc.iso8601
         @state_store.save(state)
       end
 
@@ -747,6 +759,15 @@ module HyperstackVM
         @state_store.save(state)
       end
 
+      # Pull and verify models after the tunnel is established
+      if ollama_setup_needed?(state)
+        pull_ollama_models(state['public_ip'])
+        state['ollama_setup_at'] = Time.now.utc.iso8601
+        state['ollama_models_dir'] = @config.ollama_models_dir
+        state['ollama_pulled_models'] = desired_ollama_models
+        @state_store.save(state)
+      end
+
       vm = @client.get_vm(vm_id)
       state['security_rules'] = Array(vm['security_rules']).map { |rule| normalize_rule(rule) }
       state['status'] = vm['status']
@@ -833,6 +854,9 @@ module HyperstackVM
     end
 
     def wait_for_ssh(host)
+      # Remove stale host key for this IP — VMs frequently reuse IPs after
+      # delete/recreate, causing StrictHostKeyChecking to reject the new key
+      remove_stale_host_key(host)
       info "Waiting for SSH on #{host}:#{@config.ssh_port}..."
       with_polling("SSH on #{host}:#{@config.ssh_port}") do
         next nil unless tcp_open?(host, @config.ssh_port)
@@ -859,22 +883,52 @@ module HyperstackVM
 
     def bootstrap_guest(host)
       info 'Bootstrapping Ubuntu guest over SSH...'
-      stdout, stderr, status = run_ssh_command(host, guest_bootstrap_script)
-      raise Error, "Guest bootstrap failed: #{stderr.strip.empty? ? stdout : stderr}" unless status.success?
+      retries = 3
+      retries.times do |attempt|
+        stdout, stderr, status = run_ssh_command(host, guest_bootstrap_script)
+        return if status.success?
+
+        msg = stderr.strip.empty? ? stdout : stderr
+        raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1
+
+        warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..."
+        sleep 15
+      end
     end
 
     def ollama_setup_needed?(state)
       return false unless @config.ollama_install_enabled?
+      # Re-run setup if state has no record, or if desired models changed
+      return true if state['ollama_setup_at'].nil?
 
-      state['ollama_setup_at'].nil? || model_list_signature(desired_ollama_models) != model_list_signature(state['ollama_pulled_models'])
+      model_list_signature(desired_ollama_models) != model_list_signature(state['ollama_pulled_models'])
     end
 
-    def setup_ollama(host)
+    def install_ollama_service(host)
       info "Installing and configuring Ollama on #{host}..."
-      output, status = run_ssh_command_streaming(host, ollama_setup_script)
-      return if status.success?
+      output, status = run_ssh_command_streaming(host, ollama_install_script)
+      raise Error, "Ollama install failed: #{output.strip}" unless status.success?
+    end
+
+    def pull_ollama_models(host)
+      info "Pulling Ollama models on #{host}..."
+      output, status = run_ssh_command_streaming(host, ollama_pull_script)
+      raise Error, "Ollama model pull failed: #{output.strip}" unless status.success?
 
-      raise Error, "Ollama setup failed: #{output.strip}"
+      # Verify all models are actually present on the remote (belt-and-suspenders
+      # check in case ollama pull returned 0 without actually pulling the model)
+      verify_remote_models(host)
+    end
+
+    def verify_remote_models(host)
+      stdout, _stderr, status = run_ssh_command(host, 'ollama list')
+      return unless status.success?
+
+      remote_models = stdout.lines.drop(1).map { |l| l.split.first }.compact
+      missing = desired_ollama_models.reject { |m| remote_models.any? { |r| r.start_with?(m) } }
+      return if missing.empty?
+
+      raise Error, "Models missing after setup: #{missing.join(', ')}. Remote has: #{remote_models.join(', ')}"
     end
 
     def wireguard_setup_needed?(state)
@@ -889,21 +943,31 @@ module HyperstackVM
 
     def run_wireguard_setup(host)
       validate_wireguard_setup_script!
-      info "Running WireGuard auto-setup via #{@config.wireguard_setup_script} #{host}..."
+      retries = 3
+      retries.times do |attempt|
+        info "Running WireGuard auto-setup via #{@config.wireguard_setup_script} #{host}..."
+
+        status = run_wireguard_script(host)
+        return if status.success?
 
+        if attempt == retries - 1
+          raise Error, "WireGuard setup failed after #{retries} attempts (exit #{status.exitstatus})."
+        end
+
+        delay = (attempt + 1) * 15
+        warn "WireGuard setup attempt #{attempt + 1}/#{retries} failed (exit #{status.exitstatus}), retrying in #{delay}s..."
+        sleep delay
+      end
+    end
+
+    def run_wireguard_script(host)
       Open3.popen2e('bash', @config.wireguard_setup_script, host) do |stdin, output, wait_thr|
         stdin.sync = true
         stdin.puts
         stdin.close
 
-        output.each do |line|
-          @out.print(line)
-        end
-
-        status = wait_thr.value
-        next if status.success?
-
-        raise Error, "WireGuard setup script failed with exit status #{status.exitstatus}."
+        output.each { |line| @out.print(line) }
+        wait_thr.value
       end
     end
 
@@ -940,6 +1004,14 @@ module HyperstackVM
       raise Error, "Configured WireGuard settings do not match #{script_path}: #{mismatches.join('; ')}"
     end
 
+    def remove_stale_host_key(host)
+      system('ssh-keygen', '-R', host, out: File::NULL, err: File::NULL)
+      # Also remove bracketed form for non-standard ports
+      if @config.ssh_port != 22
+        system('ssh-keygen', '-R', "[#{host}]:#{@config.ssh_port}", out: File::NULL, err: File::NULL)
+      end
+    end
+
     def failed_vm?(vm)
       [vm['status'], vm['vm_state'], vm['power_state']].compact.any? do |value|
         value.to_s.downcase.match?(/error|failed|deleted|shelved/)
@@ -955,7 +1027,7 @@ module HyperstackVM
         sock.close
         true
       end
-    rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, SocketError, IOError
+    rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::EHOSTUNREACH, Errno::ENETUNREACH, SocketError, IOError
       false
     end
 
@@ -1107,6 +1179,16 @@ module HyperstackVM
       script = []
       script << 'set -euo pipefail'
 
+      # Wait for any running unattended-upgrades or apt locks to release
+      # before attempting package operations (transient lock on fresh VMs)
+      script << 'echo "Waiting for apt locks to clear..."'
+      script << 'for i in $(seq 1 30); do'
+      script << '  if ! fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; then break; fi'
+      script << '  echo "  apt lock held, waiting ($i/30)..."; sleep 10'
+      script << 'done'
+      script << 'sudo systemctl stop unattended-upgrades.service 2>/dev/null || true'
+      script << 'sudo systemctl disable unattended-upgrades.service 2>/dev/null || true'
+
       if @config.install_wireguard?
         script << 'which wg >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y wireguard)'
       end
@@ -1119,14 +1201,18 @@ module HyperstackVM
       end
 
       if @config.configure_ollama_host?
+        # Only write a minimal OLLAMA_HOST override if no override exists yet;
+        # ollama_setup_script writes the full override (OLLAMA_MODELS, GPU_OVERHEAD, etc.)
         script << "if systemctl list-unit-files | grep -q '^ollama.service'; then"
-        script << '  sudo mkdir -p /etc/systemd/system/ollama.service.d'
-        script << "  cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null"
+        script << '  if [ ! -f /etc/systemd/system/ollama.service.d/override.conf ]; then'
+        script << '    sudo mkdir -p /etc/systemd/system/ollama.service.d'
+        script << "    cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null"
         script << '[Service]'
         script << "Environment=\"OLLAMA_HOST=0.0.0.0:#{@config.ollama_port}\""
         script << 'OVERRIDE'
-        script << '  sudo systemctl daemon-reload'
-        script << '  sudo systemctl restart ollama || true'
+        script << '    sudo systemctl daemon-reload'
+        script << '    sudo systemctl restart ollama || true'
+        script << '  fi'
         script << 'fi'
       end
 
@@ -1151,10 +1237,13 @@ module HyperstackVM
       normalized_model_list(models).sort
     end
 
-    def ollama_setup_script
+    # Installs the Ollama binary, configures the systemd override (models dir,
+    # listen host, GPU overhead, parallelism), and starts the service.  Model
+    # pulls are handled separately by ollama_pull_script so that the WireGuard
+    # tunnel can be established first.
+    def ollama_install_script
       models_dir = @config.ollama_models_dir
       listen_host = @config.ollama_listen_host
-      model_pulls = desired_ollama_models
 
       script = []
       script << 'set -euo pipefail'
@@ -1178,8 +1267,35 @@ module HyperstackVM
       script << 'sudo systemctl restart ollama'
       script << 'sleep 3'
       script << 'systemctl is-active --quiet ollama'
+      script << 'echo ollama-install-ok'
+      script.join("\n")
+    end
+
+    # Pulls each configured model with retry and per-model + final verification.
+    # Run after WireGuard is up so the user can monitor progress over the tunnel.
+    def ollama_pull_script
+      models_dir = @config.ollama_models_dir
+      model_pulls = desired_ollama_models
+
+      script = []
+      script << 'set -euo pipefail'
+      # Pull each model with retry (transient network failures) and verify
+      # it is actually present afterwards
+      model_pulls.each do |model|
+        escaped = Shellwords.escape(model)
+        script << "echo \"Pulling model #{model}...\""
+        script << "for attempt in 1 2 3; do"
+        script << "  if ollama pull #{escaped}; then break; fi"
+        script << "  if [ \"$attempt\" -eq 3 ]; then echo \"FATAL: failed to pull #{model} after 3 attempts\"; exit 1; fi"
+        script << "  echo \"  pull attempt $attempt failed, retrying in 15s...\"; sleep 15"
+        script << "done"
+        script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} not found after pull\"; exit 1; }"
+      end
+      # Final verification: ensure all expected models are listed
+      script << 'echo "Verifying all models are present..."'
       model_pulls.each do |model|
-        script << "ollama pull #{Shellwords.escape(model)}"
+        escaped = Shellwords.escape(model)
+        script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} missing in final check\"; exit 1; }"
       end
       script << "echo ollama-models-dir=#{models_dir}"
       script << 'echo ollama-ok'
-- 
cgit v1.2.3