3 files changed, 202 insertions, 71 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index da02ccf..2d83b0f 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -8,7 +8,7 @@ base_url = "https://infrahub-api.nexgencloud.com/v1"
 file = ".hyperstack-vm-state.json"
 
 [vm]
-name_prefix = "gpt-oss"
+name_prefix = "hyperstack"
 hostname = "hyperstack"
 environment_name = "snonux-ollama"
 
@@ -47,7 +47,7 @@ models_dir = "/ephemeral/ollama/models"
 listen_host = "0.0.0.0:11434"
 gpu_overhead_mb = 2000
 num_parallel = 4
-pull_models = ["qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]
+pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]
 
 [wireguard]
 auto_setup = true
diff --git a/snippets/hyperstack/hyperstack_vm.rb b/snippets/hyperstack/hyperstack_vm.rb
index 9743683..ac60da9 100644
--- a/snippets/hyperstack/hyperstack_vm.rb
+++ b/snippets/hyperstack/hyperstack_vm.rb
@@ -43,7 +43,7 @@ module HyperstackVM
         'file' => '.hyperstack-vm-state.json'
       },
       'vm' => {
-        'name_prefix' => 'gpt-oss',
+        'name_prefix' => 'hyperstack',
         'hostname' => 'hyperstack',
         'flavor_name' => 'n3-A100x1',
         'image_name' => 'Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker',
@@ -506,17 +506,28 @@ module HyperstackVM
         request.body = JSON.generate(payload)
       end
 
-      response = Net::HTTP.start(
-        uri.host,
-        uri.port,
-        use_ssl: uri.scheme == 'https',
-        open_timeout: 30,
-        read_timeout: 120
-      ) { |http| http.request(request) }
-
-      parse_response(response)
-    rescue Timeout::Error, Errno::ECONNREFUSED, SocketError, OpenSSL::SSL::SSLError => e
-      raise Error, "Hyperstack API request failed for #{path}: #{e.message}"
+      retries_left = 4
+      begin
+        response = Net::HTTP.start(
+          uri.host,
+          uri.port,
+          use_ssl: uri.scheme == 'https',
+          open_timeout: 30,
+          read_timeout: 120
+        ) { |http| http.request(request) }
+
+        parse_response(response)
+      rescue Timeout::Error, Errno::ECONNREFUSED, Errno::ECONNRESET,
+             Errno::EHOSTUNREACH, Errno::ENETUNREACH,
+             SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout => e
+        raise Error, "Hyperstack API request failed for #{path}: #{e.message}" if retries_left <= 0
+
+        retries_left -= 1
+        delay = (4 - retries_left) * 5
+        warn "API request to #{path} failed (#{e.class}: #{e.message}), retrying in #{delay}s (#{retries_left} left)..."
+        sleep delay
+        retry
+      end
     end
 
     def parse_response(response)
@@ -733,11 +744,12 @@ module HyperstackVM
         @state_store.save(state)
       end
 
-      if ollama_setup_needed?(state)
-        setup_ollama(state['public_ip'])
-        state['ollama_setup_at'] = Time.now.utc.iso8601
-        state['ollama_models_dir'] = @config.ollama_models_dir
-        state['ollama_pulled_models'] = desired_ollama_models
+      # Install Ollama binary and configure the service (fast), but defer
+      # model pulls until after the WireGuard tunnel is up so that the user
+      # can monitor progress over the tunnel.
+      if @config.ollama_install_enabled? && state['ollama_installed_at'].nil?
+        install_ollama_service(state['public_ip'])
+        state['ollama_installed_at'] = Time.now.utc.iso8601
         @state_store.save(state)
       end
 
@@ -747,6 +759,15 @@ module HyperstackVM
         @state_store.save(state)
       end
 
+      # Pull and verify models after the tunnel is established
+      if ollama_setup_needed?(state)
+        pull_ollama_models(state['public_ip'])
+        state['ollama_setup_at'] = Time.now.utc.iso8601
+        state['ollama_models_dir'] = @config.ollama_models_dir
+        state['ollama_pulled_models'] = desired_ollama_models
+        @state_store.save(state)
+      end
+
       vm = @client.get_vm(vm_id)
       state['security_rules'] = Array(vm['security_rules']).map { |rule| normalize_rule(rule) }
       state['status'] = vm['status']
@@ -833,6 +854,9 @@ module HyperstackVM
     end
 
     def wait_for_ssh(host)
+      # Remove stale host key for this IP — VMs frequently reuse IPs after
+      # delete/recreate, causing StrictHostKeyChecking to reject the new key
+      remove_stale_host_key(host)
       info "Waiting for SSH on #{host}:#{@config.ssh_port}..."
       with_polling("SSH on #{host}:#{@config.ssh_port}") do
         next nil unless tcp_open?(host, @config.ssh_port)
@@ -859,22 +883,52 @@ module HyperstackVM
 
     def bootstrap_guest(host)
       info 'Bootstrapping Ubuntu guest over SSH...'
-      stdout, stderr, status = run_ssh_command(host, guest_bootstrap_script)
-      raise Error, "Guest bootstrap failed: #{stderr.strip.empty? ? stdout : stderr}" unless status.success?
+      retries = 3
+      retries.times do |attempt|
+        stdout, stderr, status = run_ssh_command(host, guest_bootstrap_script)
+        return if status.success?
+
+        msg = stderr.strip.empty? ? stdout : stderr
+        raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1
+
+        warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..."
+        sleep 15
+      end
     end
 
     def ollama_setup_needed?(state)
       return false unless @config.ollama_install_enabled?
+      # Re-run setup if state has no record, or if desired models changed
+      return true if state['ollama_setup_at'].nil?
 
-      state['ollama_setup_at'].nil? || model_list_signature(desired_ollama_models) != model_list_signature(state['ollama_pulled_models'])
+      model_list_signature(desired_ollama_models) != model_list_signature(state['ollama_pulled_models'])
     end
 
-    def setup_ollama(host)
+    def install_ollama_service(host)
       info "Installing and configuring Ollama on #{host}..."
-      output, status = run_ssh_command_streaming(host, ollama_setup_script)
-      return if status.success?
+      output, status = run_ssh_command_streaming(host, ollama_install_script)
+      raise Error, "Ollama install failed: #{output.strip}" unless status.success?
+    end
+
+    def pull_ollama_models(host)
+      info "Pulling Ollama models on #{host}..."
+      output, status = run_ssh_command_streaming(host, ollama_pull_script)
+      raise Error, "Ollama model pull failed: #{output.strip}" unless status.success?
 
-      raise Error, "Ollama setup failed: #{output.strip}"
+      # Verify all models are actually present on the remote (belt-and-suspenders
+      # check in case ollama pull returned 0 without actually pulling the model)
+      verify_remote_models(host)
+    end
+
+    def verify_remote_models(host)
+      stdout, _stderr, status = run_ssh_command(host, 'ollama list')
+      return unless status.success?
+
+      remote_models = stdout.lines.drop(1).map { |l| l.split.first }.compact
+      missing = desired_ollama_models.reject { |m| remote_models.any? { |r| r.start_with?(m) } }
+      return if missing.empty?
+
+      raise Error, "Models missing after setup: #{missing.join(', ')}. Remote has: #{remote_models.join(', ')}"
     end
 
     def wireguard_setup_needed?(state)
@@ -889,21 +943,31 @@ module HyperstackVM
 
     def run_wireguard_setup(host)
       validate_wireguard_setup_script!
-      info "Running WireGuard auto-setup via #{@config.wireguard_setup_script} #{host}..."
+      retries = 3
+      retries.times do |attempt|
+        info "Running WireGuard auto-setup via #{@config.wireguard_setup_script} #{host}..."
+
+        status = run_wireguard_script(host)
+        return if status.success?
 
+        if attempt == retries - 1
+          raise Error, "WireGuard setup failed after #{retries} attempts (exit #{status.exitstatus})."
+        end
+
+        delay = (attempt + 1) * 15
+        warn "WireGuard setup attempt #{attempt + 1}/#{retries} failed (exit #{status.exitstatus}), retrying in #{delay}s..."
+        sleep delay
+      end
+    end
+
+    def run_wireguard_script(host)
       Open3.popen2e('bash', @config.wireguard_setup_script, host) do |stdin, output, wait_thr|
         stdin.sync = true
         stdin.puts
         stdin.close
 
-        output.each do |line|
-          @out.print(line)
-        end
-
-        status = wait_thr.value
-        next if status.success?
-
-        raise Error, "WireGuard setup script failed with exit status #{status.exitstatus}."
+        output.each { |line| @out.print(line) }
+        wait_thr.value
       end
     end
 
@@ -940,6 +1004,14 @@ module HyperstackVM
       raise Error, "Configured WireGuard settings do not match #{script_path}: #{mismatches.join('; ')}"
     end
 
+    def remove_stale_host_key(host)
+      system('ssh-keygen', '-R', host, out: File::NULL, err: File::NULL)
+      # Also remove bracketed form for non-standard ports
+      if @config.ssh_port != 22
+        system('ssh-keygen', '-R', "[#{host}]:#{@config.ssh_port}", out: File::NULL, err: File::NULL)
+      end
+    end
+
     def failed_vm?(vm)
       [vm['status'], vm['vm_state'], vm['power_state']].compact.any? do |value|
         value.to_s.downcase.match?(/error|failed|deleted|shelved/)
@@ -955,7 +1027,7 @@ module HyperstackVM
         sock.close
         true
       end
-    rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, SocketError, IOError
+    rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::EHOSTUNREACH, Errno::ENETUNREACH, SocketError, IOError
       false
     end
 
@@ -1107,6 +1179,16 @@ module HyperstackVM
       script = []
       script << 'set -euo pipefail'
 
+      # Wait for any running unattended-upgrades or apt locks to release
+      # before attempting package operations (transient lock on fresh VMs)
+      script << 'echo "Waiting for apt locks to clear..."'
+      script << 'for i in $(seq 1 30); do'
+      script << '  if ! fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; then break; fi'
+      script << '  echo "  apt lock held, waiting ($i/30)..."; sleep 10'
+      script << 'done'
+      script << 'sudo systemctl stop unattended-upgrades.service 2>/dev/null || true'
+      script << 'sudo systemctl disable unattended-upgrades.service 2>/dev/null || true'
+
       if @config.install_wireguard?
         script << 'which wg >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y wireguard)'
       end
@@ -1119,14 +1201,18 @@ module HyperstackVM
       end
 
       if @config.configure_ollama_host?
+        # Only write a minimal OLLAMA_HOST override if no override exists yet;
+        # ollama_setup_script writes the full override (OLLAMA_MODELS, GPU_OVERHEAD, etc.)
         script << "if systemctl list-unit-files | grep -q '^ollama.service'; then"
-        script << '  sudo mkdir -p /etc/systemd/system/ollama.service.d'
-        script << "  cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null"
+        script << '  if [ ! -f /etc/systemd/system/ollama.service.d/override.conf ]; then'
+        script << '    sudo mkdir -p /etc/systemd/system/ollama.service.d'
+        script << "    cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null"
         script << '[Service]'
         script << "Environment=\"OLLAMA_HOST=0.0.0.0:#{@config.ollama_port}\""
         script << 'OVERRIDE'
-        script << '  sudo systemctl daemon-reload'
-        script << '  sudo systemctl restart ollama || true'
+        script << '    sudo systemctl daemon-reload'
+        script << '    sudo systemctl restart ollama || true'
+        script << '  fi'
         script << 'fi'
       end
 
@@ -1151,10 +1237,13 @@ module HyperstackVM
       normalized_model_list(models).sort
     end
 
-    def ollama_setup_script
+    # Installs the Ollama binary, configures the systemd override (models dir,
+    # listen host, GPU overhead, parallelism), and starts the service.  Model
+    # pulls are handled separately by ollama_pull_script so that the WireGuard
+    # tunnel can be established first.
+    def ollama_install_script
       models_dir = @config.ollama_models_dir
       listen_host = @config.ollama_listen_host
-      model_pulls = desired_ollama_models
 
       script = []
       script << 'set -euo pipefail'
@@ -1178,8 +1267,35 @@ module HyperstackVM
       script << 'sudo systemctl restart ollama'
       script << 'sleep 3'
       script << 'systemctl is-active --quiet ollama'
+      script << 'echo ollama-install-ok'
+      script.join("\n")
+    end
+
+    # Pulls each configured model with retry and per-model + final verification.
+    # Run after WireGuard is up so the user can monitor progress over the tunnel.
+    def ollama_pull_script
+      models_dir = @config.ollama_models_dir
+      model_pulls = desired_ollama_models
+
+      script = []
+      script << 'set -euo pipefail'
+      # Pull each model with retry (transient network failures) and verify
+      # it is actually present afterwards
+      model_pulls.each do |model|
+        escaped = Shellwords.escape(model)
+        script << "echo \"Pulling model #{model}...\""
+        script << "for attempt in 1 2 3; do"
+        script << "  if ollama pull #{escaped}; then break; fi"
+        script << "  if [ \"$attempt\" -eq 3 ]; then echo \"FATAL: failed to pull #{model} after 3 attempts\"; exit 1; fi"
+        script << "  echo \"  pull attempt $attempt failed, retrying in 15s...\"; sleep 15"
+        script << "done"
+        script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} not found after pull\"; exit 1; }"
+      end
+      # Final verification: ensure all expected models are listed
+      script << 'echo "Verifying all models are present..."'
       model_pulls.each do |model|
-        script << "ollama pull #{Shellwords.escape(model)}"
+        escaped = Shellwords.escape(model)
+        script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} missing in final check\"; exit 1; }"
       end
       script << "echo ollama-models-dir=#{models_dir}"
       script << 'echo ollama-ok'
diff --git a/snippets/hyperstack/wg1-setup.sh b/snippets/hyperstack/wg1-setup.sh
index a57c986..d057fb8 100755
--- a/snippets/hyperstack/wg1-setup.sh
+++ b/snippets/hyperstack/wg1-setup.sh
@@ -73,6 +73,29 @@ print_error() {
     echo -e "${RED}$1${NC}"
 }
 
+# Retry wrapper for SSH/SCP commands that may fail due to transient
+# connection resets (e.g. sshd restart from unattended-upgrades).
+# Usage: retry_ssh ssh user@host "command"
+#        retry_ssh scp file user@host:/path
+retry_ssh() {
+    local max_attempts=5
+    local attempt=1
+    local delay=10
+    while true; do
+        if "$@"; then
+            return 0
+        fi
+        if [[ $attempt -ge $max_attempts ]]; then
+            print_error "Command failed after ${max_attempts} attempts: $*"
+            return 1
+        fi
+        echo "  SSH attempt ${attempt}/${max_attempts} failed, retrying in ${delay}s..."
+        sleep "$delay"
+        attempt=$((attempt + 1))
+        delay=$((delay + 5))
+    done
+}
+
 # Validate arguments
 if [[ $# -ne 1 ]]; then
     echo "Usage: $0 <VM_PUBLIC_IP>"
@@ -159,64 +182,56 @@ print_success "Client config created"
 echo ""
 echo "=== Setting up hyperstack VM (${VM_IP}) ==="
 
-# Check SSH connectivity
+# Wait for SSH to become available (handles transient connection resets
+# from sshd restarts due to unattended-upgrades or package installs)
 echo "Testing SSH connection..."
-if ! ssh -o ConnectTimeout=10 -o BatchMode=yes "${SSH_USER}@${VM_IP}" "echo 'SSH OK'" 2>/dev/null; then
-    print_error "Error: Cannot connect to ${SSH_USER}@${VM_IP}"
-    print_error "Please ensure SSH access is configured."
-    exit 1
-fi
+retry_ssh ssh -o ConnectTimeout=10 -o BatchMode=yes "${SSH_USER}@${VM_IP}" "echo 'SSH OK'"
 print_success "SSH connection OK"
 
 # Install WireGuard on server if not present
 echo "Installing WireGuard on hyperstack..."
-ssh "${SSH_USER}@${VM_IP}" "which wg >/dev/null 2>&1 || (sudo apt update && sudo apt install -y wireguard)"
+retry_ssh ssh "${SSH_USER}@${VM_IP}" "which wg >/dev/null 2>&1 || (sudo apt update && sudo apt install -y wireguard)"
 print_success "WireGuard installed"
 
 # Copy server config to hyperstack
 echo "Copying wg1.conf to hyperstack..."
-scp "$TMPDIR/server-wg1.conf" "${SSH_USER}@${VM_IP}:/tmp/wg1.conf"
-ssh "${SSH_USER}@${VM_IP}" "sudo mv /tmp/wg1.conf /etc/wireguard/wg1.conf && sudo chmod 600 /etc/wireguard/wg1.conf"
+retry_ssh scp "$TMPDIR/server-wg1.conf" "${SSH_USER}@${VM_IP}:/tmp/wg1.conf"
+retry_ssh ssh "${SSH_USER}@${VM_IP}" "sudo mv /tmp/wg1.conf /etc/wireguard/wg1.conf && sudo chmod 600 /etc/wireguard/wg1.conf"
 print_success "Server config installed"
 
 # Configure firewall on hyperstack
 echo "Configuring firewall (ufw) on hyperstack..."
-ssh "${SSH_USER}@${VM_IP}" << 'REMOTE_SCRIPT'
-# Ensure ufw is enabled
+retry_ssh ssh "${SSH_USER}@${VM_IP}" bash -s << 'REMOTE_SCRIPT'
 sudo ufw allow ssh comment 'Allow SSH' 2>/dev/null || true
 sudo ufw --force enable >/dev/null 2>&1 || true
-
-# Allow WireGuard port
 sudo ufw allow 56710/udp comment 'WireGuard wg1' 2>/dev/null || true
-
-# Allow Ollama access from wg1 subnet
 sudo ufw allow from 192.168.3.0/24 to any port 11434 proto tcp comment 'Ollama via wg1' 2>/dev/null || true
-
 echo "Firewall rules added"
 REMOTE_SCRIPT
 print_success "Firewall configured"
 
-# Configure Ollama to listen on all interfaces
+# Ensure Ollama listens on all interfaces (only if override not already set
+# by ollama_setup_script, which also configures OLLAMA_MODELS and other env vars)
 echo "Configuring Ollama to listen on 0.0.0.0..."
-ssh "${SSH_USER}@${VM_IP}" << 'REMOTE_SCRIPT'
-# Create override directory if it doesn't exist
-sudo mkdir -p /etc/systemd/system/ollama.service.d
-
-# Create or update override.conf to bind Ollama to all interfaces
-cat << 'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf > /dev/null
+retry_ssh ssh "${SSH_USER}@${VM_IP}" bash -s << 'REMOTE_SCRIPT'
+if [ -f /etc/systemd/system/ollama.service.d/override.conf ] && \
+   grep -q 'OLLAMA_HOST' /etc/systemd/system/ollama.service.d/override.conf; then
+  echo "Ollama override already configured, skipping"
+else
+  sudo mkdir -p /etc/systemd/system/ollama.service.d
+  cat << 'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf > /dev/null
 [Service]
 Environment="OLLAMA_HOST=0.0.0.0:11434"
 OVERRIDE
-
-# Reload systemd and restart Ollama
-sudo systemctl daemon-reload
-sudo systemctl restart ollama 2>/dev/null || echo "Note: Ollama service not running or not installed"
+  sudo systemctl daemon-reload
+  sudo systemctl restart ollama 2>/dev/null || echo "Note: Ollama service not running or not installed"
+fi
 REMOTE_SCRIPT
 print_success "Ollama configured"
 
 # Start wg1 on hyperstack
 echo "Starting wg1 on hyperstack..."
-ssh "${SSH_USER}@${VM_IP}" "sudo systemctl start wg-quick@wg1 2>/dev/null || sudo wg-quick up wg1"
+retry_ssh ssh "${SSH_USER}@${VM_IP}" "sudo systemctl start wg-quick@wg1 2>/dev/null || sudo wg-quick up wg1"
 print_success "wg1 started on hyperstack"
 
 echo ""