diff options
Diffstat (limited to 'snippets/hyperstack')
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm.toml | 4 | ||||
| -rw-r--r-- | snippets/hyperstack/hyperstack_vm.rb | 198 | ||||
| -rwxr-xr-x | snippets/hyperstack/wg1-setup.sh | 71 |
3 files changed, 202 insertions, 71 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml index da02ccf..2d83b0f 100644 --- a/snippets/hyperstack/hyperstack-vm.toml +++ b/snippets/hyperstack/hyperstack-vm.toml @@ -8,7 +8,7 @@ base_url = "https://infrahub-api.nexgencloud.com/v1" file = ".hyperstack-vm-state.json" [vm] -name_prefix = "gpt-oss" +name_prefix = "hyperstack" hostname = "hyperstack" environment_name = "snonux-ollama" @@ -47,7 +47,7 @@ models_dir = "/ephemeral/ollama/models" listen_host = "0.0.0.0:11434" gpu_overhead_mb = 2000 num_parallel = 4 -pull_models = ["qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] +pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] [wireguard] auto_setup = true diff --git a/snippets/hyperstack/hyperstack_vm.rb b/snippets/hyperstack/hyperstack_vm.rb index 9743683..ac60da9 100644 --- a/snippets/hyperstack/hyperstack_vm.rb +++ b/snippets/hyperstack/hyperstack_vm.rb @@ -43,7 +43,7 @@ module HyperstackVM 'file' => '.hyperstack-vm-state.json' }, 'vm' => { - 'name_prefix' => 'gpt-oss', + 'name_prefix' => 'hyperstack', 'hostname' => 'hyperstack', 'flavor_name' => 'n3-A100x1', 'image_name' => 'Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker', @@ -506,17 +506,28 @@ module HyperstackVM request.body = JSON.generate(payload) end - response = Net::HTTP.start( - uri.host, - uri.port, - use_ssl: uri.scheme == 'https', - open_timeout: 30, - read_timeout: 120 - ) { |http| http.request(request) } - - parse_response(response) - rescue Timeout::Error, Errno::ECONNREFUSED, SocketError, OpenSSL::SSL::SSLError => e - raise Error, "Hyperstack API request failed for #{path}: #{e.message}" + retries_left = 4 + begin + response = Net::HTTP.start( + uri.host, + uri.port, + use_ssl: uri.scheme == 'https', + open_timeout: 30, + read_timeout: 120 + ) { |http| http.request(request) } + + parse_response(response) + rescue Timeout::Error, Errno::ECONNREFUSED, Errno::ECONNRESET, + Errno::EHOSTUNREACH, Errno::ENETUNREACH, + SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout => e + raise Error, "Hyperstack API request failed for #{path}: #{e.message}" if retries_left <= 0 + + retries_left -= 1 + delay = (4 - retries_left) * 5 + warn "API request to #{path} failed (#{e.class}: #{e.message}), retrying in #{delay}s (#{retries_left} left)..." + sleep delay + retry + end end def parse_response(response) @@ -733,11 +744,12 @@ module HyperstackVM @state_store.save(state) end - if ollama_setup_needed?(state) - setup_ollama(state['public_ip']) - state['ollama_setup_at'] = Time.now.utc.iso8601 - state['ollama_models_dir'] = @config.ollama_models_dir - state['ollama_pulled_models'] = desired_ollama_models + # Install Ollama binary and configure the service (fast), but defer + # model pulls until after the WireGuard tunnel is up so that the user + # can monitor progress over the tunnel. + if @config.ollama_install_enabled? && state['ollama_installed_at'].nil? + install_ollama_service(state['public_ip']) + state['ollama_installed_at'] = Time.now.utc.iso8601 @state_store.save(state) end @@ -747,6 +759,15 @@ module HyperstackVM @state_store.save(state) end + # Pull and verify models after the tunnel is established + if ollama_setup_needed?(state) + pull_ollama_models(state['public_ip']) + state['ollama_setup_at'] = Time.now.utc.iso8601 + state['ollama_models_dir'] = @config.ollama_models_dir + state['ollama_pulled_models'] = desired_ollama_models + @state_store.save(state) + end + vm = @client.get_vm(vm_id) state['security_rules'] = Array(vm['security_rules']).map { |rule| normalize_rule(rule) } state['status'] = vm['status'] @@ -833,6 +854,9 @@ module HyperstackVM end def wait_for_ssh(host) + # Remove stale host key for this IP — VMs frequently reuse IPs after + # delete/recreate, causing StrictHostKeyChecking to reject the new key + remove_stale_host_key(host) info "Waiting for SSH on #{host}:#{@config.ssh_port}..." with_polling("SSH on #{host}:#{@config.ssh_port}") do next nil unless tcp_open?(host, @config.ssh_port) @@ -859,22 +883,52 @@ module HyperstackVM def bootstrap_guest(host) info 'Bootstrapping Ubuntu guest over SSH...' - stdout, stderr, status = run_ssh_command(host, guest_bootstrap_script) - raise Error, "Guest bootstrap failed: #{stderr.strip.empty? ? stdout : stderr}" unless status.success? + retries = 3 + retries.times do |attempt| + stdout, stderr, status = run_ssh_command(host, guest_bootstrap_script) + return if status.success? + + msg = stderr.strip.empty? ? stdout : stderr + raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1 + + warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..." + sleep 15 + end end def ollama_setup_needed?(state) return false unless @config.ollama_install_enabled? + # Re-run setup if state has no record, or if desired models changed + return true if state['ollama_setup_at'].nil? - state['ollama_setup_at'].nil? || model_list_signature(desired_ollama_models) != model_list_signature(state['ollama_pulled_models']) + model_list_signature(desired_ollama_models) != model_list_signature(state['ollama_pulled_models']) end - def setup_ollama(host) + def install_ollama_service(host) info "Installing and configuring Ollama on #{host}..." - output, status = run_ssh_command_streaming(host, ollama_setup_script) - return if status.success? + output, status = run_ssh_command_streaming(host, ollama_install_script) + raise Error, "Ollama install failed: #{output.strip}" unless status.success? + end + + def pull_ollama_models(host) + info "Pulling Ollama models on #{host}..." + output, status = run_ssh_command_streaming(host, ollama_pull_script) + raise Error, "Ollama model pull failed: #{output.strip}" unless status.success? - raise Error, "Ollama setup failed: #{output.strip}" + # Verify all models are actually present on the remote (belt-and-suspenders + # check in case ollama pull returned 0 without actually pulling the model) + verify_remote_models(host) + end + + def verify_remote_models(host) + stdout, _stderr, status = run_ssh_command(host, 'ollama list') + return unless status.success? + + remote_models = stdout.lines.drop(1).map { |l| l.split.first }.compact + missing = desired_ollama_models.reject { |m| remote_models.any? { |r| r.start_with?(m) } } + return if missing.empty? + + raise Error, "Models missing after setup: #{missing.join(', ')}. Remote has: #{remote_models.join(', ')}" end def wireguard_setup_needed?(state) @@ -889,21 +943,31 @@ module HyperstackVM def run_wireguard_setup(host) validate_wireguard_setup_script! - info "Running WireGuard auto-setup via #{@config.wireguard_setup_script} #{host}..." + retries = 3 + retries.times do |attempt| + info "Running WireGuard auto-setup via #{@config.wireguard_setup_script} #{host}..." + + status = run_wireguard_script(host) + return if status.success? + if attempt == retries - 1 + raise Error, "WireGuard setup failed after #{retries} attempts (exit #{status.exitstatus})." + end + + delay = (attempt + 1) * 15 + warn "WireGuard setup attempt #{attempt + 1}/#{retries} failed (exit #{status.exitstatus}), retrying in #{delay}s..." + sleep delay + end + end + + def run_wireguard_script(host) Open3.popen2e('bash', @config.wireguard_setup_script, host) do |stdin, output, wait_thr| stdin.sync = true stdin.puts stdin.close - output.each do |line| - @out.print(line) - end - - status = wait_thr.value - next if status.success? - - raise Error, "WireGuard setup script failed with exit status #{status.exitstatus}." + output.each { |line| @out.print(line) } + wait_thr.value end end @@ -940,6 +1004,14 @@ module HyperstackVM raise Error, "Configured WireGuard settings do not match #{script_path}: #{mismatches.join('; ')}" end + def remove_stale_host_key(host) + system('ssh-keygen', '-R', host, out: File::NULL, err: File::NULL) + # Also remove bracketed form for non-standard ports + if @config.ssh_port != 22 + system('ssh-keygen', '-R', "[#{host}]:#{@config.ssh_port}", out: File::NULL, err: File::NULL) + end + end + def failed_vm?(vm) [vm['status'], vm['vm_state'], vm['power_state']].compact.any? do |value| value.to_s.downcase.match?(/error|failed|deleted|shelved/) @@ -955,7 +1027,7 @@ module HyperstackVM sock.close true end - rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, SocketError, IOError + rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::EHOSTUNREACH, Errno::ENETUNREACH, SocketError, IOError false end @@ -1107,6 +1179,16 @@ module HyperstackVM script = [] script << 'set -euo pipefail' + # Wait for any running unattended-upgrades or apt locks to release + # before attempting package operations (transient lock on fresh VMs) + script << 'echo "Waiting for apt locks to clear..."' + script << 'for i in $(seq 1 30); do' + script << ' if ! fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; then break; fi' + script << ' echo " apt lock held, waiting ($i/30)..."; sleep 10' + script << 'done' + script << 'sudo systemctl stop unattended-upgrades.service 2>/dev/null || true' + script << 'sudo systemctl disable unattended-upgrades.service 2>/dev/null || true' + if @config.install_wireguard? script << 'which wg >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y wireguard)' end @@ -1119,14 +1201,18 @@ module HyperstackVM end if @config.configure_ollama_host? + # Only write a minimal OLLAMA_HOST override if no override exists yet; + # ollama_setup_script writes the full override (OLLAMA_MODELS, GPU_OVERHEAD, etc.) script << "if systemctl list-unit-files | grep -q '^ollama.service'; then" - script << ' sudo mkdir -p /etc/systemd/system/ollama.service.d' - script << " cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" + script << ' if [ ! -f /etc/systemd/system/ollama.service.d/override.conf ]; then' + script << ' sudo mkdir -p /etc/systemd/system/ollama.service.d' + script << " cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" script << '[Service]' script << "Environment=\"OLLAMA_HOST=0.0.0.0:#{@config.ollama_port}\"" script << 'OVERRIDE' - script << ' sudo systemctl daemon-reload' - script << ' sudo systemctl restart ollama || true' + script << ' sudo systemctl daemon-reload' + script << ' sudo systemctl restart ollama || true' + script << ' fi' script << 'fi' end @@ -1151,10 +1237,13 @@ module HyperstackVM normalized_model_list(models).sort end - def ollama_setup_script + # Installs the Ollama binary, configures the systemd override (models dir, + # listen host, GPU overhead, parallelism), and starts the service. Model + # pulls are handled separately by ollama_pull_script so that the WireGuard + # tunnel can be established first. + def ollama_install_script models_dir = @config.ollama_models_dir listen_host = @config.ollama_listen_host - model_pulls = desired_ollama_models script = [] script << 'set -euo pipefail' @@ -1178,8 +1267,35 @@ module HyperstackVM script << 'sudo systemctl restart ollama' script << 'sleep 3' script << 'systemctl is-active --quiet ollama' + script << 'echo ollama-install-ok' + script.join("\n") + end + + # Pulls each configured model with retry and per-model + final verification. + # Run after WireGuard is up so the user can monitor progress over the tunnel. + def ollama_pull_script + models_dir = @config.ollama_models_dir + model_pulls = desired_ollama_models + + script = [] + script << 'set -euo pipefail' + # Pull each model with retry (transient network failures) and verify + # it is actually present afterwards + model_pulls.each do |model| + escaped = Shellwords.escape(model) + script << "echo \"Pulling model #{model}...\"" + script << "for attempt in 1 2 3; do" + script << " if ollama pull #{escaped}; then break; fi" + script << " if [ \"$attempt\" -eq 3 ]; then echo \"FATAL: failed to pull #{model} after 3 attempts\"; exit 1; fi" + script << " echo \" pull attempt $attempt failed, retrying in 15s...\"; sleep 15" + script << "done" + script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} not found after pull\"; exit 1; }" + end + # Final verification: ensure all expected models are listed + script << 'echo "Verifying all models are present..."' model_pulls.each do |model| - script << "ollama pull #{Shellwords.escape(model)}" + escaped = Shellwords.escape(model) + script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} missing in final check\"; exit 1; }" end script << "echo ollama-models-dir=#{models_dir}" script << 'echo ollama-ok' diff --git a/snippets/hyperstack/wg1-setup.sh b/snippets/hyperstack/wg1-setup.sh index a57c986..d057fb8 100755 --- a/snippets/hyperstack/wg1-setup.sh +++ b/snippets/hyperstack/wg1-setup.sh @@ -73,6 +73,29 @@ print_error() { echo -e "${RED}$1${NC}" } +# Retry wrapper for SSH/SCP commands that may fail due to transient +# connection resets (e.g. sshd restart from unattended-upgrades). +# Usage: retry_ssh ssh user@host "command" +# retry_ssh scp file user@host:/path +retry_ssh() { + local max_attempts=5 + local attempt=1 + local delay=10 + while true; do + if "$@"; then + return 0 + fi + if [[ $attempt -ge $max_attempts ]]; then + print_error "Command failed after ${max_attempts} attempts: $*" + return 1 + fi + echo " SSH attempt ${attempt}/${max_attempts} failed, retrying in ${delay}s..." + sleep "$delay" + attempt=$((attempt + 1)) + delay=$((delay + 5)) + done +} + # Validate arguments if [[ $# -ne 1 ]]; then echo "Usage: $0 <VM_PUBLIC_IP>" @@ -159,64 +182,56 @@ print_success "Client config created" echo "" echo "=== Setting up hyperstack VM (${VM_IP}) ===" -# Check SSH connectivity +# Wait for SSH to become available (handles transient connection resets +# from sshd restarts due to unattended-upgrades or package installs) echo "Testing SSH connection..." -if ! ssh -o ConnectTimeout=10 -o BatchMode=yes "${SSH_USER}@${VM_IP}" "echo 'SSH OK'" 2>/dev/null; then - print_error "Error: Cannot connect to ${SSH_USER}@${VM_IP}" - print_error "Please ensure SSH access is configured." - exit 1 -fi +retry_ssh ssh -o ConnectTimeout=10 -o BatchMode=yes "${SSH_USER}@${VM_IP}" "echo 'SSH OK'" print_success "SSH connection OK" # Install WireGuard on server if not present echo "Installing WireGuard on hyperstack..." -ssh "${SSH_USER}@${VM_IP}" "which wg >/dev/null 2>&1 || (sudo apt update && sudo apt install -y wireguard)" +retry_ssh ssh "${SSH_USER}@${VM_IP}" "which wg >/dev/null 2>&1 || (sudo apt update && sudo apt install -y wireguard)" print_success "WireGuard installed" # Copy server config to hyperstack echo "Copying wg1.conf to hyperstack..." -scp "$TMPDIR/server-wg1.conf" "${SSH_USER}@${VM_IP}:/tmp/wg1.conf" -ssh "${SSH_USER}@${VM_IP}" "sudo mv /tmp/wg1.conf /etc/wireguard/wg1.conf && sudo chmod 600 /etc/wireguard/wg1.conf" +retry_ssh scp "$TMPDIR/server-wg1.conf" "${SSH_USER}@${VM_IP}:/tmp/wg1.conf" +retry_ssh ssh "${SSH_USER}@${VM_IP}" "sudo mv /tmp/wg1.conf /etc/wireguard/wg1.conf && sudo chmod 600 /etc/wireguard/wg1.conf" print_success "Server config installed" # Configure firewall on hyperstack echo "Configuring firewall (ufw) on hyperstack..." -ssh "${SSH_USER}@${VM_IP}" << 'REMOTE_SCRIPT' -# Ensure ufw is enabled +retry_ssh ssh "${SSH_USER}@${VM_IP}" bash -s << 'REMOTE_SCRIPT' sudo ufw allow ssh comment 'Allow SSH' 2>/dev/null || true sudo ufw --force enable >/dev/null 2>&1 || true - -# Allow WireGuard port sudo ufw allow 56710/udp comment 'WireGuard wg1' 2>/dev/null || true - -# Allow Ollama access from wg1 subnet sudo ufw allow from 192.168.3.0/24 to any port 11434 proto tcp comment 'Ollama via wg1' 2>/dev/null || true - echo "Firewall rules added" REMOTE_SCRIPT print_success "Firewall configured" -# Configure Ollama to listen on all interfaces +# Ensure Ollama listens on all interfaces (only if override not already set +# by ollama_setup_script, which also configures OLLAMA_MODELS and other env vars) echo "Configuring Ollama to listen on 0.0.0.0..." -ssh "${SSH_USER}@${VM_IP}" << 'REMOTE_SCRIPT' -# Create override directory if it doesn't exist -sudo mkdir -p /etc/systemd/system/ollama.service.d - -# Create or update override.conf to bind Ollama to all interfaces -cat << 'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf > /dev/null +retry_ssh ssh "${SSH_USER}@${VM_IP}" bash -s << 'REMOTE_SCRIPT' +if [ -f /etc/systemd/system/ollama.service.d/override.conf ] && \ + grep -q 'OLLAMA_HOST' /etc/systemd/system/ollama.service.d/override.conf; then + echo "Ollama override already configured, skipping" +else + sudo mkdir -p /etc/systemd/system/ollama.service.d + cat << 'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf > /dev/null [Service] Environment="OLLAMA_HOST=0.0.0.0:11434" OVERRIDE - -# Reload systemd and restart Ollama -sudo systemctl daemon-reload -sudo systemctl restart ollama 2>/dev/null || echo "Note: Ollama service not running or not installed" + sudo systemctl daemon-reload + sudo systemctl restart ollama 2>/dev/null || echo "Note: Ollama service not running or not installed" +fi REMOTE_SCRIPT print_success "Ollama configured" # Start wg1 on hyperstack echo "Starting wg1 on hyperstack..." -ssh "${SSH_USER}@${VM_IP}" "sudo systemctl start wg-quick@wg1 2>/dev/null || sudo wg-quick up wg1" +retry_ssh ssh "${SSH_USER}@${VM_IP}" "sudo systemctl start wg-quick@wg1 2>/dev/null || sudo wg-quick up wg1" print_success "wg1 started on hyperstack" echo "" |
