diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-20 12:50:32 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-20 12:50:32 +0200 |
| commit | 71a8bf9dc0dbcfc9cebc051ae836c45906c42164 (patch) | |
| tree | 58e778a8c6c254a8c3082beaa656b3d79e73277c | |
| parent | 678fd437301c715b662aaf3035c87f940f8501b5 (diff) | |
task 301: extract provisioning collaborators
| -rwxr-xr-x | snippets/hyperstack/hyperstack.rb | 803 |
1 files changed, 398 insertions, 405 deletions
diff --git a/snippets/hyperstack/hyperstack.rb b/snippets/hyperstack/hyperstack.rb index c164f6e..cbf5fe1 100755 --- a/snippets/hyperstack/hyperstack.rb +++ b/snippets/hyperstack/hyperstack.rb @@ -1052,6 +1052,384 @@ module HyperstackVM end end + class ProvisioningScripts + def initialize(config:) + @config = config + end + + def guest_bootstrap_script + script = [] + script << 'set -euo pipefail' + + # Wait for any running unattended-upgrades or apt locks to release + # before attempting package operations (transient lock on fresh VMs) + script << 'echo "Waiting for apt locks to clear..."' + script << 'for i in $(seq 1 30); do' + script << ' if ! fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; then break; fi' + script << ' echo " apt lock held, waiting ($i/30)..."; sleep 10' + script << 'done' + script << 'sudo systemctl stop unattended-upgrades.service 2>/dev/null || true' + script << 'sudo systemctl disable unattended-upgrades.service 2>/dev/null || true' + + if @config.install_wireguard? + script << 'which wg >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y wireguard)' + end + + if @config.configure_ufw? + script << "sudo ufw allow #{@config.ssh_port}/tcp comment 'Allow SSH' >/dev/null 2>&1 || true" + script << 'sudo ufw --force enable >/dev/null 2>&1 || true' + script << "sudo ufw allow #{@config.wireguard_udp_port}/udp comment 'WireGuard #{@config.local_interface_name}' >/dev/null 2>&1 || true" + # Port 11434 is shared by Ollama and vLLM; open for both regardless of which is installed. + script << "sudo ufw allow from #{Shellwords.escape(@config.wireguard_subnet)} to any port #{@config.ollama_port} proto tcp comment 'Inference API (Ollama/vLLM) via #{@config.local_interface_name}' >/dev/null 2>&1 || true" + # Port 4000: LiteLLM proxy (Anthropic API -> vLLM); open alongside the inference port. + script << "sudo ufw allow from #{Shellwords.escape(@config.wireguard_subnet)} to any port #{@config.litellm_port} proto tcp comment 'LiteLLM proxy via #{@config.local_interface_name}' >/dev/null 2>&1 || true" + end + + if @config.configure_ollama_host? + # Only write a minimal OLLAMA_HOST override if no override exists yet; + # ollama_setup_script writes the full override (OLLAMA_MODELS, GPU_OVERHEAD, etc.) + script << "if systemctl list-unit-files | grep -q '^ollama.service'; then" + script << ' if [ ! -f /etc/systemd/system/ollama.service.d/override.conf ]; then' + script << ' sudo mkdir -p /etc/systemd/system/ollama.service.d' + script << " cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" + script << '[Service]' + script << "Environment=\"OLLAMA_HOST=0.0.0.0:#{@config.ollama_port}\"" + script << 'OVERRIDE' + script << ' sudo systemctl daemon-reload' + script << ' sudo systemctl restart ollama || true' + script << ' fi' + script << 'fi' + end + + script << 'echo bootstrap-ok' + script.join("\n") + end + + def desired_ollama_models + normalized_model_list(@config.ollama_pull_models) + end + + def model_list_signature(models) + normalized_model_list(models).sort + end + + def ollama_install_script + models_dir = @config.ollama_models_dir + listen_host = @config.ollama_listen_host + + script = [] + script << 'set -euo pipefail' + script << 'sudo pkill -f unattended-upgrade >/dev/null 2>&1 || true' + script << 'if ! command -v ollama >/dev/null 2>&1; then curl -fsSL https://ollama.ai/install.sh | sh; fi' + if models_dir.start_with?('/ephemeral') + script << "mountpoint -q /ephemeral || { echo 'Expected /ephemeral mount is missing'; exit 1; }" + end + script << "sudo mkdir -p #{Shellwords.escape(models_dir)}" + script << "sudo chown -R ollama:ollama #{Shellwords.escape(File.dirname(models_dir))}" + script << 'sudo mkdir -p /etc/systemd/system/ollama.service.d' + script << "cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" + script << '[Service]' + script << "Environment=\"OLLAMA_MODELS=#{models_dir}\"" + script << "Environment=\"OLLAMA_GPU_OVERHEAD=#{@config.ollama_gpu_overhead_mb}\"" + script << "Environment=\"OLLAMA_NUM_PARALLEL=#{@config.ollama_num_parallel}\"" + script << "Environment=\"OLLAMA_CONTEXT_LENGTH=#{@config.ollama_context_length}\"" + script << "Environment=\"OLLAMA_HOST=#{listen_host}\"" + script << 'OVERRIDE' + script << 'sudo systemctl daemon-reload' + script << 'sudo systemctl enable --now ollama' + script << 'sudo systemctl restart ollama' + script << 'sleep 3' + script << 'systemctl is-active --quiet ollama' + script << 'echo ollama-install-ok' + script.join("\n") + end + + def ollama_pull_script(models: desired_ollama_models) + models_dir = @config.ollama_models_dir + + script = [] + script << 'set -euo pipefail' + # Pull each model with retry (transient network failures) and verify + # it is actually present afterwards + models.each do |model| + escaped = Shellwords.escape(model) + script << "echo \"Pulling model #{model}...\"" + script << 'for attempt in 1 2 3; do' + script << " if ollama pull #{escaped}; then break; fi" + script << " if [ \"$attempt\" -eq 3 ]; then echo \"FATAL: failed to pull #{model} after 3 attempts\"; exit 1; fi" + script << ' echo " pull attempt $attempt failed, retrying in 15s..."; sleep 15' + script << 'done' + script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} not found after pull\"; exit 1; }" + end + # Final verification: ensure all expected models are listed + script << 'echo "Verifying all models are present..."' + models.each do |model| + escaped = Shellwords.escape(model) + script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} missing in final check\"; exit 1; }" + end + script << "echo ollama-models-dir=#{models_dir}" + script << 'echo ollama-ok' + script.join("\n") + end + + def vllm_stop_script(container_name) + script = [] + script << 'set -euo pipefail' + script << "docker stop #{Shellwords.escape(container_name)} 2>/dev/null || true" + script << "docker rm #{Shellwords.escape(container_name)} 2>/dev/null || true" + script << 'echo vllm-stopped' + script.join("\n") + end + + def vllm_install_script(preset_config: nil, pull_image: true) + cfg = preset_config || {} + model = cfg['model'] || @config.vllm_model + cache_dir = @config.vllm_hug_cache_dir + compile_cache = @config.vllm_compile_cache_dir + container = cfg['container_name'] || @config.vllm_container_name + max_len = Integer(cfg['max_model_len'] || @config.vllm_max_model_len) + gpu_util = Float(cfg['gpu_memory_utilization'] || @config.vllm_gpu_memory_utilization) + tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size) + parser = cfg['tool_call_parser'] + # parser is nil only when preset explicitly omits the key and config has no default; + # empty string means "disable tool calling" (e.g. gpt-oss reasoning models). + parser = @config.vllm_tool_call_parser if parser.nil? + # Fall back to the top-level [vllm] config values when no preset is in use. + # This allows setting trust_remote_code / extra_vllm_args in the default [vllm] block + # without requiring a --model preset flag at create time. + trust_remote = cfg.key?('trust_remote_code') ? cfg['trust_remote_code'] : @config.vllm_trust_remote_code + port = @config.ollama_port + + docker_args = [ + 'docker run -d', + '--gpus all', '--ipc=host', '--network host', + "--name #{Shellwords.escape(container)}", + '--restart always', + "-v #{Shellwords.escape(cache_dir)}:/root/.cache/huggingface", + # Mount torch.compile cache so CUDA kernel compilation is skipped on warm restarts. + # Without this, every container restart recompiles (~30-60 s extra). + "-v #{Shellwords.escape(compile_cache)}:/root/.cache/vllm", + 'vllm/vllm-openai:latest', + "--model #{Shellwords.escape(model)}", + "--tensor-parallel-size #{tp_size}", + '--enable-prefix-caching', + "--gpu-memory-utilization #{gpu_util}", + "--max-model-len #{max_len}", + '--host 0.0.0.0', + "--port #{port}" + ] + # Tool calling is optional: empty/nil parser disables it. + unless parser.nil? || parser.empty? + docker_args << '--enable-auto-tool-choice' + docker_args << "--tool-call-parser #{Shellwords.escape(parser)}" + end + docker_args << '--trust-remote-code' if trust_remote + extra_args = cfg.key?('extra_vllm_args') ? Array(cfg['extra_vllm_args']) : @config.vllm_extra_args + extra_args.each { |arg| docker_args << arg } + docker_run = docker_args.join(' ') + + script = [] + script << 'set -euo pipefail' + script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" + script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" + script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true" + script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true" + script << 'docker pull vllm/vllm-openai:latest' if pull_image + script << docker_run + script << 'echo "Waiting for vLLM to become ready (up to 10 min for first model download)..."' + script << 'for i in $(seq 1 120); do' + script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi" + script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)" + script << ' echo " vLLM not ready yet ($i/120, container=$state)..."' + script << ' sleep 5' + script << 'done' + script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 10 minutes'; exit 1; }" + script << 'echo vllm-install-ok' + script.join("\n") + end + + def litellm_install_script(model_override: nil) + port = @config.litellm_port + model = model_override || @config.vllm_model + + script = [] + script << 'set -euo pipefail' + script << 'sudo apt-get install -y python3.12-venv' + script << 'sudo mkdir -p /ephemeral/litellm-env' + script << 'sudo chown ubuntu:ubuntu /ephemeral/litellm-env' + script << 'python3 -m venv /ephemeral/litellm-env' + script << '/ephemeral/litellm-env/bin/pip install --quiet "litellm[proxy]"' + script << "sudo tee /ephemeral/litellm-config.yaml > /dev/null << 'LITELLM_YAML'" + script << 'model_list:' + script.concat(litellm_model_entries(model)) + script << '' + script << 'litellm_settings:' + script << ' drop_params: true' + script << '' + script << 'general_settings:' + script << " master_key: \"#{@config.litellm_master_key}\"" + script << 'LITELLM_YAML' + script << "sudo tee /etc/systemd/system/litellm.service > /dev/null << 'LITELLM_UNIT'" + script << '[Unit]' + script << 'Description=LiteLLM Proxy' + script << 'After=network.target docker.service' + script << 'Requires=docker.service' + script << '' + script << '[Service]' + script << 'Type=simple' + script << 'User=ubuntu' + script << "ExecStart=/ephemeral/litellm-env/bin/litellm --config /ephemeral/litellm-config.yaml --host 0.0.0.0 --port #{port}" + script << 'Restart=always' + script << 'RestartSec=5' + script << '' + script << '[Install]' + script << 'WantedBy=multi-user.target' + script << 'LITELLM_UNIT' + script << 'sudo systemctl daemon-reload' + script << 'sudo systemctl enable --now litellm' + script << 'sleep 5' + script << 'systemctl is-active --quiet litellm' + script << 'echo litellm-install-ok' + script.join("\n") + end + + def litellm_reload_script(model) + script = [] + script << 'set -euo pipefail' + script << "sudo tee /ephemeral/litellm-config.yaml > /dev/null << 'LITELLM_YAML'" + script << 'model_list:' + script.concat(litellm_model_entries(model)) + script << '' + script << 'litellm_settings:' + script << ' drop_params: true' + script << '' + script << 'general_settings:' + script << " master_key: \"#{@config.litellm_master_key}\"" + script << 'LITELLM_YAML' + script << 'sudo systemctl restart litellm' + script << 'sleep 3' + script << 'systemctl is-active --quiet litellm' + script << 'echo litellm-reload-ok' + script.join("\n") + end + + private + + def normalized_model_list(models) + Array(models).each_with_object([]) do |model, ordered| + normalized = model.to_s.strip + next if normalized.empty? || ordered.include?(normalized) + + ordered << normalized + end + end + + def litellm_model_entries(model) + vllm_port = @config.ollama_port + + @config.litellm_claude_model_names.flat_map do |name| + [ + " - model_name: \"#{name}\"", + ' litellm_params:', + " model: \"hosted_vllm/#{model}\"", + " api_base: \"http://localhost:#{vllm_port}/v1\"", + ' api_key: "EMPTY"' + ] + end + end + end + + class RemoteProvisioner + def initialize(config:, scripts:, out:, ssh_command_runner:, ssh_stream_runner:) + @config = config + @scripts = scripts + @out = out + @ssh_command_runner = ssh_command_runner + @ssh_stream_runner = ssh_stream_runner + end + + def bootstrap_guest(host) + info 'Bootstrapping Ubuntu guest over SSH...' + retries = 3 + retries.times do |attempt| + stdout, stderr, status = @ssh_command_runner.call(host, @scripts.guest_bootstrap_script) + return if status.success? + + msg = stderr.strip.empty? ? stdout : stderr + raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1 + + warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..." + sleep 15 + end + end + + def install_ollama_service(host) + info "Installing and configuring Ollama on #{host}..." + output, status = @ssh_stream_runner.call(host, @scripts.ollama_install_script) + raise Error, "Ollama install failed: #{output.strip}" unless status.success? + end + + def pull_ollama_models(host) + info "Pulling Ollama models on #{host}..." + output, status = @ssh_stream_runner.call(host, @scripts.ollama_pull_script) + raise Error, "Ollama model pull failed: #{output.strip}" unless status.success? + + verify_remote_models(host) + end + + def stop_vllm_container(host, container_name) + info "Stopping old vLLM container #{container_name}..." + output, status = @ssh_stream_runner.call(host, @scripts.vllm_stop_script(container_name)) + raise Error, "Failed to stop container #{container_name}: #{output.strip}" unless status.success? + end + + def install_vllm(host, preset_config: nil, pull_image: true) + info "Setting up vLLM Docker container on #{host}..." + output, status = @ssh_stream_runner.call(host, @scripts.vllm_install_script(preset_config: preset_config, + pull_image: pull_image)) + raise Error, "vLLM install failed: #{output.strip}" unless status.success? + end + + def install_litellm(host, model:) + info "Setting up LiteLLM Anthropic-API proxy on #{host}..." + output, status = @ssh_stream_runner.call(host, @scripts.litellm_install_script(model_override: model)) + raise Error, "LiteLLM install failed: #{output.strip}" unless status.success? + end + + def reload_litellm(host, model) + info "Reloading LiteLLM proxy config for #{model}..." + output, status = @ssh_stream_runner.call(host, @scripts.litellm_reload_script(model)) + raise Error, "LiteLLM reload failed: #{output.strip}" unless status.success? + end + + def setup_vllm_stack(host, preset_config: nil) + install_vllm(host, preset_config: preset_config) + model = preset_config&.dig('model') || @config.vllm_model + install_litellm(host, model: model) + end + + private + + def verify_remote_models(host) + stdout, _stderr, status = @ssh_command_runner.call(host, 'ollama list') + return unless status.success? + + remote_models = stdout.lines.drop(1).map { |line| line.split.first }.compact + missing = @scripts.desired_ollama_models.reject { |model| remote_models.any? { |remote| remote.start_with?(model) } } + return if missing.empty? + + raise Error, "Models missing after setup: #{missing.join(', ')}. Remote has: #{remote_models.join(', ')}" + end + + def info(message) + @out.puts(message) + end + + def warn(message) + @out.puts("WARNING: #{message}") + end + end + class Manager # wg_setup_pre: optional Proc called just before this VM's WireGuard setup step runs. # Used by create-both to block VM2 until VM1 has written the base wg1.conf. @@ -1064,6 +1442,10 @@ module HyperstackVM @state_store = state_store @local_wireguard = local_wireguard @out = out + @scripts = ProvisioningScripts.new(config: config) + @provisioner = RemoteProvisioner.new(config: config, scripts: @scripts, out: out, + ssh_command_runner: method(:run_ssh_command), + ssh_stream_runner: method(:run_ssh_command_streaming)) @wg_setup_pre = wg_setup_pre @wg_setup_post = wg_setup_post end @@ -1243,22 +1625,17 @@ module HyperstackVM # Stop the old container only when it has a different name from the new one. if old_container != new_container - info "Stopping old vLLM container #{old_container}..." - output, status = run_ssh_command_streaming(host, vllm_stop_script(old_container)) - raise Error, "Failed to stop container #{old_container}: #{output.strip}" unless status.success? + @provisioner.stop_vllm_container(host, old_container) end info "Starting vLLM with preset '#{preset_name}' (#{preset['model']})..." # Skip docker pull: image is already present; pulling on every switch risks a # surprise multi-GB download if the upstream image was updated. - output, status = run_ssh_command_streaming(host, vllm_install_script(preset_config: preset, pull_image: false)) - raise Error, "vLLM install failed: #{output.strip}" unless status.success? + @provisioner.install_vllm(host, preset_config: preset, pull_image: false) # Hot-reload LiteLLM: rewrite config for the new model and restart the service. # Skips venv/apt install since those are already in place. - info "Reloading LiteLLM proxy config for #{preset['model']}..." - output, status = run_ssh_command_streaming(host, litellm_reload_script(preset['model'])) - raise Error, "LiteLLM reload failed: #{output.strip}" unless status.success? + @provisioner.reload_litellm(host, preset['model']) state['vllm_model'] = preset['model'] state['vllm_container_name'] = new_container @@ -1318,7 +1695,7 @@ module HyperstackVM wait_for_ssh(state['public_ip']) if @config.guest_bootstrap_enabled? && state['bootstrapped_at'].nil? - bootstrap_guest(state['public_ip']) + @provisioner.bootstrap_guest(state['public_ip']) state['bootstrapped_at'] = Time.now.utc.iso8601 @state_store.save(state) end @@ -1327,7 +1704,7 @@ module HyperstackVM # model pulls until after the WireGuard tunnel is up so that the user # can monitor progress over the tunnel. if effective_ollama? && state['ollama_installed_at'].nil? - install_ollama_service(state['public_ip']) + @provisioner.install_ollama_service(state['public_ip']) state['ollama_installed_at'] = Time.now.utc.iso8601 @state_store.save(state) end @@ -1347,10 +1724,10 @@ module HyperstackVM # Pull and verify Ollama models after the tunnel is established. if ollama_setup_needed?(state) - pull_ollama_models(state['public_ip']) + @provisioner.pull_ollama_models(state['public_ip']) state['ollama_setup_at'] = Time.now.utc.iso8601 state['ollama_models_dir'] = @config.ollama_models_dir - state['ollama_pulled_models'] = desired_ollama_models + state['ollama_pulled_models'] = @scripts.desired_ollama_models @state_store.save(state) end @@ -1358,7 +1735,7 @@ module HyperstackVM # the tunnel is up so that model-download progress is visible locally. if vllm_setup_needed?(state) preset_cfg = effective_vllm_preset_config - setup_vllm_stack(state['public_ip'], preset_config: preset_cfg) + @provisioner.setup_vllm_stack(state['public_ip'], preset_config: preset_cfg) state['vllm_setup_at'] = Time.now.utc.iso8601 state['vllm_model'] = preset_cfg&.dig('model') || @config.vllm_model state['vllm_container_name'] = preset_cfg&.dig('container_name') || @config.vllm_container_name @@ -1483,54 +1860,13 @@ module HyperstackVM end end - def bootstrap_guest(host) - info 'Bootstrapping Ubuntu guest over SSH...' - retries = 3 - retries.times do |attempt| - stdout, stderr, status = run_ssh_command(host, guest_bootstrap_script) - return if status.success? - - msg = stderr.strip.empty? ? stdout : stderr - raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1 - - warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..." - sleep 15 - end - end - def ollama_setup_needed?(state) return false unless effective_ollama? # Re-run setup if state has no record, or if desired models changed return true if state['ollama_setup_at'].nil? - model_list_signature(desired_ollama_models) != model_list_signature(state['ollama_pulled_models']) - end - - def install_ollama_service(host) - info "Installing and configuring Ollama on #{host}..." - output, status = run_ssh_command_streaming(host, ollama_install_script) - raise Error, "Ollama install failed: #{output.strip}" unless status.success? - end - - def pull_ollama_models(host) - info "Pulling Ollama models on #{host}..." - output, status = run_ssh_command_streaming(host, ollama_pull_script) - raise Error, "Ollama model pull failed: #{output.strip}" unless status.success? - - # Verify all models are actually present on the remote (belt-and-suspenders - # check in case ollama pull returned 0 without actually pulling the model) - verify_remote_models(host) - end - - def verify_remote_models(host) - stdout, _stderr, status = run_ssh_command(host, 'ollama list') - return unless status.success? - - remote_models = stdout.lines.drop(1).map { |l| l.split.first }.compact - missing = desired_ollama_models.reject { |m| remote_models.any? { |r| r.start_with?(m) } } - return if missing.empty? - - raise Error, "Models missing after setup: #{missing.join(', ')}. Remote has: #{remote_models.join(', ')}" + @scripts.model_list_signature(@scripts.desired_ollama_models) != + @scripts.model_list_signature(state['ollama_pulled_models']) end def wireguard_setup_needed?(state) @@ -1844,13 +2180,14 @@ module HyperstackVM @out.puts(JSON.pretty_generate(payload)) if @config.guest_bootstrap_enabled? info 'Guest bootstrap script:' - @out.puts(guest_bootstrap_script) + @out.puts(@scripts.guest_bootstrap_script) else info 'Guest bootstrap is disabled in config.' end if effective_ollama? info "Ollama will be installed with models stored under #{@config.ollama_models_dir}" - info "Ollama models to pre-pull: #{desired_ollama_models.join(', ')}" unless desired_ollama_models.empty? + models = @scripts.desired_ollama_models + info "Ollama models to pre-pull: #{models.join(', ')}" unless models.empty? end if effective_vllm? preset_cfg = effective_vllm_preset_config @@ -1880,11 +2217,12 @@ module HyperstackVM end if @config.guest_bootstrap_enabled? info 'Guest bootstrap script:' - @out.puts(guest_bootstrap_script) + @out.puts(@scripts.guest_bootstrap_script) end if ollama_setup_needed?(state) info "Ollama would be installed with models stored under #{@config.ollama_models_dir}" - info "Ollama models to pre-pull: #{desired_ollama_models.join(', ')}" unless desired_ollama_models.empty? + models = @scripts.desired_ollama_models + info "Ollama models to pre-pull: #{models.join(', ')}" unless models.empty? end if vllm_setup_needed?(state) info "vLLM would be installed: #{@config.vllm_model}" @@ -1929,137 +2267,6 @@ module HyperstackVM ].join(', ') end - def guest_bootstrap_script - script = [] - script << 'set -euo pipefail' - - # Wait for any running unattended-upgrades or apt locks to release - # before attempting package operations (transient lock on fresh VMs) - script << 'echo "Waiting for apt locks to clear..."' - script << 'for i in $(seq 1 30); do' - script << ' if ! fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; then break; fi' - script << ' echo " apt lock held, waiting ($i/30)..."; sleep 10' - script << 'done' - script << 'sudo systemctl stop unattended-upgrades.service 2>/dev/null || true' - script << 'sudo systemctl disable unattended-upgrades.service 2>/dev/null || true' - - if @config.install_wireguard? - script << 'which wg >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y wireguard)' - end - - if @config.configure_ufw? - script << "sudo ufw allow #{@config.ssh_port}/tcp comment 'Allow SSH' >/dev/null 2>&1 || true" - script << 'sudo ufw --force enable >/dev/null 2>&1 || true' - script << "sudo ufw allow #{@config.wireguard_udp_port}/udp comment 'WireGuard #{@config.local_interface_name}' >/dev/null 2>&1 || true" - # Port 11434 is shared by Ollama and vLLM; open for both regardless of which is installed. - script << "sudo ufw allow from #{Shellwords.escape(@config.wireguard_subnet)} to any port #{@config.ollama_port} proto tcp comment 'Inference API (Ollama/vLLM) via #{@config.local_interface_name}' >/dev/null 2>&1 || true" - # Port 4000: LiteLLM proxy (Anthropic API → vLLM); open alongside the inference port. - script << "sudo ufw allow from #{Shellwords.escape(@config.wireguard_subnet)} to any port #{@config.litellm_port} proto tcp comment 'LiteLLM proxy via #{@config.local_interface_name}' >/dev/null 2>&1 || true" - end - - if @config.configure_ollama_host? - # Only write a minimal OLLAMA_HOST override if no override exists yet; - # ollama_setup_script writes the full override (OLLAMA_MODELS, GPU_OVERHEAD, etc.) - script << "if systemctl list-unit-files | grep -q '^ollama.service'; then" - script << ' if [ ! -f /etc/systemd/system/ollama.service.d/override.conf ]; then' - script << ' sudo mkdir -p /etc/systemd/system/ollama.service.d' - script << " cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" - script << '[Service]' - script << "Environment=\"OLLAMA_HOST=0.0.0.0:#{@config.ollama_port}\"" - script << 'OVERRIDE' - script << ' sudo systemctl daemon-reload' - script << ' sudo systemctl restart ollama || true' - script << ' fi' - script << 'fi' - end - - script << 'echo bootstrap-ok' - script.join("\n") - end - - def desired_ollama_models - normalized_model_list(@config.ollama_pull_models) - end - - def normalized_model_list(models) - Array(models).each_with_object([]) do |model, ordered| - normalized = model.to_s.strip - next if normalized.empty? || ordered.include?(normalized) - - ordered << normalized - end - end - - def model_list_signature(models) - normalized_model_list(models).sort - end - - # Installs the Ollama binary, configures the systemd override (models dir, - # listen host, GPU overhead, parallelism), and starts the service. Model - # pulls are handled separately by ollama_pull_script so that the WireGuard - # tunnel can be established first. - def ollama_install_script - models_dir = @config.ollama_models_dir - listen_host = @config.ollama_listen_host - - script = [] - script << 'set -euo pipefail' - script << 'sudo pkill -f unattended-upgrade >/dev/null 2>&1 || true' - script << 'if ! command -v ollama >/dev/null 2>&1; then curl -fsSL https://ollama.ai/install.sh | sh; fi' - if models_dir.start_with?('/ephemeral') - script << "mountpoint -q /ephemeral || { echo 'Expected /ephemeral mount is missing'; exit 1; }" - end - script << "sudo mkdir -p #{Shellwords.escape(models_dir)}" - script << "sudo chown -R ollama:ollama #{Shellwords.escape(File.dirname(models_dir))}" - script << 'sudo mkdir -p /etc/systemd/system/ollama.service.d' - script << "cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" - script << '[Service]' - script << "Environment=\"OLLAMA_MODELS=#{models_dir}\"" - script << "Environment=\"OLLAMA_GPU_OVERHEAD=#{@config.ollama_gpu_overhead_mb}\"" - script << "Environment=\"OLLAMA_NUM_PARALLEL=#{@config.ollama_num_parallel}\"" - script << "Environment=\"OLLAMA_CONTEXT_LENGTH=#{@config.ollama_context_length}\"" - script << "Environment=\"OLLAMA_HOST=#{listen_host}\"" - script << 'OVERRIDE' - script << 'sudo systemctl daemon-reload' - script << 'sudo systemctl enable --now ollama' - script << 'sudo systemctl restart ollama' - script << 'sleep 3' - script << 'systemctl is-active --quiet ollama' - script << 'echo ollama-install-ok' - script.join("\n") - end - - # Pulls each configured model with retry and per-model + final verification. - # Run after WireGuard is up so the user can monitor progress over the tunnel. - def ollama_pull_script - models_dir = @config.ollama_models_dir - model_pulls = desired_ollama_models - - script = [] - script << 'set -euo pipefail' - # Pull each model with retry (transient network failures) and verify - # it is actually present afterwards - model_pulls.each do |model| - escaped = Shellwords.escape(model) - script << "echo \"Pulling model #{model}...\"" - script << 'for attempt in 1 2 3; do' - script << " if ollama pull #{escaped}; then break; fi" - script << " if [ \"$attempt\" -eq 3 ]; then echo \"FATAL: failed to pull #{model} after 3 attempts\"; exit 1; fi" - script << ' echo " pull attempt $attempt failed, retrying in 15s..."; sleep 15' - script << 'done' - script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} not found after pull\"; exit 1; }" - end - # Final verification: ensure all expected models are listed - script << 'echo "Verifying all models are present..."' - model_pulls.each do |model| - escaped = Shellwords.escape(model) - script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} missing in final check\"; exit 1; }" - end - script << "echo ollama-models-dir=#{models_dir}" - script << 'echo ollama-ok' - script.join("\n") - end - # Returns the effective Ollama flag: CLI override if set, else config default. def effective_ollama? defined?(@effective_ollama) ? @effective_ollama : @config.ollama_install_enabled? @@ -2088,220 +2295,6 @@ module HyperstackVM state['vllm_model'] != desired end - # Generates a script that stops and removes a named Docker container. - # Used when switching to a preset whose container_name differs from the current one. - def vllm_stop_script(container_name) - script = [] - script << 'set -euo pipefail' - script << "docker stop #{Shellwords.escape(container_name)} 2>/dev/null || true" - script << "docker rm #{Shellwords.escape(container_name)} 2>/dev/null || true" - script << 'echo vllm-stopped' - script.join("\n") - end - - def setup_vllm_stack(host, preset_config: nil) - info "Setting up vLLM Docker container on #{host}..." - output, status = run_ssh_command_streaming(host, vllm_install_script(preset_config: preset_config)) - raise Error, "vLLM install failed: #{output.strip}" unless status.success? - - model = preset_config&.dig('model') || @config.vllm_model - info "Setting up LiteLLM Anthropic-API proxy on #{host}..." - output, status = run_ssh_command_streaming(host, litellm_install_script(model_override: model)) - raise Error, "LiteLLM install failed: #{output.strip}" unless status.success? - end - - # Generates the remote shell script that (optionally) pulls the vLLM Docker image, - # starts the container, and polls until the model is fully loaded (up to 10 minutes - # to cover the first-run ~45 GB model download). - # preset_config overrides individual fields; unset fields fall back to [vllm] defaults. - # pull_image: true on initial install; false on model switch (image already present, - # and pulling every switch can trigger a multi-GB download if the image was updated). - def vllm_install_script(preset_config: nil, pull_image: true) - cfg = preset_config || {} - model = cfg['model'] || @config.vllm_model - cache_dir = @config.vllm_hug_cache_dir # always use main config for shared HF cache - compile_cache = @config.vllm_compile_cache_dir # persisted torch.compile artifacts - container = cfg['container_name'] || @config.vllm_container_name - max_len = Integer(cfg['max_model_len'] || @config.vllm_max_model_len) - gpu_util = Float(cfg['gpu_memory_utilization'] || @config.vllm_gpu_memory_utilization) - tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size) - parser = cfg['tool_call_parser'] - # parser is nil only when preset explicitly omits the key and config has no default; - # empty string means "disable tool calling" (e.g. gpt-oss reasoning models). - parser = @config.vllm_tool_call_parser if parser.nil? - # Fall back to the top-level [vllm] config values when no preset is in use. - # This allows setting trust_remote_code / extra_vllm_args in the default [vllm] block - # (e.g. for nemotron on VM1) without requiring a --model preset flag at create time. - trust_remote = cfg.key?('trust_remote_code') ? cfg['trust_remote_code'] : @config.vllm_trust_remote_code - port = @config.ollama_port # vLLM reuses the Ollama port for firewall compat - - docker_args = [ - 'docker run -d', - '--gpus all', '--ipc=host', '--network host', - "--name #{Shellwords.escape(container)}", - '--restart always', - "-v #{Shellwords.escape(cache_dir)}:/root/.cache/huggingface", - # Mount torch.compile cache so CUDA kernel compilation is skipped on warm restarts. - # Without this, every container restart recompiles (~30-60 s extra). - "-v #{Shellwords.escape(compile_cache)}:/root/.cache/vllm", - 'vllm/vllm-openai:latest', - "--model #{Shellwords.escape(model)}", - "--tensor-parallel-size #{tp_size}", - '--enable-prefix-caching', - "--gpu-memory-utilization #{gpu_util}", - "--max-model-len #{max_len}", - '--host 0.0.0.0', - "--port #{port}" - ] - # Tool calling is optional: empty/nil parser disables it (e.g. gpt-oss reasoning models - # crash vLLM's llama3_json parser due to the extra token_ids field in responses). - unless parser.nil? || parser.empty? - docker_args << '--enable-auto-tool-choice' - docker_args << "--tool-call-parser #{Shellwords.escape(parser)}" - end - docker_args << '--trust-remote-code' if trust_remote - # Append any extra flags verbatim (e.g. Mistral loader flags, reasoning parser). - # Preset extra_vllm_args take precedence; fall back to top-level [vllm].extra_vllm_args. - extra_args = cfg.key?('extra_vllm_args') ? Array(cfg['extra_vllm_args']) : @config.vllm_extra_args - extra_args.each { |arg| docker_args << arg } - docker_run = docker_args.join(' ') - - script = [] - script << 'set -euo pipefail' - script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" - script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" - # Stop and remove any existing container so re-runs are idempotent. - script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true" - script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true" - # Pull the image only when explicitly requested (initial install / forced update). - # Skipping pull on model switches avoids surprise multi-GB downloads when the image - # has been updated upstream, which would otherwise add several minutes to a switch. - script << 'docker pull vllm/vllm-openai:latest' if pull_image - script << docker_run - # Poll until the model is loaded: - # first run: ~45 GB download (~2.5 min) + model load (~65 s) + CUDA graphs (~35 s) ≈ 4-5 min - # warm restart: model load (~65 s) + CUDA graphs (~35 s, skipped if compile cache warm) ≈ 65-100 s - # Timeout: 120 × 5 s = 10 minutes - script << 'echo "Waiting for vLLM to become ready (up to 10 min for first model download)..."' - script << 'for i in $(seq 1 120); do' - script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi" - script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)" - script << ' echo " vLLM not ready yet ($i/120, container=$state)..."' - script << ' sleep 5' - script << 'done' - script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 10 minutes'; exit 1; }" - script << 'echo vllm-install-ok' - script.join("\n") - end - - # Generates the remote shell script that installs LiteLLM in a Python venv, - # writes a config mapping Claude model aliases to the vLLM endpoint, and - # starts the proxy as a systemd service on litellm_port. - # model_override replaces the HuggingFace model name in the generated YAML. - def litellm_install_script(model_override: nil) - port = @config.litellm_port - vllm_port = @config.ollama_port - model = model_override || @config.vllm_model - claude_names = @config.litellm_claude_model_names - master_key = @config.litellm_master_key - - # Build model_list YAML entries; each Claude alias maps to the vLLM model. - # "hosted_vllm/" prefix forces LiteLLM to use /v1/chat/completions (not /v1/responses). - model_entries = claude_names.flat_map do |name| - [ - " - model_name: \"#{name}\"", - ' litellm_params:', - " model: \"hosted_vllm/#{model}\"", - " api_base: \"http://localhost:#{vllm_port}/v1\"", - ' api_key: "EMPTY"' - ] - end - - script = [] - script << 'set -euo pipefail' - script << 'sudo apt-get install -y python3.12-venv' - script << 'sudo mkdir -p /ephemeral/litellm-env' - script << 'sudo chown ubuntu:ubuntu /ephemeral/litellm-env' - script << 'python3 -m venv /ephemeral/litellm-env' - script << '/ephemeral/litellm-env/bin/pip install --quiet "litellm[proxy]"' - - # Write litellm-config.yaml via heredoc; drop_params silently discards - # Claude-specific params (e.g. context_management) that vLLM ignores. - script << "sudo tee /ephemeral/litellm-config.yaml > /dev/null << 'LITELLM_YAML'" - script << 'model_list:' - script.concat(model_entries) - script << '' - script << 'litellm_settings:' - script << ' drop_params: true' - script << '' - script << 'general_settings:' - script << " master_key: \"#{master_key}\"" - script << 'LITELLM_YAML' - - # Write systemd unit via heredoc; restart on failure so transient crashes self-heal. - script << "sudo tee /etc/systemd/system/litellm.service > /dev/null << 'LITELLM_UNIT'" - script << '[Unit]' - script << 'Description=LiteLLM Proxy' - script << 'After=network.target docker.service' - script << 'Requires=docker.service' - script << '' - script << '[Service]' - script << 'Type=simple' - script << 'User=ubuntu' - script << "ExecStart=/ephemeral/litellm-env/bin/litellm --config /ephemeral/litellm-config.yaml --host 0.0.0.0 --port #{port}" - script << 'Restart=always' - script << 'RestartSec=5' - script << '' - script << '[Install]' - script << 'WantedBy=multi-user.target' - script << 'LITELLM_UNIT' - - script << 'sudo systemctl daemon-reload' - script << 'sudo systemctl enable --now litellm' - script << 'sleep 5' - script << 'systemctl is-active --quiet litellm' - script << 'echo litellm-install-ok' - script.join("\n") - end - - # Rewrites /ephemeral/litellm-config.yaml for a different model and restarts - # the service in place — faster than litellm_install_script because it skips - # the venv creation and apt-get steps that are already in place. - def litellm_reload_script(model) - @config.litellm_port - vllm_port = @config.ollama_port - claude_names = @config.litellm_claude_model_names - master_key = @config.litellm_master_key - - model_entries = claude_names.flat_map do |name| - [ - " - model_name: \"#{name}\"", - ' litellm_params:', - " model: \"hosted_vllm/#{model}\"", - " api_base: \"http://localhost:#{vllm_port}/v1\"", - ' api_key: "EMPTY"' - ] - end - - script = [] - script << 'set -euo pipefail' - script << "sudo tee /ephemeral/litellm-config.yaml > /dev/null << 'LITELLM_YAML'" - script << 'model_list:' - script.concat(model_entries) - script << '' - script << 'litellm_settings:' - script << ' drop_params: true' - script << '' - script << 'general_settings:' - script << " master_key: \"#{master_key}\"" - script << 'LITELLM_YAML' - script << 'sudo systemctl restart litellm' - script << 'sleep 3' - script << 'systemctl is-active --quiet litellm' - script << 'echo litellm-reload-ok' - script.join("\n") - end - # Tests the vLLM OpenAI-compatible API: lists loaded models and runs a # short inference request to confirm the model accepts requests. def test_vllm(wg_ip) |
