summaryrefslogtreecommitdiff
path: root/snippets
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-03-18 12:06:07 +0200
committerPaul Buetow <paul@buetow.org>2026-03-18 12:06:07 +0200
commit2a2704fa4cac96a6754d4fea1bc341a27c5bb6c8 (patch)
tree6bb555b988c8bef2b738c36a21905327567f27eb /snippets
parentb49cb03bb629a20dc459b8146ad8e735578d925d (diff)
Add vLLM model presets and live model switching
- New [vllm.presets.*] TOML section with two presets: qwen3-coder-next bullpoint/Qwen3-Coder-Next-AWQ-4bit (256k ctx, coding) nemotron-super solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ (131k ctx, analysis) - New CLI subcommand: `model list` — show presets, mark the active one - New CLI subcommand: `model switch PRESET [--dry-run]` — switch the running VM to a different preset without redeploying: 1. stops old Docker container (if container_name differs) 2. starts new container and waits for model readiness 3. hot-reloads LiteLLM config via litellm_reload_script (no venv reinstall) 4. updates state file with new vllm_model / vllm_container_name / vllm_preset - New `create --model PRESET` flag — deploy with a non-default preset - vllm_install_script and litellm_install_script now accept preset_config:/ model_override: so callers can override individual fields without duplicating the full config - State file now tracks vllm_container_name and vllm_preset for clean container lifecycle management across switches Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
Diffstat (limited to 'snippets')
-rw-r--r--snippets/hyperstack/hyperstack-vm.toml26
-rw-r--r--snippets/hyperstack/hyperstack.rb241
2 files changed, 243 insertions, 24 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index 0ea3cfc..c19c8d5 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -75,6 +75,32 @@ litellm_claude_model_names = [
"claude-haiku-3-5-20241022"
]
+# Named model presets for 'ruby hyperstack.rb model switch <name>'.
+# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
+# Switch examples:
+# ruby hyperstack.rb model switch qwen3-coder-next # fast coding, 256k context
+# ruby hyperstack.rb model switch nemotron-super # extended analysis, 131k context
+
+[vllm.presets.qwen3-coder-next]
+model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+container_name = "vllm_qwen3"
+max_model_len = 262144
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_coder"
+
+# Nemotron-Super 49B AWQ — deep reasoning / extended code analysis.
+# ~25 GB weights + KV cache fits comfortably on A100 80GB.
+# Verify the exact HuggingFace AWQ model ID before first use:
+# curl -s http://192.168.3.1:11434/v1/models | python3 -m json.tool
+[vllm.presets.nemotron-super]
+model = "solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ"
+container_name = "vllm_nemotron"
+max_model_len = 131072
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "llama3_json"
+
[wireguard]
auto_setup = true
setup_script = "./wg1-setup.sh"
diff --git a/snippets/hyperstack/hyperstack.rb b/snippets/hyperstack/hyperstack.rb
index c84d013..139129e 100644
--- a/snippets/hyperstack/hyperstack.rb
+++ b/snippets/hyperstack/hyperstack.rb
@@ -347,6 +347,34 @@ module HyperstackVM
fetch('vllm', 'litellm_master_key')
end
+ # Returns the hash of named presets from [vllm.presets.*].
+ # Each preset may override any subset of the top-level [vllm] fields.
+ def vllm_presets
+ Hash(dig('vllm', 'presets')).transform_keys(&:to_s)
+ end
+
+ def vllm_preset_names
+ vllm_presets.keys
+ end
+
+ # Resolves a named preset, merging its values over the [vllm] defaults
+ # so callers always get a complete set of parameters.
+ def vllm_preset(name)
+ raw = vllm_presets[name.to_s]
+ unless raw
+ available = vllm_preset_names.empty? ? 'none configured' : vllm_preset_names.join(', ')
+ raise Error, "Unknown vLLM preset #{name.inspect}. Available: #{available}"
+ end
+ {
+ 'model' => raw['model'] || vllm_model,
+ 'container_name' => raw['container_name'] || vllm_container_name,
+ 'max_model_len' => Integer(raw['max_model_len'] || vllm_max_model_len),
+ 'gpu_memory_utilization' => Float(raw['gpu_memory_utilization'] || vllm_gpu_memory_utilization),
+ 'tensor_parallel_size' => Integer(raw['tensor_parallel_size'] || vllm_tensor_parallel_size),
+ 'tool_call_parser' => raw['tool_call_parser'] || vllm_tool_call_parser
+ }
+ end
+
def local_client_checks_enabled?
truthy?(fetch('local_client', 'check_wg1_service'))
end
@@ -705,10 +733,13 @@ module HyperstackVM
@out = out
end
- def create(replace: false, dry_run: false, install_vllm: nil, install_ollama: nil)
+ def create(replace: false, dry_run: false, install_vllm: nil, install_ollama: nil, vllm_preset: nil)
# CLI flags override config; nil means "use config default".
@effective_vllm = install_vllm.nil? ? @config.vllm_install_enabled? : install_vllm
@effective_ollama = install_ollama.nil? ? @config.ollama_install_enabled? : install_ollama
+ # Validate preset name early so we fail before touching any remote state.
+ @effective_vllm_preset = vllm_preset
+ @config.vllm_preset(vllm_preset) if vllm_preset
existing_state = @state_store.load
if existing_state && existing_state['vm_id']
if replace
@@ -810,6 +841,77 @@ module HyperstackVM
print_local_wireguard_summary(state&.dig('public_ip'))
end
+ # Lists configured model presets and marks the one currently running on the VM.
+ def list_models
+ presets = @config.vllm_preset_names
+ state = @state_store.load
+ current = state&.dig('vllm_model')
+
+ if presets.empty?
+ info "No presets configured in [vllm.presets.*]."
+ info "Active model: #{current || @config.vllm_model}"
+ return
+ end
+
+ info 'Configured vLLM model presets:'
+ presets.each do |name|
+ p = @config.vllm_preset(name)
+ active = p['model'] == current
+ info " #{active ? '*' : ' '} #{name.ljust(24)} #{p['model']}"
+ end
+ info ''
+ info " (* = currently loaded on VM)" if current
+ end
+
+ # Switches the running VM to a different named model preset.
+ # Stops the old container, starts the new one, and hot-reloads LiteLLM config.
+ def switch_model(preset_name:, dry_run: false)
+ preset = @config.vllm_preset(preset_name) # raises if unknown
+ state = @state_store.load
+
+ old_container = state&.dig('vllm_container_name') || @config.vllm_container_name
+ new_container = preset['container_name']
+ current_model = state&.dig('vllm_model')
+
+ if dry_run
+ info "DRY RUN: model switch to preset '#{preset_name}'"
+ info " #{current_model || 'none'} → #{preset['model']}"
+ info " container: #{old_container} → #{new_container}"
+ info " max_model_len: #{preset['max_model_len']}, tool_call_parser: #{preset['tool_call_parser']}"
+ return
+ end
+
+ raise Error, "No tracked VM. Run 'create' first." unless state&.dig('vm_id')
+ host = state['public_ip']
+ raise Error, "No public IP in state file." if host.nil? || host.empty?
+
+ # Stop the old container only when it has a different name from the new one.
+ if old_container != new_container
+ info "Stopping old vLLM container #{old_container}..."
+ output, status = run_ssh_command_streaming(host, vllm_stop_script(old_container))
+ raise Error, "Failed to stop container #{old_container}: #{output.strip}" unless status.success?
+ end
+
+ info "Starting vLLM with preset '#{preset_name}' (#{preset['model']})..."
+ output, status = run_ssh_command_streaming(host, vllm_install_script(preset_config: preset))
+ raise Error, "vLLM install failed: #{output.strip}" unless status.success?
+
+ # Hot-reload LiteLLM: rewrite config for the new model and restart the service.
+ # Skips venv/apt install since those are already in place.
+ info "Reloading LiteLLM proxy config for #{preset['model']}..."
+ output, status = run_ssh_command_streaming(host, litellm_reload_script(preset['model']))
+ raise Error, "LiteLLM reload failed: #{output.strip}" unless status.success?
+
+ state['vllm_model'] = preset['model']
+ state['vllm_container_name'] = new_container
+ state['vllm_preset'] = preset_name
+ state['vllm_setup_at'] = Time.now.utc.iso8601
+ @state_store.save(state)
+
+ info "Model switched to '#{preset_name}' (#{preset['model']})."
+ info "Run 'ruby hyperstack.rb test' to verify."
+ end
+
# Runs end-to-end inference tests against vLLM and LiteLLM over WireGuard.
# Requires wg1 to be active and the VM to be fully provisioned.
def test
@@ -886,9 +988,12 @@ module HyperstackVM
# Set up vLLM (Docker container) + LiteLLM (Anthropic-API proxy) after
# the tunnel is up so that model-download progress is visible locally.
if vllm_setup_needed?(state)
- setup_vllm_stack(state['public_ip'])
- state['vllm_setup_at'] = Time.now.utc.iso8601
- state['vllm_model'] = @config.vllm_model
+ preset_cfg = effective_vllm_preset_config
+ setup_vllm_stack(state['public_ip'], preset_config: preset_cfg)
+ state['vllm_setup_at'] = Time.now.utc.iso8601
+ state['vllm_model'] = preset_cfg&.dig('model') || @config.vllm_model
+ state['vllm_container_name'] = preset_cfg&.dig('container_name') || @config.vllm_container_name
+ state['vllm_preset'] = @effective_vllm_preset
@state_store.save(state)
end
@@ -1245,8 +1350,13 @@ module HyperstackVM
end
end
if effective_vllm?
- info "vLLM will be installed: #{@config.vllm_model}"
- info " Container: #{@config.vllm_container_name}, port #{@config.ollama_port}, max_model_len #{@config.vllm_max_model_len}"
+ preset_cfg = effective_vllm_preset_config
+ vllm_m = preset_cfg&.dig('model') || @config.vllm_model
+ vllm_cname = preset_cfg&.dig('container_name') || @config.vllm_container_name
+ vllm_maxlen = preset_cfg&.dig('max_model_len') || @config.vllm_max_model_len
+ preset_note = @effective_vllm_preset ? " (preset: #{@effective_vllm_preset})" : ''
+ info "vLLM will be installed: #{vllm_m}#{preset_note}"
+ info " Container: #{vllm_cname}, port #{@config.ollama_port}, max_model_len #{vllm_maxlen}"
info "LiteLLM proxy will be installed on port #{@config.litellm_port}"
info " Claude model aliases: #{@config.litellm_claude_model_names.join(', ')}"
end
@@ -1456,35 +1566,59 @@ module HyperstackVM
defined?(@effective_vllm) ? @effective_vllm : @config.vllm_install_enabled?
end
+ # Returns the resolved preset config hash when a preset was selected via
+ # --model, or nil when using the top-level [vllm] defaults directly.
+ def effective_vllm_preset_config
+ name = defined?(@effective_vllm_preset) ? @effective_vllm_preset : nil
+ return nil unless name
+
+ @config.vllm_preset(name)
+ end
+
def vllm_setup_needed?(state)
return false unless effective_vllm?
- # Re-run if never set up, or if the configured model changed since last setup.
return true if state['vllm_setup_at'].nil?
- state['vllm_model'] != @config.vllm_model
+ # Re-run if the active model changed (direct config edit or --model preset flag).
+ desired = effective_vllm_preset_config&.dig('model') || @config.vllm_model
+ state['vllm_model'] != desired
+ end
+
+ # Generates a script that stops and removes a named Docker container.
+ # Used when switching to a preset whose container_name differs from the current one.
+ def vllm_stop_script(container_name)
+ script = []
+ script << 'set -euo pipefail'
+ script << "docker stop #{Shellwords.escape(container_name)} 2>/dev/null || true"
+ script << "docker rm #{Shellwords.escape(container_name)} 2>/dev/null || true"
+ script << 'echo vllm-stopped'
+ script.join("\n")
end
- def setup_vllm_stack(host)
+ def setup_vllm_stack(host, preset_config: nil)
info "Setting up vLLM Docker container on #{host}..."
- output, status = run_ssh_command_streaming(host, vllm_install_script)
+ output, status = run_ssh_command_streaming(host, vllm_install_script(preset_config: preset_config))
raise Error, "vLLM install failed: #{output.strip}" unless status.success?
+ model = preset_config&.dig('model') || @config.vllm_model
info "Setting up LiteLLM Anthropic-API proxy on #{host}..."
- output, status = run_ssh_command_streaming(host, litellm_install_script)
+ output, status = run_ssh_command_streaming(host, litellm_install_script(model_override: model))
raise Error, "LiteLLM install failed: #{output.strip}" unless status.success?
end
# Generates the remote shell script that pulls the vLLM Docker image, starts
# the container, and polls until the model is fully loaded (up to 10 minutes
# to cover the first-run ~45 GB model download).
- def vllm_install_script
- model = @config.vllm_model
- cache_dir = @config.vllm_hug_cache_dir
- container = @config.vllm_container_name
- max_len = @config.vllm_max_model_len
- gpu_util = @config.vllm_gpu_memory_utilization
- tp_size = @config.vllm_tensor_parallel_size
- parser = @config.vllm_tool_call_parser
+ # preset_config overrides individual fields; unset fields fall back to [vllm] defaults.
+ def vllm_install_script(preset_config: nil)
+ cfg = preset_config || {}
+ model = cfg['model'] || @config.vllm_model
+ cache_dir = @config.vllm_hug_cache_dir # always use main config for shared cache
+ container = cfg['container_name'] || @config.vllm_container_name
+ max_len = Integer(cfg['max_model_len'] || @config.vllm_max_model_len)
+ gpu_util = Float(cfg['gpu_memory_utilization'] || @config.vllm_gpu_memory_utilization)
+ tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size)
+ parser = cfg['tool_call_parser'] || @config.vllm_tool_call_parser
port = @config.ollama_port # vLLM reuses the Ollama port for firewall compat
docker_run = [
@@ -1533,10 +1667,11 @@ module HyperstackVM
# Generates the remote shell script that installs LiteLLM in a Python venv,
# writes a config mapping Claude model aliases to the vLLM endpoint, and
# starts the proxy as a systemd service on litellm_port.
- def litellm_install_script
+ # model_override replaces the HuggingFace model name in the generated YAML.
+ def litellm_install_script(model_override: nil)
port = @config.litellm_port
vllm_port = @config.ollama_port
- model = @config.vllm_model
+ model = model_override || @config.vllm_model
claude_names = @config.litellm_claude_model_names
master_key = @config.litellm_master_key
@@ -1599,6 +1734,44 @@ module HyperstackVM
script.join("\n")
end
+ # Rewrites /ephemeral/litellm-config.yaml for a different model and restarts
+ # the service in place — faster than litellm_install_script because it skips
+ # the venv creation and apt-get steps that are already in place.
+ def litellm_reload_script(model)
+ port = @config.litellm_port
+ vllm_port = @config.ollama_port
+ claude_names = @config.litellm_claude_model_names
+ master_key = @config.litellm_master_key
+
+ model_entries = claude_names.flat_map do |name|
+ [
+ " - model_name: \"#{name}\"",
+ ' litellm_params:',
+ " model: \"hosted_vllm/#{model}\"",
+ " api_base: \"http://localhost:#{vllm_port}/v1\"",
+ ' api_key: "EMPTY"'
+ ]
+ end
+
+ script = []
+ script << 'set -euo pipefail'
+ script << "sudo tee /ephemeral/litellm-config.yaml > /dev/null << 'LITELLM_YAML'"
+ script << 'model_list:'
+ script.concat(model_entries)
+ script << ''
+ script << 'litellm_settings:'
+ script << ' drop_params: true'
+ script << ''
+ script << 'general_settings:'
+ script << " master_key: \"#{master_key}\""
+ script << 'LITELLM_YAML'
+ script << 'sudo systemctl restart litellm'
+ script << 'sleep 3'
+ script << 'systemctl is-active --quiet litellm'
+ script << 'echo litellm-reload-ok'
+ script.join("\n")
+ end
+
# Tests the vLLM OpenAI-compatible API: lists loaded models and runs a
# short inference request to confirm the model accepts requests.
def test_vllm(wg_ip)
@@ -1718,10 +1891,12 @@ module HyperstackVM
puts opts
puts
puts 'Commands:'
- puts ' create [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama]'
+ puts ' create [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama] [--model PRESET]'
puts ' delete [--vm-id ID] [--dry-run]'
puts ' status'
puts ' test'
+ puts ' model list'
+ puts ' model switch PRESET [--dry-run]'
exit 0
end
end
@@ -1750,6 +1925,7 @@ module HyperstackVM
dry_run = false
install_vllm = nil
install_ollama = nil
+ vllm_preset = nil
parser = OptionParser.new do |opts|
opts.on('--replace', 'Delete the tracked VM before creating a new one') { replace = true }
opts.on('--dry-run', 'Resolve config and print the create plan without creating a VM') { dry_run = true }
@@ -1757,9 +1933,11 @@ module HyperstackVM
opts.on('--no-vllm', 'Disable vLLM+LiteLLM setup (overrides config)') { install_vllm = false }
opts.on('--ollama', 'Enable Ollama setup (overrides config)') { install_ollama = true }
opts.on('--no-ollama', 'Disable Ollama setup (overrides config)') { install_ollama = false }
+ opts.on('--model PRESET', 'Use a named vLLM model preset at create time') { |v| vllm_preset = v }
end
parser.parse!(@argv)
- manager.create(replace: replace, dry_run: dry_run, install_vllm: install_vllm, install_ollama: install_ollama)
+ manager.create(replace: replace, dry_run: dry_run, install_vllm: install_vllm,
+ install_ollama: install_ollama, vllm_preset: vllm_preset)
when 'delete'
vm_id = nil
dry_run = false
@@ -1775,8 +1953,23 @@ module HyperstackVM
manager.status
when 'test'
manager.test
+ when 'model'
+ sub = @argv.shift
+ raise Error, "Missing model subcommand. Use: model list | model switch PRESET [--dry-run]" if sub.nil?
+ case sub
+ when 'list'
+ manager.list_models
+ when 'switch'
+ preset = @argv.shift
+ raise Error, "Missing preset name. Usage: model switch PRESET [--dry-run]" if preset.nil?
+ dry_run = false
+ OptionParser.new { |o| o.on('--dry-run') { dry_run = true } }.parse!(@argv)
+ manager.switch_model(preset_name: preset, dry_run: dry_run)
+ else
+ raise Error, "Unknown model subcommand #{sub.inspect}. Use list or switch."
+ end
else
- raise Error, "Unknown command #{command.inspect}. Use create, delete, status, or test."
+ raise Error, "Unknown command #{command.inspect}. Use create, delete, status, test, or model."
end
end
end