From 2a2704fa4cac96a6754d4fea1bc341a27c5bb6c8 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Wed, 18 Mar 2026 12:06:07 +0200 Subject: Add vLLM model presets and live model switching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New [vllm.presets.*] TOML section with two presets: qwen3-coder-next bullpoint/Qwen3-Coder-Next-AWQ-4bit (256k ctx, coding) nemotron-super solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ (131k ctx, analysis) - New CLI subcommand: `model list` — show presets, mark the active one - New CLI subcommand: `model switch PRESET [--dry-run]` — switch the running VM to a different preset without redeploying: 1. stops old Docker container (if container_name differs) 2. starts new container and waits for model readiness 3. hot-reloads LiteLLM config via litellm_reload_script (no venv reinstall) 4. updates state file with new vllm_model / vllm_container_name / vllm_preset - New `create --model PRESET` flag — deploy with a non-default preset - vllm_install_script and litellm_install_script now accept preset_config:/ model_override: so callers can override individual fields without duplicating the full config - State file now tracks vllm_container_name and vllm_preset for clean container lifecycle management across switches Co-Authored-By: Claude Sonnet 4.6 (1M context) --- snippets/hyperstack/hyperstack-vm.toml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'snippets/hyperstack/hyperstack-vm.toml') diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml index 0ea3cfc..c19c8d5 100644 --- a/snippets/hyperstack/hyperstack-vm.toml +++ b/snippets/hyperstack/hyperstack-vm.toml @@ -75,6 +75,32 @@ litellm_claude_model_names = [ "claude-haiku-3-5-20241022" ] +# Named model presets for 'ruby hyperstack.rb model switch '. +# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. +# Switch examples: +# ruby hyperstack.rb model switch qwen3-coder-next # fast coding, 256k context +# ruby hyperstack.rb model switch nemotron-super # extended analysis, 131k context + +[vllm.presets.qwen3-coder-next] +model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" +container_name = "vllm_qwen3" +max_model_len = 262144 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "qwen3_coder" + +# Nemotron-Super 49B AWQ — deep reasoning / extended code analysis. +# ~25 GB weights + KV cache fits comfortably on A100 80GB. +# Verify the exact HuggingFace AWQ model ID before first use: +# curl -s http://192.168.3.1:11434/v1/models | python3 -m json.tool +[vllm.presets.nemotron-super] +model = "solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ" +container_name = "vllm_nemotron" +max_model_len = 131072 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "llama3_json" + [wireguard] auto_setup = true setup_script = "./wg1-setup.sh" -- cgit v1.2.3