From 2a2704fa4cac96a6754d4fea1bc341a27c5bb6c8 Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Wed, 18 Mar 2026 12:06:07 +0200
Subject: Add vLLM model presets and live model switching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- New [vllm.presets.*] TOML section with two presets:
    qwen3-coder-next  bullpoint/Qwen3-Coder-Next-AWQ-4bit (256k ctx, coding)
    nemotron-super    solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ (131k ctx, analysis)
- New CLI subcommand: `model list` — show presets, mark the active one
- New CLI subcommand: `model switch PRESET [--dry-run]` — switch the
  running VM to a different preset without redeploying:
    1. stops old Docker container (if container_name differs)
    2. starts new container and waits for model readiness
    3. hot-reloads LiteLLM config via litellm_reload_script (no venv reinstall)
    4. updates state file with new vllm_model / vllm_container_name / vllm_preset
- New `create --model PRESET` flag — deploy with a non-default preset
- vllm_install_script and litellm_install_script now accept preset_config:/
  model_override: so callers can override individual fields without
  duplicating the full config
- State file now tracks vllm_container_name and vllm_preset for clean
  container lifecycle management across switches

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 snippets/hyperstack/hyperstack-vm.toml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'snippets/hyperstack/hyperstack-vm.toml')
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index 0ea3cfc..c19c8d5 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -75,6 +75,32 @@ litellm_claude_model_names = [
   "claude-haiku-3-5-20241022"
 ]
 
+# Named model presets for 'ruby hyperstack.rb model switch <name>'.
+# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
+# Switch examples:
+#   ruby hyperstack.rb model switch qwen3-coder-next  # fast coding, 256k context
+#   ruby hyperstack.rb model switch nemotron-super     # extended analysis, 131k context
+
+[vllm.presets.qwen3-coder-next]
+model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+container_name = "vllm_qwen3"
+max_model_len = 262144
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_coder"
+
+# Nemotron-Super 49B AWQ — deep reasoning / extended code analysis.
+# ~25 GB weights + KV cache fits comfortably on A100 80GB.
+# Verify the exact HuggingFace AWQ model ID before first use:
+#   curl -s http://192.168.3.1:11434/v1/models | python3 -m json.tool
+[vllm.presets.nemotron-super]
+model = "solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ"
+container_name = "vllm_nemotron"
+max_model_len = 131072
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "llama3_json"
+
 [wireguard]
 auto_setup = true
 setup_script = "./wg1-setup.sh"
-- 
cgit v1.2.3