Add vLLM + LiteLLM support; rename script; add README

- Replace Ollama (disabled by default) with vLLM Docker container + LiteLLM Anthropic-API proxy as the default inference backend - vLLM setup: pulls vllm/vllm-openai, starts container on port 11434, polls until model is loaded (up to 10 min for first 45 GB download) - LiteLLM setup: installs in Python venv, writes config mapping Claude model aliases to the vLLM model, runs as a systemd service on port 4000 - New CLI flags on `create`: --vllm/--no-vllm, --ollama/--no-ollama to override config at runtime - New `test` command: end-to-end inference test over WireGuard against vLLM (/v1/models + /v1/chat/completions) and LiteLLM (/v1/messages) - UFW rules now open both port 11434 (inference) and 4000 (LiteLLM) from the WireGuard subnet - Rename hyperstack_vm.rb → hyperstack.rb - Add README.md with quickstart, Claude Code / OpenCode usage, CLI reference, monitoring commands, and VRAM sizing notes - Add vllm-setup.txt: detailed manual setup notes and architecture docs Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-18 09:10:14 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-18 09:10:14 +0200
commit: d8575832ae0022f94cd786b15f8b88de0bf18672 (patch)
tree: 75872514846cfddb1434281a59b6673344023ff7 /snippets/hyperstack/hyperstack-vm.toml
parent: 8dca92ea40b191b9de367197aac7e1f882ed3d43 (diff)
1 files changed, 28 insertions, 2 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index 2d83b0f..0ea3cfc 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -31,7 +31,10 @@ connect_timeout_sec = 10
 [network]
 wireguard_udp_port = 56710
 wireguard_subnet = "192.168.3.0/24"
+# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
 ollama_port = 11434
+# Port 4000: LiteLLM Anthropic-API proxy (used with vLLM).
+litellm_port = 4000
 allowed_ssh_cidrs = ["0.0.0.0/0"]
 allowed_wireguard_cidrs = ["0.0.0.0/0"]
 
@@ -42,13 +45,36 @@ configure_ufw = true
 configure_ollama_host = false
 
 [ollama]
-install = true
+# Disabled in favour of vLLM; set install = true to switch back to Ollama.
+install = false
 models_dir = "/ephemeral/ollama/models"
 listen_host = "0.0.0.0:11434"
 gpu_overhead_mb = 2000
-num_parallel = 4
+num_parallel = 1
+context_length = 32768
 pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]
 
+# vLLM serves one model via Docker; LiteLLM translates Anthropic API → OpenAI.
+# Use --vllm / --no-vllm CLI flags to override install at runtime.
+[vllm]
+install = true
+model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
+hug_cache_dir = "/ephemeral/hug"
+container_name = "vllm_qwen3"
+max_model_len = 262144
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_coder"
+# LiteLLM maps each entry to the vLLM model; add new Anthropic model IDs here.
+litellm_master_key = "sk-litellm-master"
+litellm_claude_model_names = [
+  "claude-sonnet-4-20250514",
+  "claude-opus-4-20250514",
+  "claude-opus-4-6-20260604",
+  "claude-haiku-3-5-20241022"
+]
+
 [wireguard]
 auto_setup = true
 setup_script = "./wg1-setup.sh"
author	Paul Buetow <paul@buetow.org>	2026-03-18 09:10:14 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-18 09:10:14 +0200
commit	d8575832ae0022f94cd786b15f8b88de0bf18672 (patch)
tree	75872514846cfddb1434281a59b6673344023ff7 /snippets/hyperstack/hyperstack-vm.toml
parent	8dca92ea40b191b9de367197aac7e1f882ed3d43 (diff)