diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-21 09:46:58 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-21 09:46:58 +0200 |
| commit | c693f37a6115f3567cd4fcff4c256a6d20dd6fac (patch) | |
| tree | 04e18f502616535013bab0c7c513a1aabdb9c2f2 | |
| parent | 3f6ef419f52c3361c8914a27c7949c2c8f2be1c8 (diff) | |
moved
| -rw-r--r-- | snippets/hyperstack/.crush/logs/crush.log | 4 | ||||
| -rw-r--r-- | snippets/hyperstack/.gitignore | 4 | ||||
| -rw-r--r-- | snippets/hyperstack/.pi/settings.json | 8 | ||||
| -rw-r--r-- | snippets/hyperstack/Gemfile | 3 | ||||
| -rw-r--r-- | snippets/hyperstack/Gemfile.lock | 16 | ||||
| -rw-r--r-- | snippets/hyperstack/README.md | 186 | ||||
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm.toml | 204 | ||||
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm1.toml | 185 | ||||
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm2.toml | 182 | ||||
| -rwxr-xr-x | snippets/hyperstack/hyperstack.rb | 2731 | ||||
| -rwxr-xr-x | snippets/hyperstack/pi-vm1 | 7 | ||||
| -rwxr-xr-x | snippets/hyperstack/pi-vm2 | 7 | ||||
| -rw-r--r-- | snippets/hyperstack/vllm-setup.txt | 487 | ||||
| -rwxr-xr-x | snippets/hyperstack/wg1-setup.sh | 414 |
14 files changed, 0 insertions, 4438 deletions
diff --git a/snippets/hyperstack/.crush/logs/crush.log b/snippets/hyperstack/.crush/logs/crush.log deleted file mode 100644 index 7745db8..0000000 --- a/snippets/hyperstack/.crush/logs/crush.log +++ /dev/null @@ -1,4 +0,0 @@ -{"time":"2026-01-29T21:33:05.561515639+02:00","level":"INFO","source":{"function":"github.com/charmbracelet/crush/internal/config.(*catwalkSync).Get.func1","file":"/home/paul/go/pkg/mod/github.com/charmbracelet/crush@v0.36.0/internal/config/catwalk.go","line":55},"msg":"Fetching providers from Catwalk"} -{"time":"2026-01-29T21:33:05.920268417+02:00","level":"INFO","source":{"function":"github.com/charmbracelet/crush/internal/config.cache[...].Store","file":"/home/paul/go/pkg/mod/github.com/charmbracelet/crush@v0.36.0/internal/config/provider.go","line":213},"msg":"Saving provider data to disk","path":"/home/paul/.local/share/crush/providers.json"} -{"time":"2026-01-29T21:33:05.923610816+02:00","level":"WARN","source":{"function":"github.com/charmbracelet/crush/internal/config.(*Config).configureProviders-range1","file":"/home/paul/go/pkg/mod/github.com/charmbracelet/crush@v0.36.0/internal/config/load.go","line":295},"msg":"Provider is missing API key, this might be OK for local providers","provider":"ollama"} -{"time":"2026-01-29T21:33:05.923686216+02:00","level":"WARN","source":{"function":"github.com/charmbracelet/crush/internal/config.(*Config).configureProviders-range1","file":"/home/paul/go/pkg/mod/github.com/charmbracelet/crush@v0.36.0/internal/config/load.go","line":309},"msg":"Provider is missing API key, this might be OK for local providers","provider":"ollama"} diff --git a/snippets/hyperstack/.gitignore b/snippets/hyperstack/.gitignore deleted file mode 100644 index 132d791..0000000 --- a/snippets/hyperstack/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -.bundle/ -vendor/bundle/ -.hyperstack-vm-state.json -.hyperstack-vm*-state.json* diff --git a/snippets/hyperstack/.pi/settings.json b/snippets/hyperstack/.pi/settings.json deleted file mode 100644 index 23f5df6..0000000 --- a/snippets/hyperstack/.pi/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "defaultProvider": "hyperstack1", - "defaultModel": "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit", - "enabledModels": [ - "hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit", - "hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit" - ] -} diff --git a/snippets/hyperstack/Gemfile b/snippets/hyperstack/Gemfile deleted file mode 100644 index a1bbd94..0000000 --- a/snippets/hyperstack/Gemfile +++ /dev/null @@ -1,3 +0,0 @@ -source "https://rubygems.org" - -gem "toml-rb", "~> 2.2" diff --git a/snippets/hyperstack/Gemfile.lock b/snippets/hyperstack/Gemfile.lock deleted file mode 100644 index 80e05d4..0000000 --- a/snippets/hyperstack/Gemfile.lock +++ /dev/null @@ -1,16 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - citrus (3.0.2) - toml-rb (2.2.0) - citrus (~> 3.0, > 3.0) - -PLATFORMS - ruby - x86_64-linux - -DEPENDENCIES - toml-rb (~> 2.2) - -BUNDLED WITH - 2.6.9 diff --git a/snippets/hyperstack/README.md b/snippets/hyperstack/README.md deleted file mode 100644 index 730b310..0000000 --- a/snippets/hyperstack/README.md +++ /dev/null @@ -1,186 +0,0 @@ -# hyperstack - -Automates Hyperstack GPU VM lifecycle: create, bootstrap, WireGuard tunnel, vLLM inference, LiteLLM proxy. - -## Architecture - -``` -Claude Code (local) Hyperstack VM (A100 80GB) -┌─────────────────┐ ┌──────────────────────────────────┐ -│ claude CLI │── Anthropic API ─▶│ LiteLLM proxy (:4000) │ -│ │ /v1/messages │ Anthropic → OpenAI translation │ -│ │ via WireGuard │ │ │ -└─────────────────┘ │ ▼ │ - │ vLLM engine (:11434) │ -OpenCode (local) │ bullpoint/Qwen3-Coder-Next- │ -┌─────────────────┐ │ AWQ-4bit (45 GB, MoE 80B) │ -│ opencode │── OpenAI API ────▶│ FlashAttention v2 │ -│ │ /v1/chat/... │ prefix caching │ -└─────────────────┘ └──────────────────────────────────┘ -``` - -Both local clients connect over a WireGuard tunnel (`wg1`, subnet `192.168.3.0/24`). -The VM gets `192.168.3.1`; your local machine gets `192.168.3.2`. - -## Prerequisites - -- Hyperstack account with API key in `~/.hyperstack` -- SSH key registered in Hyperstack as `earth` (or change `ssh.hyperstack_key_name` in the TOML) -- Review `[network].allowed_ssh_cidrs` and `[network].allowed_wireguard_cidrs` in your TOML. - The secure default is `["auto"]`, which resolves your current public egress IP to `/32`. - Set explicit CIDRs or `HYPERSTACK_OPERATOR_CIDR` if you deploy from a different network. -- WireGuard setup script: `wg1-setup.sh` (present in this directory) -- Ruby with `toml-rb` gem: `bundle install` - -## Quickstart - -```bash -# Deploy VM, set up WireGuard + vLLM + LiteLLM (~10 min on first run) -ruby hyperstack.rb create - -# Verify everything is working -ruby hyperstack.rb test - -# Use Claude Code against the local vLLM -ANTHROPIC_BASE_URL=http://hyperstack.wg1:4000 \ -ANTHROPIC_API_KEY=sk-litellm-master \ -claude --model claude-opus-4-6-20260604 --dangerously-skip-permissions - -# Tear down -# Also removes the tracked local wg1 peer, hostname alias, and pinned SSH host key. -ruby hyperstack.rb delete -``` - -## Using Pi - -Bring both VMs up first: - -```bash -ruby hyperstack.rb create-both -``` - -Then start one Pi session per terminal: - -```bash -./pi-vm1 -./pi-vm2 -``` - -These wrappers `cd` into this repo before launching Pi, so the project-local -settings in `.pi/settings.json` still apply. - -## Using Claude Code with vLLM - -WireGuard (`wg1`) must be active before connecting. - -```bash -ANTHROPIC_BASE_URL=http://hyperstack.wg1:4000 \ -ANTHROPIC_API_KEY=sk-litellm-master \ -claude --model claude-opus-4-6-20260604 --dangerously-skip-permissions -``` - -If you see an **"Auth conflict"** warning, clear the saved claude.ai session first: - -```bash -claude /logout -``` - -**Fish shell alias** (add to `~/.config/fish/config.fish`): - -```fish -alias claude-local='ANTHROPIC_BASE_URL=http://hyperstack.wg1:4000 \ - ANTHROPIC_API_KEY=sk-litellm-master \ - claude --model claude-opus-4-6-20260604 --dangerously-skip-permissions' -``` - -**Available model aliases** — all map to the same vLLM model: - -| Alias | Use case | -|-------|----------| -| `claude-opus-4-6-20260604` | Recommended (most future-proof) | -| `claude-opus-4-20250514` | | -| `claude-sonnet-4-20250514` | | -| `claude-haiku-3-5-20241022` | | - -Add new Anthropic model IDs to `vllm.litellm_claude_model_names` in `hyperstack-vm.toml` as they are released. - -## Using OpenCode with vLLM - -OpenCode speaks OpenAI natively — connect directly to vLLM, no LiteLLM needed: - -```bash -OPENAI_BASE_URL=http://hyperstack.wg1:11434/v1 \ -OPENAI_API_KEY=EMPTY \ -opencode -``` - -Set the model name to `bullpoint/Qwen3-Coder-Next-AWQ-4bit` in your OpenCode config. - -## CLI reference - -``` -ruby hyperstack.rb [--config path] <command> [options] - -Commands: - create Deploy a new VM and run full provisioning - delete Destroy the tracked VM - status Show VM and WireGuard status - test Run end-to-end inference tests (vLLM + LiteLLM) - -create options: - --replace Delete existing tracked VM before creating - --dry-run Print the plan without making changes - --vllm / --no-vllm Override config: enable/disable vLLM+LiteLLM setup - --ollama / --no-ollama Override config: enable/disable Ollama setup -``` - -## Configuration - -Edit `hyperstack-vm.toml` to change defaults. Key sections: - -| Section | Purpose | -|---------|---------| -| `[vm]` | Flavor, image, environment name | -| `[vllm]` | Model, container settings, LiteLLM key and Claude aliases | -| `[ollama]` | Ollama settings (disabled by default; set `install = true` to use instead) | -| `[network]` | Ports, WireGuard subnet, allowed CIDRs | -| `[wireguard]` | Auto-setup script path | - -`allowed_ssh_cidrs` and `allowed_wireguard_cidrs` accept either explicit CIDRs such as -`["203.0.113.4/32"]` or `["auto"]`. `auto` resolves the current public operator IP at runtime; -set `HYPERSTACK_OPERATOR_CIDR` to override that detection when needed. - -SSH host keys are pinned per state file in `<state>.known_hosts`. `delete` and `--replace` -clear that trust file for intentional reprovisioning; unexpected host key changes now fail closed. - -## Monitoring vLLM - -```bash -# Live engine stats (throughput, KV cache, prefix cache hit rate) -ssh ubuntu@<vm-ip> 'docker logs -f vllm_qwen3 2>&1 | grep "Engine 000"' - -# Last 1 minute of stats -ssh ubuntu@<vm-ip> 'docker logs --since 1m vllm_qwen3 2>&1 | grep "Engine 000"' - -# GPU stats (every 5 s) -ssh ubuntu@<vm-ip> 'nvidia-smi --query-gpu=temperature.gpu,utilization.gpu,power.draw,memory.used --format=csv -l 5' - -# LiteLLM proxy log -ssh ubuntu@<vm-ip> 'sudo journalctl -fu litellm' -``` - -Healthy baseline (A100 80GB PCIe, qwen3-coder-next AWQ 4-bit): - -| Metric | Expected | -|--------|----------| -| Prefill throughput | 5,000–11,000 tok/s | -| Decode throughput | 40–99 tok/s | -| KV cache usage | 2–5% for typical sessions | -| Prefix cache hit (Claude Code) | 0% (expected — prompt prefix mutates each turn) | -| Prefix cache hit (OpenCode) | >50% after warm-up | - -## Switching models - -Stop the current container, start a new one with a different `--model`, then update `vllm.model` in `hyperstack-vm.toml` and re-run `ruby hyperstack.rb create` to reinstall LiteLLM with the updated config. - -See `vllm-setup.txt` for detailed vLLM and LiteLLM setup notes, VRAM sizing guide, and troubleshooting. diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml deleted file mode 100644 index e82c97f..0000000 --- a/snippets/hyperstack/hyperstack-vm.toml +++ /dev/null @@ -1,204 +0,0 @@ -[auth] -api_key_file = "~/.hyperstack" - -[hyperstack] -base_url = "https://infrahub-api.nexgencloud.com/v1" - -[state] -file = ".hyperstack-vm-state.json" - -[vm] -name_prefix = "hyperstack" -hostname = "hyperstack" -environment_name = "snonux-ollama" - -# A100-80GB is the cost-first default for gpt-oss-120b inference. -# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. -flavor_name = "n3-A100x1" -image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" -assign_floating_ip = true -create_bootable_volume = false -enable_port_randomization = false -labels = ["gpt-oss-120b", "wireguard"] - -[ssh] -username = "ubuntu" -private_key_path = "~/.ssh/id_rsa" -hyperstack_key_name = "earth" -port = 22 -connect_timeout_sec = 10 - -[network] -wireguard_udp_port = 56710 -wireguard_subnet = "192.168.3.0/24" -# Secure default: "auto" resolves your current public egress IP to /32 at runtime. -# Override with explicit CIDRs if you deploy from multiple networks or want broader access. -allowed_ssh_cidrs = ["auto"] -allowed_wireguard_cidrs = ["auto"] -# Port 11434 is shared by both Ollama and vLLM for firewall compatibility. -ollama_port = 11434 -# Port 4000: LiteLLM Anthropic-API proxy (used with vLLM). -litellm_port = 4000 - -[bootstrap] -enable_guest_bootstrap = true -install_wireguard = true -configure_ufw = true -configure_ollama_host = false - -[ollama] -# Disabled in favour of vLLM; set install = true to switch back to Ollama. -install = false -models_dir = "/ephemeral/ollama/models" -listen_host = "0.0.0.0:11434" -gpu_overhead_mb = 2000 -num_parallel = 1 -context_length = 32768 -pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] - -# vLLM serves one model via Docker; LiteLLM translates Anthropic API → OpenAI. -# Use --vllm / --no-vllm CLI flags to override install at runtime. -[vllm] -install = true -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). -hug_cache_dir = "/ephemeral/hug" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" -# LiteLLM maps each entry to the vLLM model; add new Anthropic model IDs here. -litellm_master_key = "sk-litellm-master" -litellm_claude_model_names = [ - "claude-sonnet-4-20250514", - "claude-opus-4-20250514", - "claude-opus-4-6-20260604", - "claude-haiku-3-5-20241022" -] - -# Named model presets for 'ruby hyperstack.rb model switch <name>'. -# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. -# Switch examples: -# ruby hyperstack.rb model switch qwen3-coder-next # fast coding, 256k context -# ruby hyperstack.rb model switch nemotron-super # extended analysis, 131k context - -[vllm.presets.qwen3-coder-next] -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB. Uses NoPE (no positional embeddings) so context can be set to -# 1M by just raising max_model_len; no YaRN needed. May OOM above 256K on A100 80GB. -# Requires trust_remote_code=true for the nemotron_h architecture. -# Note: cyankiwi AWQ has model_type="nemotron_nas" (underscore); vLLM keys on "nemotron-nas" -# (hyphen), so vLLM may not recognise it without trust_remote_code and latest vLLM. -# NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML: -# <tool_call><function=name><parameter=p>value</parameter></function></tool_call> -# qwen3_xml handles this format and is compatible with Nemotron's chat template. -[vllm.presets.nemotron-super] -model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" -container_name = "vllm_nemotron_super" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_xml" -trust_remote_code = true -# nemotron_v3 reasoning parser exposes <think> tokens as reasoning_content in the API. -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] - -# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. -# Native MXFP4 quantization; vLLM auto-detects it (no --quantization flag needed). -# With only 14 GB weights, most of the 80 GB is available for KV cache (64K+ context). -# tool_call_parser = "" disables --enable-auto-tool-choice: the llama3_json parser crashes -# on gpt-oss responses (vLLM 0.17.1 adds token_ids to responses, breaking the parser API). -[vllm.presets.gpt-oss-20b] -model = "openai/gpt-oss-20b" -container_name = "vllm_gpt_oss_20b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -# 131072 is the absolute ceiling — exceeding it causes NaN or CUDA OOB errors. -# For sessions approaching this limit, start a fresh opencode conversation. -# tool_call_parser = "" disables --enable-auto-tool-choice (same reason as gpt-oss-20b). -[vllm.presets.gpt-oss-120b] -model = "openai/gpt-oss-120b" -container_name = "vllm_gpt_oss_120b" -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. -# Official Qwen AWQ release; max_position_embeddings=32768 per model config.json. -[vllm.presets.qwen25-coder-32b] -model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" -container_name = "vllm_qwen25_coder32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "hermes" - -# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. -# Note: model card warns of significant quality loss at 4-bit for this MoE architecture. -[vllm.presets.qwen3-coder-30b] -model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" -container_name = "vllm_qwen3_coder30b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. -# Generates <think> reasoning tokens; --reasoning-parser deepseek_r1 exposes them in the API. -# tool_call_parser="" disables tool calling (reasoning models don't support it reliably). -[vllm.presets.deepseek-r1-32b] -model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" -container_name = "vllm_deepseek_r1_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. -# Native thinking mode; --reasoning-parser deepseek_r1 is compatible with Qwen3 thinking format. -# tool_call_parser="" disables tool calling (reasoning models don't support it reliably). -[vllm.presets.qwen3-32b] -model = "Qwen/Qwen3-32B-AWQ" -container_name = "vllm_qwen3_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). -# Uses HF safetensors weights but Mistral tokenizer (tekken.json) and config (params.json). -# --load_format mistral is NOT used: AWQ weights are in standard HF safetensors format. -# --tokenizer_mode mistral and --config_format mistral handle the Mistral-native files. -[vllm.presets.devstral] -model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" -container_name = "vllm_devstral" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "mistral" -extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] - -[wireguard] -auto_setup = true -setup_script = "./wg1-setup.sh" - -[local_client] -check_wg1_service = true -interface_name = "wg1" -config_path = "/etc/wireguard/wg1.conf" diff --git a/snippets/hyperstack/hyperstack-vm1.toml b/snippets/hyperstack/hyperstack-vm1.toml deleted file mode 100644 index 1b116bd..0000000 --- a/snippets/hyperstack/hyperstack-vm1.toml +++ /dev/null @@ -1,185 +0,0 @@ -[auth] -api_key_file = "~/.hyperstack" - -[hyperstack] -base_url = "https://infrahub-api.nexgencloud.com/v1" - -[state] -# Separate state file for VM1 so vm1 and vm2 can be managed independently. -file = ".hyperstack-vm1-state.json" - -[vm] -name_prefix = "hyperstack1" -hostname = "hyperstack1" -environment_name = "snonux-ollama" - -# A100-80GB is the cost-first default for nemotron-3-super inference. -# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. -flavor_name = "n3-A100x1" -image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" -assign_floating_ip = true -create_bootable_volume = false -enable_port_randomization = false -labels = ["nemotron-3-super", "wireguard"] - -[ssh] -username = "ubuntu" -private_key_path = "~/.ssh/id_rsa" -hyperstack_key_name = "earth" -port = 22 -connect_timeout_sec = 10 - -[network] -wireguard_udp_port = 56710 -wireguard_subnet = "192.168.3.0/24" -# VM1 gets the first server-side WireGuard IP (gateway address + 0). -# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3. -wireguard_server_ip = "192.168.3.1" -# Secure default: "auto" resolves your current public egress IP to /32 at runtime. -# Override with explicit CIDRs if you deploy from multiple networks or want broader access. -allowed_ssh_cidrs = ["auto"] -allowed_wireguard_cidrs = ["auto"] -# Port 11434 is shared by both Ollama and vLLM for firewall compatibility. -ollama_port = 11434 -# Port 4000: LiteLLM Anthropic-API proxy (used with vLLM). -litellm_port = 4000 - -[bootstrap] -enable_guest_bootstrap = true -install_wireguard = true -configure_ufw = true -configure_ollama_host = false - -[ollama] -# Disabled in favour of vLLM; set install = true to switch back to Ollama. -install = false -models_dir = "/ephemeral/ollama/models" -listen_host = "0.0.0.0:11434" -gpu_overhead_mb = 2000 -num_parallel = 1 -context_length = 32768 -pull_models = ["nemotron-3-super"] - -# vLLM serves one model via Docker; LiteLLM translates Anthropic API → OpenAI. -# VM1 defaults to nemotron-3-super; use 'model switch' to load any other preset. -[vllm] -install = true -model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" -# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). -hug_cache_dir = "/ephemeral/hug" -container_name = "vllm_nemotron_super" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -# NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML. -tool_call_parser = "qwen3_xml" -trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] -# LiteLLM maps each entry to the vLLM model; add new Anthropic model IDs here. -litellm_master_key = "sk-litellm-master" -litellm_claude_model_names = [ - "claude-sonnet-4-20250514", - "claude-opus-4-20250514", - "claude-opus-4-6-20260604", - "claude-haiku-3-5-20241022" -] - -# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1.toml model switch <name>'. -# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. - -[vllm.presets.qwen3-coder-next] -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed. -# Requires trust_remote_code=true for the nemotron_h architecture. -[vllm.presets.nemotron-super] -model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" -container_name = "vllm_nemotron_super" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_xml" -trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] - -# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. -[vllm.presets.gpt-oss-20b] -model = "openai/gpt-oss-20b" -container_name = "vllm_gpt_oss_20b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -[vllm.presets.gpt-oss-120b] -model = "openai/gpt-oss-120b" -container_name = "vllm_gpt_oss_120b" -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. -[vllm.presets.qwen25-coder-32b] -model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" -container_name = "vllm_qwen25_coder32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "hermes" - -# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. -[vllm.presets.qwen3-coder-30b] -model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" -container_name = "vllm_qwen3_coder30b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. -[vllm.presets.deepseek-r1-32b] -model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" -container_name = "vllm_deepseek_r1_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. -[vllm.presets.qwen3-32b] -model = "Qwen/Qwen3-32B-AWQ" -container_name = "vllm_qwen3_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). -[vllm.presets.devstral] -model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" -container_name = "vllm_devstral" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "mistral" -extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] - -[wireguard] -auto_setup = true -setup_script = "./wg1-setup.sh" - -[local_client] -check_wg1_service = true -interface_name = "wg1" -config_path = "/etc/wireguard/wg1.conf" diff --git a/snippets/hyperstack/hyperstack-vm2.toml b/snippets/hyperstack/hyperstack-vm2.toml deleted file mode 100644 index e8e9b00..0000000 --- a/snippets/hyperstack/hyperstack-vm2.toml +++ /dev/null @@ -1,182 +0,0 @@ -[auth] -api_key_file = "~/.hyperstack" - -[hyperstack] -base_url = "https://infrahub-api.nexgencloud.com/v1" - -[state] -# Separate state file for VM2 so vm1 and vm2 can be managed independently. -file = ".hyperstack-vm2-state.json" - -[vm] -name_prefix = "hyperstack2" -hostname = "hyperstack2" -environment_name = "snonux-ollama" - -# A100-80GB is the cost-first default for qwen3-coder-next inference. -# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. -flavor_name = "n3-A100x1" -image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" -assign_floating_ip = true -create_bootable_volume = false -enable_port_randomization = false -labels = ["qwen3-coder-next", "wireguard"] - -[ssh] -username = "ubuntu" -private_key_path = "~/.ssh/id_rsa" -hyperstack_key_name = "earth" -port = 22 -connect_timeout_sec = 10 - -[network] -wireguard_udp_port = 56710 -wireguard_subnet = "192.168.3.0/24" -# VM2 gets the third server-side WireGuard IP (skipping .2 which is the earth client). -# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3. -wireguard_server_ip = "192.168.3.3" -# Secure default: "auto" resolves your current public egress IP to /32 at runtime. -# Override with explicit CIDRs if you deploy from multiple networks or want broader access. -allowed_ssh_cidrs = ["auto"] -allowed_wireguard_cidrs = ["auto"] -# Port 11434 is shared by both Ollama and vLLM for firewall compatibility. -ollama_port = 11434 -# Port 4000: LiteLLM Anthropic-API proxy (used with vLLM). -litellm_port = 4000 - -[bootstrap] -enable_guest_bootstrap = true -install_wireguard = true -configure_ufw = true -configure_ollama_host = false - -[ollama] -# Disabled in favour of vLLM; set install = true to switch back to Ollama. -install = false -models_dir = "/ephemeral/ollama/models" -listen_host = "0.0.0.0:11434" -gpu_overhead_mb = 2000 -num_parallel = 1 -context_length = 32768 -pull_models = ["qwen3-coder-next"] - -# vLLM serves one model via Docker; LiteLLM translates Anthropic API → OpenAI. -# VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset. -[vllm] -install = true -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). -hug_cache_dir = "/ephemeral/hug" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" -# LiteLLM maps each entry to the vLLM model; add new Anthropic model IDs here. -litellm_master_key = "sk-litellm-master" -litellm_claude_model_names = [ - "claude-sonnet-4-20250514", - "claude-opus-4-20250514", - "claude-opus-4-6-20260604", - "claude-haiku-3-5-20241022" -] - -# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch <name>'. -# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. - -[vllm.presets.qwen3-coder-next] -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed. -# Requires trust_remote_code=true for the nemotron_h architecture. -[vllm.presets.nemotron-super] -model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" -container_name = "vllm_nemotron_super" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_xml" -trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] - -# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. -[vllm.presets.gpt-oss-20b] -model = "openai/gpt-oss-20b" -container_name = "vllm_gpt_oss_20b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -[vllm.presets.gpt-oss-120b] -model = "openai/gpt-oss-120b" -container_name = "vllm_gpt_oss_120b" -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. -[vllm.presets.qwen25-coder-32b] -model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" -container_name = "vllm_qwen25_coder32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "hermes" - -# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. -[vllm.presets.qwen3-coder-30b] -model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" -container_name = "vllm_qwen3_coder30b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. -[vllm.presets.deepseek-r1-32b] -model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" -container_name = "vllm_deepseek_r1_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. -[vllm.presets.qwen3-32b] -model = "Qwen/Qwen3-32B-AWQ" -container_name = "vllm_qwen3_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). -[vllm.presets.devstral] -model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" -container_name = "vllm_devstral" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "mistral" -extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] - -[wireguard] -auto_setup = true -setup_script = "./wg1-setup.sh" - -[local_client] -check_wg1_service = true -interface_name = "wg1" -config_path = "/etc/wireguard/wg1.conf" diff --git a/snippets/hyperstack/hyperstack.rb b/snippets/hyperstack/hyperstack.rb deleted file mode 100755 index 7cd817d..0000000 --- a/snippets/hyperstack/hyperstack.rb +++ /dev/null @@ -1,2731 +0,0 @@ -#!/usr/bin/env ruby -# frozen_string_literal: true - -begin - require 'bundler/setup' -rescue LoadError, Gem::GemNotFoundException, Gem::LoadError, Errno::ENOENT - nil -end - -require 'json' -require 'fileutils' -require 'net/http' -require 'open3' -require 'optparse' -require 'ipaddr' -require 'shellwords' -require 'socket' -require 'time' -require 'timeout' - -begin - require 'toml-rb' -rescue LoadError - warn "Missing dependency: toml-rb. Run `bundle install` in #{__dir__} first." - exit 2 -end - -module HyperstackVM - class Error < StandardError; end - - class ConfigLoader - attr_reader :path - - def self.load(path) - expanded = File.expand_path(path) - raise Error, "Config file not found: #{expanded}" unless File.exist?(expanded) - - raw = TomlRB.load_file(expanded) - new(raw, expanded) - rescue TomlRB::ParseError => e - raise Error, "Failed to parse TOML config #{expanded}: #{e.message}" - end - - def initialize(raw, path) - @path = path - @data = deep_merge(DEFAULTS, raw || {}) - validate! - end - - def config - Config.new(@data, @path) - end - - private - - DEFAULTS = { - 'auth' => { - 'api_key_file' => '~/.hyperstack' - }, - 'hyperstack' => { - 'base_url' => 'https://infrahub-api.nexgencloud.com/v1' - }, - 'state' => { - 'file' => '.hyperstack-vm-state.json' - }, - 'vm' => { - 'name_prefix' => 'hyperstack', - 'hostname' => 'hyperstack', - 'flavor_name' => 'n3-A100x1', - 'image_name' => 'Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker', - 'assign_floating_ip' => true, - 'create_bootable_volume' => false, - 'enable_port_randomization' => false, - 'labels' => %w[gpt-oss-120b wireguard] - }, - 'ssh' => { - 'username' => 'ubuntu', - 'private_key_path' => '~/.ssh/id_rsa', - 'hyperstack_key_name' => 'earth', - 'port' => 22, - 'connect_timeout_sec' => 10 - }, - 'network' => { - 'wireguard_udp_port' => 56_710, - 'wireguard_subnet' => '192.168.3.0/24', - # Optional: explicit server-side WireGuard IP. When nil, derived as subnet + 1 (i.e. .1). - # Set to a different address (e.g. 192.168.3.3) for a second VM sharing the same wg1 tunnel. - 'wireguard_server_ip' => nil, - 'ollama_port' => 11_434, - 'litellm_port' => 4_000, - 'allowed_ssh_cidrs' => ['auto'], - 'allowed_wireguard_cidrs' => ['auto'] - }, - 'bootstrap' => { - 'enable_guest_bootstrap' => true, - 'install_wireguard' => true, - 'configure_ufw' => true, - 'configure_ollama_host' => false - }, - 'ollama' => { - 'install' => false, - 'models_dir' => '/ephemeral/ollama/models', - 'listen_host' => '0.0.0.0:11434', - 'gpu_overhead_mb' => 2000, - 'num_parallel' => 1, - 'context_length' => 32_768, - 'pull_models' => ['qwen3-coder:30b', 'gpt-oss:20b', 'gpt-oss:120b', 'nemotron-3-super'] - }, - 'vllm' => { - 'install' => true, - 'model' => 'bullpoint/Qwen3-Coder-Next-AWQ-4bit', - 'hug_cache_dir' => '/ephemeral/hug', - 'container_name' => 'vllm_qwen3', - 'max_model_len' => 262_144, - 'gpu_memory_utilization' => 0.92, - 'tensor_parallel_size' => 1, - 'tool_call_parser' => 'qwen3_coder', - 'litellm_claude_model_names' => %w[ - claude-sonnet-4-20250514 - claude-opus-4-20250514 - claude-opus-4-6-20260604 - claude-haiku-3-5-20241022 - ], - 'litellm_master_key' => 'sk-litellm-master' - }, - 'wireguard' => { - 'auto_setup' => true, - 'setup_script' => './wg1-setup.sh' - }, - 'local_client' => { - 'check_wg1_service' => true, - 'interface_name' => 'wg1', - 'config_path' => '/etc/wireguard/wg1.conf' - } - }.freeze - - def validate! - %w[auth hyperstack state vm ssh network bootstrap ollama vllm wireguard local_client].each do |section| - raise Error, "Missing config section [#{section}]" unless @data.key?(section) - end - - %w[environment_name flavor_name image_name].each do |key| - raise Error, "Missing [vm].#{key} in config #{path}" if blank?(dig('vm', key)) - end - - if fetch('vm', 'hostname') && fetch('vm', 'hostname') !~ /\A[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\z/ - raise Error, - "Invalid [vm].hostname #{fetch('vm', - 'hostname').inspect}; use lowercase letters, digits, and hyphens only." - end - - %w[username hyperstack_key_name].each do |key| - raise Error, "Missing [ssh].#{key} in config #{path}" if blank?(dig('ssh', key)) - end - - ssh_cidrs = normalized_cidrs(fetch('network', 'allowed_ssh_cidrs')) - wireguard_cidrs = normalized_cidrs(fetch('network', 'allowed_wireguard_cidrs')) - - raise Error, missing_cidr_message('allowed_ssh_cidrs') if ssh_cidrs.empty? - raise Error, missing_cidr_message('allowed_wireguard_cidrs') if wireguard_cidrs.empty? - - [fetch('network', 'wireguard_subnet'), *ssh_cidrs, *wireguard_cidrs].each do |cidr| - next if cidr == 'auto' - - IPAddr.new(cidr) - rescue IPAddr::InvalidAddressError => e - raise Error, "Invalid CIDR #{cidr.inspect}: #{e.message}" - end - - server_ip = fetch('network', 'wireguard_server_ip') - if server_ip - # Validate that the explicit server WireGuard IP is within the configured subnet. - begin - subnet = IPAddr.new(fetch('network', 'wireguard_subnet')) - unless subnet.include?(IPAddr.new(server_ip)) - raise Error, - "wireguard_server_ip #{server_ip.inspect} is not in wireguard_subnet #{fetch('network', 'wireguard_subnet')}" - end - rescue IPAddr::InvalidAddressError => e - raise Error, "Invalid wireguard_server_ip #{server_ip.inspect}: #{e.message}" - end - end - end - - def fetch(section, key) - dig(section, key) - end - - def dig(*keys) - keys.reduce(@data) do |memo, key| - memo.is_a?(Hash) ? memo[key] : nil - end - end - - def blank?(value) - value.nil? || value.to_s.strip.empty? - end - - def truthy?(value) - value == true - end - - def normalized_cidrs(values) - Array(values).map { |value| value.to_s.strip }.reject(&:empty?) - end - - def missing_cidr_message(key) - "Missing [network].#{key} in config #{path}; set it to one or more CIDRs, or ['auto'] to restrict access to the current public operator IP." - end - - def deep_merge(left, right) - left.merge(right) do |_key, old_value, new_value| - if old_value.is_a?(Hash) && new_value.is_a?(Hash) - deep_merge(old_value, new_value) - else - new_value - end - end - end - end - - class Config - attr_reader :path - - def initialize(data, path = nil) - @data = data - @path = path - end - - def api_key - key_path = expand_path(fetch('auth', 'api_key_file')) - raise Error, "API key file not found: #{key_path}" unless File.exist?(key_path) - - token = File.readlines(key_path, chomp: true).find { |line| !line.strip.empty? }&.strip - raise Error, "API key file is empty: #{key_path}" if token.nil? || token.empty? - - token - rescue Errno::EACCES => e - raise Error, "Cannot read API key file #{key_path}: #{e.message}" - end - - def api_base_url - fetch('hyperstack', 'base_url') - end - - def state_file - expand_path(fetch('state', 'file')) - end - - def environment_name - fetch('vm', 'environment_name') - end - - def flavor_name - fetch('vm', 'flavor_name') - end - - def image_name - fetch('vm', 'image_name') - end - - def vm_name_prefix - fetch('vm', 'name_prefix') - end - - def generated_vm_name - "#{vm_name_prefix}-#{Time.now.utc.strftime('%Y%m%d%H%M%S')}" - end - - def vm_hostname - value = fetch('vm', 'hostname') - return nil if blank?(value) - - value.to_s.downcase - end - - def assign_floating_ip? - truthy?(fetch('vm', 'assign_floating_ip')) - end - - def create_bootable_volume? - truthy?(fetch('vm', 'create_bootable_volume')) - end - - def enable_port_randomization? - truthy?(fetch('vm', 'enable_port_randomization')) - end - - def labels - Array(fetch('vm', 'labels')).map(&:to_s) - end - - def user_data - custom = custom_user_data - return custom unless custom.nil? || custom.empty? - return nil if vm_hostname.nil? - - default_hostname_cloud_init - rescue Errno::ENOENT => e - raise Error, "User data file not found: #{e.message}" - rescue Errno::EACCES => e - raise Error, "Cannot read user data file: #{e.message}" - end - - def ssh_username - fetch('ssh', 'username') - end - - def ssh_private_key_path - expand_path(fetch('ssh', 'private_key_path')) - end - - def ssh_known_hosts_path - "#{state_file}.known_hosts" - end - - def ssh_key_name - fetch('ssh', 'hyperstack_key_name') - end - - def ssh_port - Integer(fetch('ssh', 'port')) - end - - def ssh_connect_timeout - Integer(fetch('ssh', 'connect_timeout_sec')) - end - - def wireguard_udp_port - Integer(fetch('network', 'wireguard_udp_port')) - end - - def wireguard_subnet - fetch('network', 'wireguard_subnet') - end - - def ollama_port - Integer(fetch('network', 'ollama_port')) - end - - def litellm_port - Integer(fetch('network', 'litellm_port')) - end - - # Returns the server-side WireGuard IP for this VM. - # Uses the explicitly configured address when set; otherwise derives it as subnet_base + 1. - # Example: 192.168.3.0/24 → 192.168.3.1 (default VM1); VM2 sets wireguard_server_ip=192.168.3.3. - def wireguard_gateway_ip - configured = fetch('network', 'wireguard_server_ip') - return configured.to_s if configured && !configured.to_s.strip.empty? - - # Fall back to first usable address in the subnet. - base = IPAddr.new(wireguard_subnet).to_s - parts = base.split('.').map(&:to_i) - parts[-1] += 1 - parts.join('.') - end - - # Returns the WireGuard hostname for this VM: e.g. hyperstack1.wg1 or hyperstack2.wg1. - # Used as the DNS name to reach the VM over the tunnel (must be in /etc/hosts on the client). - def wireguard_gateway_hostname - host = vm_hostname || 'hyperstack' - "#{host}.#{local_interface_name}" - end - - def allowed_ssh_cidrs - resolved_allowed_cidrs('allowed_ssh_cidrs') - end - - def allowed_wireguard_cidrs - resolved_allowed_cidrs('allowed_wireguard_cidrs') - end - - def guest_bootstrap_enabled? - truthy?(fetch('bootstrap', 'enable_guest_bootstrap')) - end - - def install_wireguard? - truthy?(fetch('bootstrap', 'install_wireguard')) - end - - def configure_ufw? - truthy?(fetch('bootstrap', 'configure_ufw')) - end - - def configure_ollama_host? - truthy?(fetch('bootstrap', 'configure_ollama_host')) - end - - def ollama_install_enabled? - truthy?(fetch('ollama', 'install')) - end - - def ollama_models_dir - fetch('ollama', 'models_dir') - end - - def ollama_listen_host - fetch('ollama', 'listen_host') - end - - def ollama_gpu_overhead_mb - Integer(fetch('ollama', 'gpu_overhead_mb')) - end - - def ollama_num_parallel - Integer(fetch('ollama', 'num_parallel')) - end - - def ollama_context_length - Integer(fetch('ollama', 'context_length')) - end - - def ollama_pull_models - Array(fetch('ollama', 'pull_models')).map(&:to_s) - end - - def vllm_install_enabled? - truthy?(fetch('vllm', 'install')) - end - - def vllm_model - fetch('vllm', 'model') - end - - def vllm_hug_cache_dir - fetch('vllm', 'hug_cache_dir') - end - - # Derived from hug_cache_dir: sibling directory for torch.compile artifacts. - # Persisted across container restarts so recompilation is skipped on warm switches. - def vllm_compile_cache_dir - File.join(File.dirname(fetch('vllm', 'hug_cache_dir')), 'vllm_cache') - end - - def vllm_container_name - fetch('vllm', 'container_name') - end - - def vllm_max_model_len - Integer(fetch('vllm', 'max_model_len')) - end - - def vllm_gpu_memory_utilization - Float(fetch('vllm', 'gpu_memory_utilization')) - end - - def vllm_tensor_parallel_size - Integer(fetch('vllm', 'tensor_parallel_size')) - end - - def vllm_tool_call_parser - fetch('vllm', 'tool_call_parser') - end - - def litellm_claude_model_names - Array(fetch('vllm', 'litellm_claude_model_names')).map(&:to_s) - end - - def litellm_master_key - fetch('vllm', 'litellm_master_key') - end - - # Whether to pass --trust-remote-code to vLLM for the default model. - # Required for architectures not yet in the vLLM upstream registry (e.g. nemotron_h). - def vllm_trust_remote_code - truthy?(fetch('vllm', 'trust_remote_code')) - end - - # Extra vLLM CLI flags for the default model (e.g. reasoning-parser args). - def vllm_extra_args - Array(fetch('vllm', 'extra_vllm_args')).map(&:to_s) - end - - def vllm_presets - Hash(dig('vllm', 'presets')).transform_keys(&:to_s) - end - - def vllm_preset_names - vllm_presets.keys - end - - def vllm_preset(name) - raw = vllm_presets[name.to_s] - unless raw - available = vllm_preset_names.empty? ? 'none configured' : vllm_preset_names.join(', ') - raise Error, "Unknown vLLM preset #{name.inspect}. Available: #{available}" - end - { - 'model' => raw['model'] || vllm_model, - 'container_name' => raw['container_name'] || vllm_container_name, - 'max_model_len' => Integer(raw['max_model_len'] || vllm_max_model_len), - 'gpu_memory_utilization' => Float(raw['gpu_memory_utilization'] || vllm_gpu_memory_utilization), - 'tensor_parallel_size' => Integer(raw['tensor_parallel_size'] || vllm_tensor_parallel_size), - 'tool_call_parser' => raw.key?('tool_call_parser') ? raw['tool_call_parser'] : vllm_tool_call_parser, - 'trust_remote_code' => raw.key?('trust_remote_code') ? raw['trust_remote_code'] : false, - 'extra_vllm_args' => raw.key?('extra_vllm_args') ? Array(raw['extra_vllm_args']) : [] - } - end - - def local_client_checks_enabled? - truthy?(fetch('local_client', 'check_wg1_service')) - end - - def local_interface_name - fetch('local_client', 'interface_name') - end - - def local_wg_config_path - fetch('local_client', 'config_path') - end - - def wireguard_auto_setup? - truthy?(fetch('wireguard', 'auto_setup')) - end - - def wireguard_setup_script - expand_path(fetch('wireguard', 'setup_script')) - end - - def desired_security_rules(include_ollama: ollama_install_enabled?, include_vllm: vllm_install_enabled?) - rules = [] - - allowed_ssh_cidrs.each do |cidr| - rules << firewall_rule('tcp', ssh_port, cidr) - end - - allowed_wireguard_cidrs.each do |cidr| - rules << firewall_rule('udp', wireguard_udp_port, cidr) - end - - rules << firewall_rule('tcp', ollama_port, wireguard_subnet) if include_ollama || include_vllm - rules << firewall_rule('tcp', litellm_port, wireguard_subnet) if include_vllm - rules.uniq - end - - private - - def fetch(section, key) - dig(section, key) - end - - def dig(*keys) - keys.reduce(@data) do |memo, key| - memo.is_a?(Hash) ? memo[key] : nil - end - end - - def blank?(value) - value.nil? || value.to_s.strip.empty? - end - - def truthy?(value) - value == true - end - - def resolved_allowed_cidrs(key) - values = Array(fetch('network', key)).map { |value| value.to_s.strip }.reject(&:empty?) - values.flat_map { |value| value == 'auto' ? [detected_operator_cidr] : [value] }.uniq - end - - def detected_operator_cidr - return @detected_operator_cidr if defined?(@detected_operator_cidr) - - configured = ENV['HYPERSTACK_OPERATOR_CIDR'].to_s.strip - @detected_operator_cidr = normalize_operator_cidr(configured) unless configured.empty? - return @detected_operator_cidr if defined?(@detected_operator_cidr) - - @detected_operator_cidr = detect_public_operator_cidr - end - - def normalize_operator_cidr(value) - ip = IPAddr.new(value) - suffix = ip.ipv4? ? 32 : 128 - value.include?('/') ? value : "#{ip}/#{suffix}" - rescue IPAddr::InvalidAddressError => e - raise Error, "Invalid HYPERSTACK_OPERATOR_CIDR #{value.inspect}: #{e.message}" - end - - def detect_public_operator_cidr - [ - 'https://api.ipify.org', - 'https://ifconfig.me/ip', - 'https://ipv4.icanhazip.com' - ].each do |url| - cidr = fetch_public_cidr(url) - return cidr if cidr - end - - source = path || 'the active config' - raise Error, - "Unable to detect the current public operator IP for [network].allowed_*_cidrs = ['auto']. Set HYPERSTACK_OPERATOR_CIDR or replace 'auto' with explicit CIDRs in #{source}." - end - - def fetch_public_cidr(url) - uri = URI(url) - response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https', open_timeout: 5, read_timeout: 5) do |http| - http.request(Net::HTTP::Get.new(uri)) - end - return nil unless response.is_a?(Net::HTTPSuccess) - - body = response.body.to_s.strip - return nil if body.empty? - - ip = IPAddr.new(body) - suffix = ip.ipv4? ? 32 : 128 - "#{ip}/#{suffix}" - rescue IPAddr::InvalidAddressError, SocketError, SystemCallError, Timeout::Error, Net::OpenTimeout, - Net::ReadTimeout, OpenSSL::SSL::SSLError - nil - end - - def custom_user_data - inline = dig('vm', 'user_data') - return inline unless inline.nil? || inline.empty? - - file = dig('vm', 'user_data_file') - return nil if file.nil? || file.empty? - - File.read(expand_path(file)) - end - - def default_hostname_cloud_init - <<~CLOUD_INIT - #cloud-config - preserve_hostname: false - hostname: #{vm_hostname} - CLOUD_INIT - end - - def expand_path(value) - return nil if value.nil? - - string = value.to_s - return File.expand_path(string) if string.start_with?('~') - return string if string.start_with?('/') - - File.expand_path(string, File.dirname(@path)) if @path - end - - def firewall_rule(protocol, port, cidr) - ip = IPAddr.new(cidr) - { - 'direction' => 'ingress', - 'ethertype' => ip.ipv4? ? 'IPv4' : 'IPv6', - 'protocol' => protocol, - 'port_range_min' => port, - 'port_range_max' => port, - 'remote_ip_prefix' => cidr - } - end - end - - class StateStore - def initialize(path) - @path = path - end - - attr_reader :path - - def load - return nil unless File.exist?(@path) - - JSON.parse(File.read(@path)) - rescue JSON::ParserError => e - raise Error, "Failed to parse state file #{@path}: #{e.message}" - end - - def save(payload) - temp_path = "#{@path}.tmp" - File.write(temp_path, JSON.pretty_generate(payload)) - File.rename(temp_path, @path) - end - - def delete - File.delete(@path) if File.exist?(@path) - end - end - - class HyperstackClient - def initialize(base_url:, api_key:) - @base_uri = URI(base_url) - @api_key = api_key - end - - def list_environments - response = request(:get, '/core/environments') - response.fetch('environments', []) - end - - def list_keypairs - response = request(:get, '/core/keypairs') - response.fetch('keypairs', []) - end - - def list_flavors - response = request(:get, '/core/flavors') - Array(response['data']).flat_map do |entry| - Array(entry['flavors']).map do |flavor| - flavor.merge( - 'region_name' => flavor['region_name'] || entry['region_name'], - 'gpu' => flavor['gpu'] || entry['gpu'] - ) - end - end - end - - def list_images - response = request(:get, '/core/images') - Array(response['images']).flat_map do |entry| - Array(entry['images']).map do |image| - image.merge( - 'region_name' => image['region_name'] || entry['region_name'], - 'type' => image['type'] || entry['type'] - ) - end - end - end - - def list_vms - response = request(:get, '/core/virtual-machines') - response.fetch('instances', []) - end - - def get_vm(vm_id) - response = request(:get, "/core/virtual-machines/#{vm_id}") - response.fetch('instance', nil) - end - - def create_vm(payload) - request(:post, '/core/virtual-machines', payload) - end - - def delete_vm(vm_id) - request(:delete, "/core/virtual-machines/#{vm_id}") - end - - def create_vm_rule(vm_id, payload) - request(:post, "/core/virtual-machines/#{vm_id}/sg-rules", payload) - end - - private - - def request(method, path, payload = nil) - uri = @base_uri.dup - uri.path = "#{@base_uri.path}#{path}" - - request = case method - when :get - Net::HTTP::Get.new(uri) - when :post - Net::HTTP::Post.new(uri) - when :delete - Net::HTTP::Delete.new(uri) - else - raise Error, "Unsupported HTTP method: #{method}" - end - - request['accept'] = 'application/json' - request['api_key'] = @api_key - if payload - request['content-type'] = 'application/json' - request.body = JSON.generate(payload) - end - - retries_left = 4 - begin - response = Net::HTTP.start( - uri.host, - uri.port, - use_ssl: uri.scheme == 'https', - open_timeout: 30, - read_timeout: 120 - ) { |http| http.request(request) } - - parse_response(response) - rescue Timeout::Error, Errno::ECONNREFUSED, Errno::ECONNRESET, - Errno::EHOSTUNREACH, Errno::ENETUNREACH, - SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout => e - raise Error, "Hyperstack API request failed for #{path}: #{e.message}" if retries_left <= 0 - - retries_left -= 1 - delay = (4 - retries_left) * 5 - warn "API request to #{path} failed (#{e.class}: #{e.message}), retrying in #{delay}s (#{retries_left} left)..." - sleep delay - retry - end - end - - def parse_response(response) - body = response.body.to_s - payload = body.empty? ? {} : JSON.parse(body) - - if response.code.to_i >= 400 || payload['status'] == false - message = payload['message'] || payload['error_reason'] || response.message - raise Error, "Hyperstack API error (HTTP #{response.code}): #{message}" - end - - payload - rescue JSON::ParserError => e - raise Error, "Failed to parse Hyperstack API response: #{e.message}" - end - end - - class LocalWireGuard - def initialize(interface_name:, config_path:) - @interface_name = interface_name - @config_path = config_path - end - - def status - endpoints = configured_endpoints - { - 'service_state' => service_state, - 'config_path' => @config_path, - 'endpoint' => endpoints.last, - 'endpoints' => endpoints, - 'config_readable' => !config_contents.nil? - } - end - - def remove_peers_by_allowed_ips(allowed_ips, dry_run: false) - targets = Array(allowed_ips).map(&:to_s).map(&:strip).reject(&:empty?).uniq - return [] if targets.empty? - - content = config_contents - raise Error, "Unable to read #{@config_path} for peer cleanup." if content.nil? - - updated, removed = prune_peer_blocks(content, targets) - return [] if removed.empty? - return removed if dry_run - - write_config(updated) - restart_service_if_active - @config_contents = updated - removed - end - - def remove_hostnames(hostnames, dry_run: false) - targets = Array(hostnames).map(&:to_s).map(&:strip).reject(&:empty?).uniq - return [] if targets.empty? - - content = hosts_contents - raise Error, 'Unable to read /etc/hosts for hostname cleanup.' if content.nil? - - updated, removed = prune_hosts_entries(content, targets) - return [] if removed.empty? - return removed if dry_run - - write_hosts(updated) - @hosts_contents = updated - removed - end - - private - - def service_state - stdout, _stderr, status = Open3.capture3('systemctl', 'is-active', "wg-quick@#{@interface_name}") - value = stdout.to_s.strip - return value unless value.empty? - return 'active' if status.success? - - 'unknown' - end - - def configured_endpoint - configured_endpoints.last - end - - def configured_endpoints - content = config_contents - return [] if content.nil? - - parse_wireguard_peers(content).filter_map { |peer| peer['Endpoint'] }.uniq - end - - def parse_wireguard_peers(content) - current_section = nil - current_peer = nil - peers = [] - - content.each_line do |line| - stripped = line.strip - next if stripped.empty? || stripped.start_with?('#') - - if stripped.start_with?('[') && stripped.end_with?(']') - peers << current_peer if current_section == 'Peer' && current_peer && !current_peer.empty? - current_section = stripped[1..-2] - current_peer = current_section == 'Peer' ? {} : nil - next - end - - key, value = stripped.split('=', 2).map { |part| part&.strip } - next unless current_section == 'Peer' && key && value - - current_peer[key] = value - end - - peers << current_peer if current_section == 'Peer' && current_peer && !current_peer.empty? - peers - end - - def prune_peer_blocks(content, allowed_ips) - kept = [] - removed = [] - - parse_wireguard_blocks(content).each do |block| - if block[:section] == 'Peer' && allowed_ips.include?(block[:values]['AllowedIPs'].to_s.strip) - removed << block[:values] - else - kept << block[:lines].join - end - end - - [kept.join, removed] - end - - def parse_wireguard_blocks(content) - blocks = [] - current_section = nil - current_lines = [] - - content.each_line do |line| - stripped = line.strip - if stripped.start_with?('[') && stripped.end_with?(']') - blocks << wireguard_block(current_section, current_lines) unless current_lines.empty? - current_section = stripped[1..-2] - current_lines = [line] - else - current_lines << line - end - end - - blocks << wireguard_block(current_section, current_lines) unless current_lines.empty? - blocks - end - - def wireguard_block(section, lines) - { - section: section, - lines: lines.dup, - values: parse_wireguard_section_values(section, lines) - } - end - - def parse_wireguard_section_values(section, lines) - return {} unless section == 'Peer' - - lines.each_with_object({}) do |line, values| - stripped = line.strip - next if stripped.empty? || stripped.start_with?('#') || stripped.start_with?('[') - - key, value = stripped.split('=', 2).map { |part| part&.strip } - values[key] = value if key && value - end - end - - def write_config(content) - File.write(@config_path, content) - rescue Errno::EACCES - _stdout, stderr, status = Open3.capture3('sudo', '-n', 'tee', @config_path, stdin_data: content) - raise Error, "Failed to update #{@config_path}: #{stderr.to_s.strip}" unless status.success? - - _stdout, stderr, status = Open3.capture3('sudo', '-n', 'chmod', '600', @config_path) - raise Error, "Failed to chmod #{@config_path}: #{stderr.to_s.strip}" unless status.success? - end - - def restart_service_if_active - return unless service_state == 'active' - - _stdout, stderr, status = Open3.capture3('sudo', '-n', 'systemctl', 'restart', "wg-quick@#{@interface_name}") - raise Error, "Failed to restart wg-quick@#{@interface_name}: #{stderr.to_s.strip}" unless status.success? - end - - def config_contents - return @config_contents if defined?(@config_contents) - - @config_contents = File.read(@config_path) - rescue Errno::EACCES, Errno::ENOENT - stdout, _stderr, status = Open3.capture3('sudo', '-n', 'cat', @config_path) - @config_contents = status.success? ? stdout : nil - end - - def hosts_contents - return @hosts_contents if defined?(@hosts_contents) - - @hosts_contents = File.read('/etc/hosts') - rescue Errno::EACCES, Errno::ENOENT - stdout, _stderr, status = Open3.capture3('sudo', '-n', 'cat', '/etc/hosts') - @hosts_contents = status.success? ? stdout : nil - end - - def prune_hosts_entries(content, hostnames) - removed = [] - updated = content.each_line.filter_map do |line| - rewritten, line_removed = prune_host_line(line, hostnames) - removed.concat(line_removed) - rewritten - end - [updated.join, removed.uniq] - end - - def prune_host_line(line, hostnames) - stripped = line.strip - return [line, []] if stripped.empty? || stripped.start_with?('#') - - body, comment = line.split('#', 2) - tokens = body.split(/\s+/) - return [line, []] if tokens.empty? - - ip = tokens.shift - removed = tokens & hostnames - return [line, []] if removed.empty? - - remaining = tokens - hostnames - return [nil, removed] if remaining.empty? - - rewritten = ([ip] + remaining).join("\t") - rewritten = "#{rewritten} # #{comment.strip}" if comment && !comment.strip.empty? - ["#{rewritten}\n", removed] - end - - def write_hosts(content) - File.write('/etc/hosts', content) - rescue Errno::EACCES - _stdout, stderr, status = Open3.capture3('sudo', '-n', 'tee', '/etc/hosts', stdin_data: content) - raise Error, "Failed to update /etc/hosts: #{stderr.to_s.strip}" unless status.success? - end - end - - # Thread-safe output wrapper that prepends a fixed prefix to each line. - # Used by create-both so interleaved output from VM1 and VM2 threads is distinguishable. - # #print buffers partial lines until a newline is received, then flushes with the prefix. - class PrefixedOutput - def initialize(prefix, delegate, mutex) - @prefix = prefix - @delegate = delegate - @mutex = mutex - @buffer = +'' - end - - def puts(msg = '') - @mutex.synchronize { @delegate.puts("#{@prefix}#{msg}") } - end - - def print(msg) - @buffer << msg.to_s - while (idx = @buffer.index("\n")) - line = @buffer.slice!(0, idx + 1) - @mutex.synchronize { @delegate.print("#{@prefix}#{line}") } - end - end - end - - class ProvisioningScripts - def initialize(config:) - @config = config - end - - def guest_bootstrap_script - script = [] - script << 'set -euo pipefail' - - # Wait for any running unattended-upgrades or apt locks to release - # before attempting package operations (transient lock on fresh VMs) - script << 'echo "Waiting for apt locks to clear..."' - script << 'for i in $(seq 1 30); do' - script << ' if ! fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; then break; fi' - script << ' echo " apt lock held, waiting ($i/30)..."; sleep 10' - script << 'done' - script << 'sudo systemctl stop unattended-upgrades.service 2>/dev/null || true' - script << 'sudo systemctl disable unattended-upgrades.service 2>/dev/null || true' - - if @config.install_wireguard? - script << 'which wg >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y wireguard)' - end - - if @config.configure_ufw? - script << "sudo ufw allow #{@config.ssh_port}/tcp comment 'Allow SSH' >/dev/null 2>&1 || true" - script << 'sudo ufw --force enable >/dev/null 2>&1 || true' - script << "sudo ufw allow #{@config.wireguard_udp_port}/udp comment 'WireGuard #{@config.local_interface_name}' >/dev/null 2>&1 || true" - # Port 11434 is shared by Ollama and vLLM; open for both regardless of which is installed. - script << "sudo ufw allow from #{Shellwords.escape(@config.wireguard_subnet)} to any port #{@config.ollama_port} proto tcp comment 'Inference API (Ollama/vLLM) via #{@config.local_interface_name}' >/dev/null 2>&1 || true" - # Port 4000: LiteLLM proxy (Anthropic API -> vLLM); open alongside the inference port. - script << "sudo ufw allow from #{Shellwords.escape(@config.wireguard_subnet)} to any port #{@config.litellm_port} proto tcp comment 'LiteLLM proxy via #{@config.local_interface_name}' >/dev/null 2>&1 || true" - end - - if @config.configure_ollama_host? - # Only write a minimal OLLAMA_HOST override if no override exists yet; - # ollama_setup_script writes the full override (OLLAMA_MODELS, GPU_OVERHEAD, etc.) - script << "if systemctl list-unit-files | grep -q '^ollama.service'; then" - script << ' if [ ! -f /etc/systemd/system/ollama.service.d/override.conf ]; then' - script << ' sudo mkdir -p /etc/systemd/system/ollama.service.d' - script << " cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" - script << '[Service]' - script << "Environment=\"OLLAMA_HOST=0.0.0.0:#{@config.ollama_port}\"" - script << 'OVERRIDE' - script << ' sudo systemctl daemon-reload' - script << ' sudo systemctl restart ollama || true' - script << ' fi' - script << 'fi' - end - - script << 'echo bootstrap-ok' - script.join("\n") - end - - def desired_ollama_models - normalized_model_list(@config.ollama_pull_models) - end - - def model_list_signature(models) - normalized_model_list(models).sort - end - - def ollama_install_script - models_dir = @config.ollama_models_dir - listen_host = @config.ollama_listen_host - - script = [] - script << 'set -euo pipefail' - script << 'sudo pkill -f unattended-upgrade >/dev/null 2>&1 || true' - script << 'if ! command -v ollama >/dev/null 2>&1; then curl -fsSL https://ollama.ai/install.sh | sh; fi' - if models_dir.start_with?('/ephemeral') - script << "mountpoint -q /ephemeral || { echo 'Expected /ephemeral mount is missing'; exit 1; }" - end - script << "sudo mkdir -p #{Shellwords.escape(models_dir)}" - script << "sudo chown -R ollama:ollama #{Shellwords.escape(File.dirname(models_dir))}" - script << 'sudo mkdir -p /etc/systemd/system/ollama.service.d' - script << "cat <<'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf >/dev/null" - script << '[Service]' - script << "Environment=\"OLLAMA_MODELS=#{models_dir}\"" - script << "Environment=\"OLLAMA_GPU_OVERHEAD=#{@config.ollama_gpu_overhead_mb}\"" - script << "Environment=\"OLLAMA_NUM_PARALLEL=#{@config.ollama_num_parallel}\"" - script << "Environment=\"OLLAMA_CONTEXT_LENGTH=#{@config.ollama_context_length}\"" - script << "Environment=\"OLLAMA_HOST=#{listen_host}\"" - script << 'OVERRIDE' - script << 'sudo systemctl daemon-reload' - script << 'sudo systemctl enable --now ollama' - script << 'sudo systemctl restart ollama' - script << 'sleep 3' - script << 'systemctl is-active --quiet ollama' - script << 'echo ollama-install-ok' - script.join("\n") - end - - def ollama_pull_script(models: desired_ollama_models) - models_dir = @config.ollama_models_dir - - script = [] - script << 'set -euo pipefail' - # Pull each model with retry (transient network failures) and verify - # it is actually present afterwards - models.each do |model| - escaped = Shellwords.escape(model) - script << "echo \"Pulling model #{model}...\"" - script << 'for attempt in 1 2 3; do' - script << " if ollama pull #{escaped}; then break; fi" - script << " if [ \"$attempt\" -eq 3 ]; then echo \"FATAL: failed to pull #{model} after 3 attempts\"; exit 1; fi" - script << ' echo " pull attempt $attempt failed, retrying in 15s..."; sleep 15' - script << 'done' - script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} not found after pull\"; exit 1; }" - end - # Final verification: ensure all expected models are listed - script << 'echo "Verifying all models are present..."' - models.each do |model| - escaped = Shellwords.escape(model) - script << "ollama show #{escaped} --modelfile >/dev/null 2>&1 || { echo \"FATAL: model #{model} missing in final check\"; exit 1; }" - end - script << "echo ollama-models-dir=#{models_dir}" - script << 'echo ollama-ok' - script.join("\n") - end - - def vllm_stop_script(container_name) - script = [] - script << 'set -euo pipefail' - script << "docker stop #{Shellwords.escape(container_name)} 2>/dev/null || true" - script << "docker rm #{Shellwords.escape(container_name)} 2>/dev/null || true" - script << 'echo vllm-stopped' - script.join("\n") - end - - def vllm_install_script(preset_config: nil, pull_image: true) - cfg = preset_config || {} - model = cfg['model'] || @config.vllm_model - cache_dir = @config.vllm_hug_cache_dir - compile_cache = @config.vllm_compile_cache_dir - container = cfg['container_name'] || @config.vllm_container_name - max_len = Integer(cfg['max_model_len'] || @config.vllm_max_model_len) - gpu_util = Float(cfg['gpu_memory_utilization'] || @config.vllm_gpu_memory_utilization) - tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size) - parser = cfg['tool_call_parser'] - # parser is nil only when preset explicitly omits the key and config has no default; - # empty string means "disable tool calling" (e.g. gpt-oss reasoning models). - parser = @config.vllm_tool_call_parser if parser.nil? - # Fall back to the top-level [vllm] config values when no preset is in use. - # This allows setting trust_remote_code / extra_vllm_args in the default [vllm] block - # without requiring a --model preset flag at create time. - trust_remote = cfg.key?('trust_remote_code') ? cfg['trust_remote_code'] : @config.vllm_trust_remote_code - port = @config.ollama_port - - docker_args = [ - 'docker run -d', - '--gpus all', '--ipc=host', '--network host', - "--name #{Shellwords.escape(container)}", - '--restart always', - "-v #{Shellwords.escape(cache_dir)}:/root/.cache/huggingface", - # Mount torch.compile cache so CUDA kernel compilation is skipped on warm restarts. - # Without this, every container restart recompiles (~30-60 s extra). - "-v #{Shellwords.escape(compile_cache)}:/root/.cache/vllm", - 'vllm/vllm-openai:latest', - "--model #{Shellwords.escape(model)}", - "--tensor-parallel-size #{tp_size}", - '--enable-prefix-caching', - "--gpu-memory-utilization #{gpu_util}", - "--max-model-len #{max_len}", - '--host 0.0.0.0', - "--port #{port}" - ] - # Tool calling is optional: empty/nil parser disables it. - unless parser.nil? || parser.empty? - docker_args << '--enable-auto-tool-choice' - docker_args << "--tool-call-parser #{Shellwords.escape(parser)}" - end - docker_args << '--trust-remote-code' if trust_remote - extra_args = cfg.key?('extra_vllm_args') ? Array(cfg['extra_vllm_args']) : @config.vllm_extra_args - extra_args.each { |arg| docker_args << arg } - docker_run = docker_args.join(' ') - - script = [] - script << 'set -euo pipefail' - script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" - script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" - script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true" - script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true" - script << 'docker pull vllm/vllm-openai:latest' if pull_image - script << docker_run - script << 'echo "Waiting for vLLM to become ready (up to 10 min for first model download)..."' - script << 'for i in $(seq 1 120); do' - script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi" - script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)" - script << ' echo " vLLM not ready yet ($i/120, container=$state)..."' - script << ' sleep 5' - script << 'done' - script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 10 minutes'; exit 1; }" - script << 'echo vllm-install-ok' - script.join("\n") - end - - def litellm_install_script(model_override: nil) - port = @config.litellm_port - model = model_override || @config.vllm_model - - script = [] - script << 'set -euo pipefail' - script << 'sudo apt-get install -y python3.12-venv' - script << 'sudo mkdir -p /ephemeral/litellm-env' - script << 'sudo chown ubuntu:ubuntu /ephemeral/litellm-env' - script << 'python3 -m venv /ephemeral/litellm-env' - script << '/ephemeral/litellm-env/bin/pip install --quiet "litellm[proxy]"' - script << "sudo tee /ephemeral/litellm-config.yaml > /dev/null << 'LITELLM_YAML'" - script << 'model_list:' - script.concat(litellm_model_entries(model)) - script << '' - script << 'litellm_settings:' - script << ' drop_params: true' - script << '' - script << 'general_settings:' - script << " master_key: \"#{@config.litellm_master_key}\"" - script << 'LITELLM_YAML' - script << "sudo tee /etc/systemd/system/litellm.service > /dev/null << 'LITELLM_UNIT'" - script << '[Unit]' - script << 'Description=LiteLLM Proxy' - script << 'After=network.target docker.service' - script << 'Requires=docker.service' - script << '' - script << '[Service]' - script << 'Type=simple' - script << 'User=ubuntu' - script << "ExecStart=/ephemeral/litellm-env/bin/litellm --config /ephemeral/litellm-config.yaml --host 0.0.0.0 --port #{port}" - script << 'Restart=always' - script << 'RestartSec=5' - script << '' - script << '[Install]' - script << 'WantedBy=multi-user.target' - script << 'LITELLM_UNIT' - script << 'sudo systemctl daemon-reload' - script << 'sudo systemctl enable --now litellm' - script << 'sleep 5' - script << 'systemctl is-active --quiet litellm' - script << 'echo litellm-install-ok' - script.join("\n") - end - - def litellm_reload_script(model) - script = [] - script << 'set -euo pipefail' - script << "sudo tee /ephemeral/litellm-config.yaml > /dev/null << 'LITELLM_YAML'" - script << 'model_list:' - script.concat(litellm_model_entries(model)) - script << '' - script << 'litellm_settings:' - script << ' drop_params: true' - script << '' - script << 'general_settings:' - script << " master_key: \"#{@config.litellm_master_key}\"" - script << 'LITELLM_YAML' - script << 'sudo systemctl restart litellm' - script << 'sleep 3' - script << 'systemctl is-active --quiet litellm' - script << 'echo litellm-reload-ok' - script.join("\n") - end - - private - - def normalized_model_list(models) - Array(models).each_with_object([]) do |model, ordered| - normalized = model.to_s.strip - next if normalized.empty? || ordered.include?(normalized) - - ordered << normalized - end - end - - def litellm_model_entries(model) - vllm_port = @config.ollama_port - - @config.litellm_claude_model_names.flat_map do |name| - [ - " - model_name: \"#{name}\"", - ' litellm_params:', - " model: \"hosted_vllm/#{model}\"", - " api_base: \"http://localhost:#{vllm_port}/v1\"", - ' api_key: "EMPTY"' - ] - end - end - end - - class RemoteProvisioner - def initialize(config:, scripts:, out:, ssh_command_runner:, ssh_stream_runner:) - @config = config - @scripts = scripts - @out = out - @ssh_command_runner = ssh_command_runner - @ssh_stream_runner = ssh_stream_runner - end - - def bootstrap_guest(host) - info 'Bootstrapping Ubuntu guest over SSH...' - retries = 3 - retries.times do |attempt| - stdout, stderr, status = @ssh_command_runner.call(host, @scripts.guest_bootstrap_script) - return if status.success? - - msg = stderr.strip.empty? ? stdout : stderr - raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1 - - warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..." - sleep 15 - end - end - - def install_ollama_service(host) - info "Installing and configuring Ollama on #{host}..." - output, status = @ssh_stream_runner.call(host, @scripts.ollama_install_script) - raise Error, "Ollama install failed: #{output.strip}" unless status.success? - end - - def pull_ollama_models(host) - info "Pulling Ollama models on #{host}..." - output, status = @ssh_stream_runner.call(host, @scripts.ollama_pull_script) - raise Error, "Ollama model pull failed: #{output.strip}" unless status.success? - - verify_remote_models(host) - end - - def stop_vllm_container(host, container_name) - info "Stopping old vLLM container #{container_name}..." - output, status = @ssh_stream_runner.call(host, @scripts.vllm_stop_script(container_name)) - raise Error, "Failed to stop container #{container_name}: #{output.strip}" unless status.success? - end - - def install_vllm(host, preset_config: nil, pull_image: true) - info "Setting up vLLM Docker container on #{host}..." - output, status = @ssh_stream_runner.call(host, @scripts.vllm_install_script(preset_config: preset_config, - pull_image: pull_image)) - raise Error, "vLLM install failed: #{output.strip}" unless status.success? - end - - def install_litellm(host, model:) - info "Setting up LiteLLM Anthropic-API proxy on #{host}..." - output, status = @ssh_stream_runner.call(host, @scripts.litellm_install_script(model_override: model)) - raise Error, "LiteLLM install failed: #{output.strip}" unless status.success? - end - - def reload_litellm(host, model) - info "Reloading LiteLLM proxy config for #{model}..." - output, status = @ssh_stream_runner.call(host, @scripts.litellm_reload_script(model)) - raise Error, "LiteLLM reload failed: #{output.strip}" unless status.success? - end - - def setup_vllm_stack(host, preset_config: nil) - install_vllm(host, preset_config: preset_config) - model = preset_config&.dig('model') || @config.vllm_model - install_litellm(host, model: model) - end - - private - - def verify_remote_models(host) - stdout, _stderr, status = @ssh_command_runner.call(host, 'ollama list') - return unless status.success? - - remote_models = stdout.lines.drop(1).map { |line| line.split.first }.compact - missing = @scripts.desired_ollama_models.reject { |model| remote_models.any? { |remote| remote.start_with?(model) } } - return if missing.empty? - - raise Error, "Models missing after setup: #{missing.join(', ')}. Remote has: #{remote_models.join(', ')}" - end - - def info(message) - @out.puts(message) - end - - def warn(message) - @out.puts("WARNING: #{message}") - end - end - - class Manager - # wg_setup_pre: optional Proc called just before this VM's WireGuard setup step runs. - # Used by create-both to block VM2 until VM1 has written the base wg1.conf. - # wg_setup_post: optional Proc called after the WireGuard step completes (or is skipped). - # Used by create-both to signal that VM1's base config is ready for VM2. - def initialize(config:, client:, state_store:, local_wireguard:, out: $stdout, - wg_setup_pre: nil, wg_setup_post: nil) - @config = config - @client = client - @state_store = state_store - @local_wireguard = local_wireguard - @out = out - @scripts = ProvisioningScripts.new(config: config) - @provisioner = RemoteProvisioner.new(config: config, scripts: @scripts, out: out, - ssh_command_runner: method(:run_ssh_command), - ssh_stream_runner: method(:run_ssh_command_streaming)) - @wg_setup_pre = wg_setup_pre - @wg_setup_post = wg_setup_post - end - - def create(replace: false, dry_run: false, install_vllm: nil, install_ollama: nil, vllm_preset: nil) - # CLI flags override config; nil means "use config default". - @effective_vllm = install_vllm.nil? ? @config.vllm_install_enabled? : install_vllm - @effective_ollama = install_ollama.nil? ? @config.ollama_install_enabled? : install_ollama - # Validate preset name early so we fail before touching any remote state. - @effective_vllm_preset = vllm_preset - @config.vllm_preset(vllm_preset) if vllm_preset - existing_state = @state_store.load - if existing_state && existing_state['vm_id'] - if replace - if dry_run - info "DRY RUN: would delete tracked VM #{existing_state['vm_id']} before creating a replacement." - else - delete(vm_id: existing_state['vm_id'], preserve_state_on_failure: true) - end - elsif resumable_state?(existing_state) - if dry_run - print_resume_dry_run(existing_state) - return - end - - info "Resuming tracked VM #{existing_state['vm_id']} provisioning..." - continue_create(existing_state) - return - else - raise Error, - "State file #{@state_store.path} already tracks VM #{existing_state['vm_id']}. Use --replace or delete first." - end - end - - resolved = resolve_dependencies - vm_name = @config.generated_vm_name - if dry_run - info "Planning VM #{vm_name} in #{resolved[:environment]['name']} using #{@config.flavor_name}..." - else - info "Creating VM #{vm_name} in #{resolved[:environment]['name']} using #{@config.flavor_name}..." - end - - payload = build_create_payload(vm_name, resolved) - if dry_run - print_create_dry_run(vm_name, resolved, payload) - return - end - - response = @client.create_vm(payload) - instance = Array(response['instances']).first - raise Error, 'Hyperstack create response did not include an instance ID.' unless instance && instance['id'] - - state = { - 'vm_id' => instance['id'], - 'vm_name' => vm_name, - 'environment_name' => resolved[:environment]['name'], - 'region' => resolved[:environment]['region'], - 'flavor_name' => resolved[:flavor]['name'], - 'image_name' => resolved[:image]['name'], - 'key_name' => resolved[:keypair]['name'], - 'public_ip' => instance['floating_ip'], - 'created_at' => Time.now.utc.iso8601 - } - sync_service_mode_state(state) - @state_store.save(state) - continue_create(state) - end - - def delete(vm_id: nil, preserve_state_on_failure: false, dry_run: false, skip_local_cleanup: false) - state = @state_store.load - target_vm_id = vm_id || state&.dig('vm_id') - raise Error, "No VM ID provided and no state file found at #{@state_store.path}." if target_vm_id.nil? - cleanup_local = !skip_local_cleanup && state && target_vm_id == state['vm_id'] - - if dry_run - print_delete_dry_run(target_vm_id, state, preserve_state_on_failure: preserve_state_on_failure) - return - end - - info "Deleting VM #{target_vm_id}..." - @client.delete_vm(target_vm_id) - wait_for_deletion(target_vm_id) - if cleanup_local - cleanup = cleanup_local_access(dry_run: false, hostnames: [@config.wireguard_gateway_hostname], - allowed_ips: ["#{@config.wireguard_gateway_ip}/32"]) - report_local_cleanup(@out, cleanup, dry_run: false) - end - delete_ssh_known_hosts_file - @state_store.delete unless preserve_state_on_failure - info "VM #{target_vm_id} deleted." - rescue Error - raise if preserve_state_on_failure - - @state_store.delete - raise - end - - def status(include_local_wireguard: true) - state = @state_store.load - if state.nil? - info "No tracked VM state file at #{@state_store.path}." - else - begin - vm = @client.get_vm(state['vm_id']) - desired = desired_security_rules_for_state(state).map { |rule| normalize_rule(rule) } - current = Array(vm['security_rules']).map { |rule| normalize_rule(rule) } - missing_rules = desired - current - vllm_enabled = state_vllm_enabled?(state) - ollama_enabled = state_ollama_enabled?(state) - - info "Tracked VM: #{state['vm_id']} #{vm['name']}" - info "Status: #{vm['status']} / #{vm['vm_state']}" - info "Public IP: #{connect_host_for(vm) || 'none'}" - info "Service mode: #{service_mode_summary(vllm_enabled: vllm_enabled, ollama_enabled: ollama_enabled)}" - info "Active model: #{state['vllm_model'] || @config.vllm_model}" if vllm_enabled - info "Missing firewall rules: #{missing_rules.empty? ? 'none' : missing_rules.size}" - rescue Error => e - warn "Unable to load VM #{state['vm_id']}: #{e.message}" - end - end - - print_local_wireguard_summary(state&.dig('public_ip')) if include_local_wireguard - state&.dig('public_ip') - end - - def show_local_wireguard(expected_ips = nil) - print_local_wireguard_summary(expected_ips) - end - - # Lists configured model presets and marks the one currently running on the VM. - def list_models - presets = @config.vllm_preset_names - state = @state_store.load - current = state&.dig('vllm_model') - - if presets.empty? - info 'No presets configured in [vllm.presets.*].' - info "Active model: #{current || @config.vllm_model}" - return - end - - info 'Configured vLLM model presets:' - presets.each do |name| - p = @config.vllm_preset(name) - active = p['model'] == current - info " #{active ? '*' : ' '} #{name.ljust(24)} #{p['model']}" - end - info '' - info ' (* = currently loaded on VM)' if current - end - - # Switches the running VM to a different named model preset. - # Stops the old container, starts the new one, and hot-reloads LiteLLM config. - def switch_model(preset_name:, dry_run: false) - preset = @config.vllm_preset(preset_name) # raises if unknown - state = @state_store.load - - old_container = state&.dig('vllm_container_name') || @config.vllm_container_name - new_container = preset['container_name'] - current_model = state&.dig('vllm_model') - - if dry_run - info "DRY RUN: model switch to preset '#{preset_name}'" - info " #{current_model || 'none'} → #{preset['model']}" - info " container: #{old_container} → #{new_container}" - trust_note = preset['trust_remote_code'] ? ', trust_remote_code: true' : '' - parser_note = preset['tool_call_parser'].to_s.empty? ? 'none' : preset['tool_call_parser'] - extra_note = preset['extra_vllm_args']&.any? ? ", extra_args: #{preset['extra_vllm_args'].join(' ')}" : '' - info " max_model_len: #{preset['max_model_len']}, tool_call_parser: #{parser_note}#{trust_note}#{extra_note}" - return - end - - raise Error, "No tracked VM. Run 'create' first." unless state&.dig('vm_id') - - host = state['public_ip'] - raise Error, 'No public IP in state file.' if host.nil? || host.empty? - - # Stop the old container only when it has a different name from the new one. - if old_container != new_container - @provisioner.stop_vllm_container(host, old_container) - end - - info "Starting vLLM with preset '#{preset_name}' (#{preset['model']})..." - # Skip docker pull: image is already present; pulling on every switch risks a - # surprise multi-GB download if the upstream image was updated. - @provisioner.install_vllm(host, preset_config: preset, pull_image: false) - - # Hot-reload LiteLLM: rewrite config for the new model and restart the service. - # Skips venv/apt install since those are already in place. - @provisioner.reload_litellm(host, preset['model']) - - state['vllm_model'] = preset['model'] - state['vllm_container_name'] = new_container - state['vllm_preset'] = preset_name - state['vllm_setup_at'] = Time.now.utc.iso8601 - state['services'] ||= {} - state['services']['vllm_enabled'] = true - state['services']['ollama_enabled'] = state_ollama_enabled?(state) - @state_store.save(state) - - info "Model switched to '#{preset_name}' (#{preset['model']})." - info "Run 'ruby hyperstack.rb test' to verify." - end - - # Runs end-to-end inference tests against vLLM and LiteLLM over WireGuard. - # Requires wg1 to be active and the VM to be fully provisioned. - def test - state = @state_store.load - raise Error, "No tracked VM state file found at #{@state_store.path}." if state.nil? - - wg_ip = @config.wireguard_gateway_hostname - vllm_enabled = state_vllm_enabled?(state) - ollama_enabled = state_ollama_enabled?(state) - info "Running end-to-end inference tests via WireGuard (#{wg_ip})..." - - if vllm_enabled - test_vllm(wg_ip) - test_litellm(wg_ip) - end - - info " Ollama test: connect via SSH and run 'ollama list' to verify models." if ollama_enabled - - info 'All inference tests passed.' - end - - private - - def resumable_state?(state) - state['vm_id'] && ( - state['bootstrapped_at'].nil? || - ollama_setup_needed?(state) || - vllm_setup_needed?(state) || - wireguard_setup_needed?(state) - ) - end - - def continue_create(state) - vm_id = state['vm_id'] - sync_service_mode_state(state) - - vm = wait_for_vm_ready(vm_id) - ensure_security_rules(vm) - vm = wait_for_connect_ip(vm_id) - state['public_ip'] = connect_host_for(vm) - state['security_rules'] = Array(vm['security_rules']).map { |rule| normalize_rule(rule) } - @state_store.save(state) - - wait_for_ssh(state['public_ip']) - if @config.guest_bootstrap_enabled? && state['bootstrapped_at'].nil? - @provisioner.bootstrap_guest(state['public_ip']) - state['bootstrapped_at'] = Time.now.utc.iso8601 - @state_store.save(state) - end - - # Install Ollama binary and configure the service (fast), but defer - # model pulls until after the WireGuard tunnel is up so that the user - # can monitor progress over the tunnel. - if effective_ollama? && state['ollama_installed_at'].nil? - @provisioner.install_ollama_service(state['public_ip']) - state['ollama_installed_at'] = Time.now.utc.iso8601 - @state_store.save(state) - end - - # Call pre-hook before deciding whether WireGuard setup is needed; this allows a concurrent - # sibling VM (e.g. VM2 in create-both) to block here until the primary VM (VM1) has - # already written the base wg1.conf, which VM2's setup will then extend with its own peer. - @wg_setup_pre&.call - if wireguard_setup_needed?(state) - run_wireguard_setup(state['public_ip']) - state['wireguard_setup_at'] = Time.now.utc.iso8601 - @state_store.save(state) - end - # Always signal post-hook so that a waiting sibling VM is unblocked even when - # WireGuard setup was not needed (e.g. already done on a resume). - @wg_setup_post&.call - - # Pull and verify Ollama models after the tunnel is established. - if ollama_setup_needed?(state) - @provisioner.pull_ollama_models(state['public_ip']) - state['ollama_setup_at'] = Time.now.utc.iso8601 - state['ollama_models_dir'] = @config.ollama_models_dir - state['ollama_pulled_models'] = @scripts.desired_ollama_models - @state_store.save(state) - end - - # Set up vLLM (Docker container) + LiteLLM (Anthropic-API proxy) after - # the tunnel is up so that model-download progress is visible locally. - if vllm_setup_needed?(state) - preset_cfg = effective_vllm_preset_config - @provisioner.setup_vllm_stack(state['public_ip'], preset_config: preset_cfg) - state['vllm_setup_at'] = Time.now.utc.iso8601 - state['vllm_model'] = preset_cfg&.dig('model') || @config.vllm_model - state['vllm_container_name'] = preset_cfg&.dig('container_name') || @config.vllm_container_name - state['vllm_preset'] = @effective_vllm_preset - @state_store.save(state) - end - - vm = @client.get_vm(vm_id) - state['security_rules'] = Array(vm['security_rules']).map { |rule| normalize_rule(rule) } - state['status'] = vm['status'] - state['vm_state'] = vm['vm_state'] - state['provisioned_at'] = Time.now.utc.iso8601 - @state_store.save(state) - - info "VM ready: #{state['public_ip']} (id=#{state['vm_id']})" - print_local_wireguard_summary(state['public_ip']) - return unless effective_vllm? - - wg_ip = @config.wireguard_gateway_hostname - info "Run 'ruby hyperstack.rb test' to verify vLLM and LiteLLM." - info " vLLM: http://#{wg_ip}:#{@config.ollama_port}/v1/models" - info " LiteLLM: http://#{wg_ip}:#{@config.litellm_port}/v1/messages" - end - - def build_create_payload(vm_name, resolved) - payload = { - 'name' => vm_name, - 'count' => 1, - 'environment_name' => resolved[:environment]['name'], - 'flavor_name' => resolved[:flavor]['name'], - 'image_name' => resolved[:image]['name'], - 'key_name' => resolved[:keypair]['name'], - 'assign_floating_ip' => @config.assign_floating_ip?, - 'create_bootable_volume' => @config.create_bootable_volume?, - 'enable_port_randomization' => @config.enable_port_randomization?, - 'security_rules' => desired_security_rules - } - payload['labels'] = @config.labels unless @config.labels.empty? - payload['user_data'] = @config.user_data if @config.user_data - payload - end - - def resolve_dependencies - environment = @client.list_environments.find { |item| item['name'] == @config.environment_name } - raise Error, "Environment #{@config.environment_name.inspect} was not found in Hyperstack." unless environment - - flavor = @client.list_flavors.find do |item| - item['name'] == @config.flavor_name && item['region_name'] == environment['region'] - end - raise Error, "Flavor #{@config.flavor_name.inspect} is not available in #{environment['region']}." unless flavor - - if flavor['stock_available'] == false - raise Error, - "Flavor #{@config.flavor_name.inspect} exists in #{environment['region']} but is out of stock." - end - - image = @client.list_images.find do |item| - item['name'] == @config.image_name && item['region_name'] == environment['region'] - end - raise Error, "Image #{@config.image_name.inspect} is not available in #{environment['region']}." unless image - - keypair = @client.list_keypairs.find do |item| - item['name'] == @config.ssh_key_name && item.dig('environment', 'name') == environment['name'] - end - unless keypair - raise Error, - "Keypair #{@config.ssh_key_name.inspect} was not found in environment #{environment['name']}." - end - - { - environment: environment, - flavor: flavor, - image: image, - keypair: keypair - } - end - - def wait_for_vm_ready(vm_id) - with_polling("VM #{vm_id} to become ready for firewall updates") do - vm = @client.get_vm(vm_id) - next nil if vm.nil? - - raise Error, "VM #{vm_id} entered failed state #{vm['status']} / #{vm['vm_state']}." if failed_vm?(vm) - - vm_ready_for_updates?(vm) ? vm : nil - end - end - - def wait_for_connect_ip(vm_id) - ip_label = @config.assign_floating_ip? ? 'floating IP' : 'reachable IP' - with_polling("VM #{vm_id} to receive a #{ip_label}") do - vm = @client.get_vm(vm_id) - raise Error, "VM #{vm_id} entered failed state #{vm['status']} / #{vm['vm_state']}." if failed_vm?(vm) - - connect_host_for(vm) ? vm : nil - end - end - - def wait_for_ssh(host) - info "Waiting for SSH on #{host}:#{@config.ssh_port}..." - with_polling("SSH on #{host}:#{@config.ssh_port}") do - next nil unless tcp_open?(host, @config.ssh_port) - next nil unless ensure_trusted_ssh_host(host) - - _, stderr, status = run_ssh_command(host, 'true') - if status.success? - true - else - warn "SSH not ready yet: #{stderr.strip}" unless stderr.to_s.strip.empty? - nil - end - end - end - - def ensure_security_rules(vm) - existing = Array(vm['security_rules']).map { |rule| normalize_rule(rule) } - desired = desired_security_rules.map { |rule| normalize_rule(rule) } - - (desired - existing).each do |rule| - info "Adding Hyperstack firewall rule #{rule['protocol']} #{rule['remote_ip_prefix']} #{rule['port_range_min']}..." - @client.create_vm_rule(vm['id'], rule) - end - end - - def ollama_setup_needed?(state) - return false unless effective_ollama? - # Re-run setup if state has no record, or if desired models changed - return true if state['ollama_setup_at'].nil? - - @scripts.model_list_signature(@scripts.desired_ollama_models) != - @scripts.model_list_signature(state['ollama_pulled_models']) - end - - def wireguard_setup_needed?(state) - return false unless @config.wireguard_auto_setup? - - public_ip = state['public_ip'].to_s.strip - return true if public_ip.empty? - - expected_endpoint = "#{public_ip}:#{@config.wireguard_udp_port}" - !Array(@local_wireguard.status['endpoints']).include?(expected_endpoint) - end - - def run_wireguard_setup(host) - validate_wireguard_setup_script! - retries = 3 - retries.times do |attempt| - info "Running WireGuard auto-setup via #{@config.wireguard_setup_script} #{host}..." - - status = run_wireguard_script(host) - return if status.success? - - if attempt == retries - 1 - raise Error, "WireGuard setup failed after #{retries} attempts (exit #{status.exitstatus})." - end - - delay = (attempt + 1) * 15 - warn "WireGuard setup attempt #{attempt + 1}/#{retries} failed (exit #{status.exitstatus}), retrying in #{delay}s..." - sleep delay - end - end - - def run_wireguard_script(host) - # Pass server WireGuard IP and WireGuard hostname as positional args so that - # wg1-setup.sh can configure the correct server-side tunnel address and update - # /etc/hosts on the client. The Enter keystroke via stdin bypasses the interactive prompt. - server_ip = @config.wireguard_gateway_ip - wg_hostname = @config.wireguard_gateway_hostname - env = { - 'HYPERSTACK_SSH_PORT' => @config.ssh_port.to_s, - 'HYPERSTACK_SSH_CONNECT_TIMEOUT' => @config.ssh_connect_timeout.to_s, - 'HYPERSTACK_SSH_KNOWN_HOSTS_PATH' => @config.ssh_known_hosts_path, - 'HYPERSTACK_SSH_PRIVATE_KEY_PATH' => (File.exist?(@config.ssh_private_key_path) ? @config.ssh_private_key_path : '') - } - - Open3.popen2e(env, 'bash', @config.wireguard_setup_script, host, server_ip, wg_hostname) do |stdin, output, wait_thr| - stdin.sync = true - stdin.puts - stdin.close - - output.each { |line| @out.print(line) } - wait_thr.value - end - end - - def wait_for_deletion(vm_id) - info "Waiting for VM #{vm_id} deletion to complete..." - with_polling("VM #{vm_id} deletion", timeout: 300) do - @client.get_vm(vm_id) - nil - rescue Error => e - raise unless e.message.include?('not_found') || e.message.include?('does not exists') - - true - end - end - - def connect_host_for(vm) - return vm['floating_ip'] if @config.assign_floating_ip? - - vm['floating_ip'] || vm['fixed_ip'] - end - - def validate_wireguard_setup_script! - script_path = @config.wireguard_setup_script - raise Error, "WireGuard setup script not found: #{script_path}" unless File.exist?(script_path) - - mismatches = [] - mismatches << "ssh.username must be 'ubuntu'" unless @config.ssh_username == 'ubuntu' - mismatches << "local_client.interface_name must be 'wg1'" unless @config.local_interface_name == 'wg1' - mismatches << 'network.wireguard_udp_port must be 56710' unless @config.wireguard_udp_port == 56_710 - unless @config.wireguard_subnet == '192.168.3.0/24' - mismatches << "network.wireguard_subnet must be '192.168.3.0/24'" - end - - # Validate that the resolved server IP is actually within the configured subnet. - begin - subnet = IPAddr.new(@config.wireguard_subnet) - server_ip = IPAddr.new(@config.wireguard_gateway_ip) - unless subnet.include?(server_ip) - mismatches << "wireguard_server_ip #{@config.wireguard_gateway_ip.inspect} is outside #{@config.wireguard_subnet}" - end - rescue IPAddr::InvalidAddressError => e - mismatches << "Invalid wireguard_server_ip: #{e.message}" - end - - return if mismatches.empty? - - raise Error, "Configured WireGuard settings do not match #{script_path}: #{mismatches.join('; ')}" - end - - def ensure_trusted_ssh_host(host) - scanned = scan_ssh_host_keys(host) - return false if scanned.empty? - - existing = known_host_entries - if existing.empty? - write_known_host_entries(scanned) - info "Pinned SSH host key for #{host} in #{@config.ssh_known_hosts_path}." - return true - end - - return true if existing == scanned - - raise Error, - "SSH host key mismatch for #{host}. Refusing to continue. Delete #{@config.ssh_known_hosts_path} only if you intentionally replaced this VM." - end - - def scan_ssh_host_keys(host) - stdout, stderr, status = Open3.capture3('ssh-keyscan', '-T', @config.ssh_connect_timeout.to_s, - '-p', @config.ssh_port.to_s, host) - unless status.success? - warn "ssh-keyscan not ready yet: #{stderr.strip}" unless stderr.to_s.strip.empty? - return [] - end - - stdout.lines.map(&:strip).reject { |line| line.empty? || line.start_with?('#') }.sort.uniq - rescue Errno::ENOENT - raise Error, 'ssh-keyscan is required to pin SSH host keys but was not found in PATH.' - end - - def known_host_entries - path = @config.ssh_known_hosts_path - return [] unless File.exist?(path) - - File.readlines(path, chomp: true).map(&:strip).reject(&:empty?).sort.uniq - rescue Errno::EACCES => e - raise Error, "Cannot read SSH known_hosts file #{path}: #{e.message}" - end - - def write_known_host_entries(entries) - path = @config.ssh_known_hosts_path - FileUtils.mkdir_p(File.dirname(path)) - temp_path = "#{path}.tmp" - File.write(temp_path, "#{entries.join("\n")}\n") - File.chmod(0o600, temp_path) - File.rename(temp_path, path) - rescue Errno::EACCES => e - raise Error, "Cannot write SSH known_hosts file #{path}: #{e.message}" - end - - def delete_ssh_known_hosts_file - File.delete(@config.ssh_known_hosts_path) if File.exist?(@config.ssh_known_hosts_path) - rescue Errno::EACCES => e - raise Error, "Cannot delete SSH known_hosts file #{@config.ssh_known_hosts_path}: #{e.message}" - end - - def failed_vm?(vm) - [vm['status'], vm['vm_state'], vm['power_state']].compact.any? do |value| - value.to_s.downcase.match?(/error|failed|deleted|shelved/) - end - end - - def vm_ready_for_updates?(vm) - %w[ACTIVE SHUTOFF HIBERNATED].include?(vm['status'].to_s.upcase) - end - - def tcp_open?(host, port) - Socket.tcp(host, port, connect_timeout: @config.ssh_connect_timeout) do |sock| - sock.close - true - end - rescue Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::EHOSTUNREACH, Errno::ENETUNREACH, SocketError, IOError - false - end - - def run_ssh_command(host, remote_script) - Open3.capture3(*ssh_command(host), stdin_data: remote_script) - end - - def run_ssh_command_streaming(host, remote_script) - combined_output = +'' - Open3.popen2e(*ssh_command(host)) do |stdin, output, wait_thr| - stdin.write(remote_script) - stdin.close - - output.each do |line| - combined_output << line - @out.print(line) - end - - return [combined_output, wait_thr.value] - end - end - - def ssh_command(host) - command = [ - 'ssh', - '-o', 'BatchMode=yes', - '-o', 'StrictHostKeyChecking=yes', - '-o', "UserKnownHostsFile=#{@config.ssh_known_hosts_path}", - '-o', "ConnectTimeout=#{@config.ssh_connect_timeout}", - '-p', @config.ssh_port.to_s - ] - if File.exist?(@config.ssh_private_key_path) - command.concat(['-i', @config.ssh_private_key_path]) - else - warn "SSH private key #{@config.ssh_private_key_path} does not exist; falling back to default ssh-agent identity." - end - - command << "#{@config.ssh_username}@#{host}" - command << 'bash -se' - command - end - - def with_polling(description, timeout: 900, interval: 5) - deadline = Time.now + timeout - loop do - result = yield - return result if result - - raise Error, "Timed out waiting for #{description}." if Time.now >= deadline - - sleep interval - end - end - - def normalize_rule(rule) - { - 'direction' => rule['direction'].to_s.downcase, - 'ethertype' => rule['ethertype'].to_s, - 'protocol' => rule['protocol'].to_s.downcase, - 'port_range_min' => integer_or_nil(rule['port_range_min']), - 'port_range_max' => integer_or_nil(rule['port_range_max']), - 'remote_ip_prefix' => rule['remote_ip_prefix'].to_s - } - end - - def sync_service_mode_state(state) - state['services'] = { - 'vllm_enabled' => effective_vllm?, - 'ollama_enabled' => effective_ollama? - } - end - - def desired_security_rules(include_vllm: effective_vllm?, include_ollama: effective_ollama?) - @config.desired_security_rules(include_vllm: include_vllm, include_ollama: include_ollama) - end - - def desired_security_rules_for_state(state) - desired_security_rules(include_vllm: state_vllm_enabled?(state), include_ollama: state_ollama_enabled?(state)) - end - - def state_vllm_enabled?(state) - recorded = state&.dig('services', 'vllm_enabled') - return recorded unless recorded.nil? - - return true if state&.key?('vllm_setup_at') - - @config.vllm_install_enabled? - end - - def state_ollama_enabled?(state) - recorded = state&.dig('services', 'ollama_enabled') - return recorded unless recorded.nil? - - return true if state&.key?('ollama_installed_at') || state&.key?('ollama_setup_at') - - @config.ollama_install_enabled? - end - - def service_mode_summary(vllm_enabled:, ollama_enabled:) - return 'vLLM+LiteLLM enabled, Ollama enabled' if vllm_enabled && ollama_enabled - return 'vLLM+LiteLLM enabled, Ollama disabled' if vllm_enabled - return 'Ollama enabled, vLLM+LiteLLM disabled' if ollama_enabled - - 'All inference services disabled' - end - - def cleanup_local_access(dry_run:, hostnames:, allowed_ips:) - { - peers: @local_wireguard.remove_peers_by_allowed_ips(allowed_ips, dry_run: dry_run), - hostnames: @local_wireguard.remove_hostnames(hostnames, dry_run: dry_run) - } - end - - def report_local_cleanup(output, cleanup, dry_run:) - peer_summary = cleanup[:peers].map { |peer| peer['AllowedIPs'] || peer['Endpoint'] }.join(', ') - host_summary = cleanup[:hostnames].join(', ') - - if dry_run - if cleanup[:peers].empty? && cleanup[:hostnames].empty? - output.puts('DRY RUN: no matching local WireGuard peers or host entries would be removed.') - return - end - - output.puts("DRY RUN: local WireGuard peers would be removed for #{peer_summary}.") unless cleanup[:peers].empty? - output.puts("DRY RUN: local host entries would be removed for #{host_summary}.") unless cleanup[:hostnames].empty? - return - end - - output.puts('No matching local WireGuard peers needed removal.') if cleanup[:peers].empty? - output.puts('No matching local host entries needed removal.') if cleanup[:hostnames].empty? - output.puts("Local WireGuard peers removed for #{peer_summary}.") unless cleanup[:peers].empty? - output.puts("Local host entries removed for #{host_summary}.") unless cleanup[:hostnames].empty? - end - - def print_create_dry_run(vm_name, resolved, payload) - info 'DRY RUN: no VM or state file will be created.' - info "State file: #{@state_store.path}" - info "Resolved environment: #{resolved[:environment]['name']} (region #{resolved[:environment]['region']})" - info "Resolved flavor: #{format_flavor(resolved[:flavor])}" - info "Resolved image: #{resolved[:image]['name']}" - info "Resolved SSH keypair: #{resolved[:keypair]['name']}" - info "Planned VM name: #{vm_name}" - info "Allowed SSH CIDRs: #{@config.allowed_ssh_cidrs.join(', ')}" - info "Allowed WireGuard CIDRs: #{@config.allowed_wireguard_cidrs.join(', ')}" - info 'Create payload:' - @out.puts(JSON.pretty_generate(payload)) - if @config.guest_bootstrap_enabled? - info 'Guest bootstrap script:' - @out.puts(@scripts.guest_bootstrap_script) - else - info 'Guest bootstrap is disabled in config.' - end - if effective_ollama? - info "Ollama will be installed with models stored under #{@config.ollama_models_dir}" - models = @scripts.desired_ollama_models - info "Ollama models to pre-pull: #{models.join(', ')}" unless models.empty? - end - if effective_vllm? - preset_cfg = effective_vllm_preset_config - vllm_m = preset_cfg&.dig('model') || @config.vllm_model - vllm_cname = preset_cfg&.dig('container_name') || @config.vllm_container_name - vllm_maxlen = preset_cfg&.dig('max_model_len') || @config.vllm_max_model_len - preset_note = @effective_vllm_preset ? " (preset: #{@effective_vllm_preset})" : '' - info "vLLM will be installed: #{vllm_m}#{preset_note}" - info " Container: #{vllm_cname}, port #{@config.ollama_port}, max_model_len #{vllm_maxlen}" - info "LiteLLM proxy will be installed on port #{@config.litellm_port}" - info " Claude model aliases: #{@config.litellm_claude_model_names.join(', ')}" - end - if @config.wireguard_auto_setup? - info "WireGuard auto-setup script: #{@config.wireguard_setup_script} <vm_public_ip>" - end - print_local_wireguard_summary(nil) - end - - def print_resume_dry_run(state) - info "DRY RUN: would resume provisioning tracked VM #{state['vm_id']}." - begin - vm = @client.get_vm(state['vm_id']) - info "Tracked VM status: #{vm['status']} / #{vm['vm_state']}" - info "Tracked VM public IP: #{connect_host_for(vm) || 'none'}" - rescue Error => e - warn "Unable to inspect tracked VM #{state['vm_id']}: #{e.message}" - end - if @config.guest_bootstrap_enabled? - info 'Guest bootstrap script:' - @out.puts(@scripts.guest_bootstrap_script) - end - if ollama_setup_needed?(state) - info "Ollama would be installed with models stored under #{@config.ollama_models_dir}" - models = @scripts.desired_ollama_models - info "Ollama models to pre-pull: #{models.join(', ')}" unless models.empty? - end - if vllm_setup_needed?(state) - info "vLLM would be installed: #{@config.vllm_model}" - info "LiteLLM proxy would be installed on port #{@config.litellm_port}" - end - if wireguard_setup_needed?(state) - info "WireGuard auto-setup script would run: #{@config.wireguard_setup_script} #{state['public_ip'] || '<pending-public-ip>'}" - end - print_local_wireguard_summary(state['public_ip']) - end - - def print_delete_dry_run(target_vm_id, state, preserve_state_on_failure:) - info 'DRY RUN: no VM will be deleted.' - begin - vm = @client.get_vm(target_vm_id) - info "Delete target: #{target_vm_id} #{vm['name']} (#{vm['status']} / #{vm['vm_state']})" - info "Delete target public IP: #{connect_host_for(vm) || 'none'}" - rescue Error => e - warn "Unable to inspect VM #{target_vm_id} before delete: #{e.message}" - end - - if state && state['vm_id'].to_i == target_vm_id.to_i - action = preserve_state_on_failure ? 'would remain unchanged' : 'would be removed' - info "Tracked state file #{@state_store.path} #{action}." - cleanup = cleanup_local_access(dry_run: true, hostnames: [@config.wireguard_gateway_hostname], - allowed_ips: ["#{@config.wireguard_gateway_ip}/32"]) - report_local_cleanup(@out, cleanup, dry_run: true) - else - info 'No tracked state entry would be modified.' - end - end - - def format_flavor(flavor) - gpu = flavor['gpu'].to_s.empty? ? 'CPU-only' : flavor['gpu'] - [ - flavor['name'], - gpu, - "#{flavor['gpu_count']} GPU", - "#{flavor['ram']} GB RAM", - "#{flavor['cpu']} vCPU", - "stock=#{flavor['stock_available']}" - ].join(', ') - end - - # Returns the effective Ollama flag: CLI override if set, else config default. - def effective_ollama? - defined?(@effective_ollama) ? @effective_ollama : @config.ollama_install_enabled? - end - - # Returns the effective vLLM flag: CLI override if set, else config default. - def effective_vllm? - defined?(@effective_vllm) ? @effective_vllm : @config.vllm_install_enabled? - end - - # Returns the resolved preset config hash when a preset was selected via - # --model, or nil when using the top-level [vllm] defaults directly. - def effective_vllm_preset_config - name = defined?(@effective_vllm_preset) ? @effective_vllm_preset : nil - return nil unless name - - @config.vllm_preset(name) - end - - def vllm_setup_needed?(state) - return false unless effective_vllm? - return true if state['vllm_setup_at'].nil? - - # Re-run if the active model changed (direct config edit or --model preset flag). - desired = effective_vllm_preset_config&.dig('model') || @config.vllm_model - state['vllm_model'] != desired - end - - # Tests the vLLM OpenAI-compatible API: lists loaded models and runs a - # short inference request to confirm the model accepts requests. - def test_vllm(wg_ip) - port = @config.ollama_port - - info " Testing vLLM models list at http://#{wg_ip}:#{port}/v1/models..." - uri = URI("http://#{wg_ip}:#{port}/v1/models") - resp = Net::HTTP.get_response(uri) - raise Error, "vLLM /v1/models returned HTTP #{resp.code}" unless resp.code == '200' - - models = JSON.parse(resp.body).fetch('data', []).map { |m| m['id'] } - raise Error, 'vLLM returned an empty model list' if models.empty? - - # Use the currently loaded model (may differ from config default after a switch). - model = models.first - info " Models loaded: #{models.join(', ')}" - info ' Testing vLLM inference...' - reply = vllm_chat(wg_ip, port, model, 'Say hello in five words.') - info " vLLM response: #{reply}" - rescue Errno::ECONNREFUSED, Errno::EHOSTUNREACH, SocketError => e - raise Error, "Cannot reach vLLM at #{wg_ip}:#{port} — is WireGuard (wg1) active? (#{e.message})" - end - - # Tests the LiteLLM proxy using the Anthropic Messages API format, - # which is what Claude Code sends when pointed at a custom base URL. - def test_litellm(wg_ip) - port = @config.litellm_port - model = @config.litellm_claude_model_names.first - key = @config.litellm_master_key - - info " Testing LiteLLM proxy at http://#{wg_ip}:#{port}/v1/messages..." - uri = URI("http://#{wg_ip}:#{port}/v1/messages") - req = Net::HTTP::Post.new(uri) - req['Content-Type'] = 'application/json' - req['x-api-key'] = key - req['anthropic-version'] = '2023-06-01' - req.body = JSON.generate( - 'model' => model, - # 500 tokens: reasoning models (e.g. gpt-oss) consume tokens on chain-of-thought - # before producing content; 50 is too small and yields an empty content field. - 'max_tokens' => 500, - 'messages' => [{ 'role' => 'user', 'content' => 'Say hello in five words.' }] - ) - resp = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) { |h| h.request(req) } - raise Error, "LiteLLM returned HTTP #{resp.code}: #{resp.body}" unless resp.code == '200' - - text = JSON.parse(resp.body).fetch('content', []).find { |b| b['type'] == 'text' }&.dig('text').to_s.strip - info " LiteLLM response: #{text}" - rescue Errno::ECONNREFUSED, Errno::EHOSTUNREACH, SocketError => e - raise Error, "Cannot reach LiteLLM at #{wg_ip}:#{port} — is WireGuard (wg1) active? (#{e.message})" - end - - # Sends a single OpenAI chat completion request and returns the reply text. - def vllm_chat(host, port, model, prompt) - uri = URI("http://#{host}:#{port}/v1/chat/completions") - req = Net::HTTP::Post.new(uri) - req['Content-Type'] = 'application/json' - req['Authorization'] = 'Bearer EMPTY' - req.body = JSON.generate( - 'model' => model, - 'messages' => [{ 'role' => 'user', 'content' => prompt }], - # 500 tokens: reasoning models (e.g. gpt-oss) use tokens for chain-of-thought - # before content; 50 is too small and yields an empty content field. - 'max_tokens' => 500 - ) - resp = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) { |h| h.request(req) } - raise Error, "vLLM inference returned HTTP #{resp.code}" unless resp.code == '200' - - JSON.parse(resp.body).dig('choices', 0, 'message', 'content').to_s.strip - end - - def integer_or_nil(value) - value.nil? ? nil : Integer(value) - end - - def print_local_wireguard_summary(expected_ips) - return unless @config.local_client_checks_enabled? - - wg_status = @local_wireguard.status - endpoints = Array(wg_status['endpoints']).compact.uniq - info "Local WireGuard #{@config.local_interface_name}: #{wg_status['service_state']}" - if endpoints.empty? - if wg_status['config_readable'] - info 'Local WireGuard has no configured peers.' - else - warn "Unable to read #{@config.local_wg_config_path} for local WireGuard endpoint validation." - end - return - end - - label = endpoints.one? ? 'endpoint' : 'endpoints' - info "Local WireGuard #{label}: #{endpoints.join(', ')}" - - expected = Array(expected_ips).compact.map(&:to_s).map(&:strip).reject(&:empty?).uniq - return if expected.empty? - - expected_endpoints = expected.map { |ip| "#{ip}:#{@config.wireguard_udp_port}" } - missing = expected_endpoints.reject { |endpoint| endpoints.include?(endpoint) } - - if expected_endpoints.one? - if missing.empty? - info 'Local WireGuard endpoint matches the managed VM IP.' - else - hosts = endpoints.map { |endpoint| endpoint.split(':', 2).first }.uniq - warn "Local WireGuard endpoints point to #{hosts.join(', ')}, expected #{expected.first}." - end - return - end - - if missing.empty? - info 'Local WireGuard has peers for all managed VM IPs.' - else - present = expected_endpoints - missing - info "Local WireGuard has peers for: #{present.map { |endpoint| endpoint.split(':', 2).first }.join(', ')}" unless present.empty? - warn "Local WireGuard missing peers for: #{missing.map { |endpoint| endpoint.split(':', 2).first }.join(', ')}." - end - end - - def info(message) - @out.puts(message) - end - - def warn(message) - @out.puts("WARN: #{message}") - end - end - - class CLI - def initialize(argv) - @argv = argv.dup - @config_path = File.join(__dir__, 'hyperstack-vm.toml') - @config_explicit = false - end - - def show_help - puts @global_parser - puts - puts 'Commands:' - puts ' create [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama] [--model PRESET]' - puts ' create-both [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama]' - puts ' Provision hyperstack-vm1.toml and hyperstack-vm2.toml concurrently.' - puts ' WireGuard setup is serialized: VM1 writes the base wg1.conf first,' - puts ' then VM2 adds its peer. Requires both TOML files next to the script.' - puts ' delete [--vm-id ID] [--dry-run]' - puts ' delete-both [--dry-run]' - puts ' Delete the VMs tracked by hyperstack-vm1.toml and hyperstack-vm2.toml.' - puts ' status' - puts ' test' - puts ' model list' - puts ' model switch PRESET [--dry-run]' - end - - def run - @global_parser = OptionParser.new do |opts| - opts.banner = 'Usage: ruby hyperstack.rb [--config path] <create|delete|status> [options]' - opts.on('--config PATH', "Path to TOML config (default: #{@config_path})") do |value| - @config_path = value - @config_explicit = true - end - opts.on('-h', '--help', 'Show help') do - show_help - exit 0 - end - end - @global_parser.order!(@argv) - - command = @argv.shift - if command.nil? - show_help - exit 0 - end - - # create-both loads its own config files and does not use the default config path. - # Parse it before building the manager so we avoid loading the default config needlessly. - if command == 'create-both' - opts = parse_create_options(@argv, include_model_preset: false) - run_create_both(**opts) - return - end - - if command == 'delete-both' - opts = parse_delete_both_options(@argv) - run_delete_both(**opts) - return - end - - if command == 'status' - run_status - return - end - - # All other commands operate on a single VM defined by the --config path. - config_loader = ConfigLoader.load(@config_path) - manager = build_manager(config_loader.config) - - case command - when 'create' - opts = parse_create_options(@argv) - manager.create(**opts) - when 'delete' - vm_id = nil - dry_run = false - parser = OptionParser.new do |opts| - opts.on('--vm-id ID', Integer, 'Delete a VM by ID instead of using the local state file') do |value| - vm_id = value - end - opts.on('--dry-run', 'Show which VM would be deleted without deleting it') { dry_run = true } - end - parser.parse!(@argv) - manager.delete(vm_id: vm_id, dry_run: dry_run) - when 'test' - manager.test - when 'model' - sub = @argv.shift - raise Error, 'Missing model subcommand. Use: model list | model switch PRESET [--dry-run]' if sub.nil? - - case sub - when 'list' - manager.list_models - when 'switch' - preset = @argv.shift - raise Error, 'Missing preset name. Usage: model switch PRESET [--dry-run]' if preset.nil? - - dry_run = false - OptionParser.new { |o| o.on('--dry-run') { dry_run = true } }.parse!(@argv) - manager.switch_model(preset_name: preset, dry_run: dry_run) - else - raise Error, "Unknown model subcommand #{sub.inspect}. Use list or switch." - end - else - raise Error, "Unknown command #{command.inspect}. Use create, create-both, delete, delete-both, status, test, or model." - end - end - - private - - # Parses the shared --replace / --dry-run / --vllm / --ollama / --model flags - # used by both 'create' and 'create-both'. When include_model_preset is false - # (create-both), the --model flag is not registered because each VM uses its own - # TOML default. Returns a hash suitable for splatting into Manager#create. - def parse_create_options(argv, include_model_preset: true) - opts = { replace: false, dry_run: false, install_vllm: nil, install_ollama: nil, vllm_preset: nil } - OptionParser.new do |o| - o.on('--replace', 'Delete the tracked VM before creating a new one') { opts[:replace] = true } - o.on('--dry-run', 'Print the create plan without creating a VM') { opts[:dry_run] = true } - o.on('--vllm', 'Enable vLLM+LiteLLM setup (overrides config)') { opts[:install_vllm] = true } - o.on('--no-vllm', 'Disable vLLM+LiteLLM setup (overrides config)') { opts[:install_vllm] = false } - o.on('--ollama', 'Enable Ollama setup (overrides config)') { opts[:install_ollama] = true } - o.on('--no-ollama', 'Disable Ollama setup (overrides config)') { opts[:install_ollama] = false } - o.on('--model PRESET', 'Use a named vLLM preset at create time') { |v| opts[:vllm_preset] = v } if include_model_preset - end.parse!(argv) - opts - end - - def parse_delete_both_options(argv) - opts = { dry_run: false } - OptionParser.new do |o| - o.on('--dry-run', 'Show which VMs would be deleted without deleting them') { opts[:dry_run] = true } - end.parse!(argv) - opts - end - - # Constructs a Manager and all its dependencies from a Config object. - # Accepts optional output destination and WireGuard concurrency hooks. - def build_manager(config, out: $stdout, wg_setup_pre: nil, wg_setup_post: nil) - state_store = StateStore.new(config.state_file) - client = HyperstackClient.new(base_url: config.api_base_url, api_key: config.api_key) - local_wireguard = build_local_wireguard(config) - Manager.new( - config: config, - client: client, - state_store: state_store, - local_wireguard: local_wireguard, - out: out, - wg_setup_pre: wg_setup_pre, - wg_setup_post: wg_setup_post - ) - end - - def build_local_wireguard(config) - LocalWireGuard.new( - interface_name: config.local_interface_name, - config_path: config.local_wg_config_path - ) - end - - def run_status - loaders = status_config_loaders - if loaders.one? - build_manager(loaders.first.config).status - return - end - - expected_ips = [] - loaders.each_with_index do |loader, index| - puts if index.positive? - puts "[#{File.basename(loader.path)}]" - expected_ip = build_manager(loader.config).status(include_local_wireguard: false) - expected_ips << expected_ip if expected_ip - end - - puts - puts '[local-wireguard]' - build_manager(loaders.first.config).show_local_wireguard(expected_ips) - end - - def status_config_loaders - return [ConfigLoader.load(@config_path)] if @config_explicit - - candidates = [ - @config_path, - File.join(__dir__, 'hyperstack-vm1.toml'), - File.join(__dir__, 'hyperstack-vm2.toml') - ].uniq.select { |path| File.exist?(path) } - - loaders = candidates.map { |path| ConfigLoader.load(path) } - tracked = loaders.select { |loader| File.exist?(loader.config.state_file) } - tracked.empty? ? [ConfigLoader.load(@config_path)] : tracked - end - - def pair_config_loaders - [ - ConfigLoader.load(File.join(__dir__, 'hyperstack-vm1.toml')), - ConfigLoader.load(File.join(__dir__, 'hyperstack-vm2.toml')) - ] - end - - # Provisions hyperstack-vm1 and hyperstack-vm2 concurrently in separate threads. - # WireGuard setup is serialized: VM1 runs first (replacing the base wg1.conf), then - # VM2 adds its peer. A Mutex+ConditionVariable acts as a one-shot latch between threads. - # If VM1 fails before reaching the WG step the latch is still released so VM2 doesn't hang. - # vllm_preset is accepted but ignored — each VM uses its own TOML default preset. - def run_create_both(replace:, dry_run:, install_vllm:, install_ollama:, vllm_preset: nil) # rubocop:disable Lint/UnusedMethodArgument - vm1_loader, vm2_loader = pair_config_loaders - vm1_config = vm1_loader.config - vm2_config = vm2_loader.config - - out_mutex = Mutex.new - wg_mutex = Mutex.new - wg_cv = ConditionVariable.new - vm1_wg_state = { done: false, error: nil } - - # VM1 signals the latch after its WG step (whether WG ran or was already done). - vm1_wg_post = proc do - wg_mutex.synchronize { vm1_wg_state[:done] = true; wg_cv.broadcast } - end - - # VM2 blocks here until VM1's WG step resolves, then raises if VM1 failed. - vm2_wg_pre = proc do - wg_mutex.synchronize { wg_cv.wait(wg_mutex) until vm1_wg_state[:done] || vm1_wg_state[:error] } - raise Error, "VM1 WireGuard setup failed; cannot add VM2 peer." if vm1_wg_state[:error] - end - - manager1 = build_manager(vm1_config, - out: PrefixedOutput.new('[vm1] ', $stdout, out_mutex), - wg_setup_post: vm1_wg_post) - manager2 = build_manager(vm2_config, - out: PrefixedOutput.new('[vm2] ', $stdout, out_mutex), - wg_setup_pre: vm2_wg_pre) - - errors = {} - create_opts = { replace: replace, dry_run: dry_run, - install_vllm: install_vllm, install_ollama: install_ollama } - - vm1_thread = Thread.new do - manager1.create(**create_opts) - rescue Error => e - errors[:vm1] = e.message - # Unblock VM2 even if VM1 failed so the process doesn't hang. - wg_mutex.synchronize { vm1_wg_state[:error] = e.message; wg_cv.broadcast } - end - - vm2_thread = Thread.new do - manager2.create(**create_opts) - rescue Error => e - errors[:vm2] = e.message - end - - [vm1_thread, vm2_thread].each(&:join) - - errors.each { |vm, msg| $stderr.puts("ERROR [#{vm}]: #{msg}") } - exit 1 unless errors.empty? - end - - def run_delete_both(dry_run:) - out_mutex = Mutex.new - errors_mutex = Mutex.new - errors = {} - loaders = pair_config_loaders - local_wg_out = PrefixedOutput.new('[local-wireguard] ', $stdout, out_mutex) - threads = loaders.each_with_index.map do |loader, index| - label = "vm#{index + 1}" - manager = build_manager(loader.config, out: PrefixedOutput.new("[#{label}] ", $stdout, out_mutex)) - - Thread.new do - manager.delete(dry_run: dry_run, skip_local_cleanup: true) - rescue Error => e - errors_mutex.synchronize { errors[label.to_sym] = e.message } - end - end - threads.each(&:join) - - if errors.empty? - allowed_ips = loaders.map { |loader| "#{loader.config.wireguard_gateway_ip}/32" } - hostnames = loaders.map { |loader| loader.config.wireguard_gateway_hostname } - begin - local_manager = build_manager(loaders.first.config, out: local_wg_out) - cleanup = local_manager.send(:cleanup_local_access, dry_run: dry_run, hostnames: hostnames, - allowed_ips: allowed_ips) - local_manager.send(:report_local_cleanup, local_wg_out, cleanup, dry_run: dry_run) - rescue Error => e - errors[:local_wireguard] = e.message - end - end - - errors.each { |vm, msg| $stderr.puts("ERROR [#{vm}]: #{msg}") } - exit 1 unless errors.empty? - end - end -end - -begin - HyperstackVM::CLI.new(ARGV).run -rescue HyperstackVM::Error => e - warn "ERROR: #{e.message}" - exit 1 -end diff --git a/snippets/hyperstack/pi-vm1 b/snippets/hyperstack/pi-vm1 deleted file mode 100755 index a6f7937..0000000 --- a/snippets/hyperstack/pi-vm1 +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -cd "$script_dir" - -exec pi --model 'hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit' "$@" diff --git a/snippets/hyperstack/pi-vm2 b/snippets/hyperstack/pi-vm2 deleted file mode 100755 index 5f07e7d..0000000 --- a/snippets/hyperstack/pi-vm2 +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -cd "$script_dir" - -exec pi --model 'hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit' "$@" diff --git a/snippets/hyperstack/vllm-setup.txt b/snippets/hyperstack/vllm-setup.txt deleted file mode 100644 index 9ea44a7..0000000 --- a/snippets/hyperstack/vllm-setup.txt +++ /dev/null @@ -1,487 +0,0 @@ -# vLLM + LiteLLM + Claude Code Setup for Hyperstack VM -# -# This document describes the full deployment of qwen3-coder-next (AWQ 4-bit) -# via vLLM with a LiteLLM proxy for Claude Code compatibility. -# -# Architecture: -# -# Claude Code (earth) Hyperstack VM (A100 80GB) -# ┌─────────────┐ ┌──────────────────────────────┐ -# │ claude CLI │── Anthropic API ──> │ LiteLLM proxy (:4000) │ -# │ │ /v1/messages │ translates Anthropic → │ -# │ │ via WireGuard wg1 │ OpenAI chat completions │ -# └─────────────┘ │ │ │ -# │ ▼ │ -# OpenCode (earth) │ vLLM engine (:11434) │ -# ┌─────────────┐ │ /v1/chat/completions │ -# │ opencode │── OpenAI API ──────> │ FlashAttention v2 │ -# │ │ /v1/chat/completions│ prefix caching │ -# └─────────────┘ │ bullpoint/Qwen3-Coder- │ -# │ Next-AWQ-4bit (45GB) │ -# └──────────────────────────────┘ -# -# Why vLLM instead of Ollama: -# - FlashAttention v2: ~1.5-2x faster prefill for long prompts -# - Block-level prefix caching: partial KV cache reuse even when prompt -# changes mid-sequence (Ollama requires exact prefix match from token 0) -# - Chunked prefill: can interleave prefill and decode -# - Marlin kernels for AWQ MoE quantization -# -# Why LiteLLM: -# - Claude Code speaks Anthropic Messages API (/v1/messages) only -# - vLLM speaks OpenAI Chat Completions API (/v1/chat/completions) only -# - LiteLLM translates between them, mapping Claude model names to the -# actual vLLM model -# -# Model details: -# - Name: bullpoint/Qwen3-Coder-Next-AWQ-4bit (HuggingFace) -# - Architecture: MoE, 80B total params, 3B active per token -# - 512 experts, 10 activated + 1 shared per token -# - Hybrid attention: Gated DeltaNet + Gated Attention (48 layers) -# - Quantization: AWQ 4-bit, group size 32 -# - Disk size: ~45GB (vs ~151GB at BF16) -# - VRAM usage: ~45GB weights + ~27GB KV cache at 92% utilization -# - Context: 262,144 tokens (256k native) -# - vLLM requirement: >= 0.15.0 -# -# Hardware requirements: -# - Minimum: 1x A100 80GB (PCIe or SXM) -# - VRAM breakdown at gpu_memory_utilization=0.92: -# Model weights: ~45 GiB -# KV cache: ~27 GiB (298k tokens capacity, 4.49x concurrency at 262k) -# CUDA graphs: ~3 GiB -# Total: ~75 GiB / 80 GiB -# -# Ports: -# 11434/tcp - vLLM OpenAI-compatible API (reuses Ollama port for firewall compat) -# 4000/tcp - LiteLLM Anthropic-compatible proxy -# Both restricted to 192.168.3.0/24 (WireGuard wg1 subnet) - -# =========================================================================== -# STEP 1: Prerequisites -# =========================================================================== -# - VM with NVIDIA GPU, CUDA drivers, and Docker with nvidia-container-toolkit -# - WireGuard wg1 tunnel already configured (see wg1-setup.sh) -# - Ollama stopped and disabled if previously running: -# -# sudo systemctl stop ollama -# sudo systemctl disable ollama - -# =========================================================================== -# STEP 2: Storage setup -# =========================================================================== -# HuggingFace model cache on ephemeral storage (fast NVMe, survives reboots -# on some providers but not guaranteed — model will re-download if lost). -# -# sudo mkdir -p /ephemeral/hug -# sudo chmod -R 0777 /ephemeral/hug - -# =========================================================================== -# STEP 3: vLLM Docker container -# =========================================================================== -# Pull and run vLLM. The model downloads on first start (~45GB, ~2.5 min). -# After download, model loading takes ~65s and CUDA graph capture ~35s. -# Total cold start: ~4-5 minutes. -# -# docker pull vllm/vllm-openai:latest -# -# docker run -d \ -# --gpus all \ -# --ipc=host \ -# --network host \ -# --name vllm_qwen3 \ -# --restart always \ -# -v /ephemeral/hug:/root/.cache/huggingface \ -# vllm/vllm-openai:latest \ -# --model bullpoint/Qwen3-Coder-Next-AWQ-4bit \ -# --tensor-parallel-size 1 \ -# --enable-auto-tool-choice \ -# --tool-call-parser qwen3_coder \ -# --enable-prefix-caching \ -# --gpu-memory-utilization 0.92 \ -# --max-model-len 262144 \ -# --host 0.0.0.0 \ -# --port 11434 -# -# Flags explained: -# --tensor-parallel-size 1 Single GPU (use 2/4 for multi-GPU setups) -# --enable-auto-tool-choice Enables function/tool calling -# --tool-call-parser qwen3_coder Parser for qwen3-coder tool format -# --enable-prefix-caching Block-level KV cache reuse across requests -# --gpu-memory-utilization 0.92 Use 92% of VRAM (rest for OS/overhead) -# --max-model-len 262144 Full 256k context window -# --port 11434 Reuse Ollama port for firewall compatibility -# -# Verify startup (wait for "Application startup complete"): -# docker logs -f vllm_qwen3 2>&1 | grep -E "startup complete|Error" -# -# Verify model loaded: -# curl -s http://localhost:11434/v1/models | python3 -m json.tool -# -# Quick inference test: -# curl -s http://localhost:11434/v1/chat/completions \ -# -H "Content-Type: application/json" \ -# -H "Authorization: Bearer EMPTY" \ -# -d '{"model":"bullpoint/Qwen3-Coder-Next-AWQ-4bit", -# "messages":[{"role":"user","content":"Hello"}], -# "max_tokens":50}' -# -# Monitor performance (prefix cache hit rate, throughput): -# docker logs -f vllm_qwen3 2>&1 | grep "Engine 000" - -# =========================================================================== -# STEP 4: LiteLLM proxy (Anthropic API translation for Claude Code) -# =========================================================================== -# Install in a Python venv (Ubuntu 24.04 requires this): -# -# sudo apt-get install -y python3.12-venv -# sudo mkdir -p /ephemeral/litellm-env -# sudo chown ubuntu:ubuntu /ephemeral/litellm-env -# python3 -m venv /ephemeral/litellm-env -# /ephemeral/litellm-env/bin/pip install "litellm[proxy]" -# -# Write config file: -# -# sudo tee /ephemeral/litellm-config.yaml > /dev/null << "YAML" -# model_list: -# - model_name: "claude-sonnet-4-20250514" -# litellm_params: -# model: "hosted_vllm/bullpoint/Qwen3-Coder-Next-AWQ-4bit" -# api_base: "http://localhost:11434/v1" -# api_key: "EMPTY" -# - model_name: "claude-opus-4-20250514" -# litellm_params: -# model: "hosted_vllm/bullpoint/Qwen3-Coder-Next-AWQ-4bit" -# api_base: "http://localhost:11434/v1" -# api_key: "EMPTY" -# - model_name: "claude-opus-4-6-20260604" -# litellm_params: -# model: "hosted_vllm/bullpoint/Qwen3-Coder-Next-AWQ-4bit" -# api_base: "http://localhost:11434/v1" -# api_key: "EMPTY" -# - model_name: "claude-haiku-3-5-20241022" -# litellm_params: -# model: "hosted_vllm/bullpoint/Qwen3-Coder-Next-AWQ-4bit" -# api_base: "http://localhost:11434/v1" -# api_key: "EMPTY" -# -# litellm_settings: -# drop_params: true -# -# general_settings: -# master_key: "sk-litellm-master" -# YAML -# -# Config notes: -# - model_name values must match what Claude Code sends (Claude model IDs) -# - "hosted_vllm/" prefix forces LiteLLM to use /v1/chat/completions -# (not /v1/responses which vLLM doesn't fully support for complex messages) -# - drop_params: true — silently drops Claude-specific parameters like -# context_management that vLLM doesn't understand -# - master_key is the API key clients must send -# - Add new model_name entries when Anthropic releases new model IDs -# -# Start LiteLLM: -# -# nohup /ephemeral/litellm-env/bin/litellm \ -# --config /ephemeral/litellm-config.yaml \ -# --host 0.0.0.0 \ -# --port 4000 \ -# > /ephemeral/litellm.log 2>&1 & -# -# Verify: -# curl -s http://localhost:4000/v1/messages \ -# -H "Content-Type: application/json" \ -# -H "x-api-key: sk-litellm-master" \ -# -H "anthropic-version: 2023-06-01" \ -# -d '{"model":"claude-opus-4-6-20260604","max_tokens":50, -# "messages":[{"role":"user","content":"Hello"}]}' -# -# For production, create a systemd service instead of nohup: -# -# sudo tee /etc/systemd/system/litellm.service > /dev/null << "UNIT" -# [Unit] -# Description=LiteLLM Proxy -# After=network.target docker.service -# Requires=docker.service -# -# [Service] -# Type=simple -# User=ubuntu -# ExecStart=/ephemeral/litellm-env/bin/litellm \ -# --config /ephemeral/litellm-config.yaml \ -# --host 0.0.0.0 --port 4000 -# Restart=always -# RestartSec=5 -# -# [Install] -# WantedBy=multi-user.target -# UNIT -# -# sudo systemctl daemon-reload -# sudo systemctl enable --now litellm - -# =========================================================================== -# STEP 5: Firewall rules -# =========================================================================== -# Allow access from WireGuard subnet only: -# -# sudo ufw allow from 192.168.3.0/24 to any port 11434 proto tcp \ -# comment 'vLLM via wg1' -# sudo ufw allow from 192.168.3.0/24 to any port 4000 proto tcp \ -# comment 'LiteLLM proxy via wg1' - -# =========================================================================== -# STEP 6: Client configuration (on earth / local machine) -# =========================================================================== -# -# --- Claude Code --- -# Launch with environment variables pointing at LiteLLM proxy: -# -# ANTHROPIC_BASE_URL=http://192.168.3.1:4000 \ -# ANTHROPIC_API_KEY=sk-litellm-master \ -# claude --model claude-opus-4-6-20260604 --dangerously-skip-permissions -# -# Fish shell alias (add to ~/.config/fish/config.fish): -# -# alias claude-local='ANTHROPIC_BASE_URL=http://192.168.3.1:4000 \ -# ANTHROPIC_API_KEY=sk-litellm-master \ -# claude --model claude-opus-4-6-20260604 --dangerously-skip-permissions' -# -# --- OpenCode --- -# Connects directly to vLLM (no LiteLLM needed, speaks OpenAI natively): -# -# OPENAI_BASE_URL=http://192.168.3.1:11434/v1 \ -# OPENAI_API_KEY=EMPTY \ -# opencode -# -# Model name in OpenCode config: bullpoint/Qwen3-Coder-Next-AWQ-4bit - -# =========================================================================== -# STEP 7: Monitoring & troubleshooting -# =========================================================================== -# -# --- Live engine stats --- -# vLLM logs engine metrics every 10 seconds. Key fields: -# - Avg prompt throughput: prefill speed (tokens/s), higher = faster -# - Avg generation throughput: decode speed (tokens/s), ~40-99 on A100 PCIe -# - GPU KV cache usage: % of KV cache memory in use (proportional to -# active context length vs max capacity) -# - Prefix cache hit rate: % of prompt tokens served from cache (0% for -# Claude Code, higher for OpenCode) -# - Running/Waiting: active and queued request counts -# -# Follow live (all stats): -# docker logs -f vllm_qwen3 2>&1 | grep "Engine 000" -# -# Example output: -# Engine 000: Avg prompt throughput: 5555.2 tokens/s, -# Avg generation throughput: 49.4 tokens/s, -# Running: 1 reqs, Waiting: 0 reqs, -# GPU KV cache usage: 4.6%, -# Prefix cache hit rate: 0.0% -# -# --- Request-level monitoring --- -# See individual HTTP requests (method, status, duration): -# docker logs -f vllm_qwen3 2>&1 | grep "POST" -# -# Example output: -# 127.0.0.1:41864 - "POST /v1/chat/completions HTTP/1.1" 200 OK -# -# --- One-liner: last minute stats --- -# Useful for periodic checks without following the log: -# docker logs --since 1m vllm_qwen3 2>&1 | grep "Engine 000" -# -# --- LiteLLM proxy log --- -# tail -f /ephemeral/litellm.log -# -# --- GPU hardware stats --- -# Snapshot: -# nvidia-smi -# -# Continuous (every 5 seconds): -# nvidia-smi --query-gpu=temperature.gpu,utilization.gpu,power.draw,memory.used \ -# --format=csv -l 5 -# -# --- Interpreting the stats --- -# -# Healthy baseline (A100 80GB PCIe, qwen3-coder-next AWQ 4-bit): -# Prefill throughput: 5,000-11,000 tok/s (bursts higher during batch prefill) -# Decode throughput: 40-99 tok/s (varies with output length per sample) -# KV cache usage: 0-5% for short conversations, grows with context -# (100% = 298k tokens, at which point requests queue) -# Prefix cache hit: 0% for Claude Code (expected, it mutates prompt prefix) -# >50% for OpenCode after a few turns -# Temperature: 44-60C under load, <45C idle -# Power: 70W idle, 230-240W under load, 300W max -# -# Warning signs: -# - Waiting > 0 for extended periods → requests queuing, model overloaded -# - KV cache usage near 100% → context too long, reduce --max-model-len -# - Decode throughput < 20 tok/s sustained → possible thermal throttling -# - Prefill throughput < 2,000 tok/s → check for CPU offload or driver issues -# -# Common issues: -# -# 1. OOM on startup with --max-model-len 262144 -# → Reduce to 131072 or 65536 -# -# 2. "model does not exist" from vLLM -# → Model name in LiteLLM config must exactly match HuggingFace repo name -# -# 3. LiteLLM returns UnsupportedParamsError -# → Ensure drop_params: true is in litellm_settings -# -# 4. LiteLLM routes to /v1/responses instead of /v1/chat/completions -# → Use "hosted_vllm/" prefix in model field, not "openai/" -# -# 5. Claude Code "Auth conflict" warning -# → Run `claude /logout` first to clear the claude.ai session token, -# then re-launch with ANTHROPIC_API_KEY=sk-litellm-master -# -# 6. Prefix cache hit rate stays at 0% -# → Normal for Claude Code (it mutates the prompt prefix each turn) -# → OpenCode should show increasing cache hit rates after a few turns -# -# 7. vLLM container won't start (CUDA version mismatch) -# → Check driver version: nvidia-smi -# → vLLM requires CUDA >= 12.x and driver >= 535 - -# =========================================================================== -# STEP 8: Loading / switching models -# =========================================================================== -# -# vLLM serves one model per container. To switch models, stop the current -# container and start a new one with different --model. -# -# --- Stop current model --- -# docker stop vllm_qwen3 -# docker rm vllm_qwen3 -# -# --- Run a different model --- -# Replace --model, --name, and adjust --max-model-len and --tool-call-parser -# as needed. The HuggingFace model downloads automatically on first start. -# -# Example: qwen3-coder:30b (smaller, faster, fits easily on A100 80GB) -# -# docker run -d \ -# --gpus all \ -# --ipc=host \ -# --network host \ -# --name vllm_qwen3_30b \ -# --restart always \ -# -v /ephemeral/hug:/root/.cache/huggingface \ -# vllm/vllm-openai:latest \ -# --model Qwen/Qwen3-Coder-30B-AWQ \ -# --tensor-parallel-size 1 \ -# --enable-auto-tool-choice \ -# --tool-call-parser qwen3_coder \ -# --enable-prefix-caching \ -# --gpu-memory-utilization 0.92 \ -# --max-model-len 131072 \ -# --host 0.0.0.0 \ -# --port 11434 -# -# Example: full-precision model on multi-GPU (e.g. 4x H100) -# -# docker run -d \ -# --gpus all \ -# --ipc=host \ -# --network host \ -# --name vllm_qwen3_fp16 \ -# --restart always \ -# -v /ephemeral/hug:/root/.cache/huggingface \ -# vllm/vllm-openai:latest \ -# --model Qwen/Qwen3-Coder-Next \ -# --tensor-parallel-size 4 \ -# --enable-auto-tool-choice \ -# --tool-call-parser qwen3_coder \ -# --enable-prefix-caching \ -# --gpu-memory-utilization 0.90 \ -# --max-model-len 262144 \ -# --host 0.0.0.0 \ -# --port 11434 -# -# --- Update LiteLLM config to match --- -# After switching models, update the model field in litellm-config.yaml -# to match the new HuggingFace model name: -# -# model: "hosted_vllm/<new-model-name>" -# -# Then restart LiteLLM: -# pkill -f litellm -# nohup /ephemeral/litellm-env/bin/litellm \ -# --config /ephemeral/litellm-config.yaml \ -# --host 0.0.0.0 --port 4000 \ -# > /ephemeral/litellm.log 2>&1 & -# -# --- Finding models --- -# Search HuggingFace for vLLM-compatible quantized models: -# https://huggingface.co/models?search=<model-name>+awq -# https://huggingface.co/models?search=<model-name>+gptq -# -# Supported quantization formats in vLLM: -# - AWQ (recommended): fast Marlin kernels, good quality -# - GPTQ: similar to AWQ, widely available -# - FP8: 8-bit, needs Hopper+ GPUs (H100/H200) -# - BF16/FP16: full precision, needs more VRAM -# -# --- VRAM sizing guide --- -# Rule of thumb for single A100 80GB at 92% utilization (~75 GiB usable): -# -# Model size (params) | AWQ 4-bit VRAM | Max context (remaining for KV) -# ---------------------|----------------|------------------------------- -# 7-8B | ~5 GiB | 262k+ (plenty of KV headroom) -# 14B | ~9 GiB | 262k+ (plenty of KV headroom) -# 30-32B | ~18 GiB | 262k (~57 GiB for KV cache) -# 70-80B (MoE, 3B act) | ~45 GiB | 262k (~27 GiB for KV cache) -# 70B (dense) | ~38 GiB | 131k (~37 GiB for KV cache) -# 120B+ | won't fit | use multi-GPU or smaller quant -# -# If vLLM OOMs on startup, reduce --max-model-len first (halving it roughly -# halves KV cache memory). If still OOM, reduce --gpu-memory-utilization -# to 0.85 or try a smaller model. -# -# --- Verifying the new model --- -# Check loaded model: -# curl -s http://localhost:11434/v1/models | python3 -m json.tool -# -# Test inference: -# curl -s http://localhost:11434/v1/chat/completions \ -# -H "Content-Type: application/json" \ -# -H "Authorization: Bearer EMPTY" \ -# -d '{"model":"<model-name>", -# "messages":[{"role":"user","content":"Hello"}], -# "max_tokens":50}' -# -# Test via LiteLLM (Anthropic API): -# curl -s http://localhost:4000/v1/messages \ -# -H "Content-Type: application/json" \ -# -H "x-api-key: sk-litellm-master" \ -# -H "anthropic-version: 2023-06-01" \ -# -d '{"model":"claude-opus-4-6-20260604","max_tokens":50, -# "messages":[{"role":"user","content":"Hello"}]}' - -# =========================================================================== -# Performance characteristics (A100 80GB PCIe, single GPU) -# =========================================================================== -# -# Measured on 2026-03-16 with bullpoint/Qwen3-Coder-Next-AWQ-4bit: -# -# vLLM prefill throughput: 5,000-11,000 tok/s (FlashAttention v2) -# vLLM decode throughput: 40-99 tok/s (memory-bandwidth limited) -# Per-turn latency: ~10-15s (small prompts, early conversation) -# KV cache usage: 2-5% for typical coding sessions -# Prefix cache hit rate: 0% (Claude Code), expected >50% (OpenCode) -# -# Comparison with Ollama on same hardware (A100 80GB PCIe): -# -# | Ollama (Q4_K_M) | vLLM (AWQ 4-bit) -# -----------------------|-----------------------|---------------------- -# Prefill throughput | ~1,000 tok/s (est.) | 5,000-11,000 tok/s -# Decode throughput | ~40 tok/s | 40-99 tok/s -# Per-turn latency | ~28s (32k ctx) | ~10-15s -# Context window | 32k (was truncating) | 262k (full, no truncation) -# Prefix cache (Claude) | 0% always | 0% always -# Prefix cache (OpenCode)| 85-95% when warm | expected similar or better -# VRAM usage | 52-61 GiB | 75 GiB (more KV cache) diff --git a/snippets/hyperstack/wg1-setup.sh b/snippets/hyperstack/wg1-setup.sh deleted file mode 100755 index 67f139d..0000000 --- a/snippets/hyperstack/wg1-setup.sh +++ /dev/null @@ -1,414 +0,0 @@ -#!/bin/bash -# -# wg1-setup.sh - Set up WireGuard wg1 tunnel between earth and a hyperstack VM -# -# USAGE: -# ./wg1-setup.sh <VM_PUBLIC_IP> [SERVER_WG_IP] [WG_HOSTNAME] -# -# VM_PUBLIC_IP Public IP of the hyperstack VM (required) -# SERVER_WG_IP WireGuard IP to assign to this VM's tunnel interface (default: 192.168.3.1) -# Use 192.168.3.3 for hyperstack2 when hyperstack1 is already set up. -# WG_HOSTNAME Hostname mapped to SERVER_WG_IP in /etc/hosts (default: <vmhostname>.wg1) -# -# EXAMPLES: -# ./wg1-setup.sh 185.216.20.163 # VM1 (hyperstack1, 192.168.3.1) -# ./wg1-setup.sh 185.216.20.200 192.168.3.3 hyperstack2.wg1 # VM2 added to existing tunnel -# -# NETWORK DESIGN: -# Subnet: 192.168.3.0/24 (separate from wg0's 192.168.2.0/24) -# Port: 56710/udp -# -# +----------------+ +------------------+ -# | earth (client) | | hyperstack1 (VM) | -# | 192.168.3.2 |<--- WireGuard ---> | 192.168.3.1 | -# +----------------+ tunnel +------------------+ -# | | vLLM :11434 | -# | +------------------+ -# | +------------------+ -# +--------- WireGuard ----------> | hyperstack2 (VM) | -# | 192.168.3.3 | -# +------------------+ -# | vLLM :11434 | -# +------------------+ -# -# WHAT THIS SCRIPT DOES: -# -# For the FIRST VM (SERVER_WG_IP = 192.168.3.1, default): -# Generates fresh key-pairs and REPLACES /etc/wireguard/wg1.conf on earth with -# a single-peer config pointing to this VM. -# -# For ADDITIONAL VMs (any other SERVER_WG_IP, e.g. 192.168.3.3): -# Generates new server-side keys and ADDS or UPDATES just the new [Peer] block -# in the existing /etc/wireguard/wg1.conf, preserving the [Interface] section -# (client key-pair) and any other peers already present. -# The existing client public key from wg1.conf is extracted and used in the new -# VM's server config so it can encrypt traffic to earth. -# -# On every hyperstack VM (via SSH): -# - Installs WireGuard if not present -# - Creates /etc/wireguard/wg1.conf with SERVER_WG_IP as the tunnel address -# - Opens UFW ports: 56710/udp (WireGuard), 11434/tcp from 192.168.3.0/24 -# - Starts wg-quick@wg1 -# -# On earth (locally): -# - Installs WireGuard if not present (dnf) -# - Creates or updates /etc/wireguard/wg1.conf (see above) -# - Adds SERVER_WG_IP <-> WG_HOSTNAME mapping to /etc/hosts -# - Restarts wg-quick@wg1 -# -# PREREQUISITES: -# - SSH access to ubuntu@<VM_IP> with key-based auth -# - UDP port 56710 open in cloud provider's firewall/security group -# -# RE-RUNNING: -# When a VM IP changes, simply re-run this script with the new IP. -# It will regenerate keys and update configs on both sides. -# - -set -euo pipefail - -# Fixed network constants that must match hyperstack-vm*.toml [network] section. -WG_INTERFACE="wg1" -WG_PORT="56710" -DEFAULT_SERVER_WG_IP="192.168.3.1" -CLIENT_WG_IP="192.168.3.2" -SUBNET_MASK="24" -SSH_USER="ubuntu" -SSH_PORT="${HYPERSTACK_SSH_PORT:-22}" -SSH_CONNECT_TIMEOUT="${HYPERSTACK_SSH_CONNECT_TIMEOUT:-10}" -SSH_KNOWN_HOSTS_PATH="${HYPERSTACK_SSH_KNOWN_HOSTS_PATH:-}" -SSH_PRIVATE_KEY_PATH="${HYPERSTACK_SSH_PRIVATE_KEY_PATH:-}" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -print_warning() { echo -e "${YELLOW}$1${NC}"; } -print_success() { echo -e "${GREEN}$1${NC}"; } -print_error() { echo -e "${RED}$1${NC}"; } - -# Retry wrapper for SSH/SCP commands that may fail due to transient -# connection resets (e.g. sshd restart from unattended-upgrades). -retry_ssh() { - local max_attempts=5 - local attempt=1 - local delay=10 - while true; do - if "$@"; then - return 0 - fi - if [[ $attempt -ge $max_attempts ]]; then - print_error "Command failed after ${max_attempts} attempts: $*" - return 1 - fi - echo " SSH attempt ${attempt}/${max_attempts} failed, retrying in ${delay}s..." - sleep "$delay" - attempt=$((attempt + 1)) - delay=$((delay + 5)) - done -} - -SSH_BASE_OPTS=(-o "ConnectTimeout=${SSH_CONNECT_TIMEOUT}" -o BatchMode=yes -p "${SSH_PORT}") -SCP_BASE_OPTS=(-o "ConnectTimeout=${SSH_CONNECT_TIMEOUT}" -o BatchMode=yes -P "${SSH_PORT}") -if [[ -n "${SSH_KNOWN_HOSTS_PATH}" ]]; then - SSH_BASE_OPTS+=(-o StrictHostKeyChecking=yes -o "UserKnownHostsFile=${SSH_KNOWN_HOSTS_PATH}") - SCP_BASE_OPTS+=(-o StrictHostKeyChecking=yes -o "UserKnownHostsFile=${SSH_KNOWN_HOSTS_PATH}") -fi -if [[ -n "${SSH_PRIVATE_KEY_PATH}" && -f "${SSH_PRIVATE_KEY_PATH}" ]]; then - SSH_BASE_OPTS+=(-i "${SSH_PRIVATE_KEY_PATH}") - SCP_BASE_OPTS+=(-i "${SSH_PRIVATE_KEY_PATH}") -fi - -ssh_vm() { - ssh "${SSH_BASE_OPTS[@]}" "${SSH_USER}@${VM_IP}" "$@" -} - -scp_vm() { - scp "${SCP_BASE_OPTS[@]}" "$@" -} - -# Updates or adds a [Peer] block in the existing /etc/wireguard/wg1.conf. -# Preserves the [Interface] section and any other peers; only the block for -# SERVER_WG_IP (matched by AllowedIPs) is replaced. -# Uses python3 for safe regex-based TOML-like block manipulation. -update_peer_in_client_config() { - local server_ip="$1" - local server_pubkey="$2" - local vm_ip="$3" - local tmpfile conf_copy - tmpfile=$(mktemp) - conf_copy=$(mktemp) - - # /etc/wireguard/wg1.conf is root-owned; read it via sudo into a user-readable temp copy. - if ! sudo cat /etc/wireguard/wg1.conf > "$conf_copy" 2>/dev/null; then - print_error "Cannot read /etc/wireguard/wg1.conf. Run wg1-setup.sh for VM1 (192.168.3.1) first." - rm -f "$tmpfile" "$conf_copy" - return 1 - fi - - python3 - "$server_ip" "$server_pubkey" "$vm_ip" "$WG_PORT" "$conf_copy" "$tmpfile" << 'PYEOF' -import sys, re - -server_ip, server_pubkey, vm_ip, wg_port, conf_copy, tmpfile = sys.argv[1:] - -with open(conf_copy) as f: - content = f.read() - -if not content.strip(): - print("ERROR: wg1.conf is empty. Run wg1-setup.sh for VM1 (192.168.3.1) first.", file=sys.stderr) - sys.exit(1) - -# Split into sections: [Interface] block + any [Peer] blocks. -# Each section starts with a [ header; split on newline-[ boundaries. -parts = re.split(r'(?=\n\[)', content) - -# Remove any existing [Peer] block whose AllowedIPs matches server_ip/32. -kept = [p for p in parts if not (re.search(r'^\[Peer\]', p.lstrip()) and f'AllowedIPs = {server_ip}/32' in p)] - -new_peer = f""" -[Peer] -# hyperstack VM ({server_ip}) -PublicKey = {server_pubkey} -Endpoint = {vm_ip}:{wg_port} -AllowedIPs = {server_ip}/32 -PersistentKeepalive = 25""" - -result = ''.join(kept).rstrip('\n') + '\n' + new_peer + '\n' - -with open(tmpfile, 'w') as f: - f.write(result) -print('peer-updated-ok') -PYEOF - - local rc=$? - rm -f "$conf_copy" - if [[ $rc -eq 0 ]]; then - sudo cp "${tmpfile}" /etc/wireguard/wg1.conf - sudo chmod 600 /etc/wireguard/wg1.conf - fi - rm -f "${tmpfile}" - return $rc -} - -# Validate arguments -if [[ $# -lt 1 ]]; then - echo "Usage: $0 <VM_PUBLIC_IP> [SERVER_WG_IP] [WG_HOSTNAME]" - echo "Example (VM1): $0 185.216.20.163" - echo "Example (VM2): $0 185.216.20.200 192.168.3.3 hyperstack2.wg1" - exit 1 -fi - -VM_IP="$1" -SERVER_WG_IP="${2:-${DEFAULT_SERVER_WG_IP}}" -# Default WG_HOSTNAME: replace 192.168.3. prefix with 'hyperstack' and append .wg1, -# or fall back to server IP if the address doesn't match the expected pattern. -WG_HOSTNAME="${3:-$(echo "$SERVER_WG_IP" | sed 's/^192\.168\.3\.\(.*\)/hyperstack\1.wg1/' || echo "${SERVER_WG_IP}.wg1")}" - -# Determine mode: first VM replaces the entire client config; additional VMs add a peer. -IS_FIRST_VM=false -[[ "$SERVER_WG_IP" == "$DEFAULT_SERVER_WG_IP" ]] && IS_FIRST_VM=true - -echo "==============================================" -print_warning "IMPORTANT: Ensure UDP port ${WG_PORT} is open on the VM!" -print_warning "This must be configured in your cloud provider's" -print_warning "firewall/security group settings." -if [[ "$IS_FIRST_VM" == "false" ]]; then - print_warning "Mode: ADD PEER — ${SERVER_WG_IP} (${WG_HOSTNAME}) will be added to existing wg1.conf." - print_warning "Ensure the first VM (192.168.3.1) has already been set up." -fi -echo "==============================================" -echo "" -read -rp "Press Enter to continue (or Ctrl+C to abort)..." -echo "" - -# Create temporary directory for key generation -TMPDIR=$(mktemp -d) -trap 'rm -rf $TMPDIR' EXIT - -echo "=== Generating WireGuard keys locally ===" - -# Generate server (hyperstack VM) keys — always fresh for each VM. -wg genkey > "$TMPDIR/server-privatekey" -wg pubkey < "$TMPDIR/server-privatekey" > "$TMPDIR/server-publickey" -SERVER_PRIVATE_KEY=$(cat "$TMPDIR/server-privatekey") -SERVER_PUBLIC_KEY=$(cat "$TMPDIR/server-publickey") - -if [[ "$IS_FIRST_VM" == "true" ]]; then - # First VM: generate fresh client keys; the entire wg1.conf will be replaced. - wg genkey > "$TMPDIR/client-privatekey" - wg pubkey < "$TMPDIR/client-privatekey" > "$TMPDIR/client-publickey" - CLIENT_PRIVATE_KEY=$(cat "$TMPDIR/client-privatekey") - CLIENT_PUBLIC_KEY=$(cat "$TMPDIR/client-publickey") - print_success "Keys generated (first VM — full config will be replaced)" -else - # Additional VM: reuse the existing client keys from /etc/wireguard/wg1.conf so that - # the first VM's server config (which already stores the client public key) keeps working. - CLIENT_PRIVATE_KEY=$(sudo cat /etc/wireguard/wg1.conf | grep -m1 'PrivateKey' | awk '{print $3}') - if [[ -z "$CLIENT_PRIVATE_KEY" ]]; then - print_error "Cannot extract client private key from /etc/wireguard/wg1.conf." - print_error "Run this script for VM1 (192.168.3.1) first." - exit 1 - fi - CLIENT_PUBLIC_KEY=$(echo "$CLIENT_PRIVATE_KEY" | wg pubkey) - print_success "Keys generated (additional VM — client keys reused from existing wg1.conf)" -fi - -echo "" -echo "=== Creating server (hyperstack VM ${SERVER_WG_IP}) configuration ===" - -cat > "$TMPDIR/server-wg1.conf" << EOF -# WireGuard wg1 configuration for hyperstack VM (${SERVER_WG_IP}) -# Server side of earth <-> hyperstack tunnel -# Generated by wg1-setup.sh on $(date) - -[Interface] -Address = ${SERVER_WG_IP}/${SUBNET_MASK} -ListenPort = ${WG_PORT} -PrivateKey = ${SERVER_PRIVATE_KEY} - -[Peer] -# earth (client) -PublicKey = ${CLIENT_PUBLIC_KEY} -AllowedIPs = ${CLIENT_WG_IP}/32 -EOF - -print_success "Server config created (server IP: ${SERVER_WG_IP})" - -if [[ "$IS_FIRST_VM" == "true" ]]; then - echo "" - echo "=== Creating client (earth) configuration ===" - - cat > "$TMPDIR/client-wg1.conf" << EOF -# WireGuard wg1 configuration for earth -# Client side of earth <-> hyperstack tunnel -# Generated by wg1-setup.sh on $(date) - -[Interface] -Address = ${CLIENT_WG_IP}/${SUBNET_MASK} -PrivateKey = ${CLIENT_PRIVATE_KEY} - -[Peer] -# hyperstack VM (${SERVER_WG_IP}) -PublicKey = ${SERVER_PUBLIC_KEY} -Endpoint = ${VM_IP}:${WG_PORT} -AllowedIPs = ${SERVER_WG_IP}/32 -PersistentKeepalive = 25 -EOF - - print_success "Client config created" -fi - -echo "" -echo "=== Setting up hyperstack VM (${VM_IP}, tunnel IP ${SERVER_WG_IP}) ===" - -echo "Testing SSH connection..." -retry_ssh ssh_vm "echo 'SSH OK'" -print_success "SSH connection OK" - -echo "Installing WireGuard on hyperstack..." -retry_ssh ssh_vm "which wg >/dev/null 2>&1 || (sudo apt update && sudo apt install -y wireguard)" -print_success "WireGuard installed" - -echo "Copying wg1.conf to hyperstack..." -retry_ssh scp_vm "$TMPDIR/server-wg1.conf" "${SSH_USER}@${VM_IP}:/tmp/wg1.conf" -retry_ssh ssh_vm "sudo mv /tmp/wg1.conf /etc/wireguard/wg1.conf && sudo chmod 600 /etc/wireguard/wg1.conf" -print_success "Server config installed" - -echo "Configuring firewall (ufw) on hyperstack..." -retry_ssh ssh_vm bash -s << 'REMOTE_SCRIPT' -sudo ufw allow ssh comment 'Allow SSH' 2>/dev/null || true -sudo ufw --force enable >/dev/null 2>&1 || true -sudo ufw allow 56710/udp comment 'WireGuard wg1' 2>/dev/null || true -sudo ufw allow from 192.168.3.0/24 to any port 11434 proto tcp comment 'Ollama/vLLM via wg1' 2>/dev/null || true -echo "Firewall rules added" -REMOTE_SCRIPT -print_success "Firewall configured" - -echo "Configuring Ollama to listen on 0.0.0.0 (if installed)..." -retry_ssh ssh_vm bash -s << 'REMOTE_SCRIPT' -if [ -f /etc/systemd/system/ollama.service.d/override.conf ] && \ - grep -q 'OLLAMA_HOST' /etc/systemd/system/ollama.service.d/override.conf; then - echo "Ollama override already configured, skipping" -else - sudo mkdir -p /etc/systemd/system/ollama.service.d - cat << 'OVERRIDE' | sudo tee /etc/systemd/system/ollama.service.d/override.conf > /dev/null -[Service] -Environment="OLLAMA_HOST=0.0.0.0:11434" -OVERRIDE - sudo systemctl daemon-reload - sudo systemctl restart ollama 2>/dev/null || echo "Note: Ollama not running or not installed" -fi -REMOTE_SCRIPT -print_success "Ollama configured" - -echo "Starting wg1 on hyperstack..." -retry_ssh ssh_vm "sudo systemctl start wg-quick@wg1 2>/dev/null || sudo wg-quick up wg1" -print_success "wg1 started on hyperstack" - -echo "" -echo "=== Setting up earth (local) ===" - -if ! which wg >/dev/null 2>&1; then - echo "Installing WireGuard locally..." - sudo dnf install -y wireguard-tools -fi -print_success "WireGuard installed locally" - -if [[ "$IS_FIRST_VM" == "true" ]]; then - echo "Installing fresh wg1.conf locally (first VM — replaces any existing config)..." - sudo cp "$TMPDIR/client-wg1.conf" /etc/wireguard/wg1.conf - sudo chmod 600 /etc/wireguard/wg1.conf - print_success "Client config installed" -else - echo "Adding peer ${SERVER_WG_IP} to existing wg1.conf (additional VM)..." - update_peer_in_client_config "$SERVER_WG_IP" "$SERVER_PUBLIC_KEY" "$VM_IP" - print_success "Peer added to client config" -fi - -# Update /etc/hosts so that WG_HOSTNAME resolves to the VM's WireGuard IP. -# hyperstack.rb uses this hostname in test URLs and informational output. -echo "Updating /etc/hosts: ${SERVER_WG_IP} ${WG_HOSTNAME}..." -sudo sed -i "/ ${WG_HOSTNAME}$/d" /etc/hosts # Remove stale entry if present -echo "${SERVER_WG_IP} ${WG_HOSTNAME}" | sudo tee -a /etc/hosts > /dev/null -print_success "/etc/hosts updated" - -echo "Restarting wg1 locally..." -sudo systemctl stop wg-quick@wg1 2>/dev/null || true -sudo systemctl start wg-quick@wg1 -print_success "wg1 restarted locally" - -echo "" -echo "==============================================" -print_success "Setup complete!" -echo "==============================================" -echo "" -echo "WireGuard wg1 tunnel peer active:" -echo " hyperstack VM (server): ${SERVER_WG_IP} (${WG_HOSTNAME})" -echo " earth (client): ${CLIENT_WG_IP}" -echo "" -echo "=== Verification commands ===" -echo "" -echo "# Check tunnel status:" -echo "sudo wg show wg1" -echo "" -echo "# Ping hyperstack via tunnel:" -echo "ping -c 3 ${SERVER_WG_IP}" -echo "" -echo "# Verify default route is UNCHANGED:" -echo "ip route | grep default" -echo "" -echo "# Test vLLM access:" -echo "curl http://${WG_HOSTNAME}:11434/v1/models" -echo "" -echo "=== Manual start/stop commands ===" -echo "" -echo "# Stop tunnel:" -echo "sudo systemctl stop wg-quick@wg1" -echo "" -echo "# Start tunnel:" -echo "sudo systemctl start wg-quick@wg1" -echo "" -echo "# Restart on hyperstack (if VM rebooted):" -echo "ssh ${SSH_USER}@${VM_IP} 'sudo systemctl start wg-quick@wg1'" |
