diff options
| -rw-r--r-- | README.md | 52 | ||||
| -rw-r--r-- | hyperstack-vm1.toml | 9 | ||||
| -rw-r--r-- | hyperstack-vm2.toml | 9 | ||||
| -rw-r--r-- | hyperstack.fish | 12 |
4 files changed, 73 insertions, 9 deletions
@@ -279,7 +279,7 @@ Available presets (both VMs share the same set): | Preset | Model | VRAM | Context | |---|---|---|---| -| `nemotron-super` | Nemotron-3-Super 120B (Mamba+MoE, 12B active) | ~60 GB | 262K | +| `nemotron-super` | Nemotron-3-Super 120B (Mamba+MoE, 12B active) | ~60 GB | 131K | | `qwen3-coder-next` | Qwen3-Coder-Next 80B (MoE, AWQ-4bit) | ~45 GB | 262K | | `gpt-oss-120b` | GPT-OSS 120B (MoE, MXFP4) | ~65 GB | 131K | | `gpt-oss-20b` | GPT-OSS 20B (MoE, MXFP4) | ~14 GB | 65K | @@ -331,6 +331,56 @@ set `HYPERSTACK_OPERATOR_CIDR` to override that detection when needed. SSH host keys are pinned per state file in `<state>.known_hosts`. `delete` and `--replace` clear that trust file for intentional reprovisioning; unexpected host key changes now fail closed. +## Automated setup reference + +`hyperstack.rb` handles the full VM lifecycle automatically. All steps below +(VM creation, WireGuard tunnel, vLLM Docker container) run in a single command. + +### Single-VM setup + +```bash +# Deploy VM, configure WireGuard tunnel, pull and start vLLM (~10 min) +ruby hyperstack.rb create + +# Run end-to-end inference test over the tunnel +ruby hyperstack.rb test + +# Launch Pi coding agent connected to GPT-OSS 120B on the VM +pi-hyperstack # fish abbreviation from hyperstack.fish + +# Tear down the VM and remove WireGuard peer +ruby hyperstack.rb delete +``` + +### Two-VM setup + +```bash +# Deploy both VMs in parallel, set up tunnel and vLLM on each (~10 min) +ruby hyperstack.rb create-both + +# Test each VM individually +ruby hyperstack.rb --config hyperstack-vm1.toml test +ruby hyperstack.rb --config hyperstack-vm2.toml test + +# Launch Pi coding agents — one per terminal +pi-hyperstack-nemotron # fish abbreviation → Nemotron-3-Super 120B on VM1 +pi-hyperstack-coder # fish abbreviation → Qwen3-Coder-Next 80B on VM2 + +# Tear down both VMs +ruby hyperstack.rb delete-both +``` + +### Hot-switching models without reprovisioning + +```bash +# Switch the running vLLM container to a different model preset +ruby hyperstack.rb --config hyperstack-vm1.toml model switch qwen3-coder-next +ruby hyperstack.rb --config hyperstack-vm2.toml model switch nemotron-super +``` + +See the [VM configuration](#vm-configuration) and [Switching models](#switching-models) +sections for available presets and config options. + ## Manual vLLM Docker setup This section covers manual vLLM deployment for debugging or running outside the diff --git a/hyperstack-vm1.toml b/hyperstack-vm1.toml index 6109472..e101bec 100644 --- a/hyperstack-vm1.toml +++ b/hyperstack-vm1.toml @@ -86,17 +86,20 @@ tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed. +# ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation. +# Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget. # Requires trust_remote_code=true for the nemotron_h architecture. [vllm.presets.nemotron-super] model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" container_name = "vllm_nemotron_super" -max_model_len = 262144 +max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_xml" trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] +# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model +# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability. +extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. [vllm.presets.gpt-oss-20b] diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml index 202a340..d3c0a17 100644 --- a/hyperstack-vm2.toml +++ b/hyperstack-vm2.toml @@ -83,17 +83,20 @@ tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed. +# ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation. +# Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget. # Requires trust_remote_code=true for the nemotron_h architecture. [vllm.presets.nemotron-super] model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" container_name = "vllm_nemotron_super" -max_model_len = 262144 +max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_xml" trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] +# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model +# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability. +extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. [vllm.presets.gpt-oss-20b] diff --git a/hyperstack.fish b/hyperstack.fish index 1b32a37..09706b5 100644 --- a/hyperstack.fish +++ b/hyperstack.fish @@ -1,3 +1,11 @@ +# Single-VM setup (hyperstack-vm.toml → hyperstack.wg1) abbr pi-hyperstack pi --model hyperstack/openai/gpt-oss-120b -abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit -abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit +abbr hyperstack-create ruby ~/git/hyperstack/hyperstack.rb create +abbr hyperstack-delete ruby ~/git/hyperstack/hyperstack.rb delete +abbr hyperstack-test ruby ~/git/hyperstack/hyperstack.rb test + +# Dual-VM setup (hyperstack-vm1/vm2.toml → hyperstack1/2.wg1) +abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit +abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit +abbr hyperstack-create-both ruby ~/git/hyperstack/hyperstack.rb create-both +abbr hyperstack-delete-both ruby ~/git/hyperstack/hyperstack.rb delete-both |
