summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md52
-rw-r--r--hyperstack-vm1.toml9
-rw-r--r--hyperstack-vm2.toml9
-rw-r--r--hyperstack.fish12
4 files changed, 73 insertions, 9 deletions
diff --git a/README.md b/README.md
index 97a4836..95cd30d 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,7 @@ Available presets (both VMs share the same set):
| Preset | Model | VRAM | Context |
|---|---|---|---|
-| `nemotron-super` | Nemotron-3-Super 120B (Mamba+MoE, 12B active) | ~60 GB | 262K |
+| `nemotron-super` | Nemotron-3-Super 120B (Mamba+MoE, 12B active) | ~60 GB | 131K |
| `qwen3-coder-next` | Qwen3-Coder-Next 80B (MoE, AWQ-4bit) | ~45 GB | 262K |
| `gpt-oss-120b` | GPT-OSS 120B (MoE, MXFP4) | ~65 GB | 131K |
| `gpt-oss-20b` | GPT-OSS 20B (MoE, MXFP4) | ~14 GB | 65K |
@@ -331,6 +331,56 @@ set `HYPERSTACK_OPERATOR_CIDR` to override that detection when needed.
SSH host keys are pinned per state file in `<state>.known_hosts`. `delete` and `--replace`
clear that trust file for intentional reprovisioning; unexpected host key changes now fail closed.
+## Automated setup reference
+
+`hyperstack.rb` handles the full VM lifecycle automatically. All steps below
+(VM creation, WireGuard tunnel, vLLM Docker container) run in a single command.
+
+### Single-VM setup
+
+```bash
+# Deploy VM, configure WireGuard tunnel, pull and start vLLM (~10 min)
+ruby hyperstack.rb create
+
+# Run end-to-end inference test over the tunnel
+ruby hyperstack.rb test
+
+# Launch Pi coding agent connected to GPT-OSS 120B on the VM
+pi-hyperstack # fish abbreviation from hyperstack.fish
+
+# Tear down the VM and remove WireGuard peer
+ruby hyperstack.rb delete
+```
+
+### Two-VM setup
+
+```bash
+# Deploy both VMs in parallel, set up tunnel and vLLM on each (~10 min)
+ruby hyperstack.rb create-both
+
+# Test each VM individually
+ruby hyperstack.rb --config hyperstack-vm1.toml test
+ruby hyperstack.rb --config hyperstack-vm2.toml test
+
+# Launch Pi coding agents — one per terminal
+pi-hyperstack-nemotron # fish abbreviation → Nemotron-3-Super 120B on VM1
+pi-hyperstack-coder # fish abbreviation → Qwen3-Coder-Next 80B on VM2
+
+# Tear down both VMs
+ruby hyperstack.rb delete-both
+```
+
+### Hot-switching models without reprovisioning
+
+```bash
+# Switch the running vLLM container to a different model preset
+ruby hyperstack.rb --config hyperstack-vm1.toml model switch qwen3-coder-next
+ruby hyperstack.rb --config hyperstack-vm2.toml model switch nemotron-super
+```
+
+See the [VM configuration](#vm-configuration) and [Switching models](#switching-models)
+sections for available presets and config options.
+
## Manual vLLM Docker setup
This section covers manual vLLM deployment for debugging or running outside the
diff --git a/hyperstack-vm1.toml b/hyperstack-vm1.toml
index 6109472..e101bec 100644
--- a/hyperstack-vm1.toml
+++ b/hyperstack-vm1.toml
@@ -86,17 +86,20 @@ tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"
# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
-# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed.
+# ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation.
+# Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget.
# Requires trust_remote_code=true for the nemotron_h architecture.
[vllm.presets.nemotron-super]
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
container_name = "vllm_nemotron_super"
-max_model_len = 262144
+max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_xml"
trust_remote_code = true
-extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]
+# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model
+# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability.
+extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
[vllm.presets.gpt-oss-20b]
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index 202a340..d3c0a17 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -83,17 +83,20 @@ tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"
# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
-# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed.
+# ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation.
+# Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget.
# Requires trust_remote_code=true for the nemotron_h architecture.
[vllm.presets.nemotron-super]
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
container_name = "vllm_nemotron_super"
-max_model_len = 262144
+max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_xml"
trust_remote_code = true
-extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]
+# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model
+# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability.
+extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
[vllm.presets.gpt-oss-20b]
diff --git a/hyperstack.fish b/hyperstack.fish
index 1b32a37..09706b5 100644
--- a/hyperstack.fish
+++ b/hyperstack.fish
@@ -1,3 +1,11 @@
+# Single-VM setup (hyperstack-vm.toml → hyperstack.wg1)
abbr pi-hyperstack pi --model hyperstack/openai/gpt-oss-120b
-abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit
-abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit
+abbr hyperstack-create ruby ~/git/hyperstack/hyperstack.rb create
+abbr hyperstack-delete ruby ~/git/hyperstack/hyperstack.rb delete
+abbr hyperstack-test ruby ~/git/hyperstack/hyperstack.rb test
+
+# Dual-VM setup (hyperstack-vm1/vm2.toml → hyperstack1/2.wg1)
+abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit
+abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit
+abbr hyperstack-create-both ruby ~/git/hyperstack/hyperstack.rb create-both
+abbr hyperstack-delete-both ruby ~/git/hyperstack/hyperstack.rb delete-both