hyperstack-vm1.toml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

[auth]
api_key_file = "~/.hyperstack"

[hyperstack]
base_url = "https://infrahub-api.nexgencloud.com/v1"

[state]
# Separate state file for VM1 so vm1 and vm2 can be managed independently.
file = ".hyperstack-vm1-state.json"

[vm]
name_prefix = "hyperstack1"
hostname = "hyperstack1"
environment_name = "snonux-ollama"

# H100-80GB: switched from n3-A100x1 which was out of stock in CANADA-1.
# H100 also provides safer throughput and compatibility headroom for nemotron-3-super.
flavor_name = "n3-H100x1"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
enable_port_randomization = false
labels = ["nemotron-3-super", "wireguard"]

[ssh]
username = "ubuntu"
private_key_path = "~/.ssh/id_rsa"
hyperstack_key_name = "earth"
port = 22
connect_timeout_sec = 10

[network]
wireguard_udp_port = 56710
wireguard_subnet = "192.168.3.0/24"
# VM1 gets the first server-side WireGuard IP (gateway address + 0).
# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3.
wireguard_server_ip = "192.168.3.1"
# Secure default: "auto" resolves your current public egress IP to /32 at runtime.
# Override with explicit CIDRs if you deploy from multiple networks or want broader access.
allowed_ssh_cidrs = ["auto"]
allowed_wireguard_cidrs = ["auto"]
# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
ollama_port = 11434

[bootstrap]
enable_guest_bootstrap = true
install_wireguard = true
configure_ufw = true
configure_ollama_host = false

[ollama]
# Disabled in favour of vLLM; set install = true to switch back to Ollama.
install = false
models_dir = "/ephemeral/ollama/models"
listen_host = "0.0.0.0:11434"
gpu_overhead_mb = 2000
num_parallel = 1
context_length = 32768
pull_models = ["nemotron-3-super"]

# vLLM serves one model via Docker on the OpenAI-compatible API.
# VM1 defaults to nemotron-3-super; use 'model switch' to load any other preset.
[vllm]
install = true
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
container_name = "vllm_nemotron_super"
# Capped at 131072 to keep KV cache within VRAM budget on A100 80GB.
# 262144 OOMs without --enforce-eager (CUDA graph capture costs ~3-4 GB on top of ~60 GB weights).
max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
# NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML.
tool_call_parser = "qwen3_xml"
trust_remote_code = true
# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB needed to fit within A100 80GB.
extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]

# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1.toml model switch <name>'.
# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.

[vllm.presets.qwen3-coder-next]
model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
container_name = "vllm_qwen3"
max_model_len = 262144
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
# ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation.
# Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget.
# Requires trust_remote_code=true for the nemotron_h architecture.
[vllm.presets.nemotron-super]
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
container_name = "vllm_nemotron_super"
max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_xml"
trust_remote_code = true
# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model
# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability.
extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]

# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
[vllm.presets.gpt-oss-20b]
model = "openai/gpt-oss-20b"
container_name = "vllm_gpt_oss_20b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""

# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
# Hard architecture limit: max_position_embeddings=131072 in model config.json.
[vllm.presets.gpt-oss-120b]
model = "openai/gpt-oss-120b"
container_name = "vllm_gpt_oss_120b"
max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""

# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
[vllm.presets.qwen25-coder-32b]
model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
container_name = "vllm_qwen25_coder32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "hermes"

# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB.
[vllm.presets.qwen3-coder-30b]
model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ"
container_name = "vllm_qwen3_coder30b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100.
[vllm.presets.deepseek-r1-32b]
model = "casperhansen/deepseek-r1-distill-qwen-32b-awq"
container_name = "vllm_deepseek_r1_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100.
[vllm.presets.qwen3-32b]
model = "Qwen/Qwen3-32B-AWQ"
container_name = "vllm_qwen3_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100).
[vllm.presets.devstral]
model = "cyankiwi/Devstral-Small-2507-AWQ-4bit"
container_name = "vllm_devstral"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "mistral"
extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]

[wireguard]
auto_setup = true
setup_script = "./wg1-setup.sh"

[local_client]
check_wg1_service = true
interface_name = "wg1"
config_path = "/etc/wireguard/wg1.conf"