snippets/hyperstack/hyperstack-vm.toml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

[auth]
api_key_file = "~/.hyperstack"

[hyperstack]
base_url = "https://infrahub-api.nexgencloud.com/v1"

[state]
file = ".hyperstack-vm-state.json"

[vm]
name_prefix = "hyperstack"
hostname = "hyperstack"
environment_name = "snonux-ollama"

# A100-80GB is the cost-first default for gpt-oss-120b inference.
# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom.
flavor_name = "n3-A100x1"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
enable_port_randomization = false
labels = ["gpt-oss-120b", "wireguard"]

[ssh]
username = "ubuntu"
private_key_path = "~/.ssh/id_rsa"
hyperstack_key_name = "earth"
port = 22
connect_timeout_sec = 10

[network]
wireguard_udp_port = 56710
wireguard_subnet = "192.168.3.0/24"
# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
ollama_port = 11434
# Port 4000: LiteLLM Anthropic-API proxy (used with vLLM).
litellm_port = 4000
allowed_ssh_cidrs = ["0.0.0.0/0"]
allowed_wireguard_cidrs = ["0.0.0.0/0"]

[bootstrap]
enable_guest_bootstrap = true
install_wireguard = true
configure_ufw = true
configure_ollama_host = false

[ollama]
# Disabled in favour of vLLM; set install = true to switch back to Ollama.
install = false
models_dir = "/ephemeral/ollama/models"
listen_host = "0.0.0.0:11434"
gpu_overhead_mb = 2000
num_parallel = 1
context_length = 32768
pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]

# vLLM serves one model via Docker; LiteLLM translates Anthropic API → OpenAI.
# Use --vllm / --no-vllm CLI flags to override install at runtime.
[vllm]
install = true
model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
container_name = "vllm_qwen3"
max_model_len = 262144
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"
# LiteLLM maps each entry to the vLLM model; add new Anthropic model IDs here.
litellm_master_key = "sk-litellm-master"
litellm_claude_model_names = [
  "claude-sonnet-4-20250514",
  "claude-opus-4-20250514",
  "claude-opus-4-6-20260604",
  "claude-haiku-3-5-20241022"
]

[wireguard]
auto_setup = true
setup_script = "./wg1-setup.sh"

[local_client]
check_wg1_service = true
interface_name = "wg1"
config_path = "/etc/wireguard/wg1.conf"