summaryrefslogtreecommitdiff
path: root/hyperstack-vm2.toml
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-24 18:31:43 +0300
committerPaul Buetow <paul@buetow.org>2026-05-24 18:31:43 +0300
commit290ea93e7c7475996a11ce6651237d8a803228c0 (patch)
tree19786d37a6c57c534f07e02cf842af42c82117fd /hyperstack-vm2.toml
parentdad54c12a35481144c6f91d3be2695685de40dad (diff)
chore(vm2): H100 provisioning, L40 plan, and H100-specific vLLM tuning
Diffstat (limited to 'hyperstack-vm2.toml')
-rw-r--r--hyperstack-vm2.toml9
1 files changed, 6 insertions, 3 deletions
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index 3e74aae..070b8aa 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -14,7 +14,8 @@ hostname = "hyperstack2"
environment_name = "snonux-ollama"
# A100-80GB for Qwen3.6 27B; H100 fallback if n3-A100x1 unavailable.
-flavor_name = "n3-A100x1"
+# 2026-05-24: A100 sold out, switched to H100.
+flavor_name = "n3-H100x1"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
@@ -69,10 +70,12 @@ hug_cache_dir = "/ephemeral/hug"
container_name = "vllm_qwen36_27b"
# Qwen3.6-27B-FP8: official FP8 checkpoint with native 262K context on a single 80 GB GPU.
max_model_len = 262144
-gpu_memory_utilization = 0.92
+# H100 needs 0.95 to fit Mamba cache blocks; A100 worked at 0.92.
+gpu_memory_utilization = 0.95
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"
-extra_vllm_args = ["--reasoning-parser", "qwen3"]
+# --max-num-seqs 817 caps concurrent sequences to fit Mamba cache blocks on H100.
+extra_vllm_args = ["--reasoning-parser", "qwen3", "--max-num-seqs", "817"]
# Named model presets for 'ruby hyperstack.rb --vm 2 model switch <name>'.
# Core model fields override the matching [vllm] values; preset-only extras such as