diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-24 18:31:43 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-24 18:31:43 +0300 |
| commit | 290ea93e7c7475996a11ce6651237d8a803228c0 (patch) | |
| tree | 19786d37a6c57c534f07e02cf842af42c82117fd /hyperstack-vm2.toml | |
| parent | dad54c12a35481144c6f91d3be2695685de40dad (diff) | |
chore(vm2): H100 provisioning, L40 plan, and H100-specific vLLM tuning
Diffstat (limited to 'hyperstack-vm2.toml')
| -rw-r--r-- | hyperstack-vm2.toml | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml index 3e74aae..070b8aa 100644 --- a/hyperstack-vm2.toml +++ b/hyperstack-vm2.toml @@ -14,7 +14,8 @@ hostname = "hyperstack2" environment_name = "snonux-ollama" # A100-80GB for Qwen3.6 27B; H100 fallback if n3-A100x1 unavailable. -flavor_name = "n3-A100x1" +# 2026-05-24: A100 sold out, switched to H100. +flavor_name = "n3-H100x1" image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" assign_floating_ip = true create_bootable_volume = false @@ -69,10 +70,12 @@ hug_cache_dir = "/ephemeral/hug" container_name = "vllm_qwen36_27b" # Qwen3.6-27B-FP8: official FP8 checkpoint with native 262K context on a single 80 GB GPU. max_model_len = 262144 -gpu_memory_utilization = 0.92 +# H100 needs 0.95 to fit Mamba cache blocks; A100 worked at 0.92. +gpu_memory_utilization = 0.95 tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" -extra_vllm_args = ["--reasoning-parser", "qwen3"] +# --max-num-seqs 817 caps concurrent sequences to fit Mamba cache blocks on H100. +extra_vllm_args = ["--reasoning-parser", "qwen3", "--max-num-seqs", "817"] # Named model presets for 'ruby hyperstack.rb --vm 2 model switch <name>'. # Core model fields override the matching [vllm] values; preset-only extras such as |
