hyperstack: switch to Gemma 4 31B on VM2, Qwen3-Coder-Next on VM1

VM1 (hyperstack-vm1-coder.toml, renamed from hyperstack-vm1-gptoss.toml): - Default model switched from gpt-oss-120b to qwen3-coder-next - Config file renamed to reflect actual default model VM2 (hyperstack-vm2.toml): - Default model switched from qwen3-coder-next to Gemma 4 31B AWQ - Uses vLLM nightly image + transformers==5.5.0 workaround: Gemma 4 architecture is registered in transformers 5.x but vLLM stable pins <5 - max_model_len=131072 (128K context); KV cache fills ~95% of H100-80GB VRAM - Added gemma4-31b preset watcher.rb: - Add loading_status field to VmSnapshot to show live model-load progress (last relevant log line during startup instead of generic "loading" message) - fetch_vm_stats now captures both Engine 0 stats and loading-phase log lines in a single SSH call using a shell variable to avoid two docker log invocations - clean_log_line() strips vLLM PID/timestamp prefix for readable display cli.rb: update all hardcoded hyperstack-vm1-gptoss.toml references to hyperstack-vm1-coder.toml hypr.fish: replace pi-hyperstack-nemotron with pi-hyperstack-coder (VM1), add pi-hyperstack-gemma4 (VM2) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-04-06 11:02:43 +0300
committer: Paul Buetow <paul@buetow.org> 2026-04-06 11:02:43 +0300
commit: 0664ffcc62b2fb240286fde463635e510a41df84 (patch)
tree: c3528d94974a36e975d967c673bfc29890ac9fae /hyperstack-vm2.toml
parent: ce6adba0cfb47b06506976636bd2b4861112ddd8 (diff)
1 files changed, 28 insertions, 9 deletions
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index 32e3a99..bed09a1 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -13,14 +13,13 @@ name_prefix = "hyperstack2"
 hostname = "hyperstack2"
 environment_name = "snonux-ollama"
 
-# A100-80GB is the cost-first default for qwen3-coder-next inference.
-# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom.
-flavor_name = "n3-A100x1"
+# H100-80GB for Gemma 4 31B inference; switched from n3-A100x1 (out of stock).
+flavor_name = "n3-H100x1"
 image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
 assign_floating_ip = true
 create_bootable_volume = false
 enable_port_randomization = false
-labels = ["qwen3-coder-next", "wireguard"]
+labels = ["gemma4-31b", "wireguard"]
 
 [ssh]
 username = "ubuntu"
@@ -59,21 +58,41 @@ context_length = 32768
 pull_models = ["qwen3-coder-next"]
 
 # vLLM serves one model via Docker on the OpenAI-compatible API.
-# VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset.
+# VM2 defaults to Gemma 4 31B; use 'model switch' to load any other preset.
+# NOTE: Gemma 4 requires transformers>=5.0 but vLLM stable pins transformers<5.
+# Workaround: use the vLLM nightly image and force-install transformers 5.5.0 at startup.
+# Remove the docker_image and pre_start_cmd overrides once vLLM stable adds Gemma 4 support.
 [vllm]
 install = true
-model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
 # HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
 hug_cache_dir = "/ephemeral/hug"
-container_name = "vllm_qwen3"
-max_model_len = 262144
+container_name = "vllm_gemma4_31b"
+# Gemma 4 31B AWQ 4-bit: ~19 GB weights, ~61 GB VRAM remaining for KV cache on H100-80GB.
+# 131072 = Gemma 4's architectural max (128K context); KV cache auto-scales to fit.
+max_model_len = 131072
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
-tool_call_parser = "qwen3_coder"
+tool_call_parser = "gemma4"
+# Use nightly image: stable 0.19.0 lacks Gemma 4 architecture support in its bundled transformers.
+docker_image = "vllm/vllm-openai:nightly"
+# Upgrade transformers to 5.x (Gemma 4 arch added there) before starting vLLM.
+pre_start_cmd = "pip install -q transformers==5.5.0 2>/dev/null"
 
 # Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch <name>'.
 # Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
 
+# Gemma 4 31B AWQ 4-bit — Google's dense 31B multimodal model (~19 GB weights on H100-80GB).
+# ~61 GB VRAM remaining for KV cache; supports up to ~32K context comfortably.
+# Uses vLLM's gemma4 tool-call parser for function calling support.
+[vllm.presets.gemma4-31b]
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
+container_name = "vllm_gemma4_31b"
+max_model_len = 131072
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "gemma4"
+
 [vllm.presets.qwen3-coder-next]
 model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
 container_name = "vllm_qwen3"
author	Paul Buetow <paul@buetow.org>	2026-04-06 11:02:43 +0300
committer	Paul Buetow <paul@buetow.org>	2026-04-06 11:02:43 +0300
commit	0664ffcc62b2fb240286fde463635e510a41df84 (patch)
tree	c3528d94974a36e975d967c673bfc29890ac9fae /hyperstack-vm2.toml
parent	ce6adba0cfb47b06506976636bd2b4861112ddd8 (diff)