From 0664ffcc62b2fb240286fde463635e510a41df84 Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Mon, 6 Apr 2026 11:02:43 +0300
Subject: hyperstack: switch to Gemma 4 31B on VM2, Qwen3-Coder-Next on VM1

VM1 (hyperstack-vm1-coder.toml, renamed from hyperstack-vm1-gptoss.toml):
- Default model switched from gpt-oss-120b to qwen3-coder-next
- Config file renamed to reflect actual default model

VM2 (hyperstack-vm2.toml):
- Default model switched from qwen3-coder-next to Gemma 4 31B AWQ
- Uses vLLM nightly image + transformers==5.5.0 workaround: Gemma 4
  architecture is registered in transformers 5.x but vLLM stable pins <5
- max_model_len=131072 (128K context); KV cache fills ~95% of H100-80GB VRAM
- Added gemma4-31b preset

watcher.rb:
- Add loading_status field to VmSnapshot to show live model-load progress
  (last relevant log line during startup instead of generic "loading" message)
- fetch_vm_stats now captures both Engine 0 stats and loading-phase log lines
  in a single SSH call using a shell variable to avoid two docker log invocations
- clean_log_line() strips vLLM PID/timestamp prefix for readable display

cli.rb: update all hardcoded hyperstack-vm1-gptoss.toml references to
hyperstack-vm1-coder.toml

hypr.fish: replace pi-hyperstack-nemotron with pi-hyperstack-coder (VM1),
add pi-hyperstack-gemma4 (VM2)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 hyperstack-vm1-coder.toml                          | 188 +++++++++++++++++++++
 hyperstack-vm1-gptoss.toml                         | 182 --------------------
 hyperstack-vm2.toml                                |  37 +++-
 hypr.fish                                          |   4 +-
 lib/hyperstack/cli.rb                              |   8 +-
 lib/hyperstack/watcher.rb                          |  57 +++++--
 pi/agent/extensions/loop-scheduler/loop-presets.md |   3 +-
 7 files changed, 266 insertions(+), 213 deletions(-)
 create mode 100644 hyperstack-vm1-coder.toml
 delete mode 100644 hyperstack-vm1-gptoss.toml

diff --git a/hyperstack-vm1-coder.toml b/hyperstack-vm1-coder.toml
new file mode 100644
index 0000000..cd127dd
--- /dev/null
+++ b/hyperstack-vm1-coder.toml
@@ -0,0 +1,188 @@
+[auth]
+api_key_file = "~/.hyperstack"
+
+[hyperstack]
+base_url = "https://infrahub-api.nexgencloud.com/v1"
+
+[state]
+# Separate state file for VM1 so vm1 and vm2 can be managed independently.
+file = ".hyperstack-vm1-state.json"
+
+[vm]
+name_prefix = "hyperstack1"
+hostname = "hyperstack1"
+environment_name = "snonux-ollama"
+
+# A100-80GB single GPU for qwen3-coder-next (default); other models available via presets.
+flavor_name = "n3-H100x1"
+image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
+assign_floating_ip = true
+create_bootable_volume = false
+enable_port_randomization = false
+labels = ["qwen3-coder-next", "wireguard"]
+
+[ssh]
+username = "ubuntu"
+private_key_path = "~/.ssh/id_rsa"
+hyperstack_key_name = "earth"
+port = 22
+connect_timeout_sec = 10
+
+[network]
+wireguard_udp_port = 56710
+wireguard_subnet = "192.168.3.0/24"
+# VM1 gets the first server-side WireGuard IP (gateway address + 0).
+# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3.
+wireguard_server_ip = "192.168.3.1"
+# Secure default: "auto" resolves your current public egress IP to /32 at runtime.
+# Override with explicit CIDRs if you deploy from multiple networks or want broader access.
+allowed_ssh_cidrs = ["auto"]
+allowed_wireguard_cidrs = ["auto"]
+# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
+ollama_port = 11434
+
+[bootstrap]
+enable_guest_bootstrap = true
+install_wireguard = true
+configure_ufw = true
+configure_ollama_host = false
+
+[ollama]
+# Disabled in favour of vLLM; set install = true to switch back to Ollama.
+install = false
+models_dir = "/ephemeral/ollama/models"
+listen_host = "0.0.0.0:11434"
+gpu_overhead_mb = 2000
+num_parallel = 1
+context_length = 32768
+pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]
+
+# vLLM serves one model via Docker on the OpenAI-compatible API.
+# VM1 defaults to qwen3-coder-next; use 'model switch' to load any other preset.
+[vllm]
+install = true
+model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
+hug_cache_dir = "/ephemeral/hug"
+container_name = "vllm_qwen3"
+max_model_len = 262144
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_coder"
+
+# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1-gptoss.toml model switch <name>'.
+# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
+
+[vllm.presets.qwen3-coder-next]
+model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+container_name = "vllm_qwen3"
+max_model_len = 262144
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_coder"
+
+# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
+# Single-GPU (A100-80GB) config: tensor_parallel_size=1, context capped at 32k to fit in VRAM.
+# Model weights occupy ~73.6 GiB of the 79.25 GiB A100; very little VRAM remains for KV cache.
+# enforce_eager=true disables CUDA graph capture, which avoids the large profiling-phase OOM.
+# gpu_memory_utilization=0.98 lets vLLM use nearly all available VRAM for KV blocks.
+# max_model_len reduced to 32768 to keep the KV cache footprint small enough to fit.
+[vllm.presets.nemotron-super]
+model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
+container_name = "vllm_nemotron_super"
+max_model_len = 32768
+gpu_memory_utilization = 0.98
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_xml"
+trust_remote_code = true
+enable_prefix_caching = false
+extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
+extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
+
+# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
+[vllm.presets.gpt-oss-20b]
+model = "openai/gpt-oss-20b"
+container_name = "vllm_gpt_oss_20b"
+max_model_len = 65536
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = ""
+
+# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
+# Hard architecture limit: max_position_embeddings=131072 in model config.json.
+[vllm.presets.gpt-oss-120b]
+model = "openai/gpt-oss-120b"
+container_name = "vllm_gpt_oss_120b"
+max_model_len = 131072
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = ""
+extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]
+
+# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
+[vllm.presets.qwen25-coder-32b]
+model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
+container_name = "vllm_qwen25_coder32b"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "hermes"
+
+# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB.
+[vllm.presets.qwen3-coder-30b]
+model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ"
+container_name = "vllm_qwen3_coder30b"
+max_model_len = 65536
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_coder"
+
+# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100.
+[vllm.presets.deepseek-r1-32b]
+model = "casperhansen/deepseek-r1-distill-qwen-32b-awq"
+container_name = "vllm_deepseek_r1_32b"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = ""
+extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]
+
+# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100.
+[vllm.presets.qwen3-32b]
+model = "Qwen/Qwen3-32B-AWQ"
+container_name = "vllm_qwen3_32b"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = ""
+extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]
+
+# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100).
+[vllm.presets.devstral]
+model = "cyankiwi/Devstral-Small-2507-AWQ-4bit"
+container_name = "vllm_devstral"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "mistral"
+extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]
+
+# Gemma 4 31B AWQ 4-bit — Google's dense 31B multimodal model (~19 GB weights on A100-80GB).
+# ~61 GB VRAM remaining for KV cache; supports up to ~32K context comfortably.
+# Uses vLLM's gemma4 tool-call parser for function calling support.
+[vllm.presets.gemma4-31b]
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
+container_name = "vllm_gemma4_31b"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "gemma4"
+
+[wireguard]
+auto_setup = true
+setup_script = "./wg1-setup.sh"
+
+[local_client]
+check_wg1_service = true
+interface_name = "wg1"
+config_path = "/etc/wireguard/wg1.conf"
diff --git a/hyperstack-vm1-gptoss.toml b/hyperstack-vm1-gptoss.toml
deleted file mode 100644
index af25248..0000000
--- a/hyperstack-vm1-gptoss.toml
+++ /dev/null
@@ -1,182 +0,0 @@
-[auth]
-api_key_file = "~/.hyperstack"
-
-[hyperstack]
-base_url = "https://infrahub-api.nexgencloud.com/v1"
-
-[state]
-# Separate state file for VM1 so vm1 and vm2 can be managed independently.
-file = ".hyperstack-vm1-state.json"
-
-[vm]
-name_prefix = "hyperstack1"
-hostname = "hyperstack1"
-environment_name = "snonux-ollama"
-
-# A100-80GB single GPU for gpt-oss-120b
-flavor_name = "n3-A100x1"
-image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
-assign_floating_ip = true
-create_bootable_volume = false
-enable_port_randomization = false
-labels = ["gpt-oss-120b", "wireguard"]
-
-[ssh]
-username = "ubuntu"
-private_key_path = "~/.ssh/id_rsa"
-hyperstack_key_name = "earth"
-port = 22
-connect_timeout_sec = 10
-
-[network]
-wireguard_udp_port = 56710
-wireguard_subnet = "192.168.3.0/24"
-# VM1 gets the first server-side WireGuard IP (gateway address + 0).
-# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3.
-wireguard_server_ip = "192.168.3.1"
-# Secure default: "auto" resolves your current public egress IP to /32 at runtime.
-# Override with explicit CIDRs if you deploy from multiple networks or want broader access.
-allowed_ssh_cidrs = ["auto"]
-allowed_wireguard_cidrs = ["auto"]
-# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
-ollama_port = 11434
-
-[bootstrap]
-enable_guest_bootstrap = true
-install_wireguard = true
-configure_ufw = true
-configure_ollama_host = false
-
-[ollama]
-# Disabled in favour of vLLM; set install = true to switch back to Ollama.
-install = false
-models_dir = "/ephemeral/ollama/models"
-listen_host = "0.0.0.0:11434"
-gpu_overhead_mb = 2000
-num_parallel = 1
-context_length = 32768
-pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]
-
-# vLLM serves one model via Docker on the OpenAI-compatible API.
-[vllm]
-install = true
-model = "openai/gpt-oss-120b"
-# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
-hug_cache_dir = "/ephemeral/hug"
-container_name = "vllm_gpt_oss_120b"
-# Hard architecture limit: max_position_embeddings=131072 in model config.json.
-max_model_len = 131072
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-# tool_call_parser="" disables --enable-auto-tool-choice; the llama3_json parser crashes
-# on gpt-oss responses (vLLM 0.17.1 adds token_ids to responses, breaking the parser API).
-tool_call_parser = ""
-# gpt-oss-120b is a reasoning model (o-series architecture); the openai_gptoss parser
-# extracts <|channel|>analysis…<|end|> thinking blocks into reasoning_content in the response.
-extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]
-
-# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1-gptoss.toml model switch <name>'.
-# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
-
-[vllm.presets.qwen3-coder-next]
-model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
-container_name = "vllm_qwen3"
-max_model_len = 262144
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = "qwen3_coder"
-
-# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
-# Single-GPU (A100-80GB) config: tensor_parallel_size=1, context capped at 32k to fit in VRAM.
-# Model weights occupy ~73.6 GiB of the 79.25 GiB A100; very little VRAM remains for KV cache.
-# enforce_eager=true disables CUDA graph capture, which avoids the large profiling-phase OOM.
-# gpu_memory_utilization=0.98 lets vLLM use nearly all available VRAM for KV blocks.
-# max_model_len reduced to 32768 to keep the KV cache footprint small enough to fit.
-[vllm.presets.nemotron-super]
-model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
-container_name = "vllm_nemotron_super"
-max_model_len = 32768
-gpu_memory_utilization = 0.98
-tensor_parallel_size = 1
-tool_call_parser = "qwen3_xml"
-trust_remote_code = true
-enable_prefix_caching = false
-extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
-extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
-
-# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
-[vllm.presets.gpt-oss-20b]
-model = "openai/gpt-oss-20b"
-container_name = "vllm_gpt_oss_20b"
-max_model_len = 65536
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-
-# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
-# Hard architecture limit: max_position_embeddings=131072 in model config.json.
-[vllm.presets.gpt-oss-120b]
-model = "openai/gpt-oss-120b"
-container_name = "vllm_gpt_oss_120b"
-max_model_len = 131072
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]
-
-# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
-[vllm.presets.qwen25-coder-32b]
-model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
-container_name = "vllm_qwen25_coder32b"
-max_model_len = 32768
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = "hermes"
-
-# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB.
-[vllm.presets.qwen3-coder-30b]
-model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ"
-container_name = "vllm_qwen3_coder30b"
-max_model_len = 65536
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = "qwen3_coder"
-
-# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100.
-[vllm.presets.deepseek-r1-32b]
-model = "casperhansen/deepseek-r1-distill-qwen-32b-awq"
-container_name = "vllm_deepseek_r1_32b"
-max_model_len = 32768
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]
-
-# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100.
-[vllm.presets.qwen3-32b]
-model = "Qwen/Qwen3-32B-AWQ"
-container_name = "vllm_qwen3_32b"
-max_model_len = 32768
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]
-
-# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100).
-[vllm.presets.devstral]
-model = "cyankiwi/Devstral-Small-2507-AWQ-4bit"
-container_name = "vllm_devstral"
-max_model_len = 32768
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = "mistral"
-extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]
-
-[wireguard]
-auto_setup = true
-setup_script = "./wg1-setup.sh"
-
-[local_client]
-check_wg1_service = true
-interface_name = "wg1"
-config_path = "/etc/wireguard/wg1.conf"
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index 32e3a99..bed09a1 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -13,14 +13,13 @@ name_prefix = "hyperstack2"
 hostname = "hyperstack2"
 environment_name = "snonux-ollama"
 
-# A100-80GB is the cost-first default for qwen3-coder-next inference.
-# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom.
-flavor_name = "n3-A100x1"
+# H100-80GB for Gemma 4 31B inference; switched from n3-A100x1 (out of stock).
+flavor_name = "n3-H100x1"
 image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
 assign_floating_ip = true
 create_bootable_volume = false
 enable_port_randomization = false
-labels = ["qwen3-coder-next", "wireguard"]
+labels = ["gemma4-31b", "wireguard"]
 
 [ssh]
 username = "ubuntu"
@@ -59,21 +58,41 @@ context_length = 32768
 pull_models = ["qwen3-coder-next"]
 
 # vLLM serves one model via Docker on the OpenAI-compatible API.
-# VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset.
+# VM2 defaults to Gemma 4 31B; use 'model switch' to load any other preset.
+# NOTE: Gemma 4 requires transformers>=5.0 but vLLM stable pins transformers<5.
+# Workaround: use the vLLM nightly image and force-install transformers 5.5.0 at startup.
+# Remove the docker_image and pre_start_cmd overrides once vLLM stable adds Gemma 4 support.
 [vllm]
 install = true
-model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
 # HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
 hug_cache_dir = "/ephemeral/hug"
-container_name = "vllm_qwen3"
-max_model_len = 262144
+container_name = "vllm_gemma4_31b"
+# Gemma 4 31B AWQ 4-bit: ~19 GB weights, ~61 GB VRAM remaining for KV cache on H100-80GB.
+# 131072 = Gemma 4's architectural max (128K context); KV cache auto-scales to fit.
+max_model_len = 131072
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
-tool_call_parser = "qwen3_coder"
+tool_call_parser = "gemma4"
+# Use nightly image: stable 0.19.0 lacks Gemma 4 architecture support in its bundled transformers.
+docker_image = "vllm/vllm-openai:nightly"
+# Upgrade transformers to 5.x (Gemma 4 arch added there) before starting vLLM.
+pre_start_cmd = "pip install -q transformers==5.5.0 2>/dev/null"
 
 # Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch <name>'.
 # Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
 
+# Gemma 4 31B AWQ 4-bit — Google's dense 31B multimodal model (~19 GB weights on H100-80GB).
+# ~61 GB VRAM remaining for KV cache; supports up to ~32K context comfortably.
+# Uses vLLM's gemma4 tool-call parser for function calling support.
+[vllm.presets.gemma4-31b]
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
+container_name = "vllm_gemma4_31b"
+max_model_len = 131072
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "gemma4"
+
 [vllm.presets.qwen3-coder-next]
 model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
 container_name = "vllm_qwen3"
diff --git a/hypr.fish b/hypr.fish
index 3d3633b..45839e3 100644
--- a/hypr.fish
+++ b/hypr.fish
@@ -5,7 +5,7 @@ abbr hyperstack-delete ruby ~/git/hyperstack/hyperstack.rb delete
 abbr hyperstack-test ruby ~/git/hyperstack/hyperstack.rb test
 
 # Dual-VM setup (hyperstack-vm1/vm2.toml → hyperstack1/2.wg1)
-abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit
-abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit
+abbr pi-hyperstack-coder pi --model hyperstack1/bullpoint/Qwen3-Coder-Next-AWQ-4bit
+abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4bit
 abbr hyperstack-create-both ruby ~/git/hyperstack/hyperstack.rb create-both
 abbr hyperstack-delete-both ruby ~/git/hyperstack/hyperstack.rb delete-both
diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb
index f4d1cef..d4679b1 100644
--- a/lib/hyperstack/cli.rb
+++ b/lib/hyperstack/cli.rb
@@ -21,12 +21,12 @@ module HyperstackVM
       puts 'Commands:'
       puts '  create [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama] [--model PRESET]'
       puts '  create-both [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama]'
-      puts '               Provision hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml concurrently.'
+      puts '               Provision hyperstack-vm1-coder.toml and hyperstack-vm2.toml concurrently.'
       puts '               WireGuard setup is serialized: VM1 writes the base wg1.conf first,'
       puts '               then VM2 adds its peer. Requires both TOML files next to the script.'
       puts '  delete [--vm-id ID] [--dry-run]'
       puts '  delete-both [--dry-run]'
-      puts '               Delete the VMs tracked by hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml.'
+      puts '               Delete the VMs tracked by hyperstack-vm1-coder.toml and hyperstack-vm2.toml.'
       puts '  status'
       puts '  watch'
       puts '               Poll all active VMs for vLLM and GPU stats every 60 s.'
@@ -237,7 +237,7 @@ module HyperstackVM
 
       candidates = [
         @config_path,
-        File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml'),
+        File.join(REPO_ROOT, 'hyperstack-vm1-coder.toml'),
         File.join(REPO_ROOT, 'hyperstack-vm2.toml'),
         File.join(REPO_ROOT, 'hyperstack-vm-photo.toml')
       ].uniq.select { |path| File.exist?(path) }
@@ -249,7 +249,7 @@ module HyperstackVM
 
     def pair_config_loaders
       [
-        ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml')),
+        ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-coder.toml')),
         ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm2.toml'))
       ]
     end
diff --git a/lib/hyperstack/watcher.rb b/lib/hyperstack/watcher.rb
index de3d71e..1c126c5 100644
--- a/lib/hyperstack/watcher.rb
+++ b/lib/hyperstack/watcher.rb
@@ -22,11 +22,14 @@ module HyperstackVM
 
     # Snapshot of one VM's stats at a point in time.
     # service_type is :vllm or :comfyui — controls which metrics section is rendered.
+    # loading_status holds the last meaningful log line while vLLM is still initialising;
+    # it is nil once the Engine 0 stats line starts appearing.
     VmSnapshot = Struct.new(
       :label, :wg_host, :service_type,
       :vllm_model, :container_name,
       :metrics, :gpus,
       :vllm_error, :gpu_error,
+      :loading_status,
       :fetched_at,
       keyword_init: true
     )
@@ -78,7 +81,7 @@ module HyperstackVM
                               vllm_model: nil, container_name: nil,
                               metrics: nil, gpus: nil,
                               vllm_error: 'no state file', gpu_error: nil,
-                              fetched_at: Time.now)
+                              loading_status: nil, fetched_at: Time.now)
       end
 
       if config.comfyui_install_enabled?
@@ -91,7 +94,7 @@ module HyperstackVM
                      vllm_model: nil, container_name: nil,
                      metrics: nil, gpus: nil,
                      vllm_error: e.message, gpu_error: nil,
-                     fetched_at: Time.now)
+                     loading_status: nil, fetched_at: Time.now)
     end
 
     # Fetches GPU + vLLM container stats for a vLLM VM.
@@ -99,13 +102,13 @@ module HyperstackVM
       vllm_model     = state['vllm_model'] || config.vllm_model
       container_name = state['vllm_container_name'] || config.vllm_container_name
 
-      gpus, metrics, ssh_error = fetch_vm_stats(config, wg_host, container_name)
+      gpus, metrics, loading_status, ssh_error = fetch_vm_stats(config, wg_host, container_name)
 
       VmSnapshot.new(label: label, wg_host: wg_host, service_type: :vllm,
                      vllm_model: vllm_model, container_name: container_name,
                      metrics: metrics, gpus: gpus,
                      vllm_error: ssh_error, gpu_error: ssh_error,
-                     fetched_at: Time.now)
+                     loading_status: loading_status, fetched_at: Time.now)
     end
 
     # Fetches GPU + ComfyUI queue stats for a ComfyUI VM.
@@ -117,7 +120,7 @@ module HyperstackVM
                      vllm_model: nil, container_name: nil,
                      metrics: metrics, gpus: gpus,
                      vllm_error: ssh_error, gpu_error: ssh_error,
-                     fetched_at: Time.now)
+                     loading_status: nil, fetched_at: Time.now)
     end
 
     def load_state(path)
@@ -167,26 +170,37 @@ module HyperstackVM
     end
 
     # Single SSH call that runs nvidia-smi and tails the vLLM container logs.
-    # The two sections are separated by a sentinel line so we can split them.
-    # Returns [gpus, metrics, error_or_nil].
+    # Captures the Engine 0 stats line (present once the model is running) and,
+    # when that line is absent, the last relevant loading-phase log line so the
+    # watch display can show model-download / weight-load progress.
+    # Returns [gpus, metrics, loading_status, error_or_nil].
     def fetch_vm_stats(config, wg_host, container_name)
       gpu_query = 'index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total'
-      # --tail 200 instead of --since N so we always get the last stats line
+      # Capture logs once into a shell variable to avoid two docker calls.
+      # --tail 300 instead of --since N so we always get the last stats line
       # even when the VM has been idle for longer than the refresh interval.
-      script    = <<~BASH
+      # grep exit 1 (no match) is swallowed by the pipeline tail -1, which
+      # always succeeds, so bash -se does not abort on an empty grep result.
+      script = <<~BASH
         nvidia-smi --query-gpu=#{gpu_query} --format=csv,noheader,nounits
         echo ===VLLM===
-        docker logs --tail 200 #{container_name} 2>&1 | grep 'Engine 0' | tail -1
+        _logs=$(docker logs --tail 300 #{container_name} 2>&1)
+        echo "$_logs" | grep 'Engine 0' | tail -1
+        echo ===LOADING===
+        echo "$_logs" | grep -E 'Starting to load|Loading model|model weight|Downloading|GPU block|Profil|shard|Initializ|quantiz|AWQ' | tail -1
       BASH
 
       ssh = build_ssh_command(config, wg_host)
       stdout, stderr, status = Timeout.timeout(15) { Open3.capture3(*ssh, stdin_data: script) }
-      return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success?
+      return [nil, nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success?
 
-      gpu_section, vllm_section = stdout.split("===VLLM===\n", 2)
+      gpu_section, rest       = stdout.split("===VLLM===\n", 2)
+      vllm_section, load_section = rest.to_s.split("===LOADING===\n", 2)
       gpus    = parse_nvidia_smi(gpu_section.to_s)
       metrics = parse_engine_log_line(vllm_section.to_s.strip)
-      [gpus, metrics, nil]
+      # Only surface the loading line while the engine stats aren't available yet.
+      loading_status = metrics.empty? ? clean_log_line(load_section.to_s.strip) : nil
+      [gpus, metrics, loading_status, nil]
     end
 
     # Parse a vLLM "Engine 0" log line into a plain Hash.
@@ -216,6 +230,14 @@ module HyperstackVM
       m ? m[1].to_f : nil
     end
 
+    # Strips the vLLM log prefix "(EngineCore pid=N) INFO YYYY-MM-DD HH:MM:SS [file.py:NN]"
+    # so only the human-readable message is shown in the watch display.
+    def clean_log_line(line)
+      return line if line.empty?
+
+      line.sub(/^\(.*?pid=\d+\)\s+\w+\s+[\d-]+\s+[\d:]+\s+\[[\w.]+:\d+\]\s*/, '').strip
+    end
+
     # Build an SSH command array for the watcher.
     # Uses accept-new rather than yes because the known-hosts file was populated
     # with the VM's public IP during provisioning; the WireGuard hostname
@@ -330,8 +352,13 @@ module HyperstackVM
           lines.concat(render_comfyui_metrics(snap.metrics))
         elsif snap.metrics&.any?
           lines.concat(render_vllm_metrics(snap.metrics))
-        elsif snap.metrics && snap.metrics.empty?
-          lines << "  #{DIM}(no Engine log line yet — container may still be loading)#{RESET}"
+        elsif snap.metrics
+          # Engine stats not yet available — model is still loading.
+          if snap.loading_status && !snap.loading_status.empty?
+            lines << row('loading', "#{YELLOW}#{snap.loading_status}#{RESET}")
+          else
+            lines << "  #{DIM}(container starting…)#{RESET}"
+          end
         end
       end
 
diff --git a/pi/agent/extensions/loop-scheduler/loop-presets.md b/pi/agent/extensions/loop-scheduler/loop-presets.md
index 61f30d7..8d141fe 100644
--- a/pi/agent/extensions/loop-scheduler/loop-presets.md
+++ b/pi/agent/extensions/loop-scheduler/loop-presets.md
@@ -7,4 +7,5 @@
 # * review:  1h  review the last 10 git commits
 # * monitor: 10m check if there are any errors in the logs
 
-* tasks: 1m automatically start with the next task with fresh context if the current task completed following the agent-task-management skill. 
+* tasks: 1m automatically start with the next task with fresh context if the current task completed following the agent-task-management skill.
+* proceed: 1m proceed
-- 
cgit v1.2.3