From 0664ffcc62b2fb240286fde463635e510a41df84 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Mon, 6 Apr 2026 11:02:43 +0300 Subject: hyperstack: switch to Gemma 4 31B on VM2, Qwen3-Coder-Next on VM1 VM1 (hyperstack-vm1-coder.toml, renamed from hyperstack-vm1-gptoss.toml): - Default model switched from gpt-oss-120b to qwen3-coder-next - Config file renamed to reflect actual default model VM2 (hyperstack-vm2.toml): - Default model switched from qwen3-coder-next to Gemma 4 31B AWQ - Uses vLLM nightly image + transformers==5.5.0 workaround: Gemma 4 architecture is registered in transformers 5.x but vLLM stable pins <5 - max_model_len=131072 (128K context); KV cache fills ~95% of H100-80GB VRAM - Added gemma4-31b preset watcher.rb: - Add loading_status field to VmSnapshot to show live model-load progress (last relevant log line during startup instead of generic "loading" message) - fetch_vm_stats now captures both Engine 0 stats and loading-phase log lines in a single SSH call using a shell variable to avoid two docker log invocations - clean_log_line() strips vLLM PID/timestamp prefix for readable display cli.rb: update all hardcoded hyperstack-vm1-gptoss.toml references to hyperstack-vm1-coder.toml hypr.fish: replace pi-hyperstack-nemotron with pi-hyperstack-coder (VM1), add pi-hyperstack-gemma4 (VM2) Co-Authored-By: Claude Sonnet 4.6 --- hyperstack-vm1-coder.toml | 188 +++++++++++++++++++++ hyperstack-vm1-gptoss.toml | 182 -------------------- hyperstack-vm2.toml | 37 +++- hypr.fish | 4 +- lib/hyperstack/cli.rb | 8 +- lib/hyperstack/watcher.rb | 57 +++++-- pi/agent/extensions/loop-scheduler/loop-presets.md | 3 +- 7 files changed, 266 insertions(+), 213 deletions(-) create mode 100644 hyperstack-vm1-coder.toml delete mode 100644 hyperstack-vm1-gptoss.toml diff --git a/hyperstack-vm1-coder.toml b/hyperstack-vm1-coder.toml new file mode 100644 index 0000000..cd127dd --- /dev/null +++ b/hyperstack-vm1-coder.toml @@ -0,0 +1,188 @@ +[auth] +api_key_file = "~/.hyperstack" + +[hyperstack] +base_url = "https://infrahub-api.nexgencloud.com/v1" + +[state] +# Separate state file for VM1 so vm1 and vm2 can be managed independently. +file = ".hyperstack-vm1-state.json" + +[vm] +name_prefix = "hyperstack1" +hostname = "hyperstack1" +environment_name = "snonux-ollama" + +# A100-80GB single GPU for qwen3-coder-next (default); other models available via presets. +flavor_name = "n3-H100x1" +image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" +assign_floating_ip = true +create_bootable_volume = false +enable_port_randomization = false +labels = ["qwen3-coder-next", "wireguard"] + +[ssh] +username = "ubuntu" +private_key_path = "~/.ssh/id_rsa" +hyperstack_key_name = "earth" +port = 22 +connect_timeout_sec = 10 + +[network] +wireguard_udp_port = 56710 +wireguard_subnet = "192.168.3.0/24" +# VM1 gets the first server-side WireGuard IP (gateway address + 0). +# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3. +wireguard_server_ip = "192.168.3.1" +# Secure default: "auto" resolves your current public egress IP to /32 at runtime. +# Override with explicit CIDRs if you deploy from multiple networks or want broader access. +allowed_ssh_cidrs = ["auto"] +allowed_wireguard_cidrs = ["auto"] +# Port 11434 is shared by both Ollama and vLLM for firewall compatibility. +ollama_port = 11434 + +[bootstrap] +enable_guest_bootstrap = true +install_wireguard = true +configure_ufw = true +configure_ollama_host = false + +[ollama] +# Disabled in favour of vLLM; set install = true to switch back to Ollama. +install = false +models_dir = "/ephemeral/ollama/models" +listen_host = "0.0.0.0:11434" +gpu_overhead_mb = 2000 +num_parallel = 1 +context_length = 32768 +pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] + +# vLLM serves one model via Docker on the OpenAI-compatible API. +# VM1 defaults to qwen3-coder-next; use 'model switch' to load any other preset. +[vllm] +install = true +model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" +# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). +hug_cache_dir = "/ephemeral/hug" +container_name = "vllm_qwen3" +max_model_len = 262144 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "qwen3_coder" + +# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1-gptoss.toml model switch '. +# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. + +[vllm.presets.qwen3-coder-next] +model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" +container_name = "vllm_qwen3" +max_model_len = 262144 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "qwen3_coder" + +# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). +# Single-GPU (A100-80GB) config: tensor_parallel_size=1, context capped at 32k to fit in VRAM. +# Model weights occupy ~73.6 GiB of the 79.25 GiB A100; very little VRAM remains for KV cache. +# enforce_eager=true disables CUDA graph capture, which avoids the large profiling-phase OOM. +# gpu_memory_utilization=0.98 lets vLLM use nearly all available VRAM for KV blocks. +# max_model_len reduced to 32768 to keep the KV cache footprint small enough to fit. +[vllm.presets.nemotron-super] +model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" +container_name = "vllm_nemotron_super" +max_model_len = 32768 +gpu_memory_utilization = 0.98 +tensor_parallel_size = 1 +tool_call_parser = "qwen3_xml" +trust_remote_code = true +enable_prefix_caching = false +extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"] +extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] + +# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. +[vllm.presets.gpt-oss-20b] +model = "openai/gpt-oss-20b" +container_name = "vllm_gpt_oss_20b" +max_model_len = 65536 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" + +# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. +# Hard architecture limit: max_position_embeddings=131072 in model config.json. +[vllm.presets.gpt-oss-120b] +model = "openai/gpt-oss-120b" +container_name = "vllm_gpt_oss_120b" +max_model_len = 131072 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" +extra_vllm_args = ["--reasoning-parser", "openai_gptoss"] + +# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. +[vllm.presets.qwen25-coder-32b] +model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" +container_name = "vllm_qwen25_coder32b" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "hermes" + +# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. +[vllm.presets.qwen3-coder-30b] +model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" +container_name = "vllm_qwen3_coder30b" +max_model_len = 65536 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "qwen3_coder" + +# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. +[vllm.presets.deepseek-r1-32b] +model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" +container_name = "vllm_deepseek_r1_32b" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" +extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] + +# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. +[vllm.presets.qwen3-32b] +model = "Qwen/Qwen3-32B-AWQ" +container_name = "vllm_qwen3_32b" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" +extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] + +# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). +[vllm.presets.devstral] +model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" +container_name = "vllm_devstral" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "mistral" +extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] + +# Gemma 4 31B AWQ 4-bit — Google's dense 31B multimodal model (~19 GB weights on A100-80GB). +# ~61 GB VRAM remaining for KV cache; supports up to ~32K context comfortably. +# Uses vLLM's gemma4 tool-call parser for function calling support. +[vllm.presets.gemma4-31b] +model = "cyankiwi/gemma-4-31B-it-AWQ-4bit" +container_name = "vllm_gemma4_31b" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "gemma4" + +[wireguard] +auto_setup = true +setup_script = "./wg1-setup.sh" + +[local_client] +check_wg1_service = true +interface_name = "wg1" +config_path = "/etc/wireguard/wg1.conf" diff --git a/hyperstack-vm1-gptoss.toml b/hyperstack-vm1-gptoss.toml deleted file mode 100644 index af25248..0000000 --- a/hyperstack-vm1-gptoss.toml +++ /dev/null @@ -1,182 +0,0 @@ -[auth] -api_key_file = "~/.hyperstack" - -[hyperstack] -base_url = "https://infrahub-api.nexgencloud.com/v1" - -[state] -# Separate state file for VM1 so vm1 and vm2 can be managed independently. -file = ".hyperstack-vm1-state.json" - -[vm] -name_prefix = "hyperstack1" -hostname = "hyperstack1" -environment_name = "snonux-ollama" - -# A100-80GB single GPU for gpt-oss-120b -flavor_name = "n3-A100x1" -image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" -assign_floating_ip = true -create_bootable_volume = false -enable_port_randomization = false -labels = ["gpt-oss-120b", "wireguard"] - -[ssh] -username = "ubuntu" -private_key_path = "~/.ssh/id_rsa" -hyperstack_key_name = "earth" -port = 22 -connect_timeout_sec = 10 - -[network] -wireguard_udp_port = 56710 -wireguard_subnet = "192.168.3.0/24" -# VM1 gets the first server-side WireGuard IP (gateway address + 0). -# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3. -wireguard_server_ip = "192.168.3.1" -# Secure default: "auto" resolves your current public egress IP to /32 at runtime. -# Override with explicit CIDRs if you deploy from multiple networks or want broader access. -allowed_ssh_cidrs = ["auto"] -allowed_wireguard_cidrs = ["auto"] -# Port 11434 is shared by both Ollama and vLLM for firewall compatibility. -ollama_port = 11434 - -[bootstrap] -enable_guest_bootstrap = true -install_wireguard = true -configure_ufw = true -configure_ollama_host = false - -[ollama] -# Disabled in favour of vLLM; set install = true to switch back to Ollama. -install = false -models_dir = "/ephemeral/ollama/models" -listen_host = "0.0.0.0:11434" -gpu_overhead_mb = 2000 -num_parallel = 1 -context_length = 32768 -pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] - -# vLLM serves one model via Docker on the OpenAI-compatible API. -[vllm] -install = true -model = "openai/gpt-oss-120b" -# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). -hug_cache_dir = "/ephemeral/hug" -container_name = "vllm_gpt_oss_120b" -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -# tool_call_parser="" disables --enable-auto-tool-choice; the llama3_json parser crashes -# on gpt-oss responses (vLLM 0.17.1 adds token_ids to responses, breaking the parser API). -tool_call_parser = "" -# gpt-oss-120b is a reasoning model (o-series architecture); the openai_gptoss parser -# extracts <|channel|>analysis…<|end|> thinking blocks into reasoning_content in the response. -extra_vllm_args = ["--reasoning-parser", "openai_gptoss"] - -# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1-gptoss.toml model switch '. -# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. - -[vllm.presets.qwen3-coder-next] -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# Single-GPU (A100-80GB) config: tensor_parallel_size=1, context capped at 32k to fit in VRAM. -# Model weights occupy ~73.6 GiB of the 79.25 GiB A100; very little VRAM remains for KV cache. -# enforce_eager=true disables CUDA graph capture, which avoids the large profiling-phase OOM. -# gpu_memory_utilization=0.98 lets vLLM use nearly all available VRAM for KV blocks. -# max_model_len reduced to 32768 to keep the KV cache footprint small enough to fit. -[vllm.presets.nemotron-super] -model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" -container_name = "vllm_nemotron_super" -max_model_len = 32768 -gpu_memory_utilization = 0.98 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_xml" -trust_remote_code = true -enable_prefix_caching = false -extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"] -extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] - -# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. -[vllm.presets.gpt-oss-20b] -model = "openai/gpt-oss-20b" -container_name = "vllm_gpt_oss_20b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -[vllm.presets.gpt-oss-120b] -model = "openai/gpt-oss-120b" -container_name = "vllm_gpt_oss_120b" -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "openai_gptoss"] - -# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. -[vllm.presets.qwen25-coder-32b] -model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" -container_name = "vllm_qwen25_coder32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "hermes" - -# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. -[vllm.presets.qwen3-coder-30b] -model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" -container_name = "vllm_qwen3_coder30b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. -[vllm.presets.deepseek-r1-32b] -model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" -container_name = "vllm_deepseek_r1_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. -[vllm.presets.qwen3-32b] -model = "Qwen/Qwen3-32B-AWQ" -container_name = "vllm_qwen3_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). -[vllm.presets.devstral] -model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" -container_name = "vllm_devstral" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "mistral" -extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] - -[wireguard] -auto_setup = true -setup_script = "./wg1-setup.sh" - -[local_client] -check_wg1_service = true -interface_name = "wg1" -config_path = "/etc/wireguard/wg1.conf" diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml index 32e3a99..bed09a1 100644 --- a/hyperstack-vm2.toml +++ b/hyperstack-vm2.toml @@ -13,14 +13,13 @@ name_prefix = "hyperstack2" hostname = "hyperstack2" environment_name = "snonux-ollama" -# A100-80GB is the cost-first default for qwen3-coder-next inference. -# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. -flavor_name = "n3-A100x1" +# H100-80GB for Gemma 4 31B inference; switched from n3-A100x1 (out of stock). +flavor_name = "n3-H100x1" image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" assign_floating_ip = true create_bootable_volume = false enable_port_randomization = false -labels = ["qwen3-coder-next", "wireguard"] +labels = ["gemma4-31b", "wireguard"] [ssh] username = "ubuntu" @@ -59,21 +58,41 @@ context_length = 32768 pull_models = ["qwen3-coder-next"] # vLLM serves one model via Docker on the OpenAI-compatible API. -# VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset. +# VM2 defaults to Gemma 4 31B; use 'model switch' to load any other preset. +# NOTE: Gemma 4 requires transformers>=5.0 but vLLM stable pins transformers<5. +# Workaround: use the vLLM nightly image and force-install transformers 5.5.0 at startup. +# Remove the docker_image and pre_start_cmd overrides once vLLM stable adds Gemma 4 support. [vllm] install = true -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" +model = "cyankiwi/gemma-4-31B-it-AWQ-4bit" # HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). hug_cache_dir = "/ephemeral/hug" -container_name = "vllm_qwen3" -max_model_len = 262144 +container_name = "vllm_gemma4_31b" +# Gemma 4 31B AWQ 4-bit: ~19 GB weights, ~61 GB VRAM remaining for KV cache on H100-80GB. +# 131072 = Gemma 4's architectural max (128K context); KV cache auto-scales to fit. +max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" +tool_call_parser = "gemma4" +# Use nightly image: stable 0.19.0 lacks Gemma 4 architecture support in its bundled transformers. +docker_image = "vllm/vllm-openai:nightly" +# Upgrade transformers to 5.x (Gemma 4 arch added there) before starting vLLM. +pre_start_cmd = "pip install -q transformers==5.5.0 2>/dev/null" # Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch '. # Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. +# Gemma 4 31B AWQ 4-bit — Google's dense 31B multimodal model (~19 GB weights on H100-80GB). +# ~61 GB VRAM remaining for KV cache; supports up to ~32K context comfortably. +# Uses vLLM's gemma4 tool-call parser for function calling support. +[vllm.presets.gemma4-31b] +model = "cyankiwi/gemma-4-31B-it-AWQ-4bit" +container_name = "vllm_gemma4_31b" +max_model_len = 131072 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "gemma4" + [vllm.presets.qwen3-coder-next] model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" container_name = "vllm_qwen3" diff --git a/hypr.fish b/hypr.fish index 3d3633b..45839e3 100644 --- a/hypr.fish +++ b/hypr.fish @@ -5,7 +5,7 @@ abbr hyperstack-delete ruby ~/git/hyperstack/hyperstack.rb delete abbr hyperstack-test ruby ~/git/hyperstack/hyperstack.rb test # Dual-VM setup (hyperstack-vm1/vm2.toml → hyperstack1/2.wg1) -abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit -abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit +abbr pi-hyperstack-coder pi --model hyperstack1/bullpoint/Qwen3-Coder-Next-AWQ-4bit +abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4bit abbr hyperstack-create-both ruby ~/git/hyperstack/hyperstack.rb create-both abbr hyperstack-delete-both ruby ~/git/hyperstack/hyperstack.rb delete-both diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb index f4d1cef..d4679b1 100644 --- a/lib/hyperstack/cli.rb +++ b/lib/hyperstack/cli.rb @@ -21,12 +21,12 @@ module HyperstackVM puts 'Commands:' puts ' create [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama] [--model PRESET]' puts ' create-both [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama]' - puts ' Provision hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml concurrently.' + puts ' Provision hyperstack-vm1-coder.toml and hyperstack-vm2.toml concurrently.' puts ' WireGuard setup is serialized: VM1 writes the base wg1.conf first,' puts ' then VM2 adds its peer. Requires both TOML files next to the script.' puts ' delete [--vm-id ID] [--dry-run]' puts ' delete-both [--dry-run]' - puts ' Delete the VMs tracked by hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml.' + puts ' Delete the VMs tracked by hyperstack-vm1-coder.toml and hyperstack-vm2.toml.' puts ' status' puts ' watch' puts ' Poll all active VMs for vLLM and GPU stats every 60 s.' @@ -237,7 +237,7 @@ module HyperstackVM candidates = [ @config_path, - File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml'), + File.join(REPO_ROOT, 'hyperstack-vm1-coder.toml'), File.join(REPO_ROOT, 'hyperstack-vm2.toml'), File.join(REPO_ROOT, 'hyperstack-vm-photo.toml') ].uniq.select { |path| File.exist?(path) } @@ -249,7 +249,7 @@ module HyperstackVM def pair_config_loaders [ - ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml')), + ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-coder.toml')), ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm2.toml')) ] end diff --git a/lib/hyperstack/watcher.rb b/lib/hyperstack/watcher.rb index de3d71e..1c126c5 100644 --- a/lib/hyperstack/watcher.rb +++ b/lib/hyperstack/watcher.rb @@ -22,11 +22,14 @@ module HyperstackVM # Snapshot of one VM's stats at a point in time. # service_type is :vllm or :comfyui — controls which metrics section is rendered. + # loading_status holds the last meaningful log line while vLLM is still initialising; + # it is nil once the Engine 0 stats line starts appearing. VmSnapshot = Struct.new( :label, :wg_host, :service_type, :vllm_model, :container_name, :metrics, :gpus, :vllm_error, :gpu_error, + :loading_status, :fetched_at, keyword_init: true ) @@ -78,7 +81,7 @@ module HyperstackVM vllm_model: nil, container_name: nil, metrics: nil, gpus: nil, vllm_error: 'no state file', gpu_error: nil, - fetched_at: Time.now) + loading_status: nil, fetched_at: Time.now) end if config.comfyui_install_enabled? @@ -91,7 +94,7 @@ module HyperstackVM vllm_model: nil, container_name: nil, metrics: nil, gpus: nil, vllm_error: e.message, gpu_error: nil, - fetched_at: Time.now) + loading_status: nil, fetched_at: Time.now) end # Fetches GPU + vLLM container stats for a vLLM VM. @@ -99,13 +102,13 @@ module HyperstackVM vllm_model = state['vllm_model'] || config.vllm_model container_name = state['vllm_container_name'] || config.vllm_container_name - gpus, metrics, ssh_error = fetch_vm_stats(config, wg_host, container_name) + gpus, metrics, loading_status, ssh_error = fetch_vm_stats(config, wg_host, container_name) VmSnapshot.new(label: label, wg_host: wg_host, service_type: :vllm, vllm_model: vllm_model, container_name: container_name, metrics: metrics, gpus: gpus, vllm_error: ssh_error, gpu_error: ssh_error, - fetched_at: Time.now) + loading_status: loading_status, fetched_at: Time.now) end # Fetches GPU + ComfyUI queue stats for a ComfyUI VM. @@ -117,7 +120,7 @@ module HyperstackVM vllm_model: nil, container_name: nil, metrics: metrics, gpus: gpus, vllm_error: ssh_error, gpu_error: ssh_error, - fetched_at: Time.now) + loading_status: nil, fetched_at: Time.now) end def load_state(path) @@ -167,26 +170,37 @@ module HyperstackVM end # Single SSH call that runs nvidia-smi and tails the vLLM container logs. - # The two sections are separated by a sentinel line so we can split them. - # Returns [gpus, metrics, error_or_nil]. + # Captures the Engine 0 stats line (present once the model is running) and, + # when that line is absent, the last relevant loading-phase log line so the + # watch display can show model-download / weight-load progress. + # Returns [gpus, metrics, loading_status, error_or_nil]. def fetch_vm_stats(config, wg_host, container_name) gpu_query = 'index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total' - # --tail 200 instead of --since N so we always get the last stats line + # Capture logs once into a shell variable to avoid two docker calls. + # --tail 300 instead of --since N so we always get the last stats line # even when the VM has been idle for longer than the refresh interval. - script = <<~BASH + # grep exit 1 (no match) is swallowed by the pipeline tail -1, which + # always succeeds, so bash -se does not abort on an empty grep result. + script = <<~BASH nvidia-smi --query-gpu=#{gpu_query} --format=csv,noheader,nounits echo ===VLLM=== - docker logs --tail 200 #{container_name} 2>&1 | grep 'Engine 0' | tail -1 + _logs=$(docker logs --tail 300 #{container_name} 2>&1) + echo "$_logs" | grep 'Engine 0' | tail -1 + echo ===LOADING=== + echo "$_logs" | grep -E 'Starting to load|Loading model|model weight|Downloading|GPU block|Profil|shard|Initializ|quantiz|AWQ' | tail -1 BASH ssh = build_ssh_command(config, wg_host) stdout, stderr, status = Timeout.timeout(15) { Open3.capture3(*ssh, stdin_data: script) } - return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success? + return [nil, nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success? - gpu_section, vllm_section = stdout.split("===VLLM===\n", 2) + gpu_section, rest = stdout.split("===VLLM===\n", 2) + vllm_section, load_section = rest.to_s.split("===LOADING===\n", 2) gpus = parse_nvidia_smi(gpu_section.to_s) metrics = parse_engine_log_line(vllm_section.to_s.strip) - [gpus, metrics, nil] + # Only surface the loading line while the engine stats aren't available yet. + loading_status = metrics.empty? ? clean_log_line(load_section.to_s.strip) : nil + [gpus, metrics, loading_status, nil] end # Parse a vLLM "Engine 0" log line into a plain Hash. @@ -216,6 +230,14 @@ module HyperstackVM m ? m[1].to_f : nil end + # Strips the vLLM log prefix "(EngineCore pid=N) INFO YYYY-MM-DD HH:MM:SS [file.py:NN]" + # so only the human-readable message is shown in the watch display. + def clean_log_line(line) + return line if line.empty? + + line.sub(/^\(.*?pid=\d+\)\s+\w+\s+[\d-]+\s+[\d:]+\s+\[[\w.]+:\d+\]\s*/, '').strip + end + # Build an SSH command array for the watcher. # Uses accept-new rather than yes because the known-hosts file was populated # with the VM's public IP during provisioning; the WireGuard hostname @@ -330,8 +352,13 @@ module HyperstackVM lines.concat(render_comfyui_metrics(snap.metrics)) elsif snap.metrics&.any? lines.concat(render_vllm_metrics(snap.metrics)) - elsif snap.metrics && snap.metrics.empty? - lines << " #{DIM}(no Engine log line yet — container may still be loading)#{RESET}" + elsif snap.metrics + # Engine stats not yet available — model is still loading. + if snap.loading_status && !snap.loading_status.empty? + lines << row('loading', "#{YELLOW}#{snap.loading_status}#{RESET}") + else + lines << " #{DIM}(container starting…)#{RESET}" + end end end diff --git a/pi/agent/extensions/loop-scheduler/loop-presets.md b/pi/agent/extensions/loop-scheduler/loop-presets.md index 61f30d7..8d141fe 100644 --- a/pi/agent/extensions/loop-scheduler/loop-presets.md +++ b/pi/agent/extensions/loop-scheduler/loop-presets.md @@ -7,4 +7,5 @@ # * review: 1h review the last 10 git commits # * monitor: 10m check if there are any errors in the logs -* tasks: 1m automatically start with the next task with fresh context if the current task completed following the agent-task-management skill. +* tasks: 1m automatically start with the next task with fresh context if the current task completed following the agent-task-management skill. +* proceed: 1m proceed -- cgit v1.2.3