summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--hyperstack-vm1-coder.toml (renamed from hyperstack-vm1-gptoss.toml)32
-rw-r--r--hyperstack-vm2.toml37
-rw-r--r--hypr.fish4
-rw-r--r--lib/hyperstack/cli.rb8
-rw-r--r--lib/hyperstack/watcher.rb57
-rw-r--r--pi/agent/extensions/loop-scheduler/loop-presets.md3
6 files changed, 97 insertions, 44 deletions
diff --git a/hyperstack-vm1-gptoss.toml b/hyperstack-vm1-coder.toml
index af25248..cd127dd 100644
--- a/hyperstack-vm1-gptoss.toml
+++ b/hyperstack-vm1-coder.toml
@@ -13,13 +13,13 @@ name_prefix = "hyperstack1"
hostname = "hyperstack1"
environment_name = "snonux-ollama"
-# A100-80GB single GPU for gpt-oss-120b
-flavor_name = "n3-A100x1"
+# A100-80GB single GPU for qwen3-coder-next (default); other models available via presets.
+flavor_name = "n3-H100x1"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
enable_port_randomization = false
-labels = ["gpt-oss-120b", "wireguard"]
+labels = ["qwen3-coder-next", "wireguard"]
[ssh]
username = "ubuntu"
@@ -58,22 +58,17 @@ context_length = 32768
pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]
# vLLM serves one model via Docker on the OpenAI-compatible API.
+# VM1 defaults to qwen3-coder-next; use 'model switch' to load any other preset.
[vllm]
install = true
-model = "openai/gpt-oss-120b"
+model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
-container_name = "vllm_gpt_oss_120b"
-# Hard architecture limit: max_position_embeddings=131072 in model config.json.
-max_model_len = 131072
+container_name = "vllm_qwen3"
+max_model_len = 262144
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
-# tool_call_parser="" disables --enable-auto-tool-choice; the llama3_json parser crashes
-# on gpt-oss responses (vLLM 0.17.1 adds token_ids to responses, breaking the parser API).
-tool_call_parser = ""
-# gpt-oss-120b is a reasoning model (o-series architecture); the openai_gptoss parser
-# extracts <|channel|>analysis…<|end|> thinking blocks into reasoning_content in the response.
-extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]
+tool_call_parser = "qwen3_coder"
# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1-gptoss.toml model switch <name>'.
# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
@@ -172,6 +167,17 @@ tensor_parallel_size = 1
tool_call_parser = "mistral"
extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]
+# Gemma 4 31B AWQ 4-bit — Google's dense 31B multimodal model (~19 GB weights on A100-80GB).
+# ~61 GB VRAM remaining for KV cache; supports up to ~32K context comfortably.
+# Uses vLLM's gemma4 tool-call parser for function calling support.
+[vllm.presets.gemma4-31b]
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
+container_name = "vllm_gemma4_31b"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "gemma4"
+
[wireguard]
auto_setup = true
setup_script = "./wg1-setup.sh"
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index 32e3a99..bed09a1 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -13,14 +13,13 @@ name_prefix = "hyperstack2"
hostname = "hyperstack2"
environment_name = "snonux-ollama"
-# A100-80GB is the cost-first default for qwen3-coder-next inference.
-# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom.
-flavor_name = "n3-A100x1"
+# H100-80GB for Gemma 4 31B inference; switched from n3-A100x1 (out of stock).
+flavor_name = "n3-H100x1"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
enable_port_randomization = false
-labels = ["qwen3-coder-next", "wireguard"]
+labels = ["gemma4-31b", "wireguard"]
[ssh]
username = "ubuntu"
@@ -59,21 +58,41 @@ context_length = 32768
pull_models = ["qwen3-coder-next"]
# vLLM serves one model via Docker on the OpenAI-compatible API.
-# VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset.
+# VM2 defaults to Gemma 4 31B; use 'model switch' to load any other preset.
+# NOTE: Gemma 4 requires transformers>=5.0 but vLLM stable pins transformers<5.
+# Workaround: use the vLLM nightly image and force-install transformers 5.5.0 at startup.
+# Remove the docker_image and pre_start_cmd overrides once vLLM stable adds Gemma 4 support.
[vllm]
install = true
-model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
-container_name = "vllm_qwen3"
-max_model_len = 262144
+container_name = "vllm_gemma4_31b"
+# Gemma 4 31B AWQ 4-bit: ~19 GB weights, ~61 GB VRAM remaining for KV cache on H100-80GB.
+# 131072 = Gemma 4's architectural max (128K context); KV cache auto-scales to fit.
+max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
-tool_call_parser = "qwen3_coder"
+tool_call_parser = "gemma4"
+# Use nightly image: stable 0.19.0 lacks Gemma 4 architecture support in its bundled transformers.
+docker_image = "vllm/vllm-openai:nightly"
+# Upgrade transformers to 5.x (Gemma 4 arch added there) before starting vLLM.
+pre_start_cmd = "pip install -q transformers==5.5.0 2>/dev/null"
# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch <name>'.
# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
+# Gemma 4 31B AWQ 4-bit — Google's dense 31B multimodal model (~19 GB weights on H100-80GB).
+# ~61 GB VRAM remaining for KV cache; supports up to ~32K context comfortably.
+# Uses vLLM's gemma4 tool-call parser for function calling support.
+[vllm.presets.gemma4-31b]
+model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
+container_name = "vllm_gemma4_31b"
+max_model_len = 131072
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "gemma4"
+
[vllm.presets.qwen3-coder-next]
model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
container_name = "vllm_qwen3"
diff --git a/hypr.fish b/hypr.fish
index 3d3633b..45839e3 100644
--- a/hypr.fish
+++ b/hypr.fish
@@ -5,7 +5,7 @@ abbr hyperstack-delete ruby ~/git/hyperstack/hyperstack.rb delete
abbr hyperstack-test ruby ~/git/hyperstack/hyperstack.rb test
# Dual-VM setup (hyperstack-vm1/vm2.toml → hyperstack1/2.wg1)
-abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit
-abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit
+abbr pi-hyperstack-coder pi --model hyperstack1/bullpoint/Qwen3-Coder-Next-AWQ-4bit
+abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4bit
abbr hyperstack-create-both ruby ~/git/hyperstack/hyperstack.rb create-both
abbr hyperstack-delete-both ruby ~/git/hyperstack/hyperstack.rb delete-both
diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb
index f4d1cef..d4679b1 100644
--- a/lib/hyperstack/cli.rb
+++ b/lib/hyperstack/cli.rb
@@ -21,12 +21,12 @@ module HyperstackVM
puts 'Commands:'
puts ' create [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama] [--model PRESET]'
puts ' create-both [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama]'
- puts ' Provision hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml concurrently.'
+ puts ' Provision hyperstack-vm1-coder.toml and hyperstack-vm2.toml concurrently.'
puts ' WireGuard setup is serialized: VM1 writes the base wg1.conf first,'
puts ' then VM2 adds its peer. Requires both TOML files next to the script.'
puts ' delete [--vm-id ID] [--dry-run]'
puts ' delete-both [--dry-run]'
- puts ' Delete the VMs tracked by hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml.'
+ puts ' Delete the VMs tracked by hyperstack-vm1-coder.toml and hyperstack-vm2.toml.'
puts ' status'
puts ' watch'
puts ' Poll all active VMs for vLLM and GPU stats every 60 s.'
@@ -237,7 +237,7 @@ module HyperstackVM
candidates = [
@config_path,
- File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml'),
+ File.join(REPO_ROOT, 'hyperstack-vm1-coder.toml'),
File.join(REPO_ROOT, 'hyperstack-vm2.toml'),
File.join(REPO_ROOT, 'hyperstack-vm-photo.toml')
].uniq.select { |path| File.exist?(path) }
@@ -249,7 +249,7 @@ module HyperstackVM
def pair_config_loaders
[
- ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml')),
+ ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-coder.toml')),
ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm2.toml'))
]
end
diff --git a/lib/hyperstack/watcher.rb b/lib/hyperstack/watcher.rb
index de3d71e..1c126c5 100644
--- a/lib/hyperstack/watcher.rb
+++ b/lib/hyperstack/watcher.rb
@@ -22,11 +22,14 @@ module HyperstackVM
# Snapshot of one VM's stats at a point in time.
# service_type is :vllm or :comfyui — controls which metrics section is rendered.
+ # loading_status holds the last meaningful log line while vLLM is still initialising;
+ # it is nil once the Engine 0 stats line starts appearing.
VmSnapshot = Struct.new(
:label, :wg_host, :service_type,
:vllm_model, :container_name,
:metrics, :gpus,
:vllm_error, :gpu_error,
+ :loading_status,
:fetched_at,
keyword_init: true
)
@@ -78,7 +81,7 @@ module HyperstackVM
vllm_model: nil, container_name: nil,
metrics: nil, gpus: nil,
vllm_error: 'no state file', gpu_error: nil,
- fetched_at: Time.now)
+ loading_status: nil, fetched_at: Time.now)
end
if config.comfyui_install_enabled?
@@ -91,7 +94,7 @@ module HyperstackVM
vllm_model: nil, container_name: nil,
metrics: nil, gpus: nil,
vllm_error: e.message, gpu_error: nil,
- fetched_at: Time.now)
+ loading_status: nil, fetched_at: Time.now)
end
# Fetches GPU + vLLM container stats for a vLLM VM.
@@ -99,13 +102,13 @@ module HyperstackVM
vllm_model = state['vllm_model'] || config.vllm_model
container_name = state['vllm_container_name'] || config.vllm_container_name
- gpus, metrics, ssh_error = fetch_vm_stats(config, wg_host, container_name)
+ gpus, metrics, loading_status, ssh_error = fetch_vm_stats(config, wg_host, container_name)
VmSnapshot.new(label: label, wg_host: wg_host, service_type: :vllm,
vllm_model: vllm_model, container_name: container_name,
metrics: metrics, gpus: gpus,
vllm_error: ssh_error, gpu_error: ssh_error,
- fetched_at: Time.now)
+ loading_status: loading_status, fetched_at: Time.now)
end
# Fetches GPU + ComfyUI queue stats for a ComfyUI VM.
@@ -117,7 +120,7 @@ module HyperstackVM
vllm_model: nil, container_name: nil,
metrics: metrics, gpus: gpus,
vllm_error: ssh_error, gpu_error: ssh_error,
- fetched_at: Time.now)
+ loading_status: nil, fetched_at: Time.now)
end
def load_state(path)
@@ -167,26 +170,37 @@ module HyperstackVM
end
# Single SSH call that runs nvidia-smi and tails the vLLM container logs.
- # The two sections are separated by a sentinel line so we can split them.
- # Returns [gpus, metrics, error_or_nil].
+ # Captures the Engine 0 stats line (present once the model is running) and,
+ # when that line is absent, the last relevant loading-phase log line so the
+ # watch display can show model-download / weight-load progress.
+ # Returns [gpus, metrics, loading_status, error_or_nil].
def fetch_vm_stats(config, wg_host, container_name)
gpu_query = 'index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total'
- # --tail 200 instead of --since N so we always get the last stats line
+ # Capture logs once into a shell variable to avoid two docker calls.
+ # --tail 300 instead of --since N so we always get the last stats line
# even when the VM has been idle for longer than the refresh interval.
- script = <<~BASH
+ # grep exit 1 (no match) is swallowed by the pipeline tail -1, which
+ # always succeeds, so bash -se does not abort on an empty grep result.
+ script = <<~BASH
nvidia-smi --query-gpu=#{gpu_query} --format=csv,noheader,nounits
echo ===VLLM===
- docker logs --tail 200 #{container_name} 2>&1 | grep 'Engine 0' | tail -1
+ _logs=$(docker logs --tail 300 #{container_name} 2>&1)
+ echo "$_logs" | grep 'Engine 0' | tail -1
+ echo ===LOADING===
+ echo "$_logs" | grep -E 'Starting to load|Loading model|model weight|Downloading|GPU block|Profil|shard|Initializ|quantiz|AWQ' | tail -1
BASH
ssh = build_ssh_command(config, wg_host)
stdout, stderr, status = Timeout.timeout(15) { Open3.capture3(*ssh, stdin_data: script) }
- return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success?
+ return [nil, nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success?
- gpu_section, vllm_section = stdout.split("===VLLM===\n", 2)
+ gpu_section, rest = stdout.split("===VLLM===\n", 2)
+ vllm_section, load_section = rest.to_s.split("===LOADING===\n", 2)
gpus = parse_nvidia_smi(gpu_section.to_s)
metrics = parse_engine_log_line(vllm_section.to_s.strip)
- [gpus, metrics, nil]
+ # Only surface the loading line while the engine stats aren't available yet.
+ loading_status = metrics.empty? ? clean_log_line(load_section.to_s.strip) : nil
+ [gpus, metrics, loading_status, nil]
end
# Parse a vLLM "Engine 0" log line into a plain Hash.
@@ -216,6 +230,14 @@ module HyperstackVM
m ? m[1].to_f : nil
end
+ # Strips the vLLM log prefix "(EngineCore pid=N) INFO YYYY-MM-DD HH:MM:SS [file.py:NN]"
+ # so only the human-readable message is shown in the watch display.
+ def clean_log_line(line)
+ return line if line.empty?
+
+ line.sub(/^\(.*?pid=\d+\)\s+\w+\s+[\d-]+\s+[\d:]+\s+\[[\w.]+:\d+\]\s*/, '').strip
+ end
+
# Build an SSH command array for the watcher.
# Uses accept-new rather than yes because the known-hosts file was populated
# with the VM's public IP during provisioning; the WireGuard hostname
@@ -330,8 +352,13 @@ module HyperstackVM
lines.concat(render_comfyui_metrics(snap.metrics))
elsif snap.metrics&.any?
lines.concat(render_vllm_metrics(snap.metrics))
- elsif snap.metrics && snap.metrics.empty?
- lines << " #{DIM}(no Engine log line yet — container may still be loading)#{RESET}"
+ elsif snap.metrics
+ # Engine stats not yet available — model is still loading.
+ if snap.loading_status && !snap.loading_status.empty?
+ lines << row('loading', "#{YELLOW}#{snap.loading_status}#{RESET}")
+ else
+ lines << " #{DIM}(container starting…)#{RESET}"
+ end
end
end
diff --git a/pi/agent/extensions/loop-scheduler/loop-presets.md b/pi/agent/extensions/loop-scheduler/loop-presets.md
index 61f30d7..8d141fe 100644
--- a/pi/agent/extensions/loop-scheduler/loop-presets.md
+++ b/pi/agent/extensions/loop-scheduler/loop-presets.md
@@ -7,4 +7,5 @@
# * review: 1h review the last 10 git commits
# * monitor: 10m check if there are any errors in the logs
-* tasks: 1m automatically start with the next task with fresh context if the current task completed following the agent-task-management skill.
+* tasks: 1m automatically start with the next task with fresh context if the current task completed following the agent-task-management skill.
+* proceed: 1m proceed