summaryrefslogtreecommitdiff
path: root/lib/hyperstack
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-24 22:56:19 +0300
committerPaul Buetow <paul@buetow.org>2026-05-24 22:56:19 +0300
commit5343872a58f30fa7470011d740b404cfdd7ecdf2 (patch)
treeb5add2fd535e44eb08bedbbb42af0d955b233e3f /lib/hyperstack
parentd3787698a8a16b92006d4b5a9d285b170881f225 (diff)
fix(provisioning): recover from vLLM readiness timeout and increase poll window
When create timed out during vLLM readiness polling (common for large models like Qwen3.6-27B-FP8), rerunning create would stop and restart the already-running container, restarting the whole startup sequence. Now the vLLM install script checks if the container is already running and serving the correct model before touching it. If it detects a healthy container, it skips the stop/pull/start cycle entirely. Also increases the readiness timeout from 20 min (240x5s) to 30 min (360x5s) to accommodate cold starts with model download and CUDA graph capture on large models.
Diffstat (limited to 'lib/hyperstack')
-rw-r--r--lib/hyperstack/provisioning.rb24
1 files changed, 18 insertions, 6 deletions
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index 948cd2a..2738275 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -212,10 +212,22 @@ module HyperstackVM
script = []
script << 'set -euo pipefail'
+ # If the container is already running and serving the correct model, skip
+ # the stop/pull/start cycle entirely — just wait for it to become ready.
+ # This recovers gracefully from a previous create that timed out during the
+ # readiness poll but left the container running successfully.
+ script << "if docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null | grep -q '^running$'; then"
+ script << " if curl -sf http://localhost:#{Shellwords.escape(port.to_s)}/v1/models 2>/dev/null | grep -q #{Shellwords.escape(model)}; then"
+ script << " echo 'vLLM container already running with #{model}; skipping restart.'"
+ script << ' echo vllm-install-ok'
+ script << ' exit 0'
+ script << ' fi'
+ script << " echo 'Container #{container} is running but not serving #{model}; restarting.'"
+ script << " docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
+ script << " docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
+ script << 'fi'
script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
- script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
- script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
script << "docker pull #{Shellwords.escape(image)}" if pull_image
script << docker_run
# Stage patterns cover the full vLLM startup sequence:
@@ -229,18 +241,18 @@ module HyperstackVM
script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."'
script << "stage_pat='#{stage_pat}'"
script << "strip_pfx='#{strip_pfx}'"
- script << 'for i in $(seq 1 240); do'
+ script << 'for i in $(seq 1 360); do'
script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi"
script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)"
script << " progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)"
script << ' if [ -n "$progress" ]; then'
- script << ' echo " vLLM ($i/240, $state): $progress"'
+ script << ' echo " vLLM ($i/360, $state): $progress"'
script << ' else'
- script << ' echo " vLLM not ready yet ($i/240, container=$state)..."'
+ script << ' echo " vLLM not ready yet ($i/360, container=$state)..."'
script << ' fi'
script << ' sleep 5'
script << 'done'
- script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }"
+ script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 30 minutes'; exit 1; }"
script << 'echo vllm-install-ok'
script.join("\n")
end