diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-24 22:56:19 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-24 22:56:19 +0300 |
| commit | 5343872a58f30fa7470011d740b404cfdd7ecdf2 (patch) | |
| tree | b5add2fd535e44eb08bedbbb42af0d955b233e3f /lib/hyperstack | |
| parent | d3787698a8a16b92006d4b5a9d285b170881f225 (diff) | |
fix(provisioning): recover from vLLM readiness timeout and increase poll window
When create timed out during vLLM readiness polling (common for large
models like Qwen3.6-27B-FP8), rerunning create would stop and restart
the already-running container, restarting the whole startup sequence.
Now the vLLM install script checks if the container is already running
and serving the correct model before touching it. If it detects a
healthy container, it skips the stop/pull/start cycle entirely.
Also increases the readiness timeout from 20 min (240x5s) to 30 min
(360x5s) to accommodate cold starts with model download and CUDA graph
capture on large models.
Diffstat (limited to 'lib/hyperstack')
| -rw-r--r-- | lib/hyperstack/provisioning.rb | 24 |
1 files changed, 18 insertions, 6 deletions
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb index 948cd2a..2738275 100644 --- a/lib/hyperstack/provisioning.rb +++ b/lib/hyperstack/provisioning.rb @@ -212,10 +212,22 @@ module HyperstackVM script = [] script << 'set -euo pipefail' + # If the container is already running and serving the correct model, skip + # the stop/pull/start cycle entirely — just wait for it to become ready. + # This recovers gracefully from a previous create that timed out during the + # readiness poll but left the container running successfully. + script << "if docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null | grep -q '^running$'; then" + script << " if curl -sf http://localhost:#{Shellwords.escape(port.to_s)}/v1/models 2>/dev/null | grep -q #{Shellwords.escape(model)}; then" + script << " echo 'vLLM container already running with #{model}; skipping restart.'" + script << ' echo vllm-install-ok' + script << ' exit 0' + script << ' fi' + script << " echo 'Container #{container} is running but not serving #{model}; restarting.'" + script << " docker stop #{Shellwords.escape(container)} 2>/dev/null || true" + script << " docker rm #{Shellwords.escape(container)} 2>/dev/null || true" + script << 'fi' script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" - script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true" - script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true" script << "docker pull #{Shellwords.escape(image)}" if pull_image script << docker_run # Stage patterns cover the full vLLM startup sequence: @@ -229,18 +241,18 @@ module HyperstackVM script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."' script << "stage_pat='#{stage_pat}'" script << "strip_pfx='#{strip_pfx}'" - script << 'for i in $(seq 1 240); do' + script << 'for i in $(seq 1 360); do' script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi" script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)" script << " progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)" script << ' if [ -n "$progress" ]; then' - script << ' echo " vLLM ($i/240, $state): $progress"' + script << ' echo " vLLM ($i/360, $state): $progress"' script << ' else' - script << ' echo " vLLM not ready yet ($i/240, container=$state)..."' + script << ' echo " vLLM not ready yet ($i/360, container=$state)..."' script << ' fi' script << ' sleep 5' script << 'done' - script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }" + script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 30 minutes'; exit 1; }" script << 'echo vllm-install-ok' script.join("\n") end |
