From 5343872a58f30fa7470011d740b404cfdd7ecdf2 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Sun, 24 May 2026 22:56:19 +0300 Subject: fix(provisioning): recover from vLLM readiness timeout and increase poll window When create timed out during vLLM readiness polling (common for large models like Qwen3.6-27B-FP8), rerunning create would stop and restart the already-running container, restarting the whole startup sequence. Now the vLLM install script checks if the container is already running and serving the correct model before touching it. If it detects a healthy container, it skips the stop/pull/start cycle entirely. Also increases the readiness timeout from 20 min (240x5s) to 30 min (360x5s) to accommodate cold starts with model download and CUDA graph capture on large models. --- lib/hyperstack/provisioning.rb | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'lib/hyperstack') diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb index 948cd2a..2738275 100644 --- a/lib/hyperstack/provisioning.rb +++ b/lib/hyperstack/provisioning.rb @@ -212,10 +212,22 @@ module HyperstackVM script = [] script << 'set -euo pipefail' + # If the container is already running and serving the correct model, skip + # the stop/pull/start cycle entirely — just wait for it to become ready. + # This recovers gracefully from a previous create that timed out during the + # readiness poll but left the container running successfully. + script << "if docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null | grep -q '^running$'; then" + script << " if curl -sf http://localhost:#{Shellwords.escape(port.to_s)}/v1/models 2>/dev/null | grep -q #{Shellwords.escape(model)}; then" + script << " echo 'vLLM container already running with #{model}; skipping restart.'" + script << ' echo vllm-install-ok' + script << ' exit 0' + script << ' fi' + script << " echo 'Container #{container} is running but not serving #{model}; restarting.'" + script << " docker stop #{Shellwords.escape(container)} 2>/dev/null || true" + script << " docker rm #{Shellwords.escape(container)} 2>/dev/null || true" + script << 'fi' script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" - script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true" - script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true" script << "docker pull #{Shellwords.escape(image)}" if pull_image script << docker_run # Stage patterns cover the full vLLM startup sequence: @@ -229,18 +241,18 @@ module HyperstackVM script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."' script << "stage_pat='#{stage_pat}'" script << "strip_pfx='#{strip_pfx}'" - script << 'for i in $(seq 1 240); do' + script << 'for i in $(seq 1 360); do' script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi" script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)" script << " progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)" script << ' if [ -n "$progress" ]; then' - script << ' echo " vLLM ($i/240, $state): $progress"' + script << ' echo " vLLM ($i/360, $state): $progress"' script << ' else' - script << ' echo " vLLM not ready yet ($i/240, container=$state)..."' + script << ' echo " vLLM not ready yet ($i/360, container=$state)..."' script << ' fi' script << ' sleep 5' script << 'done' - script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }" + script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 30 minutes'; exit 1; }" script << 'echo vllm-install-ok' script.join("\n") end -- cgit v1.2.3