fix(provisioning): recover from vLLM readiness timeout and increase poll window

When create timed out during vLLM readiness polling (common for large models like Qwen3.6-27B-FP8), rerunning create would stop and restart the already-running container, restarting the whole startup sequence. Now the vLLM install script checks if the container is already running and serving the correct model before touching it. If it detects a healthy container, it skips the stop/pull/start cycle entirely. Also increases the readiness timeout from 20 min (240x5s) to 30 min (360x5s) to accommodate cold starts with model download and CUDA graph capture on large models.
author: Paul Buetow <paul@buetow.org> 2026-05-24 22:56:19 +0300
committer: Paul Buetow <paul@buetow.org> 2026-05-24 22:56:19 +0300
commit: 5343872a58f30fa7470011d740b404cfdd7ecdf2 (patch)
tree: b5add2fd535e44eb08bedbbb42af0d955b233e3f /lib/hyperstack
parent: d3787698a8a16b92006d4b5a9d285b170881f225 (diff)
1 files changed, 18 insertions, 6 deletions
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index 948cd2a..2738275 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -212,10 +212,22 @@ module HyperstackVM
 
       script = []
       script << 'set -euo pipefail'
+      # If the container is already running and serving the correct model, skip
+      # the stop/pull/start cycle entirely — just wait for it to become ready.
+      # This recovers gracefully from a previous create that timed out during the
+      # readiness poll but left the container running successfully.
+      script << "if docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null | grep -q '^running$'; then"
+      script << "  if curl -sf http://localhost:#{Shellwords.escape(port.to_s)}/v1/models 2>/dev/null | grep -q #{Shellwords.escape(model)}; then"
+      script << "    echo 'vLLM container already running with #{model}; skipping restart.'"
+      script << '    echo vllm-install-ok'
+      script << '    exit 0'
+      script << '  fi'
+      script << "  echo 'Container #{container} is running but not serving #{model}; restarting.'"
+      script << "  docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
+      script << "  docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
+      script << 'fi'
       script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
       script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
-      script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
-      script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
       script << "docker pull #{Shellwords.escape(image)}" if pull_image
       script << docker_run
       # Stage patterns cover the full vLLM startup sequence:
@@ -229,18 +241,18 @@ module HyperstackVM
       script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."'
       script << "stage_pat='#{stage_pat}'"
       script << "strip_pfx='#{strip_pfx}'"
-      script << 'for i in $(seq 1 240); do'
+      script << 'for i in $(seq 1 360); do'
       script << "  if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi"
       script << "  state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)"
       script << "  progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)"
       script << '  if [ -n "$progress" ]; then'
-      script << '    echo "  vLLM ($i/240, $state): $progress"'
+      script << '    echo "  vLLM ($i/360, $state): $progress"'
       script << '  else'
-      script << '    echo "  vLLM not ready yet ($i/240, container=$state)..."'
+      script << '    echo "  vLLM not ready yet ($i/360, container=$state)..."'
       script << '  fi'
       script << '  sleep 5'
       script << 'done'
-      script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }"
+      script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 30 minutes'; exit 1; }"
       script << 'echo vllm-install-ok'
       script.join("\n")
     end
author	Paul Buetow <paul@buetow.org>	2026-05-24 22:56:19 +0300
committer	Paul Buetow <paul@buetow.org>	2026-05-24 22:56:19 +0300
commit	5343872a58f30fa7470011d740b404cfdd7ecdf2 (patch)
tree	b5add2fd535e44eb08bedbbb42af0d955b233e3f /lib/hyperstack
parent	d3787698a8a16b92006d4b5a9d285b170881f225 (diff)