From 5343872a58f30fa7470011d740b404cfdd7ecdf2 Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Sun, 24 May 2026 22:56:19 +0300
Subject: fix(provisioning): recover from vLLM readiness timeout and increase
 poll window

When create timed out during vLLM readiness polling (common for large
models like Qwen3.6-27B-FP8), rerunning create would stop and restart
the already-running container, restarting the whole startup sequence.

Now the vLLM install script checks if the container is already running
and serving the correct model before touching it. If it detects a
healthy container, it skips the stop/pull/start cycle entirely.

Also increases the readiness timeout from 20 min (240x5s) to 30 min
(360x5s) to accommodate cold starts with model download and CUDA graph
capture on large models.
---
 lib/hyperstack/provisioning.rb | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'lib/hyperstack')

diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index 948cd2a..2738275 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -212,10 +212,22 @@ module HyperstackVM
 
       script = []
       script << 'set -euo pipefail'
+      # If the container is already running and serving the correct model, skip
+      # the stop/pull/start cycle entirely — just wait for it to become ready.
+      # This recovers gracefully from a previous create that timed out during the
+      # readiness poll but left the container running successfully.
+      script << "if docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null | grep -q '^running$'; then"
+      script << "  if curl -sf http://localhost:#{Shellwords.escape(port.to_s)}/v1/models 2>/dev/null | grep -q #{Shellwords.escape(model)}; then"
+      script << "    echo 'vLLM container already running with #{model}; skipping restart.'"
+      script << '    echo vllm-install-ok'
+      script << '    exit 0'
+      script << '  fi'
+      script << "  echo 'Container #{container} is running but not serving #{model}; restarting.'"
+      script << "  docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
+      script << "  docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
+      script << 'fi'
       script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
       script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
-      script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
-      script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
       script << "docker pull #{Shellwords.escape(image)}" if pull_image
       script << docker_run
       # Stage patterns cover the full vLLM startup sequence:
@@ -229,18 +241,18 @@ module HyperstackVM
       script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."'
       script << "stage_pat='#{stage_pat}'"
       script << "strip_pfx='#{strip_pfx}'"
-      script << 'for i in $(seq 1 240); do'
+      script << 'for i in $(seq 1 360); do'
       script << "  if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi"
       script << "  state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)"
       script << "  progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)"
       script << '  if [ -n "$progress" ]; then'
-      script << '    echo "  vLLM ($i/240, $state): $progress"'
+      script << '    echo "  vLLM ($i/360, $state): $progress"'
       script << '  else'
-      script << '    echo "  vLLM not ready yet ($i/240, container=$state)..."'
+      script << '    echo "  vLLM not ready yet ($i/360, container=$state)..."'
       script << '  fi'
       script << '  sleep 5'
       script << 'done'
-      script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }"
+      script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 30 minutes'; exit 1; }"
       script << 'echo vllm-install-ok'
       script.join("\n")
     end
-- 
cgit v1.2.3