summaryrefslogtreecommitdiff
path: root/lib/hyperstack
diff options
context:
space:
mode:
Diffstat (limited to 'lib/hyperstack')
-rw-r--r--lib/hyperstack/provisioning.rb24
1 files changed, 18 insertions, 6 deletions
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index 948cd2a..2738275 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -212,10 +212,22 @@ module HyperstackVM
script = []
script << 'set -euo pipefail'
+ # If the container is already running and serving the correct model, skip
+ # the stop/pull/start cycle entirely — just wait for it to become ready.
+ # This recovers gracefully from a previous create that timed out during the
+ # readiness poll but left the container running successfully.
+ script << "if docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null | grep -q '^running$'; then"
+ script << " if curl -sf http://localhost:#{Shellwords.escape(port.to_s)}/v1/models 2>/dev/null | grep -q #{Shellwords.escape(model)}; then"
+ script << " echo 'vLLM container already running with #{model}; skipping restart.'"
+ script << ' echo vllm-install-ok'
+ script << ' exit 0'
+ script << ' fi'
+ script << " echo 'Container #{container} is running but not serving #{model}; restarting.'"
+ script << " docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
+ script << " docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
+ script << 'fi'
script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
- script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
- script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
script << "docker pull #{Shellwords.escape(image)}" if pull_image
script << docker_run
# Stage patterns cover the full vLLM startup sequence:
@@ -229,18 +241,18 @@ module HyperstackVM
script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."'
script << "stage_pat='#{stage_pat}'"
script << "strip_pfx='#{strip_pfx}'"
- script << 'for i in $(seq 1 240); do'
+ script << 'for i in $(seq 1 360); do'
script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi"
script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)"
script << " progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)"
script << ' if [ -n "$progress" ]; then'
- script << ' echo " vLLM ($i/240, $state): $progress"'
+ script << ' echo " vLLM ($i/360, $state): $progress"'
script << ' else'
- script << ' echo " vLLM not ready yet ($i/240, container=$state)..."'
+ script << ' echo " vLLM not ready yet ($i/360, container=$state)..."'
script << ' fi'
script << ' sleep 5'
script << 'done'
- script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }"
+ script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 30 minutes'; exit 1; }"
script << 'echo vllm-install-ok'
script.join("\n")
end