vllm: skip docker pull on model switch, persist torch compile cache

- model switch now passes pull_image: false to avoid surprise multi-GB image downloads when the upstream vLLM image was updated upstream; docker pull is still run on initial install (pull_image: true default) - mount /ephemeral/vllm_cache → /root/.cache/vllm so torch.compile artifacts survive container restarts; saves ~30-60 s on warm switches - add vllm_compile_cache_dir helper (sibling of hug_cache_dir) Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-18 19:08:17 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-18 19:08:17 +0200
commit: 1c906b2378c49d28b47889e0db659cb6d9cf5395 (patch)
tree: c6ed75b7f2566c0fdca7b012a8a26667dbd70f8e /snippets/hyperstack/hyperstack.rb
parent: 3b01d5cb2c8932207127e7dd72848cea91c6347d (diff)
1 files changed, 26 insertions, 9 deletions
diff --git a/snippets/hyperstack/hyperstack.rb b/snippets/hyperstack/hyperstack.rb
index dd83225..69bb6f6 100755
--- a/snippets/hyperstack/hyperstack.rb
+++ b/snippets/hyperstack/hyperstack.rb
@@ -380,6 +380,12 @@ module HyperstackVM
       fetch('vllm', 'hug_cache_dir')
     end
 
+    # Derived from hug_cache_dir: sibling directory for torch.compile artifacts.
+    # Persisted across container restarts so recompilation is skipped on warm switches.
+    def vllm_compile_cache_dir
+      File.join(File.dirname(fetch('vllm', 'hug_cache_dir')), 'vllm_cache')
+    end
+
     def vllm_container_name
       fetch('vllm', 'container_name')
     end
@@ -921,7 +927,9 @@ module HyperstackVM
       end
 
       info "Starting vLLM with preset '#{preset_name}' (#{preset['model']})..."
-      output, status = run_ssh_command_streaming(host, vllm_install_script(preset_config: preset))
+      # Skip docker pull: image is already present; pulling on every switch risks a
+      # surprise multi-GB download if the upstream image was updated.
+      output, status = run_ssh_command_streaming(host, vllm_install_script(preset_config: preset, pull_image: false))
       raise Error, "vLLM install failed: #{output.strip}" unless status.success?
 
       # Hot-reload LiteLLM: rewrite config for the new model and restart the service.
@@ -1632,14 +1640,17 @@ module HyperstackVM
       raise Error, "LiteLLM install failed: #{output.strip}" unless status.success?
     end
 
-    # Generates the remote shell script that pulls the vLLM Docker image, starts
-    # the container, and polls until the model is fully loaded (up to 10 minutes
+    # Generates the remote shell script that (optionally) pulls the vLLM Docker image,
+    # starts the container, and polls until the model is fully loaded (up to 10 minutes
     # to cover the first-run ~45 GB model download).
     # preset_config overrides individual fields; unset fields fall back to [vllm] defaults.
-    def vllm_install_script(preset_config: nil)
+    # pull_image: true on initial install; false on model switch (image already present,
+    # and pulling every switch can trigger a multi-GB download if the image was updated).
+    def vllm_install_script(preset_config: nil, pull_image: true)
       cfg              = preset_config || {}
       model            = cfg['model'] || @config.vllm_model
-      cache_dir        = @config.vllm_hug_cache_dir # always use main config for shared cache
+      cache_dir        = @config.vllm_hug_cache_dir     # always use main config for shared HF cache
+      compile_cache    = @config.vllm_compile_cache_dir # persisted torch.compile artifacts
       container        = cfg['container_name']         || @config.vllm_container_name
       max_len          = Integer(cfg['max_model_len']  || @config.vllm_max_model_len)
       gpu_util         = Float(cfg['gpu_memory_utilization'] || @config.vllm_gpu_memory_utilization)
@@ -1657,6 +1668,9 @@ module HyperstackVM
         "--name #{Shellwords.escape(container)}",
         '--restart always',
         "-v #{Shellwords.escape(cache_dir)}:/root/.cache/huggingface",
+        # Mount torch.compile cache so CUDA kernel compilation is skipped on warm restarts.
+        # Without this, every container restart recompiles (~30-60 s extra).
+        "-v #{Shellwords.escape(compile_cache)}:/root/.cache/vllm",
         'vllm/vllm-openai:latest',
         "--model #{Shellwords.escape(model)}",
         "--tensor-parallel-size #{tp_size}",
@@ -1679,16 +1693,19 @@ module HyperstackVM
 
       script = []
       script << 'set -euo pipefail'
-      script << "sudo mkdir -p #{Shellwords.escape(cache_dir)}"
-      script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)}"
+      script << "sudo mkdir -p #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
+      script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
       # Stop and remove any existing container so re-runs are idempotent.
       script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
       script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
-      script << 'docker pull vllm/vllm-openai:latest'
+      # Pull the image only when explicitly requested (initial install / forced update).
+      # Skipping pull on model switches avoids surprise multi-GB downloads when the image
+      # has been updated upstream, which would otherwise add several minutes to a switch.
+      script << 'docker pull vllm/vllm-openai:latest' if pull_image
       script << docker_run
       # Poll until the model is loaded:
       #   first run:    ~45 GB download (~2.5 min) + model load (~65 s) + CUDA graphs (~35 s) ≈ 4-5 min
-      #   warm restart: model load + CUDA graphs ≈ 100 s
+      #   warm restart: model load (~65 s) + CUDA graphs (~35 s, skipped if compile cache warm) ≈ 65-100 s
       # Timeout: 120 × 5 s = 10 minutes
       script << 'echo "Waiting for vLLM to become ready (up to 10 min for first model download)..."'
       script << 'for i in $(seq 1 120); do'
author	Paul Buetow <paul@buetow.org>	2026-03-18 19:08:17 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-18 19:08:17 +0200
commit	1c906b2378c49d28b47889e0db659cb6d9cf5395 (patch)
tree	c6ed75b7f2566c0fdca7b012a8a26667dbd70f8e /snippets/hyperstack/hyperstack.rb
parent	3b01d5cb2c8932207127e7dd72848cea91c6347d (diff)