provisioner: support docker_image and pre_start_cmd for Gemma 4 startup

Adds docker_image and pre_start_cmd config fields to config.rb and provisioning.rb so the Gemma 4 31B workarounds are baked in: - docker_image = "vllm/vllm-openai:nightly" (stable lacks Gemma 4 support) - pre_start_cmd = "pip install -q transformers==5.5.0" (stable pins <5) - extra_docker_env = ["CUDA_VISIBLE_DEVICES=0"] (required with --entrypoint bash) When pre_start_cmd is set, the provisioner switches to --entrypoint bash and chains the patch command before launching vLLM, so create-both works end-to-end without manual container replacement. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-04-06 20:47:39 +0300
committer: Paul Buetow <paul@buetow.org> 2026-04-06 20:47:39 +0300
commit: eb800cdf31176584ee0b604f5bda65f0d2880909 (patch)
tree: 0bf9ef9491137e9e5e6600f1819b1b8d048a24af
parent: 0664ffcc62b2fb240286fde463635e510a41df84 (diff)
3 files changed, 48 insertions, 11 deletions
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index bed09a1..aeb796f 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -61,7 +61,7 @@ pull_models = ["qwen3-coder-next"]
 # VM2 defaults to Gemma 4 31B; use 'model switch' to load any other preset.
 # NOTE: Gemma 4 requires transformers>=5.0 but vLLM stable pins transformers<5.
 # Workaround: use the vLLM nightly image and force-install transformers 5.5.0 at startup.
-# Remove the docker_image and pre_start_cmd overrides once vLLM stable adds Gemma 4 support.
+# Remove docker_image and pre_start_cmd once vLLM stable gains Gemma 4 support.
 [vllm]
 install = true
 model = "cyankiwi/gemma-4-31B-it-AWQ-4bit"
@@ -78,6 +78,9 @@ tool_call_parser = "gemma4"
 docker_image = "vllm/vllm-openai:nightly"
 # Upgrade transformers to 5.x (Gemma 4 arch added there) before starting vLLM.
 pre_start_cmd = "pip install -q transformers==5.5.0 2>/dev/null"
+# CUDA_VISIBLE_DEVICES=0 is required when using --entrypoint bash (which pre_start_cmd triggers):
+# the EngineCore subprocess loses GPU visibility without it, causing a rank OOB error on startup.
+extra_docker_env = ["CUDA_VISIBLE_DEVICES=0"]
 
 # Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch <name>'.
 # Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.
diff --git a/lib/hyperstack/config.rb b/lib/hyperstack/config.rb
index 402f45d..178429d 100644
--- a/lib/hyperstack/config.rb
+++ b/lib/hyperstack/config.rb
@@ -445,6 +445,19 @@ module HyperstackVM
       Array(fetch('vllm', 'extra_docker_env')).map(&:to_s)
     end
 
+    # Docker image for vLLM. Defaults to the stable release.
+    # Override to 'vllm/vllm-openai:nightly' for models not yet supported by stable vLLM.
+    def vllm_docker_image
+      fetch('vllm', 'docker_image') || 'vllm/vllm-openai:latest'
+    end
+
+    # Shell command to run inside the container before starting vLLM (via --entrypoint bash).
+    # Used to patch dependencies at startup, e.g. upgrading transformers for new model architectures.
+    # nil means no pre-start command — vLLM is started directly (default entrypoint).
+    def vllm_pre_start_cmd
+      fetch('vllm', 'pre_start_cmd')
+    end
+
     # Whether to pass --enable-prefix-caching to vLLM. Defaults to true.
     # Disable for hybrid Mamba models (NemotronH): prefix caching forces Mamba into "all" cache
     # mode which pre-allocates states for all sequences, consuming extra VRAM on startup.
@@ -477,6 +490,9 @@ module HyperstackVM
         'trust_remote_code' => raw.key?('trust_remote_code') ? raw['trust_remote_code'] : false,
         'extra_vllm_args' => raw.key?('extra_vllm_args') ? Array(raw['extra_vllm_args']) : [],
         'extra_docker_env' => raw.key?('extra_docker_env') ? Array(raw['extra_docker_env']) : [],
+        # docker_image / pre_start_cmd: nil means "not set in preset" — fall back to [vllm] defaults.
+        'docker_image' => raw.key?('docker_image') ? raw['docker_image'] : nil,
+        'pre_start_cmd' => raw.key?('pre_start_cmd') ? raw['pre_start_cmd'] : nil,
         # nil means "not set in preset" — fall back to the top-level [vllm] value in the script.
         'enable_prefix_caching' => raw.key?('enable_prefix_caching') ? raw['enable_prefix_caching'] : nil
       }
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index fd1e212..4d4c2bb 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -160,6 +160,12 @@ module HyperstackVM
                        @config.vllm_prefix_caching_enabled?
                      end
       extra_env = cfg.key?('extra_docker_env') ? Array(cfg['extra_docker_env']) : @config.vllm_extra_docker_env
+      # docker_image: preset value takes priority; nil falls back to [vllm] top-level or default.
+      image = (cfg.key?('docker_image') ? cfg['docker_image'] : nil) || @config.vllm_docker_image
+      # pre_start_cmd: shell command to run inside the container before vLLM starts.
+      # When set, --entrypoint bash is used so the command can patch dependencies at runtime
+      # (e.g. upgrading transformers for Gemma 4, which requires transformers>=5.x).
+      pre_cmd = (cfg.key?('pre_start_cmd') ? cfg['pre_start_cmd'] : nil) || @config.vllm_pre_start_cmd
       port = @config.ollama_port
 
       docker_args = [
@@ -172,10 +178,11 @@ module HyperstackVM
         # Without this, every container restart recompiles (~30-60 s extra).
         "-v #{Shellwords.escape(compile_cache)}:/root/.cache/vllm"
       ]
-      # Extra Docker env vars (e.g. VLLM_ALLOW_LONG_MAX_MODEL_LEN=1) injected before the image name.
+      # Extra Docker env vars (e.g. CUDA_VISIBLE_DEVICES=0) injected before the image name.
       extra_env.each { |kv| docker_args << "-e #{Shellwords.escape(kv)}" }
-      docker_args += [
-        'vllm/vllm-openai:latest',
+      # vllm_flags holds the vLLM CLI arguments (everything passed after the image name).
+      # Kept separate from docker_args so pre_start_cmd can wrap them in a bash -c string.
+      vllm_flags = [
         "--model #{Shellwords.escape(model)}",
         "--tensor-parallel-size #{tp_size}",
         "--gpu-memory-utilization #{gpu_util}",
@@ -185,16 +192,27 @@ module HyperstackVM
       ]
       # Prefix caching is beneficial for most models but forces Mamba "all" cache mode on
       # NemotronH, which pre-allocates states for all sequences and can OOM on startup.
-      docker_args << '--enable-prefix-caching' if prefix_cache
+      vllm_flags << '--enable-prefix-caching' if prefix_cache
       # Tool calling is optional: empty/nil parser disables it.
       unless parser.nil? || parser.empty?
-        docker_args << '--enable-auto-tool-choice'
-        docker_args << "--tool-call-parser #{Shellwords.escape(parser)}"
+        vllm_flags << '--enable-auto-tool-choice'
+        vllm_flags << "--tool-call-parser #{Shellwords.escape(parser)}"
       end
-      docker_args << '--trust-remote-code' if trust_remote
+      vllm_flags << '--trust-remote-code' if trust_remote
       extra_args = cfg.key?('extra_vllm_args') ? Array(cfg['extra_vllm_args']) : @config.vllm_extra_args
-      extra_args.each { |arg| docker_args << arg }
-      docker_run = docker_args.join(' ')
+      extra_args.each { |arg| vllm_flags << arg }
+
+      # When pre_start_cmd is set (e.g. to upgrade transformers for Gemma 4), override the
+      # container entrypoint to bash and chain the patch command before vLLM starts.
+      # CUDA_VISIBLE_DEVICES must be set via extra_docker_env when using --entrypoint bash because
+      # the EngineCore subprocess loses GPU visibility without it (DP adjusted local rank OOB error).
+      docker_run = if pre_cmd
+                     vllm_cmd = "python3 -m vllm.entrypoints.openai.api_server #{vllm_flags.join(' ')}"
+                     entrypoint_cmd = Shellwords.escape("#{pre_cmd}; #{vllm_cmd}")
+                     "#{docker_args.join(' ')} --entrypoint bash #{image} -c #{entrypoint_cmd}"
+                   else
+                     "#{docker_args.join(' ')} #{image} #{vllm_flags.join(' ')}"
+                   end
 
       script = []
       script << 'set -euo pipefail'
@@ -202,7 +220,7 @@ module HyperstackVM
       script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
       script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
       script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
-      script << 'docker pull vllm/vllm-openai:latest' if pull_image
+      script << "docker pull #{Shellwords.escape(image)}" if pull_image
       script << docker_run
       # Stage patterns cover the full vLLM startup sequence:
       #   HuggingFace download → safetensors shard loading → torch.compile → CUDA graphs → API up.
author	Paul Buetow <paul@buetow.org>	2026-04-06 20:47:39 +0300
committer	Paul Buetow <paul@buetow.org>	2026-04-06 20:47:39 +0300
commit	eb800cdf31176584ee0b604f5bda65f0d2880909 (patch)
tree	0bf9ef9491137e9e5e6600f1819b1b8d048a24af
parent	0664ffcc62b2fb240286fde463635e510a41df84 (diff)