summaryrefslogtreecommitdiff
path: root/lib/hyperstack/provisioning.rb
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-04-06 20:47:39 +0300
committerPaul Buetow <paul@buetow.org>2026-04-06 20:47:39 +0300
commiteb800cdf31176584ee0b604f5bda65f0d2880909 (patch)
tree0bf9ef9491137e9e5e6600f1819b1b8d048a24af /lib/hyperstack/provisioning.rb
parent0664ffcc62b2fb240286fde463635e510a41df84 (diff)
provisioner: support docker_image and pre_start_cmd for Gemma 4 startup
Adds docker_image and pre_start_cmd config fields to config.rb and provisioning.rb so the Gemma 4 31B workarounds are baked in: - docker_image = "vllm/vllm-openai:nightly" (stable lacks Gemma 4 support) - pre_start_cmd = "pip install -q transformers==5.5.0" (stable pins <5) - extra_docker_env = ["CUDA_VISIBLE_DEVICES=0"] (required with --entrypoint bash) When pre_start_cmd is set, the provisioner switches to --entrypoint bash and chains the patch command before launching vLLM, so create-both works end-to-end without manual container replacement. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'lib/hyperstack/provisioning.rb')
-rw-r--r--lib/hyperstack/provisioning.rb38
1 files changed, 28 insertions, 10 deletions
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index fd1e212..4d4c2bb 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -160,6 +160,12 @@ module HyperstackVM
@config.vllm_prefix_caching_enabled?
end
extra_env = cfg.key?('extra_docker_env') ? Array(cfg['extra_docker_env']) : @config.vllm_extra_docker_env
+ # docker_image: preset value takes priority; nil falls back to [vllm] top-level or default.
+ image = (cfg.key?('docker_image') ? cfg['docker_image'] : nil) || @config.vllm_docker_image
+ # pre_start_cmd: shell command to run inside the container before vLLM starts.
+ # When set, --entrypoint bash is used so the command can patch dependencies at runtime
+ # (e.g. upgrading transformers for Gemma 4, which requires transformers>=5.x).
+ pre_cmd = (cfg.key?('pre_start_cmd') ? cfg['pre_start_cmd'] : nil) || @config.vllm_pre_start_cmd
port = @config.ollama_port
docker_args = [
@@ -172,10 +178,11 @@ module HyperstackVM
# Without this, every container restart recompiles (~30-60 s extra).
"-v #{Shellwords.escape(compile_cache)}:/root/.cache/vllm"
]
- # Extra Docker env vars (e.g. VLLM_ALLOW_LONG_MAX_MODEL_LEN=1) injected before the image name.
+ # Extra Docker env vars (e.g. CUDA_VISIBLE_DEVICES=0) injected before the image name.
extra_env.each { |kv| docker_args << "-e #{Shellwords.escape(kv)}" }
- docker_args += [
- 'vllm/vllm-openai:latest',
+ # vllm_flags holds the vLLM CLI arguments (everything passed after the image name).
+ # Kept separate from docker_args so pre_start_cmd can wrap them in a bash -c string.
+ vllm_flags = [
"--model #{Shellwords.escape(model)}",
"--tensor-parallel-size #{tp_size}",
"--gpu-memory-utilization #{gpu_util}",
@@ -185,16 +192,27 @@ module HyperstackVM
]
# Prefix caching is beneficial for most models but forces Mamba "all" cache mode on
# NemotronH, which pre-allocates states for all sequences and can OOM on startup.
- docker_args << '--enable-prefix-caching' if prefix_cache
+ vllm_flags << '--enable-prefix-caching' if prefix_cache
# Tool calling is optional: empty/nil parser disables it.
unless parser.nil? || parser.empty?
- docker_args << '--enable-auto-tool-choice'
- docker_args << "--tool-call-parser #{Shellwords.escape(parser)}"
+ vllm_flags << '--enable-auto-tool-choice'
+ vllm_flags << "--tool-call-parser #{Shellwords.escape(parser)}"
end
- docker_args << '--trust-remote-code' if trust_remote
+ vllm_flags << '--trust-remote-code' if trust_remote
extra_args = cfg.key?('extra_vllm_args') ? Array(cfg['extra_vllm_args']) : @config.vllm_extra_args
- extra_args.each { |arg| docker_args << arg }
- docker_run = docker_args.join(' ')
+ extra_args.each { |arg| vllm_flags << arg }
+
+ # When pre_start_cmd is set (e.g. to upgrade transformers for Gemma 4), override the
+ # container entrypoint to bash and chain the patch command before vLLM starts.
+ # CUDA_VISIBLE_DEVICES must be set via extra_docker_env when using --entrypoint bash because
+ # the EngineCore subprocess loses GPU visibility without it (DP adjusted local rank OOB error).
+ docker_run = if pre_cmd
+ vllm_cmd = "python3 -m vllm.entrypoints.openai.api_server #{vllm_flags.join(' ')}"
+ entrypoint_cmd = Shellwords.escape("#{pre_cmd}; #{vllm_cmd}")
+ "#{docker_args.join(' ')} --entrypoint bash #{image} -c #{entrypoint_cmd}"
+ else
+ "#{docker_args.join(' ')} #{image} #{vllm_flags.join(' ')}"
+ end
script = []
script << 'set -euo pipefail'
@@ -202,7 +220,7 @@ module HyperstackVM
script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}"
script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true"
script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
- script << 'docker pull vllm/vllm-openai:latest' if pull_image
+ script << "docker pull #{Shellwords.escape(image)}" if pull_image
script << docker_run
# Stage patterns cover the full vLLM startup sequence:
# HuggingFace download → safetensors shard loading → torch.compile → CUDA graphs → API up.