diff options
| author | Paul Buetow <paul@buetow.org> | 2026-04-06 20:47:39 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-04-06 20:47:39 +0300 |
| commit | eb800cdf31176584ee0b604f5bda65f0d2880909 (patch) | |
| tree | 0bf9ef9491137e9e5e6600f1819b1b8d048a24af /lib/hyperstack | |
| parent | 0664ffcc62b2fb240286fde463635e510a41df84 (diff) | |
provisioner: support docker_image and pre_start_cmd for Gemma 4 startup
Adds docker_image and pre_start_cmd config fields to config.rb and
provisioning.rb so the Gemma 4 31B workarounds are baked in:
- docker_image = "vllm/vllm-openai:nightly" (stable lacks Gemma 4 support)
- pre_start_cmd = "pip install -q transformers==5.5.0" (stable pins <5)
- extra_docker_env = ["CUDA_VISIBLE_DEVICES=0"] (required with --entrypoint bash)
When pre_start_cmd is set, the provisioner switches to --entrypoint bash and
chains the patch command before launching vLLM, so create-both works end-to-end
without manual container replacement.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'lib/hyperstack')
| -rw-r--r-- | lib/hyperstack/config.rb | 16 | ||||
| -rw-r--r-- | lib/hyperstack/provisioning.rb | 38 |
2 files changed, 44 insertions, 10 deletions
diff --git a/lib/hyperstack/config.rb b/lib/hyperstack/config.rb index 402f45d..178429d 100644 --- a/lib/hyperstack/config.rb +++ b/lib/hyperstack/config.rb @@ -445,6 +445,19 @@ module HyperstackVM Array(fetch('vllm', 'extra_docker_env')).map(&:to_s) end + # Docker image for vLLM. Defaults to the stable release. + # Override to 'vllm/vllm-openai:nightly' for models not yet supported by stable vLLM. + def vllm_docker_image + fetch('vllm', 'docker_image') || 'vllm/vllm-openai:latest' + end + + # Shell command to run inside the container before starting vLLM (via --entrypoint bash). + # Used to patch dependencies at startup, e.g. upgrading transformers for new model architectures. + # nil means no pre-start command — vLLM is started directly (default entrypoint). + def vllm_pre_start_cmd + fetch('vllm', 'pre_start_cmd') + end + # Whether to pass --enable-prefix-caching to vLLM. Defaults to true. # Disable for hybrid Mamba models (NemotronH): prefix caching forces Mamba into "all" cache # mode which pre-allocates states for all sequences, consuming extra VRAM on startup. @@ -477,6 +490,9 @@ module HyperstackVM 'trust_remote_code' => raw.key?('trust_remote_code') ? raw['trust_remote_code'] : false, 'extra_vllm_args' => raw.key?('extra_vllm_args') ? Array(raw['extra_vllm_args']) : [], 'extra_docker_env' => raw.key?('extra_docker_env') ? Array(raw['extra_docker_env']) : [], + # docker_image / pre_start_cmd: nil means "not set in preset" — fall back to [vllm] defaults. + 'docker_image' => raw.key?('docker_image') ? raw['docker_image'] : nil, + 'pre_start_cmd' => raw.key?('pre_start_cmd') ? raw['pre_start_cmd'] : nil, # nil means "not set in preset" — fall back to the top-level [vllm] value in the script. 'enable_prefix_caching' => raw.key?('enable_prefix_caching') ? raw['enable_prefix_caching'] : nil } diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb index fd1e212..4d4c2bb 100644 --- a/lib/hyperstack/provisioning.rb +++ b/lib/hyperstack/provisioning.rb @@ -160,6 +160,12 @@ module HyperstackVM @config.vllm_prefix_caching_enabled? end extra_env = cfg.key?('extra_docker_env') ? Array(cfg['extra_docker_env']) : @config.vllm_extra_docker_env + # docker_image: preset value takes priority; nil falls back to [vllm] top-level or default. + image = (cfg.key?('docker_image') ? cfg['docker_image'] : nil) || @config.vllm_docker_image + # pre_start_cmd: shell command to run inside the container before vLLM starts. + # When set, --entrypoint bash is used so the command can patch dependencies at runtime + # (e.g. upgrading transformers for Gemma 4, which requires transformers>=5.x). + pre_cmd = (cfg.key?('pre_start_cmd') ? cfg['pre_start_cmd'] : nil) || @config.vllm_pre_start_cmd port = @config.ollama_port docker_args = [ @@ -172,10 +178,11 @@ module HyperstackVM # Without this, every container restart recompiles (~30-60 s extra). "-v #{Shellwords.escape(compile_cache)}:/root/.cache/vllm" ] - # Extra Docker env vars (e.g. VLLM_ALLOW_LONG_MAX_MODEL_LEN=1) injected before the image name. + # Extra Docker env vars (e.g. CUDA_VISIBLE_DEVICES=0) injected before the image name. extra_env.each { |kv| docker_args << "-e #{Shellwords.escape(kv)}" } - docker_args += [ - 'vllm/vllm-openai:latest', + # vllm_flags holds the vLLM CLI arguments (everything passed after the image name). + # Kept separate from docker_args so pre_start_cmd can wrap them in a bash -c string. + vllm_flags = [ "--model #{Shellwords.escape(model)}", "--tensor-parallel-size #{tp_size}", "--gpu-memory-utilization #{gpu_util}", @@ -185,16 +192,27 @@ module HyperstackVM ] # Prefix caching is beneficial for most models but forces Mamba "all" cache mode on # NemotronH, which pre-allocates states for all sequences and can OOM on startup. - docker_args << '--enable-prefix-caching' if prefix_cache + vllm_flags << '--enable-prefix-caching' if prefix_cache # Tool calling is optional: empty/nil parser disables it. unless parser.nil? || parser.empty? - docker_args << '--enable-auto-tool-choice' - docker_args << "--tool-call-parser #{Shellwords.escape(parser)}" + vllm_flags << '--enable-auto-tool-choice' + vllm_flags << "--tool-call-parser #{Shellwords.escape(parser)}" end - docker_args << '--trust-remote-code' if trust_remote + vllm_flags << '--trust-remote-code' if trust_remote extra_args = cfg.key?('extra_vllm_args') ? Array(cfg['extra_vllm_args']) : @config.vllm_extra_args - extra_args.each { |arg| docker_args << arg } - docker_run = docker_args.join(' ') + extra_args.each { |arg| vllm_flags << arg } + + # When pre_start_cmd is set (e.g. to upgrade transformers for Gemma 4), override the + # container entrypoint to bash and chain the patch command before vLLM starts. + # CUDA_VISIBLE_DEVICES must be set via extra_docker_env when using --entrypoint bash because + # the EngineCore subprocess loses GPU visibility without it (DP adjusted local rank OOB error). + docker_run = if pre_cmd + vllm_cmd = "python3 -m vllm.entrypoints.openai.api_server #{vllm_flags.join(' ')}" + entrypoint_cmd = Shellwords.escape("#{pre_cmd}; #{vllm_cmd}") + "#{docker_args.join(' ')} --entrypoint bash #{image} -c #{entrypoint_cmd}" + else + "#{docker_args.join(' ')} #{image} #{vllm_flags.join(' ')}" + end script = [] script << 'set -euo pipefail' @@ -202,7 +220,7 @@ module HyperstackVM script << "sudo chmod -R 0777 #{Shellwords.escape(cache_dir)} #{Shellwords.escape(compile_cache)}" script << "docker stop #{Shellwords.escape(container)} 2>/dev/null || true" script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true" - script << 'docker pull vllm/vllm-openai:latest' if pull_image + script << "docker pull #{Shellwords.escape(image)}" if pull_image script << docker_run # Stage patterns cover the full vLLM startup sequence: # HuggingFace download → safetensors shard loading → torch.compile → CUDA graphs → API up. |
