diff options
Diffstat (limited to 'snippets/hyperstack')
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm.toml | 15 | ||||
| -rw-r--r-- | snippets/hyperstack/hyperstack.rb | 41 |
2 files changed, 32 insertions, 24 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml index c19c8d5..9ed3abe 100644 --- a/snippets/hyperstack/hyperstack-vm.toml +++ b/snippets/hyperstack/hyperstack-vm.toml @@ -89,14 +89,15 @@ gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" -# Nemotron-Super 49B AWQ — deep reasoning / extended code analysis. -# ~25 GB weights + KV cache fits comfortably on A100 80GB. -# Verify the exact HuggingFace AWQ model ID before first use: -# curl -s http://192.168.3.1:11434/v1/models | python3 -m json.tool +# Llama-3.3-70B-Instruct AWQ 4-bit — deep reasoning / extended code analysis. +# ~35 GB weights on A100 80GB; 32K context window fits within KV budget. +# Replaces nemotron-super: the NAS model (cyankiwi AWQ) has num_key_value_heads=null +# in its config.json (by design for the heterogeneous architecture), which is +# incompatible with vLLM's pydantic ModelConfig validation (requires int). [vllm.presets.nemotron-super] -model = "solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ" -container_name = "vllm_nemotron" -max_model_len = 131072 +model = "casperhansen/llama-3.3-70b-instruct-awq" +container_name = "vllm_llama70b" +max_model_len = 32768 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "llama3_json" diff --git a/snippets/hyperstack/hyperstack.rb b/snippets/hyperstack/hyperstack.rb index 139129e..639c3e1 100644 --- a/snippets/hyperstack/hyperstack.rb +++ b/snippets/hyperstack/hyperstack.rb @@ -371,7 +371,9 @@ module HyperstackVM 'max_model_len' => Integer(raw['max_model_len'] || vllm_max_model_len), 'gpu_memory_utilization' => Float(raw['gpu_memory_utilization'] || vllm_gpu_memory_utilization), 'tensor_parallel_size' => Integer(raw['tensor_parallel_size'] || vllm_tensor_parallel_size), - 'tool_call_parser' => raw['tool_call_parser'] || vllm_tool_call_parser + 'tool_call_parser' => raw['tool_call_parser'] || vllm_tool_call_parser, + # trust_remote_code: required by some models (e.g. Nemotron) for custom architectures. + 'trust_remote_code' => raw.key?('trust_remote_code') ? raw['trust_remote_code'] : false } end @@ -877,7 +879,8 @@ module HyperstackVM info "DRY RUN: model switch to preset '#{preset_name}'" info " #{current_model || 'none'} → #{preset['model']}" info " container: #{old_container} → #{new_container}" - info " max_model_len: #{preset['max_model_len']}, tool_call_parser: #{preset['tool_call_parser']}" + trust_note = preset['trust_remote_code'] ? ', trust_remote_code: true' : '' + info " max_model_len: #{preset['max_model_len']}, tool_call_parser: #{preset['tool_call_parser']}#{trust_note}" return end @@ -1611,17 +1614,18 @@ module HyperstackVM # to cover the first-run ~45 GB model download). # preset_config overrides individual fields; unset fields fall back to [vllm] defaults. def vllm_install_script(preset_config: nil) - cfg = preset_config || {} - model = cfg['model'] || @config.vllm_model - cache_dir = @config.vllm_hug_cache_dir # always use main config for shared cache - container = cfg['container_name'] || @config.vllm_container_name - max_len = Integer(cfg['max_model_len'] || @config.vllm_max_model_len) - gpu_util = Float(cfg['gpu_memory_utilization'] || @config.vllm_gpu_memory_utilization) - tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size) - parser = cfg['tool_call_parser'] || @config.vllm_tool_call_parser - port = @config.ollama_port # vLLM reuses the Ollama port for firewall compat - - docker_run = [ + cfg = preset_config || {} + model = cfg['model'] || @config.vllm_model + cache_dir = @config.vllm_hug_cache_dir # always use main config for shared cache + container = cfg['container_name'] || @config.vllm_container_name + max_len = Integer(cfg['max_model_len'] || @config.vllm_max_model_len) + gpu_util = Float(cfg['gpu_memory_utilization'] || @config.vllm_gpu_memory_utilization) + tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size) + parser = cfg['tool_call_parser'] || @config.vllm_tool_call_parser + trust_remote = cfg.key?('trust_remote_code') ? cfg['trust_remote_code'] : false + port = @config.ollama_port # vLLM reuses the Ollama port for firewall compat + + docker_args = [ 'docker run -d', '--gpus all', '--ipc=host', '--network host', "--name #{Shellwords.escape(container)}", @@ -1637,7 +1641,9 @@ module HyperstackVM "--max-model-len #{max_len}", '--host 0.0.0.0', "--port #{port}" - ].join(' ') + ] + docker_args << '--trust-remote-code' if trust_remote + docker_run = docker_args.join(' ') script = [] script << 'set -euo pipefail' @@ -1775,8 +1781,7 @@ module HyperstackVM # Tests the vLLM OpenAI-compatible API: lists loaded models and runs a # short inference request to confirm the model accepts requests. def test_vllm(wg_ip) - port = @config.ollama_port - model = @config.vllm_model + port = @config.ollama_port info " Testing vLLM models list at http://#{wg_ip}:#{port}/v1/models..." uri = URI("http://#{wg_ip}:#{port}/v1/models") @@ -1784,8 +1789,10 @@ module HyperstackVM raise Error, "vLLM /v1/models returned HTTP #{resp.code}" unless resp.code == '200' models = JSON.parse(resp.body).fetch('data', []).map { |m| m['id'] } - raise Error, "vLLM returned an empty model list (expected #{model})" if models.empty? + raise Error, 'vLLM returned an empty model list' if models.empty? + # Use the currently loaded model (may differ from config default after a switch). + model = models.first info " Models loaded: #{models.join(', ')}" info " Testing vLLM inference..." reply = vllm_chat(wg_ip, port, model, 'Say hello in five words.') |
