Add extra_vllm_args support; fix nemotron-super to real 120B; add deepseek-r1-32b, qwen3-32b, devstral presets

- hyperstack.rb: add extra_vllm_args array field to preset resolver and vllm_install_script; flags are appended verbatim to the docker run command, enabling per-preset vLLM flags (reasoning parsers, Mistral loader) - hyperstack.rb: show extra_args in dry-run model switch output - hyperstack-vm.toml: fix nemotron-super to use actual NVIDIA Nemotron-3-Super-120B-A12B AWQ (cyankiwi) with trust_remote_code=true; previous preset incorrectly used llama-3.3-70b - hyperstack-vm.toml: add deepseek-r1-32b (--reasoning-parser deepseek_r1, ~18 GB) - hyperstack-vm.toml: add qwen3-32b (--reasoning-parser deepseek_r1, ~18 GB) - hyperstack-vm.toml: add devstral (Mistral tokenizer+config format, ~15 GB); --load_format mistral omitted because AWQ weights are in standard HF safetensors format All 6 new/updated presets end-to-end tested on A100 80GB (vLLM 0.17.1). Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-18 16:50:38 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-18 16:50:38 +0200
commit: d3821c76ecd18bf6256d7493596c304fff784d29 (patch)
tree: 4c940bbff57ba48ede1d057c6803aa03635f8bc3 /snippets/hyperstack
parent: e9f57c66ba76b11e11a715c112e35394386a7831 (diff)
2 files changed, 55 insertions, 11 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index 14d9ed0..f1c80a7 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -89,18 +89,19 @@ gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
 tool_call_parser = "qwen3_coder"
 
-# Llama-3.3-70B-Instruct AWQ 4-bit — deep reasoning / extended code analysis.
-# ~35 GB weights on A100 80GB; 32K context window fits within KV budget.
-# Replaces nemotron-super: the NAS model (cyankiwi AWQ) has num_key_value_heads=null
-# in its config.json (by design for the heterogeneous architecture), which is
-# incompatible with vLLM's pydantic ModelConfig validation (requires int).
+# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
+# ~60 GB weights on A100 80GB; 256K context window (most of 80 GB available for KV cache).
+# Requires trust_remote_code=true for the nemotron_h architecture.
+# Note: cyankiwi AWQ has model_type="nemotron_nas" (underscore); vLLM keys on "nemotron-nas"
+# (hyphen), so vLLM may not recognise it without trust_remote_code and latest vLLM.
 [vllm.presets.nemotron-super]
-model = "casperhansen/llama-3.3-70b-instruct-awq"
-container_name = "vllm_llama70b"
-max_model_len = 32768
+model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
+container_name = "vllm_nemotron_super"
+max_model_len = 65536
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
-tool_call_parser = "llama3_json"
+tool_call_parser = ""
+trust_remote_code = true
 
 # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
 # Native MXFP4 quantization; vLLM auto-detects it (no --quantization flag needed).
@@ -147,6 +148,43 @@ gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
 tool_call_parser = "qwen3_coder"
 
+# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100.
+# Generates <think> reasoning tokens; --reasoning-parser deepseek_r1 exposes them in the API.
+# tool_call_parser="" disables tool calling (reasoning models don't support it reliably).
+[vllm.presets.deepseek-r1-32b]
+model = "casperhansen/deepseek-r1-distill-qwen-32b-awq"
+container_name = "vllm_deepseek_r1_32b"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = ""
+extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]
+
+# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100.
+# Native thinking mode; --reasoning-parser deepseek_r1 is compatible with Qwen3 thinking format.
+# tool_call_parser="" disables tool calling (reasoning models don't support it reliably).
+[vllm.presets.qwen3-32b]
+model = "Qwen/Qwen3-32B-AWQ"
+container_name = "vllm_qwen3_32b"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = ""
+extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]
+
+# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100).
+# Uses HF safetensors weights but Mistral tokenizer (tekken.json) and config (params.json).
+# --load_format mistral is NOT used: AWQ weights are in standard HF safetensors format.
+# --tokenizer_mode mistral and --config_format mistral handle the Mistral-native files.
+[vllm.presets.devstral]
+model = "cyankiwi/Devstral-Small-2507-AWQ-4bit"
+container_name = "vllm_devstral"
+max_model_len = 32768
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "mistral"
+extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]
+
 [wireguard]
 auto_setup = true
 setup_script = "./wg1-setup.sh"
diff --git a/snippets/hyperstack/hyperstack.rb b/snippets/hyperstack/hyperstack.rb
index cf2ee0d..2ab53c9 100644
--- a/snippets/hyperstack/hyperstack.rb
+++ b/snippets/hyperstack/hyperstack.rb
@@ -382,7 +382,10 @@ module HyperstackVM
         # Empty string means "no tool calling"; use key? so empty string doesn't fall back to default.
         'tool_call_parser'       => raw.key?('tool_call_parser') ? raw['tool_call_parser'] : vllm_tool_call_parser,
         # trust_remote_code: required by some models (e.g. Nemotron) for custom architectures.
-        'trust_remote_code'      => raw.key?('trust_remote_code') ? raw['trust_remote_code'] : false
+        'trust_remote_code'      => raw.key?('trust_remote_code') ? raw['trust_remote_code'] : false,
+        # extra_vllm_args: arbitrary additional flags passed verbatim to the vLLM docker command.
+        # Used for special loaders (Mistral format) or reasoning parsers (deepseek_r1).
+        'extra_vllm_args'        => raw.key?('extra_vllm_args') ? Array(raw['extra_vllm_args']) : []
       }
     end
 
@@ -890,7 +893,8 @@ module HyperstackVM
         info "  container: #{old_container} → #{new_container}"
         trust_note  = preset['trust_remote_code'] ? ', trust_remote_code: true' : ''
         parser_note = preset['tool_call_parser'].to_s.empty? ? 'none' : preset['tool_call_parser']
-        info "  max_model_len: #{preset['max_model_len']}, tool_call_parser: #{parser_note}#{trust_note}"
+        extra_note  = preset['extra_vllm_args']&.any? ? ", extra_args: #{preset['extra_vllm_args'].join(' ')}" : ''
+        info "  max_model_len: #{preset['max_model_len']}, tool_call_parser: #{parser_note}#{trust_note}#{extra_note}"
         return
       end
 
@@ -1660,6 +1664,8 @@ module HyperstackVM
         docker_args << "--tool-call-parser #{Shellwords.escape(parser)}"
       end
       docker_args << '--trust-remote-code' if trust_remote
+      # Append any extra flags verbatim (e.g. Mistral loader flags, reasoning parser).
+      (cfg['extra_vllm_args'] || []).each { |arg| docker_args << arg }
       docker_run = docker_args.join(' ')
 
       script = []
author	Paul Buetow <paul@buetow.org>	2026-03-18 16:50:38 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-18 16:50:38 +0200
commit	d3821c76ecd18bf6256d7493596c304fff784d29 (patch)
tree	4c940bbff57ba48ede1d057c6803aa03635f8bc3 /snippets/hyperstack
parent	e9f57c66ba76b11e11a715c112e35394386a7831 (diff)