diff options
Diffstat (limited to 'snippets/hyperstack')
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm.toml | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml index 4ec6879..e739d5f 100644 --- a/snippets/hyperstack/hyperstack-vm.toml +++ b/snippets/hyperstack/hyperstack-vm.toml @@ -94,16 +94,19 @@ tool_call_parser = "qwen3_coder" # Requires trust_remote_code=true for the nemotron_h architecture. # Note: cyankiwi AWQ has model_type="nemotron_nas" (underscore); vLLM keys on "nemotron-nas" # (hyphen), so vLLM may not recognise it without trust_remote_code and latest vLLM. +# Tool calling: Nemotron uses a custom XML format (<tool_call><function=...><parameter=...>) +# not supported by any vLLM 0.17.1 built-in parser. tool_call_parser="" disables tool calling. +# Use for long-context analysis and reasoning; switch to qwen3-coder-next for agentic work. [vllm.presets.nemotron-super] model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" container_name = "vllm_nemotron_super" max_model_len = 65536 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 -# llama3_json lets vLLM accept tool_choice requests (required by opencode). -# Nemotron won't spontaneously call tools, so the vLLM 0.17.1 token_ids bug won't trigger. -tool_call_parser = "llama3_json" +tool_call_parser = "" trust_remote_code = true +# nemotron_v3 reasoning parser exposes <think> tokens as reasoning_content in the API. +extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. # Native MXFP4 quantization; vLLM auto-detects it (no --quantization flag needed). |
