nemotron-super: revert to no tool calling; add nemotron_v3 reasoning parser

vLLM 0.17.1 has no tool call parser for Nemotron's custom XML format (<tool_call><function=...><parameter=...>). Setting llama3_json produced garbage output. Reverted to tool_call_parser="" with a clear comment. Added --reasoning-parser nemotron_v3 via extra_vllm_args so <think> tokens are properly exposed as reasoning_content in the API response. For agentic work requiring tool calls, switch to qwen3-coder-next or devstral. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-18 17:42:00 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-18 17:42:00 +0200
commit: a7d3d2d4339815cf4a39b58873069b07a0ac1d47 (patch)
tree: a8271bd320e846965b36fd8d430b4da3130d422d /snippets/hyperstack
parent: bda86a3c91b307e25507e975927c3dde38f65a74 (diff)
1 files changed, 6 insertions, 3 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index 4ec6879..e739d5f 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -94,16 +94,19 @@ tool_call_parser = "qwen3_coder"
 # Requires trust_remote_code=true for the nemotron_h architecture.
 # Note: cyankiwi AWQ has model_type="nemotron_nas" (underscore); vLLM keys on "nemotron-nas"
 # (hyphen), so vLLM may not recognise it without trust_remote_code and latest vLLM.
+# Tool calling: Nemotron uses a custom XML format (<tool_call><function=...><parameter=...>)
+# not supported by any vLLM 0.17.1 built-in parser. tool_call_parser="" disables tool calling.
+# Use for long-context analysis and reasoning; switch to qwen3-coder-next for agentic work.
 [vllm.presets.nemotron-super]
 model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
 container_name = "vllm_nemotron_super"
 max_model_len = 65536
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
-# llama3_json lets vLLM accept tool_choice requests (required by opencode).
-# Nemotron won't spontaneously call tools, so the vLLM 0.17.1 token_ids bug won't trigger.
-tool_call_parser = "llama3_json"
+tool_call_parser = ""
 trust_remote_code = true
+# nemotron_v3 reasoning parser exposes <think> tokens as reasoning_content in the API.
+extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]
 
 # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
 # Native MXFP4 quantization; vLLM auto-detects it (no --quantization flag needed).
author	Paul Buetow <paul@buetow.org>	2026-03-18 17:42:00 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-18 17:42:00 +0200
commit	a7d3d2d4339815cf4a39b58873069b07a0ac1d47 (patch)
tree	a8271bd320e846965b36fd8d430b4da3130d422d /snippets/hyperstack
parent	bda86a3c91b307e25507e975927c3dde38f65a74 (diff)