Fix nemotron-super preset and test_vllm model detection

Replace cyankiwi/Llama-3_3-Nemotron-Super-49B-v1_5-AWQ-4bit with casperhansen/llama-3.3-70b-instruct-awq for the nemotron-super preset. The NAS model's config.json has num_key_value_heads=null by design for its heterogeneous per-layer attention architecture, which is incompatible with vLLM's pydantic ModelConfig validation (requires int). No working AWQ quant for this architecture exists; Llama-3.3-70B-Instruct AWQ is a proven drop-in for the extended-analysis use case. Also fix test_vllm to use the model reported by /v1/models instead of the static config default, so tests pass after a model switch. Add trust_remote_code support to vllm_install_script for future models that require custom HuggingFace model code. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-18 13:14:26 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-18 13:14:26 +0200
commit: 5dc0878f9617b3472dc819b3662f52bfefad892a (patch)
tree: ed974db930596dc826305d04174ba47a018bd50e /snippets/hyperstack/hyperstack-vm.toml
parent: 2a2704fa4cac96a6754d4fea1bc341a27c5bb6c8 (diff)
1 files changed, 8 insertions, 7 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index c19c8d5..9ed3abe 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -89,14 +89,15 @@ gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
 tool_call_parser = "qwen3_coder"
 
-# Nemotron-Super 49B AWQ — deep reasoning / extended code analysis.
-# ~25 GB weights + KV cache fits comfortably on A100 80GB.
-# Verify the exact HuggingFace AWQ model ID before first use:
-#   curl -s http://192.168.3.1:11434/v1/models | python3 -m json.tool
+# Llama-3.3-70B-Instruct AWQ 4-bit — deep reasoning / extended code analysis.
+# ~35 GB weights on A100 80GB; 32K context window fits within KV budget.
+# Replaces nemotron-super: the NAS model (cyankiwi AWQ) has num_key_value_heads=null
+# in its config.json (by design for the heterogeneous architecture), which is
+# incompatible with vLLM's pydantic ModelConfig validation (requires int).
 [vllm.presets.nemotron-super]
-model = "solidrust/Llama-3.3-Nemotron-Super-49B-v1-AWQ"
-container_name = "vllm_nemotron"
-max_model_len = 131072
+model = "casperhansen/llama-3.3-70b-instruct-awq"
+container_name = "vllm_llama70b"
+max_model_len = 32768
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
 tool_call_parser = "llama3_json"
author	Paul Buetow <paul@buetow.org>	2026-03-18 13:14:26 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-18 13:14:26 +0200
commit	5dc0878f9617b3472dc819b3662f52bfefad892a (patch)
tree	ed974db930596dc826305d04174ba47a018bd50e /snippets/hyperstack/hyperstack-vm.toml
parent	2a2704fa4cac96a6754d4fea1bc341a27c5bb6c8 (diff)