diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-18 18:06:42 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-18 18:06:42 +0200 |
| commit | 07f91d85eb7d115ccfbecb9841712a12d36e874e (patch) | |
| tree | 3f9c2db006ae22dd7a5deb8d243675fdb32b09c7 | |
| parent | 1122c9373cadb90d28b8d588e73f84b86237fd15 (diff) | |
nemotron-super: set max_model_len=262144 (256K); document NoPE and OOM risk
Tested 1M context (NoPE allows arbitrary max_position_embeddings without
YaRN) — OOMs on A100 80GB due to insufficient VRAM after 60GB model weights.
256K (262144) is the practical ceiling on this hardware.
Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm.toml | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml index 4e47a76..e8f6251 100644 --- a/snippets/hyperstack/hyperstack-vm.toml +++ b/snippets/hyperstack/hyperstack-vm.toml @@ -90,7 +90,8 @@ tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB; 256K context window (most of 80 GB available for KV cache). +# ~60 GB weights on A100 80GB. Uses NoPE (no positional embeddings) so context can be set to +# 1M by just raising max_model_len; no YaRN needed. May OOM above 256K on A100 80GB. # Requires trust_remote_code=true for the nemotron_h architecture. # Note: cyankiwi AWQ has model_type="nemotron_nas" (underscore); vLLM keys on "nemotron-nas" # (hyphen), so vLLM may not recognise it without trust_remote_code and latest vLLM. @@ -100,7 +101,7 @@ tool_call_parser = "qwen3_coder" [vllm.presets.nemotron-super] model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" container_name = "vllm_nemotron_super" -max_model_len = 65536 +max_model_len = 262144 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_xml" |
