From 07f91d85eb7d115ccfbecb9841712a12d36e874e Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Wed, 18 Mar 2026 18:06:42 +0200
Subject: nemotron-super: set max_model_len=262144 (256K); document NoPE and
 OOM risk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tested 1M context (NoPE allows arbitrary max_position_embeddings without
YaRN) — OOMs on A100 80GB due to insufficient VRAM after 60GB model weights.
256K (262144) is the practical ceiling on this hardware.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 snippets/hyperstack/hyperstack-vm.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'snippets/hyperstack')

diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index 4e47a76..e8f6251 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -90,7 +90,8 @@ tensor_parallel_size = 1
 tool_call_parser = "qwen3_coder"
 
 # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
-# ~60 GB weights on A100 80GB; 256K context window (most of 80 GB available for KV cache).
+# ~60 GB weights on A100 80GB. Uses NoPE (no positional embeddings) so context can be set to
+# 1M by just raising max_model_len; no YaRN needed. May OOM above 256K on A100 80GB.
 # Requires trust_remote_code=true for the nemotron_h architecture.
 # Note: cyankiwi AWQ has model_type="nemotron_nas" (underscore); vLLM keys on "nemotron-nas"
 # (hyphen), so vLLM may not recognise it without trust_remote_code and latest vLLM.
@@ -100,7 +101,7 @@ tool_call_parser = "qwen3_coder"
 [vllm.presets.nemotron-super]
 model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
 container_name = "vllm_nemotron_super"
-max_model_len = 65536
+max_model_len = 262144
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
 tool_call_parser = "qwen3_xml"
-- 
cgit v1.2.3