diff options
Diffstat (limited to 'hyperstack-vm2.toml')
| -rw-r--r-- | hyperstack-vm2.toml | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml index 202a340..d3c0a17 100644 --- a/hyperstack-vm2.toml +++ b/hyperstack-vm2.toml @@ -83,17 +83,20 @@ tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed. +# ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation. +# Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget. # Requires trust_remote_code=true for the nemotron_h architecture. [vllm.presets.nemotron-super] model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" container_name = "vllm_nemotron_super" -max_model_len = 262144 +max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_xml" trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] +# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model +# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability. +extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. [vllm.presets.gpt-oss-20b] |
