From 0fff87a18044cdbf7af72157451ebdd801e3ff69 Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Wed, 18 Mar 2026 18:56:28 +0200
Subject: gpt-oss-120b: raise max_model_len to 163840 (160K)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

131K was still too small — observed 135K token conversations in practice.
Physical KV capacity is 168K blocks so 160K is safe without OOM.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 snippets/hyperstack/hyperstack-vm.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'snippets/hyperstack')

diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index d7d09e3..cd16615 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -124,12 +124,12 @@ tool_call_parser = ""
 
 # OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
 # MXFP4 KV cache is very compact: vLLM allocates 168K token blocks (10560×16) at 0.92 util.
-# 131072 fits Claude Code's ~33K system prompt with room for long conversations.
+# 163840 (160K) stays within physical KV capacity; handles long Claude Code sessions (135K+).
 # tool_call_parser = "" disables --enable-auto-tool-choice (same reason as gpt-oss-20b).
 [vllm.presets.gpt-oss-120b]
 model = "openai/gpt-oss-120b"
 container_name = "vllm_gpt_oss_120b"
-max_model_len = 131072
+max_model_len = 163840
 gpu_memory_utilization = 0.92
 tensor_parallel_size = 1
 tool_call_parser = ""
-- 
cgit v1.2.3