From 3b01d5cb2c8932207127e7dd72848cea91c6347d Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Wed, 18 Mar 2026 18:58:41 +0200 Subject: =?UTF-8?q?gpt-oss-120b:=20revert=20to=20131072=20=E2=80=94=20hard?= =?UTF-8?q?=20architecture=20limit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit max_position_embeddings=131072 in model config.json; exceeding it causes NaN/CUDA OOB. 163840 was rejected by vLLM at startup. The 135K error requires starting a fresh opencode conversation instead. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- snippets/hyperstack/hyperstack-vm.toml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'snippets/hyperstack') diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml index cd16615..e23294f 100644 --- a/snippets/hyperstack/hyperstack-vm.toml +++ b/snippets/hyperstack/hyperstack-vm.toml @@ -123,13 +123,14 @@ tensor_parallel_size = 1 tool_call_parser = "" # OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# MXFP4 KV cache is very compact: vLLM allocates 168K token blocks (10560×16) at 0.92 util. -# 163840 (160K) stays within physical KV capacity; handles long Claude Code sessions (135K+). +# Hard architecture limit: max_position_embeddings=131072 in model config.json. +# 131072 is the absolute ceiling — exceeding it causes NaN or CUDA OOB errors. +# For sessions approaching this limit, start a fresh opencode conversation. # tool_call_parser = "" disables --enable-auto-tool-choice (same reason as gpt-oss-20b). [vllm.presets.gpt-oss-120b] model = "openai/gpt-oss-120b" container_name = "vllm_gpt_oss_120b" -max_model_len = 163840 +max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "" -- cgit v1.2.3