diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-18 18:52:31 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-18 18:52:31 +0200 |
| commit | 98858030d4c9c81849dcd49d6212255cbda28755 (patch) | |
| tree | 32ba6ce9f519ca1bca9b499d62407d7489b1a957 /snippets/hyperstack | |
| parent | 3fe076087ea50ca56f211c4f4c00c8c08b0479da (diff) | |
gpt-oss-120b: raise max_model_len to 131072
MXFP4 KV cache is compact enough that vLLM allocated 168K token blocks
(10560×16) at 0.92 utilization — the 40K limit was too conservative and
caused negative max_tokens errors in long Claude Code sessions.
Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
Diffstat (limited to 'snippets/hyperstack')
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm.toml | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml index e8f6251..d7d09e3 100644 --- a/snippets/hyperstack/hyperstack-vm.toml +++ b/snippets/hyperstack/hyperstack-vm.toml @@ -123,13 +123,13 @@ tensor_parallel_size = 1 tool_call_parser = "" # OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Leaves ~8 GB for KV cache; 40K context is the practical ceiling on a single A100 80GB. -# Set >= 40K so Claude Code's ~33K system prompt fits (opencode needs only ~14K). +# MXFP4 KV cache is very compact: vLLM allocates 168K token blocks (10560×16) at 0.92 util. +# 131072 fits Claude Code's ~33K system prompt with room for long conversations. # tool_call_parser = "" disables --enable-auto-tool-choice (same reason as gpt-oss-20b). [vllm.presets.gpt-oss-120b] model = "openai/gpt-oss-120b" container_name = "vllm_gpt_oss_120b" -max_model_len = 40960 +max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "" |
