summaryrefslogtreecommitdiff
path: root/snippets/hyperstack/hyperstack-vm.toml
diff options
context:
space:
mode:
Diffstat (limited to 'snippets/hyperstack/hyperstack-vm.toml')
-rw-r--r--snippets/hyperstack/hyperstack-vm.toml4
1 files changed, 2 insertions, 2 deletions
diff --git a/snippets/hyperstack/hyperstack-vm.toml b/snippets/hyperstack/hyperstack-vm.toml
index d7d09e3..cd16615 100644
--- a/snippets/hyperstack/hyperstack-vm.toml
+++ b/snippets/hyperstack/hyperstack-vm.toml
@@ -124,12 +124,12 @@ tool_call_parser = ""
# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
# MXFP4 KV cache is very compact: vLLM allocates 168K token blocks (10560×16) at 0.92 util.
-# 131072 fits Claude Code's ~33K system prompt with room for long conversations.
+# 163840 (160K) stays within physical KV capacity; handles long Claude Code sessions (135K+).
# tool_call_parser = "" disables --enable-auto-tool-choice (same reason as gpt-oss-20b).
[vllm.presets.gpt-oss-120b]
model = "openai/gpt-oss-120b"
container_name = "vllm_gpt_oss_120b"
-max_model_len = 131072
+max_model_len = 163840
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""