replace qwen3-coder-next with qwen3.6-27b across configs, docs, and tooling

author: Paul Buetow <paul@buetow.org> 2026-05-24 14:02:34 +0300
committer: Paul Buetow <paul@buetow.org> 2026-05-24 14:02:34 +0300
commit: c8bd4d1e7a34ebf452d3d6c843d5cef785abe608 (patch)
tree: ec1e6c19379c3ba86f6d80d90286eceae393b983 /hyperstack-vm2.toml
parent: f16f4b753b3bf317e6da79f479ff5f506ed34b47 (diff)
1 files changed, 1 insertions, 9 deletions
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index c3605ff..faa8054 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -55,7 +55,7 @@ listen_host = "0.0.0.0:11434"
 gpu_overhead_mb = 2000
 num_parallel = 1
 context_length = 32768
-pull_models = ["qwen3-coder-next"]
+pull_models = ["qwen36-27b"]
 
 # vLLM serves one model via Docker on the OpenAI-compatible API.
 # VM2 defaults to Qwen3.6 27B; use 'model switch' to load any other preset.
@@ -102,14 +102,6 @@ docker_image = "vllm/vllm-openai:nightly"
 pre_start_cmd = "pip install -q transformers==5.5.0 2>/dev/null"
 extra_docker_env = ["CUDA_VISIBLE_DEVICES=0"]
 
-[vllm.presets.qwen3-coder-next]
-model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
-container_name = "vllm_qwen3"
-max_model_len = 262144
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = "qwen3_coder"
-
 # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
 # ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation.
 # Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget.
author	Paul Buetow <paul@buetow.org>	2026-05-24 14:02:34 +0300
committer	Paul Buetow <paul@buetow.org>	2026-05-24 14:02:34 +0300
commit	c8bd4d1e7a34ebf452d3d6c843d5cef785abe608 (patch)
tree	ec1e6c19379c3ba86f6d80d90286eceae393b983 /hyperstack-vm2.toml
parent	f16f4b753b3bf317e6da79f479ff5f506ed34b47 (diff)