diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-24 14:02:34 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-24 14:02:34 +0300 |
| commit | c8bd4d1e7a34ebf452d3d6c843d5cef785abe608 (patch) | |
| tree | ec1e6c19379c3ba86f6d80d90286eceae393b983 /hyperstack-vm2.toml | |
| parent | f16f4b753b3bf317e6da79f479ff5f506ed34b47 (diff) | |
replace qwen3-coder-next with qwen3.6-27b across configs, docs, and tooling
Diffstat (limited to 'hyperstack-vm2.toml')
| -rw-r--r-- | hyperstack-vm2.toml | 10 |
1 files changed, 1 insertions, 9 deletions
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml index c3605ff..faa8054 100644 --- a/hyperstack-vm2.toml +++ b/hyperstack-vm2.toml @@ -55,7 +55,7 @@ listen_host = "0.0.0.0:11434" gpu_overhead_mb = 2000 num_parallel = 1 context_length = 32768 -pull_models = ["qwen3-coder-next"] +pull_models = ["qwen36-27b"] # vLLM serves one model via Docker on the OpenAI-compatible API. # VM2 defaults to Qwen3.6 27B; use 'model switch' to load any other preset. @@ -102,14 +102,6 @@ docker_image = "vllm/vllm-openai:nightly" pre_start_cmd = "pip install -q transformers==5.5.0 2>/dev/null" extra_docker_env = ["CUDA_VISIBLE_DEVICES=0"] -[vllm.presets.qwen3-coder-next] -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). # ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation. # Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget. |
