diff options
| -rw-r--r-- | README.md | 10 | ||||
| -rw-r--r-- | hyperstack-vm1.toml | 22 | ||||
| -rw-r--r-- | hyperstack-vm2.toml | 20 | ||||
| -rw-r--r-- | hypr.fish | 1 | ||||
| -rw-r--r-- | lib/hyperstack/config.rb | 4 | ||||
| -rw-r--r-- | lib/hyperstack/manager.rb | 2 | ||||
| -rw-r--r-- | lib/hyperstack/provisioning.rb | 2 | ||||
| -rw-r--r-- | pi/agent/models.json | 96 |
8 files changed, 10 insertions, 147 deletions
@@ -166,7 +166,7 @@ definitions are available without any manual config editing. Source `hyperstack.fish` or copy the abbreviations into your Fish config: ```fish -abbr pi-hyperstack pi --model hyperstack/openai/gpt-oss-120b +abbr pi-hyperstack pi --model hyperstack/Qwen/Qwen3.6-27B-FP8 abbr pi-hyperstack-coder pi --model hyperstack1/Qwen/Qwen3.6-27B-FP8 abbr pi-hyperstack-qwen36 pi --model hyperstack2/Qwen/Qwen3.6-27B-FP8 abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4bit @@ -175,7 +175,7 @@ abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4b Then launch a session after the VM(s) are up: ```fish -pi-hyperstack # GPT-OSS 120B on VM1 +pi-hyperstack # Qwen3.6 27B FP8 on VM1 pi-hyperstack-coder # Qwen3.6 27B FP8 on VM1 pi-hyperstack-qwen36 # Qwen3.6 27B FP8 on VM2 pi-hyperstack-gemma4 # Gemma 4 31B on VM2 @@ -187,7 +187,7 @@ Three providers are defined, one per setup, each pointing at its vLLM endpoint o | Provider | Base URL | Primary model | |----------|----------|---------------| -| `hyperstack` | `http://hyperstack.wg1:11434/v1` | GPT-OSS 120B (single-VM) | +| `hyperstack` | `http://hyperstack.wg1:11434/v1` | Qwen3.6 27B FP8 (single-VM) | | `hyperstack1` | `http://hyperstack1.wg1:11434/v1` | Qwen3.6 27B FP8 (default; presets in TOML) | | `hyperstack2` | `http://hyperstack2.wg1:11434/v1` | Gemma 4 31B (default; presets in TOML) | @@ -212,7 +212,7 @@ After loading a different model on a VM with `model switch` (see [Switching mode tell Pi to use it without restarting the session: ``` -model switch hyperstack1/openai/gpt-oss-120b +model switch hyperstack1/Qwen/Qwen3.6-27B-FP8 ``` Pi sends subsequent requests to the new model ID immediately; the provider base URL stays the same. @@ -281,8 +281,6 @@ Available presets (both VMs share the same set): | `gemma4-31b` | Gemma 4 31B IT (AWQ-4bit) | ~19 GB | 32K–128K (see TOML) | | `nemotron-super` | Nemotron-3-Super 120B (Mamba+MoE, 12B active) | ~60 GB | 131K | | `qwen36-27b` | Qwen3.6 27B FP8 | ~45 GB | 262K | -| `gpt-oss-120b` | GPT-OSS 120B (MoE, MXFP4) | ~65 GB | 131K | -| `gpt-oss-20b` | GPT-OSS 20B (MoE, MXFP4) | ~14 GB | 65K | | `qwen25-coder-32b` | Qwen2.5-Coder-32B-Instruct (AWQ) | ~18 GB | 32K | | `qwen3-coder-30b` | Qwen3-Coder-30B-A3B (MoE, AWQ) | ~18 GB | 65K | | `deepseek-r1-32b` | DeepSeek-R1-Distill-Qwen-32B (AWQ) | ~18 GB | 32K | diff --git a/hyperstack-vm1.toml b/hyperstack-vm1.toml index 75c313c..d28dbb8 100644 --- a/hyperstack-vm1.toml +++ b/hyperstack-vm1.toml @@ -55,7 +55,7 @@ listen_host = "0.0.0.0:11434" gpu_overhead_mb = 2000 num_parallel = 1 context_length = 32768 -pull_models = ["qwen36-27b", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] +pull_models = ["qwen36-27b", "qwen3-coder:30b", "nemotron-3-super"] # vLLM serves one model via Docker on the OpenAI-compatible API. # VM1 defaults to Qwen3.6 27B; use 'model switch' to load any other preset. @@ -102,26 +102,6 @@ enable_prefix_caching = false extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"] extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] -# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. -[vllm.presets.gpt-oss-20b] -model = "openai/gpt-oss-20b" -container_name = "vllm_gpt_oss_20b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -[vllm.presets.gpt-oss-120b] -model = "openai/gpt-oss-120b" -container_name = "vllm_gpt_oss_120b" -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "openai_gptoss"] - # Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. [vllm.presets.qwen25-coder-32b] model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml index faa8054..3e74aae 100644 --- a/hyperstack-vm2.toml +++ b/hyperstack-vm2.toml @@ -118,26 +118,6 @@ trust_remote_code = true # otherwise needs alongside the ~60 GB weights. Trades some throughput for stability. extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] -# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. -[vllm.presets.gpt-oss-20b] -model = "openai/gpt-oss-20b" -container_name = "vllm_gpt_oss_20b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -[vllm.presets.gpt-oss-120b] -model = "openai/gpt-oss-120b" -container_name = "vllm_gpt_oss_120b" -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "openai_gptoss"] - # Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. [vllm.presets.qwen25-coder-32b] model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" @@ -1,4 +1,5 @@ # Dual-VM setup (hyperstack-vm1/vm2.toml -> hyperstack1/2.wg1) +abbr pi-hyperstack pi --model hyperstack1/Qwen/Qwen3.6-27B-FP8 abbr pi-hyperstack-coder pi --model hyperstack1/Qwen/Qwen3.6-27B-FP8 abbr pi-hyperstack-qwen36 pi --model hyperstack2/Qwen/Qwen3.6-27B-FP8 abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4bit diff --git a/lib/hyperstack/config.rb b/lib/hyperstack/config.rb index 7057b4f..e41d1dd 100644 --- a/lib/hyperstack/config.rb +++ b/lib/hyperstack/config.rb @@ -49,7 +49,7 @@ module HyperstackVM 'assign_floating_ip' => true, 'create_bootable_volume' => false, 'enable_port_randomization' => false, - 'labels' => %w[gpt-oss-120b wireguard] + 'labels' => %w[qwen36-27b wireguard] }, 'ssh' => { 'username' => 'ubuntu', @@ -81,7 +81,7 @@ module HyperstackVM 'gpu_overhead_mb' => 2000, 'num_parallel' => 1, 'context_length' => 32_768, - 'pull_models' => ['qwen3-coder:30b', 'gpt-oss:20b', 'gpt-oss:120b', 'nemotron-3-super'] + 'pull_models' => ['qwen3-coder:30b', 'qwen36-27b', 'nemotron-3-super'] }, 'vllm' => { 'install' => true, diff --git a/lib/hyperstack/manager.rb b/lib/hyperstack/manager.rb index 7a68199..e8382bb 100644 --- a/lib/hyperstack/manager.rb +++ b/lib/hyperstack/manager.rb @@ -938,7 +938,7 @@ module HyperstackVM req.body = JSON.generate( 'model' => model, 'messages' => [{ 'role' => 'user', 'content' => prompt }], - # 500 tokens: reasoning models (e.g. gpt-oss) use tokens for chain-of-thought + # 500 tokens: reasoning models use tokens for chain-of-thought # before content; 50 is too small and yields an empty content field. 'max_tokens' => 500 ) diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb index 0b56559..948cd2a 100644 --- a/lib/hyperstack/provisioning.rb +++ b/lib/hyperstack/provisioning.rb @@ -143,7 +143,7 @@ module HyperstackVM tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size) parser = cfg['tool_call_parser'] # parser is nil only when preset explicitly omits the key and config has no default; - # empty string means "disable tool calling" (e.g. gpt-oss reasoning models). + # empty string means "disable tool calling" (e.g. reasoning models). parser = @config.vllm_tool_call_parser if parser.nil? # Fall back to the top-level [vllm] config values when no preset is in use. # This allows setting trust_remote_code / extra_vllm_args in the default [vllm] block diff --git a/pi/agent/models.json b/pi/agent/models.json index a5e8200..3636503 100644 --- a/pi/agent/models.json +++ b/pi/agent/models.json @@ -11,38 +11,6 @@ }, "models": [ { - "id": "openai/gpt-oss-120b", - "name": "GPT-OSS 120B [vm]", - "reasoning": true, - "input": [ - "text" - ], - "cost": { - "input": 0, - "output": 0, - "cacheRead": 0, - "cacheWrite": 0 - }, - "contextWindow": 131072, - "maxTokens": 8192 - }, - { - "id": "openai/gpt-oss-20b", - "name": "GPT-OSS 20B [vm]", - "reasoning": false, - "input": [ - "text" - ], - "cost": { - "input": 0, - "output": 0, - "cacheRead": 0, - "cacheWrite": 0 - }, - "contextWindow": 65536, - "maxTokens": 8192 - }, - { "id": "Qwen/Qwen3.6-27B-FP8", "name": "Qwen3.6 27B FP8 [vm]", "reasoning": true, @@ -255,38 +223,6 @@ } }, { - "id": "openai/gpt-oss-20b", - "name": "GPT-OSS 20B [vm1]", - "reasoning": false, - "input": [ - "text" - ], - "cost": { - "input": 0, - "output": 0, - "cacheRead": 0, - "cacheWrite": 0 - }, - "contextWindow": 65536, - "maxTokens": 8192 - }, - { - "id": "openai/gpt-oss-120b", - "name": "GPT-OSS 120B [vm1]", - "reasoning": true, - "input": [ - "text" - ], - "cost": { - "input": 0, - "output": 0, - "cacheRead": 0, - "cacheWrite": 0 - }, - "contextWindow": 131072, - "maxTokens": 8192 - }, - { "id": "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ", "name": "Qwen2.5 Coder 32B [vm1]", "reasoning": false, @@ -467,38 +403,6 @@ "maxTokens": 8192 }, { - "id": "openai/gpt-oss-20b", - "name": "GPT-OSS 20B [vm2]", - "reasoning": false, - "input": [ - "text" - ], - "cost": { - "input": 0, - "output": 0, - "cacheRead": 0, - "cacheWrite": 0 - }, - "contextWindow": 65536, - "maxTokens": 8192 - }, - { - "id": "openai/gpt-oss-120b", - "name": "GPT-OSS 120B [vm2]", - "reasoning": true, - "input": [ - "text" - ], - "cost": { - "input": 0, - "output": 0, - "cacheRead": 0, - "cacheWrite": 0 - }, - "contextWindow": 131072, - "maxTokens": 8192 - }, - { "id": "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ", "name": "Qwen2.5 Coder 32B [vm2]", "reasoning": false, |
