summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md10
-rw-r--r--hyperstack-vm1.toml22
-rw-r--r--hyperstack-vm2.toml20
-rw-r--r--hypr.fish1
-rw-r--r--lib/hyperstack/config.rb4
-rw-r--r--lib/hyperstack/manager.rb2
-rw-r--r--lib/hyperstack/provisioning.rb2
-rw-r--r--pi/agent/models.json96
8 files changed, 10 insertions, 147 deletions
diff --git a/README.md b/README.md
index cdb4df4..39c669e 100644
--- a/README.md
+++ b/README.md
@@ -166,7 +166,7 @@ definitions are available without any manual config editing.
Source `hyperstack.fish` or copy the abbreviations into your Fish config:
```fish
-abbr pi-hyperstack pi --model hyperstack/openai/gpt-oss-120b
+abbr pi-hyperstack pi --model hyperstack/Qwen/Qwen3.6-27B-FP8
abbr pi-hyperstack-coder pi --model hyperstack1/Qwen/Qwen3.6-27B-FP8
abbr pi-hyperstack-qwen36 pi --model hyperstack2/Qwen/Qwen3.6-27B-FP8
abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4bit
@@ -175,7 +175,7 @@ abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4b
Then launch a session after the VM(s) are up:
```fish
-pi-hyperstack # GPT-OSS 120B on VM1
+pi-hyperstack # Qwen3.6 27B FP8 on VM1
pi-hyperstack-coder # Qwen3.6 27B FP8 on VM1
pi-hyperstack-qwen36 # Qwen3.6 27B FP8 on VM2
pi-hyperstack-gemma4 # Gemma 4 31B on VM2
@@ -187,7 +187,7 @@ Three providers are defined, one per setup, each pointing at its vLLM endpoint o
| Provider | Base URL | Primary model |
|----------|----------|---------------|
-| `hyperstack` | `http://hyperstack.wg1:11434/v1` | GPT-OSS 120B (single-VM) |
+| `hyperstack` | `http://hyperstack.wg1:11434/v1` | Qwen3.6 27B FP8 (single-VM) |
| `hyperstack1` | `http://hyperstack1.wg1:11434/v1` | Qwen3.6 27B FP8 (default; presets in TOML) |
| `hyperstack2` | `http://hyperstack2.wg1:11434/v1` | Gemma 4 31B (default; presets in TOML) |
@@ -212,7 +212,7 @@ After loading a different model on a VM with `model switch` (see [Switching mode
tell Pi to use it without restarting the session:
```
-model switch hyperstack1/openai/gpt-oss-120b
+model switch hyperstack1/Qwen/Qwen3.6-27B-FP8
```
Pi sends subsequent requests to the new model ID immediately; the provider base URL stays the same.
@@ -281,8 +281,6 @@ Available presets (both VMs share the same set):
| `gemma4-31b` | Gemma 4 31B IT (AWQ-4bit) | ~19 GB | 32K–128K (see TOML) |
| `nemotron-super` | Nemotron-3-Super 120B (Mamba+MoE, 12B active) | ~60 GB | 131K |
| `qwen36-27b` | Qwen3.6 27B FP8 | ~45 GB | 262K |
-| `gpt-oss-120b` | GPT-OSS 120B (MoE, MXFP4) | ~65 GB | 131K |
-| `gpt-oss-20b` | GPT-OSS 20B (MoE, MXFP4) | ~14 GB | 65K |
| `qwen25-coder-32b` | Qwen2.5-Coder-32B-Instruct (AWQ) | ~18 GB | 32K |
| `qwen3-coder-30b` | Qwen3-Coder-30B-A3B (MoE, AWQ) | ~18 GB | 65K |
| `deepseek-r1-32b` | DeepSeek-R1-Distill-Qwen-32B (AWQ) | ~18 GB | 32K |
diff --git a/hyperstack-vm1.toml b/hyperstack-vm1.toml
index 75c313c..d28dbb8 100644
--- a/hyperstack-vm1.toml
+++ b/hyperstack-vm1.toml
@@ -55,7 +55,7 @@ listen_host = "0.0.0.0:11434"
gpu_overhead_mb = 2000
num_parallel = 1
context_length = 32768
-pull_models = ["qwen36-27b", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"]
+pull_models = ["qwen36-27b", "qwen3-coder:30b", "nemotron-3-super"]
# vLLM serves one model via Docker on the OpenAI-compatible API.
# VM1 defaults to Qwen3.6 27B; use 'model switch' to load any other preset.
@@ -102,26 +102,6 @@ enable_prefix_caching = false
extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
-# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
-[vllm.presets.gpt-oss-20b]
-model = "openai/gpt-oss-20b"
-container_name = "vllm_gpt_oss_20b"
-max_model_len = 65536
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-
-# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
-# Hard architecture limit: max_position_embeddings=131072 in model config.json.
-[vllm.presets.gpt-oss-120b]
-model = "openai/gpt-oss-120b"
-container_name = "vllm_gpt_oss_120b"
-max_model_len = 131072
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]
-
# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
[vllm.presets.qwen25-coder-32b]
model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index faa8054..3e74aae 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -118,26 +118,6 @@ trust_remote_code = true
# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability.
extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]
-# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
-[vllm.presets.gpt-oss-20b]
-model = "openai/gpt-oss-20b"
-container_name = "vllm_gpt_oss_20b"
-max_model_len = 65536
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-
-# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
-# Hard architecture limit: max_position_embeddings=131072 in model config.json.
-[vllm.presets.gpt-oss-120b]
-model = "openai/gpt-oss-120b"
-container_name = "vllm_gpt_oss_120b"
-max_model_len = 131072
-gpu_memory_utilization = 0.92
-tensor_parallel_size = 1
-tool_call_parser = ""
-extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]
-
# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
[vllm.presets.qwen25-coder-32b]
model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
diff --git a/hypr.fish b/hypr.fish
index 60f8356..f243255 100644
--- a/hypr.fish
+++ b/hypr.fish
@@ -1,4 +1,5 @@
# Dual-VM setup (hyperstack-vm1/vm2.toml -> hyperstack1/2.wg1)
+abbr pi-hyperstack pi --model hyperstack1/Qwen/Qwen3.6-27B-FP8
abbr pi-hyperstack-coder pi --model hyperstack1/Qwen/Qwen3.6-27B-FP8
abbr pi-hyperstack-qwen36 pi --model hyperstack2/Qwen/Qwen3.6-27B-FP8
abbr pi-hyperstack-gemma4 pi --model hyperstack2/cyankiwi/gemma-4-31B-it-AWQ-4bit
diff --git a/lib/hyperstack/config.rb b/lib/hyperstack/config.rb
index 7057b4f..e41d1dd 100644
--- a/lib/hyperstack/config.rb
+++ b/lib/hyperstack/config.rb
@@ -49,7 +49,7 @@ module HyperstackVM
'assign_floating_ip' => true,
'create_bootable_volume' => false,
'enable_port_randomization' => false,
- 'labels' => %w[gpt-oss-120b wireguard]
+ 'labels' => %w[qwen36-27b wireguard]
},
'ssh' => {
'username' => 'ubuntu',
@@ -81,7 +81,7 @@ module HyperstackVM
'gpu_overhead_mb' => 2000,
'num_parallel' => 1,
'context_length' => 32_768,
- 'pull_models' => ['qwen3-coder:30b', 'gpt-oss:20b', 'gpt-oss:120b', 'nemotron-3-super']
+ 'pull_models' => ['qwen3-coder:30b', 'qwen36-27b', 'nemotron-3-super']
},
'vllm' => {
'install' => true,
diff --git a/lib/hyperstack/manager.rb b/lib/hyperstack/manager.rb
index 7a68199..e8382bb 100644
--- a/lib/hyperstack/manager.rb
+++ b/lib/hyperstack/manager.rb
@@ -938,7 +938,7 @@ module HyperstackVM
req.body = JSON.generate(
'model' => model,
'messages' => [{ 'role' => 'user', 'content' => prompt }],
- # 500 tokens: reasoning models (e.g. gpt-oss) use tokens for chain-of-thought
+ # 500 tokens: reasoning models use tokens for chain-of-thought
# before content; 50 is too small and yields an empty content field.
'max_tokens' => 500
)
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index 0b56559..948cd2a 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -143,7 +143,7 @@ module HyperstackVM
tp_size = Integer(cfg['tensor_parallel_size'] || @config.vllm_tensor_parallel_size)
parser = cfg['tool_call_parser']
# parser is nil only when preset explicitly omits the key and config has no default;
- # empty string means "disable tool calling" (e.g. gpt-oss reasoning models).
+ # empty string means "disable tool calling" (e.g. reasoning models).
parser = @config.vllm_tool_call_parser if parser.nil?
# Fall back to the top-level [vllm] config values when no preset is in use.
# This allows setting trust_remote_code / extra_vllm_args in the default [vllm] block
diff --git a/pi/agent/models.json b/pi/agent/models.json
index a5e8200..3636503 100644
--- a/pi/agent/models.json
+++ b/pi/agent/models.json
@@ -11,38 +11,6 @@
},
"models": [
{
- "id": "openai/gpt-oss-120b",
- "name": "GPT-OSS 120B [vm]",
- "reasoning": true,
- "input": [
- "text"
- ],
- "cost": {
- "input": 0,
- "output": 0,
- "cacheRead": 0,
- "cacheWrite": 0
- },
- "contextWindow": 131072,
- "maxTokens": 8192
- },
- {
- "id": "openai/gpt-oss-20b",
- "name": "GPT-OSS 20B [vm]",
- "reasoning": false,
- "input": [
- "text"
- ],
- "cost": {
- "input": 0,
- "output": 0,
- "cacheRead": 0,
- "cacheWrite": 0
- },
- "contextWindow": 65536,
- "maxTokens": 8192
- },
- {
"id": "Qwen/Qwen3.6-27B-FP8",
"name": "Qwen3.6 27B FP8 [vm]",
"reasoning": true,
@@ -255,38 +223,6 @@
}
},
{
- "id": "openai/gpt-oss-20b",
- "name": "GPT-OSS 20B [vm1]",
- "reasoning": false,
- "input": [
- "text"
- ],
- "cost": {
- "input": 0,
- "output": 0,
- "cacheRead": 0,
- "cacheWrite": 0
- },
- "contextWindow": 65536,
- "maxTokens": 8192
- },
- {
- "id": "openai/gpt-oss-120b",
- "name": "GPT-OSS 120B [vm1]",
- "reasoning": true,
- "input": [
- "text"
- ],
- "cost": {
- "input": 0,
- "output": 0,
- "cacheRead": 0,
- "cacheWrite": 0
- },
- "contextWindow": 131072,
- "maxTokens": 8192
- },
- {
"id": "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ",
"name": "Qwen2.5 Coder 32B [vm1]",
"reasoning": false,
@@ -467,38 +403,6 @@
"maxTokens": 8192
},
{
- "id": "openai/gpt-oss-20b",
- "name": "GPT-OSS 20B [vm2]",
- "reasoning": false,
- "input": [
- "text"
- ],
- "cost": {
- "input": 0,
- "output": 0,
- "cacheRead": 0,
- "cacheWrite": 0
- },
- "contextWindow": 65536,
- "maxTokens": 8192
- },
- {
- "id": "openai/gpt-oss-120b",
- "name": "GPT-OSS 120B [vm2]",
- "reasoning": true,
- "input": [
- "text"
- ],
- "cost": {
- "input": 0,
- "output": 0,
- "cacheRead": 0,
- "cacheWrite": 0
- },
- "contextWindow": 131072,
- "maxTokens": 8192
- },
- {
"id": "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ",
"name": "Qwen2.5 Coder 32B [vm2]",
"reasoning": false,