diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-21 09:46:58 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-21 09:46:58 +0200 |
| commit | c693f37a6115f3567cd4fcff4c256a6d20dd6fac (patch) | |
| tree | 04e18f502616535013bab0c7c513a1aabdb9c2f2 /snippets/hyperstack/hyperstack-vm2.toml | |
| parent | 3f6ef419f52c3361c8914a27c7949c2c8f2be1c8 (diff) | |
moved
Diffstat (limited to 'snippets/hyperstack/hyperstack-vm2.toml')
| -rw-r--r-- | snippets/hyperstack/hyperstack-vm2.toml | 182 |
1 files changed, 0 insertions, 182 deletions
diff --git a/snippets/hyperstack/hyperstack-vm2.toml b/snippets/hyperstack/hyperstack-vm2.toml deleted file mode 100644 index e8e9b00..0000000 --- a/snippets/hyperstack/hyperstack-vm2.toml +++ /dev/null @@ -1,182 +0,0 @@ -[auth] -api_key_file = "~/.hyperstack" - -[hyperstack] -base_url = "https://infrahub-api.nexgencloud.com/v1" - -[state] -# Separate state file for VM2 so vm1 and vm2 can be managed independently. -file = ".hyperstack-vm2-state.json" - -[vm] -name_prefix = "hyperstack2" -hostname = "hyperstack2" -environment_name = "snonux-ollama" - -# A100-80GB is the cost-first default for qwen3-coder-next inference. -# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. -flavor_name = "n3-A100x1" -image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" -assign_floating_ip = true -create_bootable_volume = false -enable_port_randomization = false -labels = ["qwen3-coder-next", "wireguard"] - -[ssh] -username = "ubuntu" -private_key_path = "~/.ssh/id_rsa" -hyperstack_key_name = "earth" -port = 22 -connect_timeout_sec = 10 - -[network] -wireguard_udp_port = 56710 -wireguard_subnet = "192.168.3.0/24" -# VM2 gets the third server-side WireGuard IP (skipping .2 which is the earth client). -# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3. -wireguard_server_ip = "192.168.3.3" -# Secure default: "auto" resolves your current public egress IP to /32 at runtime. -# Override with explicit CIDRs if you deploy from multiple networks or want broader access. -allowed_ssh_cidrs = ["auto"] -allowed_wireguard_cidrs = ["auto"] -# Port 11434 is shared by both Ollama and vLLM for firewall compatibility. -ollama_port = 11434 -# Port 4000: LiteLLM Anthropic-API proxy (used with vLLM). -litellm_port = 4000 - -[bootstrap] -enable_guest_bootstrap = true -install_wireguard = true -configure_ufw = true -configure_ollama_host = false - -[ollama] -# Disabled in favour of vLLM; set install = true to switch back to Ollama. -install = false -models_dir = "/ephemeral/ollama/models" -listen_host = "0.0.0.0:11434" -gpu_overhead_mb = 2000 -num_parallel = 1 -context_length = 32768 -pull_models = ["qwen3-coder-next"] - -# vLLM serves one model via Docker; LiteLLM translates Anthropic API → OpenAI. -# VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset. -[vllm] -install = true -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). -hug_cache_dir = "/ephemeral/hug" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" -# LiteLLM maps each entry to the vLLM model; add new Anthropic model IDs here. -litellm_master_key = "sk-litellm-master" -litellm_claude_model_names = [ - "claude-sonnet-4-20250514", - "claude-opus-4-20250514", - "claude-opus-4-6-20260604", - "claude-haiku-3-5-20241022" -] - -# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch <name>'. -# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. - -[vllm.presets.qwen3-coder-next] -model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" -container_name = "vllm_qwen3" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). -# ~60 GB weights on A100 80GB. Uses NoPE so context can be set to 1M; no YaRN needed. -# Requires trust_remote_code=true for the nemotron_h architecture. -[vllm.presets.nemotron-super] -model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" -container_name = "vllm_nemotron_super" -max_model_len = 262144 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_xml" -trust_remote_code = true -extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] - -# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. -[vllm.presets.gpt-oss-20b] -model = "openai/gpt-oss-20b" -container_name = "vllm_gpt_oss_20b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. -# Hard architecture limit: max_position_embeddings=131072 in model config.json. -[vllm.presets.gpt-oss-120b] -model = "openai/gpt-oss-120b" -container_name = "vllm_gpt_oss_120b" -max_model_len = 131072 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" - -# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. -[vllm.presets.qwen25-coder-32b] -model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" -container_name = "vllm_qwen25_coder32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "hermes" - -# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. -[vllm.presets.qwen3-coder-30b] -model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" -container_name = "vllm_qwen3_coder30b" -max_model_len = 65536 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "qwen3_coder" - -# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. -[vllm.presets.deepseek-r1-32b] -model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" -container_name = "vllm_deepseek_r1_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. -[vllm.presets.qwen3-32b] -model = "Qwen/Qwen3-32B-AWQ" -container_name = "vllm_qwen3_32b" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "" -extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] - -# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). -[vllm.presets.devstral] -model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" -container_name = "vllm_devstral" -max_model_len = 32768 -gpu_memory_utilization = 0.92 -tensor_parallel_size = 1 -tool_call_parser = "mistral" -extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] - -[wireguard] -auto_setup = true -setup_script = "./wg1-setup.sh" - -[local_client] -check_wg1_service = true -interface_name = "wg1" -config_path = "/etc/wireguard/wg1.conf" |
