[auth] api_key_file = "~/.hyperstack" [hyperstack] base_url = "https://infrahub-api.nexgencloud.com/v1" [state] file = ".hyperstack-vm-state.json" [vm] name_prefix = "hyperstack" hostname = "hyperstack" environment_name = "snonux-ollama" # A100-80GB is the cost-first default for gpt-oss-120b inference. # Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. flavor_name = "n3-A100x1" image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" assign_floating_ip = true create_bootable_volume = false enable_port_randomization = false labels = ["gpt-oss-120b", "wireguard"] [ssh] username = "ubuntu" private_key_path = "~/.ssh/id_rsa" hyperstack_key_name = "earth" port = 22 connect_timeout_sec = 10 [network] wireguard_udp_port = 56710 wireguard_subnet = "192.168.3.0/24" # Port 11434 is shared by both Ollama and vLLM for firewall compatibility. ollama_port = 11434 # Port 4000: LiteLLM Anthropic-API proxy (used with vLLM). litellm_port = 4000 allowed_ssh_cidrs = ["0.0.0.0/0"] allowed_wireguard_cidrs = ["0.0.0.0/0"] [bootstrap] enable_guest_bootstrap = true install_wireguard = true configure_ufw = true configure_ollama_host = false [ollama] # Disabled in favour of vLLM; set install = true to switch back to Ollama. install = false models_dir = "/ephemeral/ollama/models" listen_host = "0.0.0.0:11434" gpu_overhead_mb = 2000 num_parallel = 1 context_length = 32768 pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] # vLLM serves one model via Docker; LiteLLM translates Anthropic API → OpenAI. # Use --vllm / --no-vllm CLI flags to override install at runtime. [vllm] install = true model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" # HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). hug_cache_dir = "/ephemeral/hug" container_name = "vllm_qwen3" max_model_len = 262144 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # LiteLLM maps each entry to the vLLM model; add new Anthropic model IDs here. litellm_master_key = "sk-litellm-master" litellm_claude_model_names = [ "claude-sonnet-4-20250514", "claude-opus-4-20250514", "claude-opus-4-6-20260604", "claude-haiku-3-5-20241022" ] [wireguard] auto_setup = true setup_script = "./wg1-setup.sh" [local_client] check_wg1_service = true interface_name = "wg1" config_path = "/etc/wireguard/wg1.conf"