From 290ea93e7c7475996a11ce6651237d8a803228c0 Mon Sep 17 00:00:00 2001
From: Paul Buetow <paul@buetow.org>
Date: Sun, 24 May 2026 18:31:43 +0300
Subject: chore(vm2): H100 provisioning, L40 plan, and H100-specific vLLM
 tuning

---
 PLAN-L40.md         | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 hyperstack-vm2.toml |   9 ++-
 2 files changed, 163 insertions(+), 3 deletions(-)
 create mode 100644 PLAN-L40.md

diff --git a/PLAN-L40.md b/PLAN-L40.md
new file mode 100644
index 0000000..3d0b1ff
--- /dev/null
+++ b/PLAN-L40.md
@@ -0,0 +1,157 @@
+# Plan: VM1 on Hyperstack L40 with Qwen3.6 MoE + TurboQuant
+
+**Prepared:** 2026-05-24  
+**Scope:** Research and planning only — no code changes, no provisioning.
+
+---
+
+## 1. GPU and VM sizing (Hyperstack L40)
+
+| Item | Assessment |
+|---|---|
+| **Flavor** | Hyperstack’s GPU flavors use the `n3-*` prefix (see current `n3-A100x1` / `n3-H100x1`). The L40 48 GB flavor is expected to be named `n3-L40x1` or `n3-L40Sx1`; exact string must be verified via the Hyperstack console/API before updating `hyperstack-vm1.toml`. |
+| **VRAM** | 48 GB (vs 80 GB on the current A100). That is a hard ceiling for both model weights and KV cache. |
+| **Cost** | L40/L40S nodes are generally cheaper than A100/H100 on Hyperstack. Assuming the tiered pricing model, an L40 should reduce the hourly cost of VM1, but the final price depends on the exact `flavor_name` and any egress charges. |
+
+## 2. Model choice: what actually fits on 48 GB
+
+The prompt mentions **Qwen3.6 MoE (e.g. 235B-A22B)**. A 235B-parameter model in BF16 would require **> 400 GB** of VRAM, which is impossible on a single L40. The only Qwen3.6 MoE that is publicly released and could *potentially* fit is **Qwen3.6-35B-A3B** (35B total / 3B active), but even that is **~70 GB in BF16**.
+
+**Realistic options to make it fit in 48 GB:**
+
+| Option | Weight size (est.) | Fit on 48 GB? | Notes |
+|---|---|---|---|
+| **AWQ 4-bit** Qwen3.6-35B-A3B | ~18 GB | Yes | Needs a community or official AWQ checkpoint (not yet listed as official at the time of writing, but AWQ/GPTQ variants usually appear quickly). |
+| **FP8** Qwen3.6-35B-A3B (if available) | ~35 GB | Tight | Leaves ~10 GB for KV cache, activations and CUDA graphs. vLLM profiling may tip it over. |
+| **Qwen3.6-27B dense** (current VM2 default) | ~27 GB FP8 | Yes | Not MoE; defeats the purpose of the task. |
+
+**Recommendation:** Target an **AWQ 4-bit (or GPTQ 4-bit) Qwen3.6-35B-A3B** checkpoint, or wait for an official **FP8** checkpoint and accept a reduced `max_model_len`. Do not attempt the 235B-A22B variant on a single L40.
+
+## 3. vLLM + TurboQuant compatibility
+
+TurboQuant is a KV-cache compression backend in vLLM. Key upstream state:
+
+- **PR #39931** (merged 2026-05-05) added TurboQuant support for *hybrid* architectures (attention + Mamba/MoE).
+- **Issue #41726** reports a fatal crash during **chunked continuation prefill** on hybrid MoE models (e.g. Qwen3.5-9B NVFP4). Root cause: TurboQuant’s `_continuation_prefill` path requests workspace memory that was not reserved during warmup.
+- **PR #40798** is open as a candidate fix but **not yet merged**.
+
+**Implications for Qwen3.6-35B-A3B:**
+- Because Qwen3.6 uses a hybrid attention+Mamba architecture, it is in the exact class of models affected by #41726.
+- If TurboQuant is enabled (`--kv-cache-dtype turboquant_k8v4`, `--kv-cache-dtype turboquant_4bit_nc`, etc.), any long prompt that crosses a chunked-prefill boundary will likely trigger:
+  ```
+  AssertionError: Workspace is locked but allocation ... requires X MB, current size is Y MB.
+  ```
+
+**Mitigations available today:**
+1. **Disable chunked prefill:** Pass `--no-enable-chunked-prefill` in `extra_vllm_args`. This avoids the `_continuation_prefill` path entirely. Trade-off: large prefills are no longer split into chunks, which can increase latency for long inputs and may OOM if a single prefill is very large.
+2. **Use `--enforce-eager`:** Disables CUDA graph capture, which slightly changes memory layout but does **not** solve the workspace lock issue by itself. It is useful mainly to save a few GB of VRAM on tight GPUs.
+3. **Wait for PR #40798** to merge and land in a stable vLLM image.
+
+## 4. Recommended `hyperstack-vm1.toml` changes (conceptual)
+
+```toml
+[vm]
+# Verify exact flavor string with Hyperstack API before deploying.
+flavor_name = "n3-L40x1"          # or n3-L40Sx1
+labels = ["qwen36-moe", "wireguard"]
+
+[vllm]
+install = true
+model = "Qwen/Qwen3.6-35B-A3B-AWQ"   # or the best available quantized MoE
+container_name = "vllm_qwen36_moe"
+max_model_len = 65536                  # conservative for 48 GB; can raise if AWQ
+gpu_memory_utilization = 0.92
+tensor_parallel_size = 1
+tool_call_parser = "qwen3_coder"
+
+# TurboQuant KV cache on a hybrid MoE
+extra_vllm_args = [
+  "--reasoning-parser", "qwen3",
+  "--kv-cache-dtype", "turboquant_k8v4",
+  "--no-enable-chunked-prefill"        # mitigation for issue #41726
+]
+
+# Nightly image post-PR-39931 is required; pin to a known-good digest until 0.20.2+
+docker_image = "vllm/vllm-openai:nightly"
+```
+
+**VRAM estimate (AWQ 4-bit + TurboQuant K8V4 on L40 48 GB):**
+
+| Consumer | Est. size |
+|---|---|
+| AWQ weights (35B params @ 4-bit) | ~18 GB |
+| Activations / MoE routing / logits | ~4–6 GB |
+| CUDA graphs (if not eager) | ~2 GB |
+| KV cache (TurboQuant) | ~20–24 GB |
+| **Headroom** | **~0–4 GB** |
+
+Because headroom is thin, `gpu_memory_utilization=0.92` is appropriate. If profiling OOMs, raise it to `0.95` or drop `max_model_len`. If vLLM still OOMs during startup, try `--enforce-eager` to reclaim the CUDA-graph memory.
+
+## 5. CLI and WireGuard implications
+
+| Area | Impact |
+|---|---|
+| `--vm 1 / 2 / both` | No structural changes. The CLI already resolves `hyperstack-vm1.toml` independently via its own state file. Switching the flavor/model is transparent to `--vm 2`. |
+| WireGuard | `wireguard_server_ip = "192.168.3.1"` stays the same. Recreating VM1 yields a new public IP, so the local `wg1.conf` peer endpoint must be refreshed (`ruby hyperstack.rb --vm 1 create` already handles this via `wg1-setup.sh`). The tunnel subnet `192.168.3.0/24` is unchanged. |
+| Port 11434 / firewall | Unchanged. Port 56710 UDP and 22 TCP remain locked to `allowed_wireguard_cidrs` / `allowed_ssh_cidrs`. |
+| Dual-VM routing | The client can continue to round-robin or fallback between `192.168.3.1` (VM1, MoE) and `192.168.3.3` (VM2, dense). No code changes needed. |
+
+## 6. Risks
+
+| Risk | Severity | Mitigation |
+|---|---|---|
+| **TurboQuant crash (#41726)** on hybrid MoE | High | Disable chunked prefill now; migrate to fixed vLLM nightly once PR #40798 lands. |
+| **Model does not fit** in 48 GB if no AWQ/FP8 checkpoint exists | High | Confirm a 4-bit or FP8 checkpoint is on HuggingFace before provisioning. Fallback to Qwen3.6-27B dense (moves goalposts). |
+| **Performance regression** from no chunked prefill | Medium | Expect higher TTFB on long prompts. Monitor with `ruby hyperstack.rb --vm 1 test`. |
+| **Flavor unavailability** | Medium | Have a fallback flavor ready (e.g. `n3-A100x1` on VM1 if L40 is sold out), or accept A100 pricing. |
+| **Nightly Docker image instability** | Medium | Pin to a specific digest (`vllm/vllm-openai@sha256:...`) after first successful smoke test. |
+
+## 7. Step-by-step migration plan (if you decide to proceed)
+
+1. **Verify asset availability**
+   - Confirm Hyperstack offers an L40 flavor and note its exact name.
+   - Locate a Qwen3.6-35B-A3B AWQ/FP8 checkpoint on HuggingFace. If none exists, abort or pivot to the dense 27B.
+
+2. **Snapshot / backup**
+   - Ensure VM2 (A100 dense) is stable and passing tests (`ruby hyperstack.rb --vm 2 test`).
+   - Save current VM1 state file as `.hyperstack-vm1-state.json.bak` in case a fast rollback is needed.
+
+3. **Update configuration**
+   - Edit `hyperstack-vm1.toml`:
+     - `flavor_name` → L40 flavor.
+     - `[vllm]` block → new model ID, container name, conservative `max_model_len`.
+     - Add `docker_image = "vllm/vllm-openai:nightly"` (or a pinned digest).
+     - Add TurboQuant arg and chunked-prefill mitigation to `extra_vllm_args`.
+   - Update `[vm] labels` to reflect the new model.
+
+4. **Provision**
+   ```bash
+   ruby hyperstack.rb --vm 1 create --replace
+   ```
+   The `--replace` flag tears down the old A100 VM1 and rebuilds it on L40.
+
+5. **Post-create validation**
+   - Check WireGuard handshake: `sudo wg show wg1 latest-handshakes`.
+   - Ping tunnel IP: `ping -c 3 192.168.3.1`.
+   - Query vLLM: `curl -s http://192.168.3.1:11434/v1/models`.
+   - Run the automated test suite: `ruby hyperstack.rb --vm 1 test`.
+
+6. **Smoke test for TurboQuant stability**
+   - Send a conversation with a very long system prompt (> 4096 tokens) and tool schemas to force a chunked-prefill boundary.
+   - If the engine crashes with the workspace assertion, apply the fallback:
+     - Add `--enforce-eager` to `extra_vllm_args`, or
+     - Fall back to `--kv-cache-dtype fp8` (loses TurboQuant compression but is stable).
+
+7. **Dual-VM confirmation**
+   - Run `ruby hyperstack.rb --vm both test` to ensure both endpoints are healthy and reachable through the WireGuard tunnel.
+
+8. **Monitor and iterate**
+   - Watch VRAM usage with `nvidia-smi` inside the VM.
+   - Adjust `max_model_len` and `gpu_memory_utilization` as needed.
+   - Once upstream PR #40798 merges, rebuild the Docker image with the fixed vLLM version and re-enable chunked prefill.
+
+---
+
+## Bottom line
+
+The L40 is a cost-efficient target *if* a quantized Qwen3.6-35B-A3B checkpoint is available. The biggest blocker is the open vLLM issue #41726 (TurboQuant + hybrid MoE crash on chunked prefill). Disabling chunked prefill is a viable short-term workaround, but it comes with a latency trade-off and must be validated before making VM1 the default endpoint.
diff --git a/hyperstack-vm2.toml b/hyperstack-vm2.toml
index 3e74aae..070b8aa 100644
--- a/hyperstack-vm2.toml
+++ b/hyperstack-vm2.toml
@@ -14,7 +14,8 @@ hostname = "hyperstack2"
 environment_name = "snonux-ollama"
 
 # A100-80GB for Qwen3.6 27B; H100 fallback if n3-A100x1 unavailable.
-flavor_name = "n3-A100x1"
+# 2026-05-24: A100 sold out, switched to H100.
+flavor_name = "n3-H100x1"
 image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
 assign_floating_ip = true
 create_bootable_volume = false
@@ -69,10 +70,12 @@ hug_cache_dir = "/ephemeral/hug"
 container_name = "vllm_qwen36_27b"
 # Qwen3.6-27B-FP8: official FP8 checkpoint with native 262K context on a single 80 GB GPU.
 max_model_len = 262144
-gpu_memory_utilization = 0.92
+# H100 needs 0.95 to fit Mamba cache blocks; A100 worked at 0.92.
+gpu_memory_utilization = 0.95
 tensor_parallel_size = 1
 tool_call_parser = "qwen3_coder"
-extra_vllm_args = ["--reasoning-parser", "qwen3"]
+# --max-num-seqs 817 caps concurrent sequences to fit Mamba cache blocks on H100.
+extra_vllm_args = ["--reasoning-parser", "qwen3", "--max-num-seqs", "817"]
 
 # Named model presets for 'ruby hyperstack.rb --vm 2 model switch <name>'.
 # Core model fields override the matching [vllm] values; preset-only extras such as
-- 
cgit v1.2.3