From 4074e77d93f2641d013d76917d023f0fcb76977b Mon Sep 17 00:00:00 2001
From: pewdiepie-archdaemon <pewdiepie-archdaemon@users.noreply.github.com>
Date: Sun, 14 Jun 2026 08:57:29 +0900
Subject: [PATCH] Cookbook: auto-set KV cache to fp8 for DeepSeek V3/V4/R1 MoE
 families
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These models OOM on --kv-cache-dtype auto (≈bf16) at any usable
context with current tensor-parallel layouts. _detectModelOptimizations
now seeds opts.kvCacheDtype='fp8' for them, and the serve panel's KV
Cache select picks that up as the default unless the user has a
saved override on this skill.
---
 static/js/cookbook.js      | 6 +++++-
 static/js/cookbookServe.js | 9 ++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/static/js/cookbook.js b/static/js/cookbook.js
index 8df4027e8..8b1264690 100644
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -249,10 +249,14 @@ function _detectModelOptimizations(modelName) {
   }
   // DeepSeek MoE — V3 / V3.1 / V4 (and future Vx), R1 / R2 reasoning.
   // Anything v-{integer} or r-{integer} family from DeepSeek is MoE in
-  // current architectures.
+  // current architectures. These models also require fp8 KV cache to
+  // fit at meaningful context with current tensor-parallel layouts —
+  // the launch crashes otherwise (--kv-cache-dtype auto → bf16 OOMs).
   else if (n.includes('deepseek') && /\b(v[3-9]|v\d{2,}|r[1-9])\b/.test(n)) {
     opts.flags.push('--enable-expert-parallel');
     opts.tips.push('MoE expert parallel for DeepSeek');
+    opts.kvCacheDtype = 'fp8';
+    opts.tips.push('fp8 KV cache required — bf16 OOMs at usable context');
   }
   // Reasoning parser — applies independently of MoE detection. Without this
   // flag, models like MiniMax-M2.x, DeepSeek-R1, Qwen3 reasoning, GLM-4.x,
diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js
index 33274dc19..f31d3189b 100644
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -546,7 +546,14 @@ function _rerenderCachedModels() {
           : (_es.gpus || detectedGpuIds));
       const tpOpts = [1,2,4,8].map(n => `<option${defaultTp==String(n)?' selected':''}>${n}</option>`).join('');
       const dtypeOpts = ['auto','float16','bfloat16'].map(d => `<option value="${d}"${sv('dtype','auto')===d?' selected':''}>${d}</option>`).join('');
-      const vllmKvCacheOpts = ['auto','fp8'].map(d => `<option value="${d}"${sv('vllm_kv_cache_dtype','auto')===d?' selected':''}>${d}</option>`).join('');
+      // KV cache default — most models are fine on auto, but a few
+      // (e.g. DeepSeek-V3/V4/R1 MoE) need fp8 explicitly or the launch
+      // OOMs. _detectModelOptimizations seeds opts.kvCacheDtype for
+      // those families; honour it unless the user has a saved override.
+      const _kvOptsCheck = _detectModelOptimizations(repo);
+      const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || 'auto';
+      const _kvSelected = sv('vllm_kv_cache_dtype', _kvAutoDefault);
+      const vllmKvCacheOpts = ['auto','fp8'].map(d => `<option value="${d}"${_kvSelected===d?' selected':''}>${d}</option>`).join('');
       const _l = (name, tip) => `<span>${name}<span class="hwfit-hint" title="${tip}">?</span></span>`;
       const _ggufChoices = _runnableGgufFiles(m);
       const _savedGguf = String(sv('gguf_file', '') || '');