From 4074e77d93f2641d013d76917d023f0fcb76977b Mon Sep 17 00:00:00 2001 From: pewdiepie-archdaemon Date: Sun, 14 Jun 2026 08:57:29 +0900 Subject: [PATCH] Cookbook: auto-set KV cache to fp8 for DeepSeek V3/V4/R1 MoE families MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These models OOM on --kv-cache-dtype auto (≈bf16) at any usable context with current tensor-parallel layouts. _detectModelOptimizations now seeds opts.kvCacheDtype='fp8' for them, and the serve panel's KV Cache select picks that up as the default unless the user has a saved override on this skill. --- static/js/cookbook.js | 6 +++++- static/js/cookbookServe.js | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/static/js/cookbook.js b/static/js/cookbook.js index 8df4027e8..8b1264690 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -249,10 +249,14 @@ function _detectModelOptimizations(modelName) { } // DeepSeek MoE — V3 / V3.1 / V4 (and future Vx), R1 / R2 reasoning. // Anything v-{integer} or r-{integer} family from DeepSeek is MoE in - // current architectures. + // current architectures. These models also require fp8 KV cache to + // fit at meaningful context with current tensor-parallel layouts — + // the launch crashes otherwise (--kv-cache-dtype auto → bf16 OOMs). else if (n.includes('deepseek') && /\b(v[3-9]|v\d{2,}|r[1-9])\b/.test(n)) { opts.flags.push('--enable-expert-parallel'); opts.tips.push('MoE expert parallel for DeepSeek'); + opts.kvCacheDtype = 'fp8'; + opts.tips.push('fp8 KV cache required — bf16 OOMs at usable context'); } // Reasoning parser — applies independently of MoE detection. Without this // flag, models like MiniMax-M2.x, DeepSeek-R1, Qwen3 reasoning, GLM-4.x, diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index 33274dc19..f31d3189b 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -546,7 +546,14 @@ function _rerenderCachedModels() { : (_es.gpus || detectedGpuIds)); const tpOpts = [1,2,4,8].map(n => `${n}`).join(''); const dtypeOpts = ['auto','float16','bfloat16'].map(d => ``).join(''); - const vllmKvCacheOpts = ['auto','fp8'].map(d => ``).join(''); + // KV cache default — most models are fine on auto, but a few + // (e.g. DeepSeek-V3/V4/R1 MoE) need fp8 explicitly or the launch + // OOMs. _detectModelOptimizations seeds opts.kvCacheDtype for + // those families; honour it unless the user has a saved override. + const _kvOptsCheck = _detectModelOptimizations(repo); + const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || 'auto'; + const _kvSelected = sv('vllm_kv_cache_dtype', _kvAutoDefault); + const vllmKvCacheOpts = ['auto','fp8'].map(d => ``).join(''); const _l = (name, tip) => `${name}?`; const _ggufChoices = _runnableGgufFiles(m); const _savedGguf = String(sv('gguf_file', '') || '');