diff --git a/static/js/cookbook.js b/static/js/cookbook.js
index 8df4027e8..8b1264690 100644
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -249,10 +249,14 @@ function _detectModelOptimizations(modelName) {
}
// DeepSeek MoE — V3 / V3.1 / V4 (and future Vx), R1 / R2 reasoning.
// Anything v-{integer} or r-{integer} family from DeepSeek is MoE in
- // current architectures.
+ // current architectures. These models also require fp8 KV cache to
+ // fit at meaningful context with current tensor-parallel layouts —
+ // the launch crashes otherwise (--kv-cache-dtype auto → bf16 OOMs).
else if (n.includes('deepseek') && /\b(v[3-9]|v\d{2,}|r[1-9])\b/.test(n)) {
opts.flags.push('--enable-expert-parallel');
opts.tips.push('MoE expert parallel for DeepSeek');
+ opts.kvCacheDtype = 'fp8';
+ opts.tips.push('fp8 KV cache required — bf16 OOMs at usable context');
}
// Reasoning parser — applies independently of MoE detection. Without this
// flag, models like MiniMax-M2.x, DeepSeek-R1, Qwen3 reasoning, GLM-4.x,
diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js
index 33274dc19..f31d3189b 100644
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -546,7 +546,14 @@ function _rerenderCachedModels() {
: (_es.gpus || detectedGpuIds));
const tpOpts = [1,2,4,8].map(n => ``).join('');
const dtypeOpts = ['auto','float16','bfloat16'].map(d => ``).join('');
- const vllmKvCacheOpts = ['auto','fp8'].map(d => ``).join('');
+ // KV cache default — most models are fine on auto, but a few
+ // (e.g. DeepSeek-V3/V4/R1 MoE) need fp8 explicitly or the launch
+ // OOMs. _detectModelOptimizations seeds opts.kvCacheDtype for
+ // those families; honour it unless the user has a saved override.
+ const _kvOptsCheck = _detectModelOptimizations(repo);
+ const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || 'auto';
+ const _kvSelected = sv('vllm_kv_cache_dtype', _kvAutoDefault);
+ const vllmKvCacheOpts = ['auto','fp8'].map(d => ``).join('');
const _l = (name, tip) => `${name}?`;
const _ggufChoices = _runnableGgufFiles(m);
const _savedGguf = String(sv('gguf_file', '') || '');