Cookbook: auto-set KV cache to fp8 for DeepSeek V3/V4/R1 MoE families

These models OOM on --kv-cache-dtype auto (≈bf16) at any usable context with current tensor-parallel layouts. _detectModelOptimizations now seeds opts.kvCacheDtype='fp8' for them, and the serve panel's KV Cache select picks that up as the default unless the user has a saved override on this skill.
2026-06-17 10:15:27 -04:00 · 2026-06-14 08:57:29 +09:00
parent d3944be1be
commit 4074e77d93
2 changed files with 13 additions and 2 deletions
@@ -546,7 +546,14 @@ function _rerenderCachedModels() {
          : (_es.gpus || detectedGpuIds));
      const tpOpts = [1,2,4,8].map(n => `<option${defaultTp==String(n)?' selected':''}>${n}</option>`).join('');
      const dtypeOpts = ['auto','float16','bfloat16'].map(d => `<option value="${d}"${sv('dtype','auto')===d?' selected':''}>${d}</option>`).join('');
-      const vllmKvCacheOpts = ['auto','fp8'].map(d => `<option value="${d}"${sv('vllm_kv_cache_dtype','auto')===d?' selected':''}>${d}</option>`).join('');
+      // KV cache default — most models are fine on auto, but a few
+      // (e.g. DeepSeek-V3/V4/R1 MoE) need fp8 explicitly or the launch
+      // OOMs. _detectModelOptimizations seeds opts.kvCacheDtype for
+      // those families; honour it unless the user has a saved override.
+      const _kvOptsCheck = _detectModelOptimizations(repo);
+      const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || 'auto';
+      const _kvSelected = sv('vllm_kv_cache_dtype', _kvAutoDefault);
+      const vllmKvCacheOpts = ['auto','fp8'].map(d => `<option value="${d}"${_kvSelected===d?' selected':''}>${d}</option>`).join('');
      const _l = (name, tip) => `<span>${name}<span class="hwfit-hint" title="${tip}">?</span></span>`;
      const _ggufChoices = _runnableGgufFiles(m);
      const _savedGguf = String(sv('gguf_file', '') || '');