diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js
index f31d3189b..145b5c08c 100644
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -9,6 +9,7 @@ import spinnerModule from './spinner.js';
import { providerLogo } from './providers.js';
import { modelColor } from './chatRenderer.js';
import { bindMenuDismiss, dismissOrRemove } from './escMenuStack.js';
+import { openCookbookDependencies } from './cookbook-diagnosis.js';
// Shared state/functions injected by init()
let _envState;
@@ -579,10 +580,15 @@ function _rerenderCachedModels() {
const _arrowTitle = _modelPresets.length > 0
? `${_modelPresets.length} saved launch config${_modelPresets.length === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete`
: `No saved launch configs for ${_repoShort} yet — click Save to add one`;
- let _slotsHtml = `
`
- + ``
+ // Wrap the Save split in a
`;
let panelHtml = `
`;
// Runtime-readiness note pinned at the top of the serve area so the
@@ -617,6 +623,10 @@ function _rerenderCachedModels() {
panelHtml += `${_l('Backend','Inference engine: vLLM, SGLang, llama.cpp, Ollama, or Diffusers')}
`;
panelHtml += ``;
panelHtml += `${_l('venv','Path to Python venv or conda env activate script')}`;
+ // Dtype lives in Row 1 (next to venv) — it's the first knob people
+ // change when matching the model to the box, so it earns top-row
+ // real estate over Row 2's launch-tuning controls.
+ panelHtml += `${_l('Dtype','Data type for weights. auto picks best for GPU')}`;
const defaultPort = defaultBackend === 'ollama' ? '11434' : _nextAvailablePort();
panelHtml += `${_l('Port','HTTP port for the API server')}`;
const _activeGpus = (defaultGpus || '').split(',').map(s => s.trim()).filter(Boolean);
@@ -627,9 +637,13 @@ function _rerenderCachedModels() {
const on = _activeGpus.includes(String(i));
_gpuBtnsHtml += ``;
}
- panelHtml += `${_l('GPUs','Toggle which GPUs to use')}
${_gpuBtnsHtml}
`;
- // Save / saved-configs split button — moved into Row 1 (next to GPUs)
- // so it shares the same baseline as the rest of the top controls.
+ // GPUs button strip moved to Row 2 (next to GPU Mem) below. 4px
+ // margin on the left, 8px on the right — extra 4px right-side gap
+ // separates the GPU chiclets from the GPU Mem field that follows
+ // (asked-for breathing room; 4px on either side felt cramped on
+ // the GPU-Mem boundary).
+ const _gpusLabelHtml = `${_l('GPUs','Toggle which GPUs to use')}
${_gpuBtnsHtml}
`;
+ // Save / saved-configs split button — sits at the right end of Row 1.
panelHtml += _slotsHtml;
panelHtml += `
`;
// (hwfit-serve-runtime-note moved to the top of the panel — see above.)
@@ -650,17 +664,21 @@ function _rerenderCachedModels() {
// (Swap, KV Cache, Attention backend, Env vars, llama.cpp batch/ubatch)
// moved to the Advanced fold below to keep this row scannable.
panelHtml += `
`;
- // Order: Dtype → TP → Context → GPU → GPU Mem → Max Seqs.
- // Dtype moved left of TP at user's request — it's the first knob
- // people typically check when matching the model to the box.
- panelHtml += `${_l('Dtype','Data type for weights. auto picks best for GPU')}`;
+ // Order: TP → Context → Max Seqs → GPUs → GPU Mem.
+ // Dtype moved up to Row 1. GPUs moved here next to GPU Mem so the
+ // "which devices + how much of them" decisions sit adjacent. Max
+ // Seqs follows Context per the "request-shape" cluster.
panelHtml += `${_l('TP','Tensor Parallelism — split model across N GPUs')}`;
// ctx resets to the model's max on every panel open (the real ctx slider
// lives in the Scan/Download toolbar — see cookbook.js .hwfit-ctx-control).
panelHtml += `${_l('Context','Max tokens per request — resets to the model max on every open. Lower = less VRAM')}`;
- panelHtml += `${_l('GPU','Which GPU to use. Leave empty for default')}`;
- panelHtml += `${_l('GPU Mem','Fraction of GPU memory (0.0–1.0). Lower if OOM')}`;
panelHtml += `${_l('Max Seqs','Maximum concurrent requests. Lower = less memory. Default 4 — prosumer GPUs often OOM on vLLM default 256 during CUDA graph capture.')}`;
+ // GPU "auto" field removed — the GPU button strip below already
+ // writes data-field="gpus" (the canonical comma-separated device
+ // list) and the command builders now read from that single source.
+ panelHtml += `${_l('GPU Mem','Fraction of GPU memory (0.0–1.0). Lower if OOM')}`;
+ // GPUs button strip at the far right of Row 2.
+ panelHtml += _gpusLabelHtml;
panelHtml += `
`;
// ── Advanced (collapsed by default) ──
// Everything below the fold is tuning users only touch occasionally:
@@ -688,7 +706,10 @@ function _rerenderCachedModels() {
// tuning, or any other KEY=VALUE pair that doesn't have a dedicated
// field. After the venv activate runs, $VIRTUAL_ENV / $PATH / etc. are
// already exported so they expand correctly here.
- panelHtml += `${_l('Env','Extra KEY=VALUE env-var pairs prepended to the launch (space-separated). Example: CUDACXX=$VIRTUAL_ENV/lib/python3.10/site-packages/nvidia/cuda_nvcc/bin/nvcc — points flashinfer at the venv-bundled nvcc when the system one is too old for your GPU.')}`;
+ // grid-column: 1 / -1 makes Env span every column of the Advanced
+ // row's CSS grid (the old flex:1 1 100% did nothing in a grid
+ // container — left an empty trailing column gap on wide modals).
+ panelHtml += `${_l('Env','Extra KEY=VALUE env-var pairs prepended to the launch (space-separated). Example: CUDACXX=$VIRTUAL_ENV/lib/python3.10/site-packages/nvidia/cuda_nvcc/bin/nvcc — points flashinfer at the venv-bundled nvcc when the system one is too old for your GPU.')}`;
panelHtml += ``;
// Advanced llama.cpp row (Batch / UBatch — moved out of Core for the
// same "rarely touched" reason as the vLLM extras above).
@@ -722,6 +743,21 @@ function _rerenderCachedModels() {
if (_rp_name) panelHtml += ` Reasoning Parser ${_rp_name}`;
panelHtml += ` Enforce Eager${_h('Disable CUDA graphs. Slower but uses less memory')}`;
panelHtml += ` Prefix Caching${_h('Cache shared prompt prefixes across requests')}`;
+ // Inline the previously-second vLLM checks row so Expert Parallel /
+ // Speculative / MoE Env sit next to Prefix Caching with no gap. All
+ // three are vLLM-only — class-gated so they hide on SGLang.
+ if (_opts2_row3.flags.includes('--enable-expert-parallel')) panelHtml += ` Expert Parallel`;
+ {
+ const _specDef = _opts2_row3.spec || { method: 'mtp', tokens: 3 };
+ const _specMethod = sv('spec_method', _specDef.method);
+ const _specTokens = sv('spec_tokens', String(_specDef.tokens));
+ const _specMethods = ['mtp', 'qwen3_next_mtp', 'eagle', 'medusa', 'ngram'];
+ if (!_specMethods.includes(_specMethod)) _specMethods.unshift(_specMethod);
+ const _specOpts = _specMethods.map(m =>
+ ``).join('');
+ panelHtml += ` Speculative ?`;
+ }
+ if (_opts2_row3.envVars.length) panelHtml += ` MoE Env Vars`;
panelHtml += ``;
// Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => ``).join('');
@@ -774,28 +810,8 @@ function _rerenderCachedModels() {
// vLLM backend so the Speculative (MTP) control is ALWAYS reachable —
// even for models the auto-detector doesn't recognize. Expert-parallel,
// reasoning-parser and MoE-env still only appear when auto-detected.
- const _opts2 = _detectModelOptimizations(repo);
- panelHtml += `
`;
- if (_opts2.flags.includes('--enable-expert-parallel')) panelHtml += ` Expert Parallel`;
- // Reasoning Parser moved to Row 3 (inline with Trust Remote / Auto
- // Tool) so the per-model toggles sit together — the duplicate that
- // lived here previously left two copies of the same checkbox.
- {
- // Speculative decoding (vLLM --speculative-config). Default OFF; the
- // method/token defaults come from auto-detection when available,
- // else fall back to MTP/3. Toggling the checkbox is what actually
- // adds the flag at launch (see cookbook.js command builder).
- const _specDef = _opts2.spec || { method: 'mtp', tokens: 3 };
- const _specMethod = sv('spec_method', _specDef.method);
- const _specTokens = sv('spec_tokens', String(_specDef.tokens));
- const _specMethods = ['mtp', 'qwen3_next_mtp', 'eagle', 'medusa', 'ngram'];
- if (!_specMethods.includes(_specMethod)) _specMethods.unshift(_specMethod);
- const _specOpts = _specMethods.map(m =>
- ``).join('');
- panelHtml += ` Speculative ?`;
- }
- if (_opts2.envVars.length) panelHtml += ` MoE Env Vars`;
- panelHtml += `
`;
+ // Expert Parallel / Speculative / MoE Env moved into Row 3 above so
+ // the vLLM-only toggles sit next to Prefix Caching with no gap.
// Extra args sits below the vLLM checks (Reasoning Parser + Spec)
// so it reads as "after the advanced toggles, any other flags".
panelHtml += `
`;
@@ -1143,6 +1159,22 @@ function _rerenderCachedModels() {
note.style.color = 'var(--red)';
note.style.borderColor = 'color-mix(in srgb, var(--red) 40%, transparent)';
note.style.background = 'color-mix(in srgb, var(--red) 8%, transparent)';
+ // Append an accent-color link straight to the Dependencies
+ // recipe panel for this backend so the user has one click
+ // to the fix instead of hunting for the right row.
+ if (noteText) {
+ const pkgName = pkg?.name || ({ vllm: 'vllm', sglang: 'sglang', llamacpp: 'llama_cpp', diffusers: 'diffusers' }[backend]);
+ const repo = (panel.closest('.doclib-card, .memory-item')?.dataset?.repo) || '';
+ const link = document.createElement('a');
+ link.href = '#';
+ link.textContent = ' Install in Dependencies →';
+ link.style.cssText = 'color:var(--accent, var(--red));text-decoration:underline;font-weight:600;margin-left:4px;';
+ link.addEventListener('click', (ev) => {
+ ev.preventDefault();
+ if (pkgName) openCookbookDependencies(pkgName, { expandRecipe: pkgName, model: repo });
+ });
+ noteText.appendChild(link);
+ }
} else {
// Healthy / ready → green so the user reads "good to go" at a
// glance instead of scanning fg-muted for a state.