diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index f31d3189b..145b5c08c 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -9,6 +9,7 @@ import spinnerModule from './spinner.js'; import { providerLogo } from './providers.js'; import { modelColor } from './chatRenderer.js'; import { bindMenuDismiss, dismissOrRemove } from './escMenuStack.js'; +import { openCookbookDependencies } from './cookbook-diagnosis.js'; // Shared state/functions injected by init() let _envState; @@ -579,10 +580,15 @@ function _rerenderCachedModels() { const _arrowTitle = _modelPresets.length > 0 ? `${_modelPresets.length} saved launch config${_modelPresets.length === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete` : `No saved launch configs for ${_repoShort} yet — click Save to add one`; - let _slotsHtml = `
` - + `` + // Wrap the Save split in a
`; let panelHtml = `
`; // Runtime-readiness note pinned at the top of the serve area so the @@ -617,6 +623,10 @@ function _rerenderCachedModels() { panelHtml += ``; panelHtml += ``; panelHtml += ``; + // Dtype lives in Row 1 (next to venv) — it's the first knob people + // change when matching the model to the box, so it earns top-row + // real estate over Row 2's launch-tuning controls. + panelHtml += ``; const defaultPort = defaultBackend === 'ollama' ? '11434' : _nextAvailablePort(); panelHtml += ``; const _activeGpus = (defaultGpus || '').split(',').map(s => s.trim()).filter(Boolean); @@ -627,9 +637,13 @@ function _rerenderCachedModels() { const on = _activeGpus.includes(String(i)); _gpuBtnsHtml += ``; } - panelHtml += ``; - // Save / saved-configs split button — moved into Row 1 (next to GPUs) - // so it shares the same baseline as the rest of the top controls. + // GPUs button strip moved to Row 2 (next to GPU Mem) below. 4px + // margin on the left, 8px on the right — extra 4px right-side gap + // separates the GPU chiclets from the GPU Mem field that follows + // (asked-for breathing room; 4px on either side felt cramped on + // the GPU-Mem boundary). + const _gpusLabelHtml = ``; + // Save / saved-configs split button — sits at the right end of Row 1. panelHtml += _slotsHtml; panelHtml += `
`; // (hwfit-serve-runtime-note moved to the top of the panel — see above.) @@ -650,17 +664,21 @@ function _rerenderCachedModels() { // (Swap, KV Cache, Attention backend, Env vars, llama.cpp batch/ubatch) // moved to the Advanced fold below to keep this row scannable. panelHtml += `
`; - // Order: Dtype → TP → Context → GPU → GPU Mem → Max Seqs. - // Dtype moved left of TP at user's request — it's the first knob - // people typically check when matching the model to the box. - panelHtml += ``; + // Order: TP → Context → Max Seqs → GPUs → GPU Mem. + // Dtype moved up to Row 1. GPUs moved here next to GPU Mem so the + // "which devices + how much of them" decisions sit adjacent. Max + // Seqs follows Context per the "request-shape" cluster. panelHtml += ``; // ctx resets to the model's max on every panel open (the real ctx slider // lives in the Scan/Download toolbar — see cookbook.js .hwfit-ctx-control). panelHtml += ``; - panelHtml += ``; - panelHtml += ``; panelHtml += ``; + // GPU "auto" field removed — the GPU button strip below already + // writes data-field="gpus" (the canonical comma-separated device + // list) and the command builders now read from that single source. + panelHtml += ``; + // GPUs button strip at the far right of Row 2. + panelHtml += _gpusLabelHtml; panelHtml += `
`; // ── Advanced (collapsed by default) ── // Everything below the fold is tuning users only touch occasionally: @@ -688,7 +706,10 @@ function _rerenderCachedModels() { // tuning, or any other KEY=VALUE pair that doesn't have a dedicated // field. After the venv activate runs, $VIRTUAL_ENV / $PATH / etc. are // already exported so they expand correctly here. - panelHtml += ``; + // grid-column: 1 / -1 makes Env span every column of the Advanced + // row's CSS grid (the old flex:1 1 100% did nothing in a grid + // container — left an empty trailing column gap on wide modals). + panelHtml += ``; panelHtml += ``; // Advanced llama.cpp row (Batch / UBatch — moved out of Core for the // same "rarely touched" reason as the vLLM extras above). @@ -722,6 +743,21 @@ function _rerenderCachedModels() { if (_rp_name) panelHtml += ``; panelHtml += ``; panelHtml += ``; + // Inline the previously-second vLLM checks row so Expert Parallel / + // Speculative / MoE Env sit next to Prefix Caching with no gap. All + // three are vLLM-only — class-gated so they hide on SGLang. + if (_opts2_row3.flags.includes('--enable-expert-parallel')) panelHtml += ``; + { + const _specDef = _opts2_row3.spec || { method: 'mtp', tokens: 3 }; + const _specMethod = sv('spec_method', _specDef.method); + const _specTokens = sv('spec_tokens', String(_specDef.tokens)); + const _specMethods = ['mtp', 'qwen3_next_mtp', 'eagle', 'medusa', 'ngram']; + if (!_specMethods.includes(_specMethod)) _specMethods.unshift(_specMethod); + const _specOpts = _specMethods.map(m => + ``).join(''); + panelHtml += ``; + } + if (_opts2_row3.envVars.length) panelHtml += ``; panelHtml += ``; // Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand) const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => ``).join(''); @@ -774,28 +810,8 @@ function _rerenderCachedModels() { // vLLM backend so the Speculative (MTP) control is ALWAYS reachable — // even for models the auto-detector doesn't recognize. Expert-parallel, // reasoning-parser and MoE-env still only appear when auto-detected. - const _opts2 = _detectModelOptimizations(repo); - panelHtml += `
`; - if (_opts2.flags.includes('--enable-expert-parallel')) panelHtml += ``; - // Reasoning Parser moved to Row 3 (inline with Trust Remote / Auto - // Tool) so the per-model toggles sit together — the duplicate that - // lived here previously left two copies of the same checkbox. - { - // Speculative decoding (vLLM --speculative-config). Default OFF; the - // method/token defaults come from auto-detection when available, - // else fall back to MTP/3. Toggling the checkbox is what actually - // adds the flag at launch (see cookbook.js command builder). - const _specDef = _opts2.spec || { method: 'mtp', tokens: 3 }; - const _specMethod = sv('spec_method', _specDef.method); - const _specTokens = sv('spec_tokens', String(_specDef.tokens)); - const _specMethods = ['mtp', 'qwen3_next_mtp', 'eagle', 'medusa', 'ngram']; - if (!_specMethods.includes(_specMethod)) _specMethods.unshift(_specMethod); - const _specOpts = _specMethods.map(m => - ``).join(''); - panelHtml += ``; - } - if (_opts2.envVars.length) panelHtml += ``; - panelHtml += `
`; + // Expert Parallel / Speculative / MoE Env moved into Row 3 above so + // the vLLM-only toggles sit next to Prefix Caching with no gap. // Extra args sits below the vLLM checks (Reasoning Parser + Spec) // so it reads as "after the advanced toggles, any other flags". panelHtml += `
`; @@ -1143,6 +1159,22 @@ function _rerenderCachedModels() { note.style.color = 'var(--red)'; note.style.borderColor = 'color-mix(in srgb, var(--red) 40%, transparent)'; note.style.background = 'color-mix(in srgb, var(--red) 8%, transparent)'; + // Append an accent-color link straight to the Dependencies + // recipe panel for this backend so the user has one click + // to the fix instead of hunting for the right row. + if (noteText) { + const pkgName = pkg?.name || ({ vllm: 'vllm', sglang: 'sglang', llamacpp: 'llama_cpp', diffusers: 'diffusers' }[backend]); + const repo = (panel.closest('.doclib-card, .memory-item')?.dataset?.repo) || ''; + const link = document.createElement('a'); + link.href = '#'; + link.textContent = ' Install in Dependencies →'; + link.style.cssText = 'color:var(--accent, var(--red));text-decoration:underline;font-weight:600;margin-left:4px;'; + link.addEventListener('click', (ev) => { + ev.preventDefault(); + if (pkgName) openCookbookDependencies(pkgName, { expandRecipe: pkgName, model: repo }); + }); + noteText.appendChild(link); + } } else { // Healthy / ready → green so the user reads "good to go" at a // glance instead of scanning fg-muted for a state.