Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts:
#	routes/cookbook_routes.py
#	routes/hwfit_routes.py
#	services/hwfit/fit.py
#	services/hwfit/models.py
#	static/js/cookbook-diagnosis.js
#	static/js/cookbook-hwfit.js
#	static/js/cookbook.js
#	static/js/cookbookRunning.js
This commit is contained in:
pewdiepie-archdaemon
2026-06-03 16:49:10 +09:00
569 changed files with 35252 additions and 3489 deletions
+229 -44
View File
@@ -223,11 +223,20 @@ function _detectModelOptimizations(modelName) {
return opts;
}
/** Detect the right vLLM tool-call-parser based on model name */
/** Detect the right vLLM tool-call-parser based on model name.
* Qwen tool-call formats split by generation:
* - Qwen3-Coder → qwen3_coder (XML <tool_call> with named params)
* - Qwen3 (non-coder) → qwen3_xml (reasoning/instruct, XML wrapper)
* - Qwen2.5 / Qwen2 / 1.5 → hermes (Qwen2.5 was trained on Hermes format)
* Catching "qwen" first and labelling everything qwen3_xml breaks tool
* calls on the Qwen2.5 line (the model emits hermes-style which the
* qwen3_xml parser doesn't recognise, so the call leaks through as text).
*/
export function _detectToolParser(modelName) {
const n = (modelName || '').toLowerCase();
if (n.includes('qwen3') && n.includes('coder')) return 'qwen3_coder';
if (n.includes('qwen')) return 'qwen3_xml';
if (n.includes('qwen3')) return 'qwen3_xml';
if (n.includes('qwen')) return 'hermes'; // Qwen2.5 / Qwen2 / Qwen1.5
if (n.includes('llama-4') || n.includes('llama4')) return 'llama4_json';
if (n.includes('llama') || n.includes('nemotron')) return 'llama3_json';
if (n.includes('mistral') || n.includes('mixtral')) return 'mistral';
@@ -251,37 +260,43 @@ export function _detectBackend(model) {
const q = (model.quant || '').toUpperCase();
const sysBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
const isRocm = sysBackend === 'rocm';
const isAppleSilicon = ['metal', 'mps', 'apple'].includes(sysBackend);
const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
if (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX')) {
return { backend: 'unsupported', label: 'Unsupported' };
}
const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || ['FP8', 'FP4', 'MXFP4', 'NF4', 'INT4', 'INT8', 'W4A16', 'W8A8', 'W8A16'].includes(q) || /\b(awq|gptq|fp8|fp4|nvfp4|mxfp4|nf4|int4|int8|w4a16|w8a8|w8a16)\b/i.test(_nm);
const isGgufLike = model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf');
// Image gen models → diffusers
if (model.is_image_gen || model.is_diffusion || model._tag === 'image') {
return { backend: 'diffusers', label: 'Diffusers' };
}
// AWQ / GPTQ / FP8 are safetensors GPU-serving formats. Never route them
// through llama.cpp/Ollama just because the host is Mac/Windows; those engines
// need GGUF. The UI will warn/block on Metal where vLLM/SGLang aren't viable.
if (isAwqLike) {
return { backend: 'vllm', label: 'vLLM' };
}
// GGUF → llama.cpp/Ollama-compatible.
if (isGgufLike) {
return { backend: 'llamacpp', label: 'llama.cpp' };
}
// Windows → default to llama.cpp (no vLLM support on Windows)
if (_isWindows()) {
return { backend: 'llamacpp', label: 'llama.cpp' };
}
// Apple Silicon (Metal) → llama.cpp (GGUF). vLLM/SGLang are CUDA/ROCm-only and
// don't run on macOS; AWQ/GPTQ/FP8 (vLLM-only) models are already filtered out
// don't run on macOS; vLLM-native quantized models are already filtered out
// of metal Cookbook results, so llama.cpp is always the right engine here.
if (['metal', 'mps', 'apple'].includes(sysBackend)) {
return { backend: 'llamacpp', label: 'llama.cpp' };
}
// AWQ / GPTQ / FP8 → vLLM
if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') {
return { backend: 'vllm', label: 'vLLM' };
}
// GGUF → llama.cpp. Match the quant tag OR a gguf hint in the repo/path/name:
// a raw .gguf file often has no quant field, which made it fall through to the
// vLLM default below.
const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
if (model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf')) {
return { backend: 'llamacpp', label: 'llama.cpp' };
}
// ROCm/AMD machines should not blindly default HF safetensors models to
// vLLM. SGLang is the safer OpenAI-compatible default for plain HF text
// repos there; llama.cpp still wins above whenever the model is GGUF.
@@ -351,6 +366,8 @@ export function _buildServeCmd(f, modelName, backend) {
cmd += ` --gpu-memory-utilization ${f.gpu_mem || '0.90'}`;
if (f.swap && f.swap !== '0') cmd += ` --swap-space ${f.swap}`;
cmd += ` --dtype ${f.dtype || 'auto'}`;
const _kv = (f.vllm_kv_cache_dtype ?? '').toString().trim();
if (_kv === 'fp8') cmd += ' --kv-cache-dtype fp8';
if (f.max_seqs && f.max_seqs.toString().trim()) cmd += ` --max-num-seqs ${f.max_seqs.toString().trim()}`;
if (f.enforce_eager) cmd += ' --enforce-eager';
if (f.trust_remote) cmd += ' --trust-remote-code';
@@ -384,13 +401,17 @@ export function _buildServeCmd(f, modelName, backend) {
const ggufPath = f._gguf_path || 'model.gguf';
const gpuId = f.gpu_id?.trim() || '';
const py = _isWindows() ? 'python' : 'python3';
// CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
// mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
// start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
const _cpuOnly = String(f.ngl).trim() === '0';
const lcPrefix = (() => {
let p = '';
if (f.unified_mem && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `;
return p;
})();
if (f.unified_mem && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `;
if (!_isWindows()) {
// Resolve GGUF path once, fail loudly if nothing matched (prevents
@@ -402,16 +423,75 @@ export function _buildServeCmd(f, modelName, backend) {
// renders modern GGUF chat templates that the Python bindings' Jinja2
// rejects (do_tojson ensure_ascii). Fall back to llama_cpp.server.
// Don't suppress stderr — surface real errors (missing file, lib, OOM).
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}`;
// Optional perf/fit flags from a hardware profile (see services/hwfit/
// profiles.py). n_cpu_moe offloads MoE expert layers to CPU when the model
// is bigger than VRAM; flash-attn + a quantized KV cache cut KV memory and
// speed things up. Only emitted when set, so manual/older flows are unchanged.
const _ncm = (f.n_cpu_moe ?? '').toString().trim();
const _kv = (f.cache_type ?? '').toString().trim();
const _llamaNum = (v) => {
const s = String(v || '').trim();
return /^\d+$/.test(s) ? s : '';
};
const _llamaCsv = (v) => {
const s = String(v || '').replace(/\s+/g, '');
return /^\d+(?:\.\d+)?(?:,\d+(?:\.\d+)?)*$/.test(s) ? s : '';
};
let _lcExtra = '';
let _lcpExtra = '';
if (_ncm !== '' && Number(_ncm) > 0) {
_lcExtra += ` --n-cpu-moe ${_ncm}`;
_lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores
}
if (f.flash_attn && !_cpuOnly) {
_lcExtra += ' --flash-attn on';
_lcpExtra += ' --flash_attn true';
}
if (_kv) {
_lcExtra += ` --cache-type-k ${_kv} --cache-type-v ${_kv}`;
// llama-cpp-python exposes these as type_k/type_v; pass through best-effort.
_lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`;
}
const _llamaFit = String(f.llama_fit || '').trim();
if (['on', 'off'].includes(_llamaFit)) _lcExtra += ` --fit ${_llamaFit}`;
if (f.llama_no_mmap) _lcExtra += ' --no-mmap';
if (f.llama_no_warmup) _lcExtra += ' --no-warmup';
const _llamaSplitMode = String(f.llama_split_mode || '').trim();
if (['none', 'layer', 'row', 'tensor'].includes(_llamaSplitMode)) _lcExtra += ` --split-mode ${_llamaSplitMode}`;
const _llamaTensorSplit = _llamaCsv(f.llama_tensor_split);
if (_llamaTensorSplit) _lcExtra += ` --tensor-split ${_llamaTensorSplit}`;
const _llamaMainGpu = _llamaNum(f.llama_main_gpu);
if (_llamaMainGpu) _lcExtra += ` --main-gpu ${_llamaMainGpu}`;
const _llamaParallel = _llamaNum(f.llama_parallel);
if (_llamaParallel) _lcExtra += ` --parallel ${_llamaParallel}`;
const _llamaBatch = _llamaNum(f.llama_batch_size);
if (_llamaBatch) _lcExtra += ` --batch-size ${_llamaBatch}`;
const _llamaUBatch = _llamaNum(f.llama_ubatch_size);
if (_llamaUBatch) _lcExtra += ` --ubatch-size ${_llamaUBatch}`;
if (f.llama_speculative_mtp) {
const specTokens = parseInt(f.llama_spec_tokens, 10);
const specN = Number.isFinite(specTokens) && specTokens > 0 ? specTokens : 3;
_lcExtra += ` --spec-type draft-mtp --spec-draft-n-max ${specN}`;
}
// Vision: serve the multimodal projector so the model can read images. The
// mmproj path is resolved at runtime (find mmproj-*.gguf next to the model);
// only emitted when the Vision toggle is on AND a projector was found.
if (f.vision && f._mmproj_path) {
_lcExtra += ` --mmproj "${f._mmproj_path}" --image-max-tokens 1024`;
// llama-cpp-python takes the projector via --clip_model_path.
_lcpExtra += ` --clip_model_path "${f._mmproj_path}"`;
}
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
if (_isWindows()) {
cmd += _lcpServer;
} else {
cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}`;
cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
cmd += ` || ${_lcpServer}`;
}
} else if (backend === 'ollama') {
const ollamaPort = f.port || '11434';
const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=0.0.0.0:${ollamaPort} ` : '';
const bindHost = _envState.remoteHost ? '0.0.0.0' : '127.0.0.1';
const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=${bindHost}:${ollamaPort} ` : '';
cmd = `${hostEnv}ollama serve`;
} else if (backend === 'diffusers') {
const gpuStr = f.gpus?.trim();
@@ -542,6 +622,10 @@ async function _fetchDependencies() {
const _statusTag = (pkg, isLocal, isSystemDep, winBlocked) => {
if (winBlocked) return `<span class="cookbook-dep-tag cookbook-dep-na">N/A</span>`;
if (pkg.installed && isSystemDep) return `<span class="cookbook-dep-tag cookbook-dep-installed" title="Found on selected server">Installed</span>`;
if (pkg.installed && pkg.pip_update_available === false) {
const tip = esc(pkg.update_note || pkg.status_note || 'Found externally; update outside Odysseus.');
return `<span class="cookbook-dep-tag cookbook-dep-installed" title="${tip}">Installed</span>`;
}
if (pkg.installed) return `<button class="cookbook-dep-tag cookbook-dep-installed cookbook-dep-installed-btn" title="Installed — click for actions"><span class="cookbook-dep-installed-label">Installed</span><span class="cookbook-dep-caret">&#9662;</span></button>`;
if (isSystemDep) {
const depTip = esc(pkg.install_hint || 'Install this OS package on the selected server.');
@@ -556,11 +640,13 @@ async function _fetchDependencies() {
const isSystemDep = pkg.kind === 'system';
const winBlocked = !isLocal && _isWindows() && _winUnsupported.has(pkg.name);
const note = pkg.status_note ? `<div class="memory-item-meta" style="font-size:10px;opacity:0.65;margin-top:3px;">${esc(pkg.status_note)}</div>` : '';
const updateNote = pkg.installed && pkg.pip_update_available === false && pkg.update_note ? `<div class="memory-item-meta" style="font-size:10px;opacity:0.55;margin-top:3px;">${esc(pkg.update_note)}</div>` : '';
return `<div class="cookbook-dep-row${winBlocked ? ' cookbook-dep-blocked' : ''}" data-pkg-name="${esc(pkg.name)}" data-dep-pip="${esc(pkg.pip || '')}" data-dep-target="${isLocal ? 'local' : 'remote'}" data-dep-kind="${esc(pkg.kind || 'python')}">`
+ `<div class="cookbook-dep-info">`
+ `<div class="memory-item-title">${esc(pkg.name)}</div>`
+ `<div class="memory-item-meta" style="font-size:10px;opacity:0.5;margin-top:2px;">${esc(pkg.desc)}</div>`
+ note
+ updateNote
+ `</div>`
+ `<span class="cookbook-dep-tag cookbook-dep-cat">${esc(pkg.category)}</span>`
+ _statusTag(pkg, isLocal, isSystemDep, winBlocked)
@@ -642,7 +728,7 @@ async function _fetchDependencies() {
}
// _dep flags this as a pip dependency/driver install (not a servable
// model) so the running-task card doesn't offer a "Serve →" button.
const payload = { repo_id: pipName, _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true };
const payload = { repo_id: pipName, _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true, env_path: _envState.envPath || '' };
_addTask(data.session_id, 'pip ' + pkgName, 'download', payload);
if (statusEl) { statusEl.textContent = upgrade ? 'Updating...' : 'Installing...'; statusEl.disabled = true; }
uiModule.showToast(`${upgrade ? 'Updating' : 'Installing'} ${pkgName} on ${targetHost}...`);
@@ -932,6 +1018,51 @@ function _wireTabEvents(body) {
});
}
// "Rebuild llama.cpp" clears the cached build so the next serve recompiles.
// The serve bootstrap only builds llama-server when it is missing from PATH,
// so a host that first built CPU-only (no nvcc at build time) keeps reusing
// that binary forever; this is the lever to force a fresh GPU build after a
// CUDA/ROCm toolkit is installed.
const rebuildBtn = document.getElementById('cookbook-rebuild-engine');
if (rebuildBtn && !rebuildBtn._wired) {
rebuildBtn._wired = true;
rebuildBtn.addEventListener('click', async () => {
// Match _installDep: honor the Dependencies server selector so the clear
// runs on the same host the build runs on.
const sel = document.getElementById('hwfit-deps-server');
if (sel) _applyServerSelection(sel.value);
const host = _envState.remoteHost || '';
const where = host || 'this server';
if (!confirm(`Rebuild the llama.cpp engine on ${where}?\n\nThis clears the cached llama-server build so the next serve recompiles from source (with CUDA/HIP if a toolchain is present). It does not download or install anything.`)) return;
const _label = rebuildBtn.textContent;
rebuildBtn.disabled = true;
rebuildBtn.textContent = 'Clearing...';
try {
const res = await fetch('/api/cookbook/rebuild-engine', {
method: 'POST', credentials: 'same-origin',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
engine: 'llamacpp',
remote_host: host || undefined,
ssh_port: _getPort(host) || undefined,
}),
});
const data = await res.json().catch(() => ({}));
if (!res.ok || !data.ok) {
const reason = data.detail || data.error || `HTTP ${res.status}`;
uiModule.showToast('Rebuild failed: ' + String(reason).slice(0, 200));
} else {
uiModule.showToast(`Cleared llama.cpp build on ${where}. Re-launch the serve task to rebuild with GPU support.`);
}
} catch (err) {
uiModule.showToast('Rebuild failed: ' + err.message);
} finally {
rebuildBtn.disabled = false;
rebuildBtn.textContent = _label;
}
});
}
// Serve sort
const serveSort = document.getElementById('serve-sort');
if (serveSort) {
@@ -985,6 +1116,7 @@ function _wireTabEvents(body) {
document.getElementById('serve-bulk-cancel')?.addEventListener('click', () => {
selectBtn.classList.remove('active');
selectBtn.textContent = 'Select'; // reset label so the button doesn't stay reading "Cancel" after exit
bulkBar.classList.add('hidden');
document.querySelectorAll('.serve-select-cb').forEach(dot => { dot.style.display = 'none'; dot.classList.remove('selected'); });
});
@@ -1003,6 +1135,7 @@ function _wireTabEvents(body) {
if (item) await _deleteCachedModel(repo, item, true);
}
selectBtn.classList.remove('active');
selectBtn.textContent = 'Select'; // same reset as bulk-cancel
bulkBar.classList.add('hidden');
document.querySelectorAll('.serve-select-cb').forEach(dot => { dot.style.display = 'none'; dot.classList.remove('selected'); });
});
@@ -1011,6 +1144,16 @@ function _wireTabEvents(body) {
// Download input
const dlBtn = document.getElementById('cookbook-dl-btn');
const dlInput = document.getElementById('cookbook-dl-repo');
const dlCardToggle = document.getElementById('cookbook-download-card-toggle');
const dlCardBody = document.getElementById('cookbook-download-card-body');
const dlCardArrow = document.getElementById('cookbook-download-card-arrow');
if (dlCardToggle && dlCardBody) {
dlCardToggle.addEventListener('click', () => {
const isOpen = dlCardBody.style.display !== 'none';
dlCardBody.style.display = isOpen ? 'none' : 'block';
if (dlCardArrow) dlCardArrow.style.transform = isOpen ? 'rotate(0deg)' : 'rotate(90deg)';
});
}
if (dlBtn && dlInput) {
function _stripHfUrl(input) {
let repo = input.trim();
@@ -1104,8 +1247,12 @@ function _wireTabEvents(body) {
if (hfToggle && hfList) {
let _loaded = false;
// Per-server VRAM cache so we don't re-probe on every expand
const _vramCache = {};
async function _getSelectedServerVram() {
const _hwCache = {};
function _hfModelLooksAwqLike(m) {
const text = `${m?.repo_id || ''} ${(m?.tags || []).join(' ')}`.toLowerCase();
return /\b(awq|gptq|fp8|4bit|int4)\b/.test(text);
}
async function _getSelectedServerHw() {
// Prefer the "What Fits" dropdown (the main control that shows hardware);
// fall back to the download dropdown. This is the server the list ranks for.
const dlSrv = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
@@ -1122,7 +1269,7 @@ function _wireTabEvents(body) {
}
}
const cacheKey = host || 'local';
if (_vramCache[cacheKey] !== undefined) return _vramCache[cacheKey];
if (_hwCache[cacheKey]) return _hwCache[cacheKey];
// Fetch system info for this server from hwfit
try {
const qp = new URLSearchParams();
@@ -1132,13 +1279,13 @@ function _wireTabEvents(body) {
const r = await fetch(`/api/hwfit/system?${qp}`);
if (r.ok) {
const sys = await r.json();
const v = sys?.gpu_vram_gb || 0;
_vramCache[cacheKey] = v;
return v;
const hw = { vram: sys?.gpu_vram_gb || 0, backend: String(sys?.backend || '').toLowerCase() };
_hwCache[cacheKey] = hw;
return hw;
}
} catch {}
_vramCache[cacheKey] = 0;
return 0;
_hwCache[cacheKey] = { vram: 0, backend: '' };
return _hwCache[cacheKey];
}
async function _loadLatest() {
// Match the Dependencies loader: whirlpool spinner + text label so the
@@ -1157,7 +1304,8 @@ function _wireTabEvents(body) {
} catch {
hfList.innerHTML = '<div class="hwfit-loading">Scanning models…</div>';
}
const vram = await _getSelectedServerVram();
const hwInfo = await _getSelectedServerHw();
const vram = hwInfo.vram || 0;
try {
let lastErr = '';
const _fetchLatest = async (v) => {
@@ -1173,6 +1321,9 @@ function _wireTabEvents(body) {
if (!models.length && vram > 0) {
models = await _fetchLatest(0);
}
if (['rocm', 'metal', 'mps', 'apple', 'generic', 'cpu'].includes(hwInfo.backend)) {
models = models.filter(m => !_hfModelLooksAwqLike(m));
}
if (!models.length) {
// Distinguish "the HF API failed" from "nothing matched" so an outage
// doesn't masquerade as no-fitting-models.
@@ -1254,9 +1405,32 @@ function _wireTabEvents(body) {
// HF token — save on change
const hfInput = document.getElementById('hwfit-hftoken');
if (hfInput) {
hfInput.addEventListener('change', () => {
_envState.hfToken = hfInput.value.trim();
_persistEnvState();
hfInput.addEventListener('change', async () => {
const val = hfInput.value.trim();
_envState.hfToken = val;
try { await _persistEnvState(); } catch {}
if (val) {
_envState.hfTokenConfigured = true;
const masked = val.length > 6 ? val.slice(0, 3) + '…' + val.slice(-3) : '••••';
_envState.hfTokenMasked = masked;
hfInput.placeholder = `Stored (${masked}) - enter a new token to replace`;
hfInput.value = '';
let check = hfInput.parentNode.querySelector('.hwfit-hf-check');
if (!check) {
check = document.createElement('span');
check.className = 'hwfit-hf-check';
check.title = 'Token stored';
check.textContent = '✓';
check.style.cssText = 'font-weight:800;color:var(--green,#50fa7b);font-size:15px;line-height:1;flex-shrink:0;position:relative;top:2px;';
hfInput.parentNode.insertBefore(check, hfInput);
}
const flash = document.createElement('span');
flash.textContent = 'Saved';
flash.style.cssText = 'margin-left:8px;font-size:11px;color:var(--green,#50fa7b);opacity:0;transition:opacity 0.18s;flex-shrink:0;position:relative;top:1px;';
hfInput.parentNode.appendChild(flash);
requestAnimationFrame(() => { flash.style.opacity = '1'; });
setTimeout(() => { flash.style.opacity = '0'; setTimeout(() => flash.remove(), 220); }, 1400);
}
});
}
}
@@ -1393,7 +1567,7 @@ function _renderRecipes() {
// silently sending downloads to the wrong server. An empty selection means Local; the user
// chooses a remote server explicitly via the dropdown.
// Download input
// Manual download input
html += `<div style="margin-top:7px;margin-bottom:2px;display:flex;gap:4px;align-items:center;">`;
if (_es.servers.length > 1) {
html += `<select class="cookbook-field-input hwfit-dl-server" id="hwfit-dl-server" style="height:28px;position:relative;top:0px;">`;
@@ -1409,7 +1583,7 @@ function _renderRecipes() {
html += `<button class="cookbook-btn cookbook-dl-btn" id="cookbook-dl-btn">Download</button>`;
html += `</div>`;
// Latest HF models that fit — collapsible card list
html += `<div style="margin-top:2px;position:relative;top:-8px;">`;
html += `<div style="margin-top:5px;position:relative;top:-3px;">`;
html += `<div style="display:flex;gap:4px;align-items:center;">`;
html += `<button type="button" class="memory-toolbar-btn" id="cookbook-hf-latest-toggle" style="flex:1;text-align:left;height:26px;display:flex;align-items:center;gap:6px;border-radius:4px;">`;
html += `<span id="cookbook-hf-latest-arrow" style="display:inline-block;transition:transform 0.15s;pointer-events:none;">\u25B8</span>`;
@@ -1422,7 +1596,7 @@ function _renderRecipes() {
html += `</div>`; // /#cookbook-dl-tab-fold-body (whole Download card body)
// Search section
html += '</div></div></div>';
html += '</div></div></div></div>';
html += '<div class="cookbook-group" data-backend-group="Search">';
html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';
@@ -1445,13 +1619,21 @@ function _renderRecipes() {
html += '<option value="Q4_K_M">Q4</option><option value="Q8_0">Q8</option>';
html += '<option value="Q6_K">Q6</option><option value="Q5_K_M">Q5</option>';
html += '<option value="Q3_K_M">Q3</option><option value="Q2_K">Q2</option>';
html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option></select>';
// Ctx slider — ported from origin/main. Lets you target a context length
// for fit estimates; the hwfit ranking uses _ctxValue() to factor that into
// VRAM math, so dragging this re-sorts the list toward models that fit
// your chosen ctx.
html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option><option value="FP4">FP4</option><option value="NVFP4">NVFP4</option></select>';
// Engine filter — show only models whose serve engine matches. Composes
// with quant / type / search filters.
html += '<select class="cookbook-field-input hwfit-engine" id="hwfit-engine" style="height:28px;" title="Filter by serving engine">';
html += '<option value="">Engine</option>';
html += '<option value="llamacpp">llama.cpp</option>';
html += '<option value="vllm">vLLM</option>';
html += '<option value="sglang">SGLang</option>';
html += '</select>';
html += '<span class="hwfit-help-chip" title="Higher numbers usually mean better quality, but they need more memory. Lower numbers fit on more hardware.">?</span>';
// Ctx slider — lets you target a context length for fit estimates; the
// hwfit ranking uses _ctxValue() to factor that into VRAM math, so
// dragging this re-sorts the list toward models that fit your chosen ctx.
html += '<label class="hwfit-ctx-control" title="Context length for fit estimates. Lower it to find more models that could fit your hardware.">';
html += '<span>Ctx</span><input type="range" id="hwfit-context" min="0" max="5" step="1" value="3" />';
html += '<span>Ctx</span><span class="hwfit-help-chip hwfit-help-chip-inline" title="Context length. Lower it to find more models that could fit your hardware; raise it when you need longer chats or documents.">?</span><input type="range" id="hwfit-context" min="0" max="5" step="1" value="3" />';
html += '<output id="hwfit-context-label">50k</output></label>';
html += '</div>';
html += '<div class="hwfit-toolbar" style="margin-top:7px;">';
@@ -1462,8 +1644,10 @@ function _renderRecipes() {
// Scan/refresh button (icon-only) where the quant dropdown used to sit.
html += '<button type="button" class="hwfit-gpu-btn" id="hwfit-rescan" title="Re-scan hardware" style="flex-shrink:0;position:relative;top:-3px;left:-1px;">↻ RESCAN</button>';
html += '<button type="button" class="hwfit-gpu-btn hwfit-hw-manual-btn" id="hwfit-hw-manual-btn" title="Set hardware manually" style="flex-shrink:0;position:relative;top:-3px;left:-1px;">EDIT</button>';
// Sort state — the clickable column headers read/write this (pewds' original
// sort paradigm). Newest is reachable by clicking the Model column header.
html += '<select class="cookbook-field-input hwfit-sort" id="hwfit-sort" style="display:none">';
html += '<option value="score">Score</option><option value="vram">VRAM</option>';
html += '<option value="fit">Fit</option><option value="score">Score</option><option value="vram">VRAM</option>';
html += '<option value="speed">Speed</option><option value="params">Params</option>';
html += '<option value="context">Context</option></select>';
html += '</div>';
@@ -1523,6 +1707,7 @@ function _renderRecipes() {
html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
html += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px;">';
html += '<h2 style="margin:0;padding:0;line-height:1;">Dependencies</h2>';
html += '<button class="cookbook-field-input" id="cookbook-rebuild-engine" title="Clear the cached llama.cpp build so the next serve recompiles from source (use after installing a CUDA/ROCm toolkit to turn a CPU-only build into a GPU build)." style="height:24px;font-size:10px;padding:0 8px;cursor:pointer;width:auto;">Rebuild llama.cpp</button>';
html += '<span style="font-size:10px;opacity:0.5;margin-left:auto;">Server</span>';
html += '<select class="cookbook-field-input" id="hwfit-deps-server" style="height:28px;min-width:70px;">';
html += _buildServerOpts(false);