Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
2026-06-17 02:05:22 -04:00 · 2026-06-03 16:49:10 +09:00
parent eb79b76432 41a928f21b
commit 3706d756f3
569 changed files with 35252 additions and 3489 deletions
@@ -223,11 +223,20 @@ function _detectModelOptimizations(modelName) {
  return opts;
 }

-/** Detect the right vLLM tool-call-parser based on model name */
+/** Detect the right vLLM tool-call-parser based on model name.
+ *  Qwen tool-call formats split by generation:
+ *   - Qwen3-Coder           → qwen3_coder  (XML <tool_call> with named params)
+ *   - Qwen3 (non-coder)     → qwen3_xml    (reasoning/instruct, XML wrapper)
+ *   - Qwen2.5 / Qwen2 / 1.5 → hermes       (Qwen2.5 was trained on Hermes format)
+ *  Catching "qwen" first and labelling everything qwen3_xml breaks tool
+ *  calls on the Qwen2.5 line (the model emits hermes-style which the
+ *  qwen3_xml parser doesn't recognise, so the call leaks through as text).
+ */
 export function _detectToolParser(modelName) {
  const n = (modelName || '').toLowerCase();
  if (n.includes('qwen3') && n.includes('coder')) return 'qwen3_coder';
-  if (n.includes('qwen')) return 'qwen3_xml';
+  if (n.includes('qwen3')) return 'qwen3_xml';
+  if (n.includes('qwen')) return 'hermes';   // Qwen2.5 / Qwen2 / Qwen1.5
  if (n.includes('llama-4') || n.includes('llama4')) return 'llama4_json';
  if (n.includes('llama') || n.includes('nemotron')) return 'llama3_json';
  if (n.includes('mistral') || n.includes('mixtral')) return 'mistral';
@@ -251,37 +260,43 @@ export function _detectBackend(model) {
  const q = (model.quant || '').toUpperCase();
  const sysBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
  const isRocm = sysBackend === 'rocm';
+  const isAppleSilicon = ['metal', 'mps', 'apple'].includes(sysBackend);
+  const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
+  if (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX')) {
+    return { backend: 'unsupported', label: 'Unsupported' };
+  }
+  const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || ['FP8', 'FP4', 'MXFP4', 'NF4', 'INT4', 'INT8', 'W4A16', 'W8A8', 'W8A16'].includes(q) || /\b(awq|gptq|fp8|fp4|nvfp4|mxfp4|nf4|int4|int8|w4a16|w8a8|w8a16)\b/i.test(_nm);
+  const isGgufLike = model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf');

  // Image gen models → diffusers
  if (model.is_image_gen || model.is_diffusion || model._tag === 'image') {
    return { backend: 'diffusers', label: 'Diffusers' };
  }

+  // AWQ / GPTQ / FP8 are safetensors GPU-serving formats. Never route them
+  // through llama.cpp/Ollama just because the host is Mac/Windows; those engines
+  // need GGUF. The UI will warn/block on Metal where vLLM/SGLang aren't viable.
+  if (isAwqLike) {
+    return { backend: 'vllm', label: 'vLLM' };
+  }
+
+  // GGUF → llama.cpp/Ollama-compatible.
+  if (isGgufLike) {
+    return { backend: 'llamacpp', label: 'llama.cpp' };
+  }
+
  // Windows → default to llama.cpp (no vLLM support on Windows)
  if (_isWindows()) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
  }

  // Apple Silicon (Metal) → llama.cpp (GGUF). vLLM/SGLang are CUDA/ROCm-only and
-  // don't run on macOS; AWQ/GPTQ/FP8 (vLLM-only) models are already filtered out
+  // don't run on macOS; vLLM-native quantized models are already filtered out
  // of metal Cookbook results, so llama.cpp is always the right engine here.
  if (['metal', 'mps', 'apple'].includes(sysBackend)) {
    return { backend: 'llamacpp', label: 'llama.cpp' };
  }

-  // AWQ / GPTQ / FP8 → vLLM
-  if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') {
-    return { backend: 'vllm', label: 'vLLM' };
-  }
-
-  // GGUF → llama.cpp. Match the quant tag OR a gguf hint in the repo/path/name:
-  // a raw .gguf file often has no quant field, which made it fall through to the
-  // vLLM default below.
-  const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
-  if (model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf')) {
-    return { backend: 'llamacpp', label: 'llama.cpp' };
-  }
-
  // ROCm/AMD machines should not blindly default HF safetensors models to
  // vLLM. SGLang is the safer OpenAI-compatible default for plain HF text
  // repos there; llama.cpp still wins above whenever the model is GGUF.
@@ -351,6 +366,8 @@ export function _buildServeCmd(f, modelName, backend) {
    cmd += ` --gpu-memory-utilization ${f.gpu_mem || '0.90'}`;
    if (f.swap && f.swap !== '0') cmd += ` --swap-space ${f.swap}`;
    cmd += ` --dtype ${f.dtype || 'auto'}`;
+    const _kv = (f.vllm_kv_cache_dtype ?? '').toString().trim();
+    if (_kv === 'fp8') cmd += ' --kv-cache-dtype fp8';
    if (f.max_seqs && f.max_seqs.toString().trim()) cmd += ` --max-num-seqs ${f.max_seqs.toString().trim()}`;
    if (f.enforce_eager) cmd += ' --enforce-eager';
    if (f.trust_remote) cmd += ' --trust-remote-code';
@@ -384,13 +401,17 @@ export function _buildServeCmd(f, modelName, backend) {
    const ggufPath = f._gguf_path || 'model.gguf';
    const gpuId = f.gpu_id?.trim() || '';
    const py = _isWindows() ? 'python' : 'python3';
+    // CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
+    // mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
+    // start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
+    const _cpuOnly = String(f.ngl).trim() === '0';
    const lcPrefix = (() => {
      let p = '';
-      if (f.unified_mem && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
+      if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
      if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `;
      return p;
    })();
-    if (f.unified_mem && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
+    if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
    if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `;
    if (!_isWindows()) {
      // Resolve GGUF path once, fail loudly if nothing matched (prevents
@@ -402,16 +423,75 @@ export function _buildServeCmd(f, modelName, backend) {
    // renders modern GGUF chat templates that the Python bindings' Jinja2
    // rejects (do_tojson ensure_ascii). Fall back to llama_cpp.server.
    // Don't suppress stderr — surface real errors (missing file, lib, OOM).
-    const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}`;
+    // Optional perf/fit flags from a hardware profile (see services/hwfit/
+    // profiles.py). n_cpu_moe offloads MoE expert layers to CPU when the model
+    // is bigger than VRAM; flash-attn + a quantized KV cache cut KV memory and
+    // speed things up. Only emitted when set, so manual/older flows are unchanged.
+    const _ncm = (f.n_cpu_moe ?? '').toString().trim();
+    const _kv = (f.cache_type ?? '').toString().trim();
+    const _llamaNum = (v) => {
+      const s = String(v || '').trim();
+      return /^\d+$/.test(s) ? s : '';
+    };
+    const _llamaCsv = (v) => {
+      const s = String(v || '').replace(/\s+/g, '');
+      return /^\d+(?:\.\d+)?(?:,\d+(?:\.\d+)?)*$/.test(s) ? s : '';
+    };
+    let _lcExtra = '';
+    let _lcpExtra = '';
+    if (_ncm !== '' && Number(_ncm) > 0) {
+      _lcExtra += ` --n-cpu-moe ${_ncm}`;
+      _lcpExtra += ` --n_cpu_moe ${_ncm}`;   // llama-cpp-python uses underscores
+    }
+    if (f.flash_attn && !_cpuOnly) {
+      _lcExtra += ' --flash-attn on';
+      _lcpExtra += ' --flash_attn true';
+    }
+    if (_kv) {
+      _lcExtra += ` --cache-type-k ${_kv} --cache-type-v ${_kv}`;
+      // llama-cpp-python exposes these as type_k/type_v; pass through best-effort.
+      _lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`;
+    }
+    const _llamaFit = String(f.llama_fit || '').trim();
+    if (['on', 'off'].includes(_llamaFit)) _lcExtra += ` --fit ${_llamaFit}`;
+    if (f.llama_no_mmap) _lcExtra += ' --no-mmap';
+    if (f.llama_no_warmup) _lcExtra += ' --no-warmup';
+    const _llamaSplitMode = String(f.llama_split_mode || '').trim();
+    if (['none', 'layer', 'row', 'tensor'].includes(_llamaSplitMode)) _lcExtra += ` --split-mode ${_llamaSplitMode}`;
+    const _llamaTensorSplit = _llamaCsv(f.llama_tensor_split);
+    if (_llamaTensorSplit) _lcExtra += ` --tensor-split ${_llamaTensorSplit}`;
+    const _llamaMainGpu = _llamaNum(f.llama_main_gpu);
+    if (_llamaMainGpu) _lcExtra += ` --main-gpu ${_llamaMainGpu}`;
+    const _llamaParallel = _llamaNum(f.llama_parallel);
+    if (_llamaParallel) _lcExtra += ` --parallel ${_llamaParallel}`;
+    const _llamaBatch = _llamaNum(f.llama_batch_size);
+    if (_llamaBatch) _lcExtra += ` --batch-size ${_llamaBatch}`;
+    const _llamaUBatch = _llamaNum(f.llama_ubatch_size);
+    if (_llamaUBatch) _lcExtra += ` --ubatch-size ${_llamaUBatch}`;
+    if (f.llama_speculative_mtp) {
+      const specTokens = parseInt(f.llama_spec_tokens, 10);
+      const specN = Number.isFinite(specTokens) && specTokens > 0 ? specTokens : 3;
+      _lcExtra += ` --spec-type draft-mtp --spec-draft-n-max ${specN}`;
+    }
+    // Vision: serve the multimodal projector so the model can read images. The
+    // mmproj path is resolved at runtime (find mmproj-*.gguf next to the model);
+    // only emitted when the Vision toggle is on AND a projector was found.
+    if (f.vision && f._mmproj_path) {
+      _lcExtra += ` --mmproj "${f._mmproj_path}" --image-max-tokens 1024`;
+      // llama-cpp-python takes the projector via --clip_model_path.
+      _lcpExtra += ` --clip_model_path "${f._mmproj_path}"`;
+    }
+    const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
    if (_isWindows()) {
      cmd += _lcpServer;
    } else {
-      cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}`;
+      cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
      cmd += ` || ${_lcpServer}`;
    }
  } else if (backend === 'ollama') {
    const ollamaPort = f.port || '11434';
-    const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=0.0.0.0:${ollamaPort} ` : '';
+    const bindHost = _envState.remoteHost ? '0.0.0.0' : '127.0.0.1';
+    const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=${bindHost}:${ollamaPort} ` : '';
    cmd = `${hostEnv}ollama serve`;
  } else if (backend === 'diffusers') {
    const gpuStr = f.gpus?.trim();
@@ -542,6 +622,10 @@ async function _fetchDependencies() {
    const _statusTag = (pkg, isLocal, isSystemDep, winBlocked) => {
      if (winBlocked) return `<span class="cookbook-dep-tag cookbook-dep-na">N/A</span>`;
      if (pkg.installed && isSystemDep) return `<span class="cookbook-dep-tag cookbook-dep-installed" title="Found on selected server">Installed</span>`;
+      if (pkg.installed && pkg.pip_update_available === false) {
+        const tip = esc(pkg.update_note || pkg.status_note || 'Found externally; update outside Odysseus.');
+        return `<span class="cookbook-dep-tag cookbook-dep-installed" title="${tip}">Installed</span>`;
+      }
      if (pkg.installed) return `<button class="cookbook-dep-tag cookbook-dep-installed cookbook-dep-installed-btn" title="Installed — click for actions"><span class="cookbook-dep-installed-label">Installed</span><span class="cookbook-dep-caret">&#9662;</span></button>`;
      if (isSystemDep) {
        const depTip = esc(pkg.install_hint || 'Install this OS package on the selected server.');
@@ -556,11 +640,13 @@ async function _fetchDependencies() {
      const isSystemDep = pkg.kind === 'system';
      const winBlocked = !isLocal && _isWindows() && _winUnsupported.has(pkg.name);
      const note = pkg.status_note ? `<div class="memory-item-meta" style="font-size:10px;opacity:0.65;margin-top:3px;">${esc(pkg.status_note)}</div>` : '';
+      const updateNote = pkg.installed && pkg.pip_update_available === false && pkg.update_note ? `<div class="memory-item-meta" style="font-size:10px;opacity:0.55;margin-top:3px;">${esc(pkg.update_note)}</div>` : '';
      return `<div class="cookbook-dep-row${winBlocked ? ' cookbook-dep-blocked' : ''}" data-pkg-name="${esc(pkg.name)}" data-dep-pip="${esc(pkg.pip || '')}" data-dep-target="${isLocal ? 'local' : 'remote'}" data-dep-kind="${esc(pkg.kind || 'python')}">`
        + `<div class="cookbook-dep-info">`
        + `<div class="memory-item-title">${esc(pkg.name)}</div>`
        + `<div class="memory-item-meta" style="font-size:10px;opacity:0.5;margin-top:2px;">${esc(pkg.desc)}</div>`
        + note
+        + updateNote
        + `</div>`
        + `<span class="cookbook-dep-tag cookbook-dep-cat">${esc(pkg.category)}</span>`
        + _statusTag(pkg, isLocal, isSystemDep, winBlocked)
@@ -642,7 +728,7 @@ async function _fetchDependencies() {
        }
        // _dep flags this as a pip dependency/driver install (not a servable
        // model) so the running-task card doesn't offer a "Serve →" button.
-        const payload = { repo_id: pipName, _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true };
+        const payload = { repo_id: pipName, _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true, env_path: _envState.envPath || '' };
        _addTask(data.session_id, 'pip ' + pkgName, 'download', payload);
        if (statusEl) { statusEl.textContent = upgrade ? 'Updating...' : 'Installing...'; statusEl.disabled = true; }
        uiModule.showToast(`${upgrade ? 'Updating' : 'Installing'} ${pkgName} on ${targetHost}...`);
@@ -932,6 +1018,51 @@ function _wireTabEvents(body) {
    });
  }

+  // "Rebuild llama.cpp" clears the cached build so the next serve recompiles.
+  // The serve bootstrap only builds llama-server when it is missing from PATH,
+  // so a host that first built CPU-only (no nvcc at build time) keeps reusing
+  // that binary forever; this is the lever to force a fresh GPU build after a
+  // CUDA/ROCm toolkit is installed.
+  const rebuildBtn = document.getElementById('cookbook-rebuild-engine');
+  if (rebuildBtn && !rebuildBtn._wired) {
+    rebuildBtn._wired = true;
+    rebuildBtn.addEventListener('click', async () => {
+      // Match _installDep: honor the Dependencies server selector so the clear
+      // runs on the same host the build runs on.
+      const sel = document.getElementById('hwfit-deps-server');
+      if (sel) _applyServerSelection(sel.value);
+      const host = _envState.remoteHost || '';
+      const where = host || 'this server';
+      if (!confirm(`Rebuild the llama.cpp engine on ${where}?\n\nThis clears the cached llama-server build so the next serve recompiles from source (with CUDA/HIP if a toolchain is present). It does not download or install anything.`)) return;
+      const _label = rebuildBtn.textContent;
+      rebuildBtn.disabled = true;
+      rebuildBtn.textContent = 'Clearing...';
+      try {
+        const res = await fetch('/api/cookbook/rebuild-engine', {
+          method: 'POST', credentials: 'same-origin',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({
+            engine: 'llamacpp',
+            remote_host: host || undefined,
+            ssh_port: _getPort(host) || undefined,
+          }),
+        });
+        const data = await res.json().catch(() => ({}));
+        if (!res.ok || !data.ok) {
+          const reason = data.detail || data.error || `HTTP ${res.status}`;
+          uiModule.showToast('Rebuild failed: ' + String(reason).slice(0, 200));
+        } else {
+          uiModule.showToast(`Cleared llama.cpp build on ${where}. Re-launch the serve task to rebuild with GPU support.`);
+        }
+      } catch (err) {
+        uiModule.showToast('Rebuild failed: ' + err.message);
+      } finally {
+        rebuildBtn.disabled = false;
+        rebuildBtn.textContent = _label;
+      }
+    });
+  }
+
  // Serve sort
  const serveSort = document.getElementById('serve-sort');
  if (serveSort) {
@@ -985,6 +1116,7 @@ function _wireTabEvents(body) {

    document.getElementById('serve-bulk-cancel')?.addEventListener('click', () => {
      selectBtn.classList.remove('active');
+      selectBtn.textContent = 'Select';  // reset label so the button doesn't stay reading "Cancel" after exit
      bulkBar.classList.add('hidden');
      document.querySelectorAll('.serve-select-cb').forEach(dot => { dot.style.display = 'none'; dot.classList.remove('selected'); });
    });
@@ -1003,6 +1135,7 @@ function _wireTabEvents(body) {
        if (item) await _deleteCachedModel(repo, item, true);
      }
      selectBtn.classList.remove('active');
+      selectBtn.textContent = 'Select';  // same reset as bulk-cancel
      bulkBar.classList.add('hidden');
      document.querySelectorAll('.serve-select-cb').forEach(dot => { dot.style.display = 'none'; dot.classList.remove('selected'); });
    });
@@ -1011,6 +1144,16 @@ function _wireTabEvents(body) {
  // Download input
  const dlBtn = document.getElementById('cookbook-dl-btn');
  const dlInput = document.getElementById('cookbook-dl-repo');
+  const dlCardToggle = document.getElementById('cookbook-download-card-toggle');
+  const dlCardBody = document.getElementById('cookbook-download-card-body');
+  const dlCardArrow = document.getElementById('cookbook-download-card-arrow');
+  if (dlCardToggle && dlCardBody) {
+    dlCardToggle.addEventListener('click', () => {
+      const isOpen = dlCardBody.style.display !== 'none';
+      dlCardBody.style.display = isOpen ? 'none' : 'block';
+      if (dlCardArrow) dlCardArrow.style.transform = isOpen ? 'rotate(0deg)' : 'rotate(90deg)';
+    });
+  }
  if (dlBtn && dlInput) {
    function _stripHfUrl(input) {
      let repo = input.trim();
@@ -1104,8 +1247,12 @@ function _wireTabEvents(body) {
  if (hfToggle && hfList) {
    let _loaded = false;
    // Per-server VRAM cache so we don't re-probe on every expand
-    const _vramCache = {};
-    async function _getSelectedServerVram() {
+    const _hwCache = {};
+    function _hfModelLooksAwqLike(m) {
+      const text = `${m?.repo_id || ''} ${(m?.tags || []).join(' ')}`.toLowerCase();
+      return /\b(awq|gptq|fp8|4bit|int4)\b/.test(text);
+    }
+    async function _getSelectedServerHw() {
      // Prefer the "What Fits" dropdown (the main control that shows hardware);
      // fall back to the download dropdown. This is the server the list ranks for.
      const dlSrv = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
@@ -1122,7 +1269,7 @@ function _wireTabEvents(body) {
        }
      }
      const cacheKey = host || 'local';
-      if (_vramCache[cacheKey] !== undefined) return _vramCache[cacheKey];
+      if (_hwCache[cacheKey]) return _hwCache[cacheKey];
      // Fetch system info for this server from hwfit
      try {
        const qp = new URLSearchParams();
@@ -1132,13 +1279,13 @@ function _wireTabEvents(body) {
        const r = await fetch(`/api/hwfit/system?${qp}`);
        if (r.ok) {
          const sys = await r.json();
-          const v = sys?.gpu_vram_gb || 0;
-          _vramCache[cacheKey] = v;
-          return v;
+          const hw = { vram: sys?.gpu_vram_gb || 0, backend: String(sys?.backend || '').toLowerCase() };
+          _hwCache[cacheKey] = hw;
+          return hw;
        }
      } catch {}
-      _vramCache[cacheKey] = 0;
-      return 0;
+      _hwCache[cacheKey] = { vram: 0, backend: '' };
+      return _hwCache[cacheKey];
    }
    async function _loadLatest() {
      // Match the Dependencies loader: whirlpool spinner + text label so the
@@ -1157,7 +1304,8 @@ function _wireTabEvents(body) {
      } catch {
        hfList.innerHTML = '<div class="hwfit-loading">Scanning models…</div>';
      }
-      const vram = await _getSelectedServerVram();
+      const hwInfo = await _getSelectedServerHw();
+      const vram = hwInfo.vram || 0;
      try {
        let lastErr = '';
        const _fetchLatest = async (v) => {
@@ -1173,6 +1321,9 @@ function _wireTabEvents(body) {
        if (!models.length && vram > 0) {
          models = await _fetchLatest(0);
        }
+        if (['rocm', 'metal', 'mps', 'apple', 'generic', 'cpu'].includes(hwInfo.backend)) {
+          models = models.filter(m => !_hfModelLooksAwqLike(m));
+        }
        if (!models.length) {
          // Distinguish "the HF API failed" from "nothing matched" so an outage
          // doesn't masquerade as no-fitting-models.
@@ -1254,9 +1405,32 @@ function _wireTabEvents(body) {
  // HF token — save on change
  const hfInput = document.getElementById('hwfit-hftoken');
  if (hfInput) {
-    hfInput.addEventListener('change', () => {
-      _envState.hfToken = hfInput.value.trim();
-      _persistEnvState();
+    hfInput.addEventListener('change', async () => {
+      const val = hfInput.value.trim();
+      _envState.hfToken = val;
+      try { await _persistEnvState(); } catch {}
+      if (val) {
+        _envState.hfTokenConfigured = true;
+        const masked = val.length > 6 ? val.slice(0, 3) + '…' + val.slice(-3) : '••••';
+        _envState.hfTokenMasked = masked;
+        hfInput.placeholder = `Stored (${masked}) - enter a new token to replace`;
+        hfInput.value = '';
+        let check = hfInput.parentNode.querySelector('.hwfit-hf-check');
+        if (!check) {
+          check = document.createElement('span');
+          check.className = 'hwfit-hf-check';
+          check.title = 'Token stored';
+          check.textContent = '✓';
+          check.style.cssText = 'font-weight:800;color:var(--green,#50fa7b);font-size:15px;line-height:1;flex-shrink:0;position:relative;top:2px;';
+          hfInput.parentNode.insertBefore(check, hfInput);
+        }
+        const flash = document.createElement('span');
+        flash.textContent = 'Saved';
+        flash.style.cssText = 'margin-left:8px;font-size:11px;color:var(--green,#50fa7b);opacity:0;transition:opacity 0.18s;flex-shrink:0;position:relative;top:1px;';
+        hfInput.parentNode.appendChild(flash);
+        requestAnimationFrame(() => { flash.style.opacity = '1'; });
+        setTimeout(() => { flash.style.opacity = '0'; setTimeout(() => flash.remove(), 220); }, 1400);
+      }
    });
  }
 }
@@ -1393,7 +1567,7 @@ function _renderRecipes() {
  // silently sending downloads to the wrong server. An empty selection means Local; the user
  // chooses a remote server explicitly via the dropdown.

-  // Download input
+  // Manual download input
  html += `<div style="margin-top:7px;margin-bottom:2px;display:flex;gap:4px;align-items:center;">`;
  if (_es.servers.length > 1) {
    html += `<select class="cookbook-field-input hwfit-dl-server" id="hwfit-dl-server" style="height:28px;position:relative;top:0px;">`;
@@ -1409,7 +1583,7 @@ function _renderRecipes() {
  html += `<button class="cookbook-btn cookbook-dl-btn" id="cookbook-dl-btn">Download</button>`;
  html += `</div>`;
  // Latest HF models that fit — collapsible card list
-  html += `<div style="margin-top:2px;position:relative;top:-8px;">`;
+  html += `<div style="margin-top:5px;position:relative;top:-3px;">`;
  html += `<div style="display:flex;gap:4px;align-items:center;">`;
  html += `<button type="button" class="memory-toolbar-btn" id="cookbook-hf-latest-toggle" style="flex:1;text-align:left;height:26px;display:flex;align-items:center;gap:6px;border-radius:4px;">`;
  html += `<span id="cookbook-hf-latest-arrow" style="display:inline-block;transition:transform 0.15s;pointer-events:none;">\u25B8</span>`;
@@ -1422,7 +1596,7 @@ function _renderRecipes() {
  html += `</div>`;  // /#cookbook-dl-tab-fold-body (whole Download card body)

  // Search section
-  html += '</div></div></div>';
+  html += '</div></div></div></div>';
  html += '<div class="cookbook-group" data-backend-group="Search">';
  html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
  html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';
@@ -1445,13 +1619,21 @@ function _renderRecipes() {
  html += '<option value="Q4_K_M">Q4</option><option value="Q8_0">Q8</option>';
  html += '<option value="Q6_K">Q6</option><option value="Q5_K_M">Q5</option>';
  html += '<option value="Q3_K_M">Q3</option><option value="Q2_K">Q2</option>';
-  html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option></select>';
-  // Ctx slider — ported from origin/main. Lets you target a context length
-  // for fit estimates; the hwfit ranking uses _ctxValue() to factor that into
-  // VRAM math, so dragging this re-sorts the list toward models that fit
-  // your chosen ctx.
+  html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option><option value="FP4">FP4</option><option value="NVFP4">NVFP4</option></select>';
+  // Engine filter — show only models whose serve engine matches. Composes
+  // with quant / type / search filters.
+  html += '<select class="cookbook-field-input hwfit-engine" id="hwfit-engine" style="height:28px;" title="Filter by serving engine">';
+  html += '<option value="">Engine</option>';
+  html += '<option value="llamacpp">llama.cpp</option>';
+  html += '<option value="vllm">vLLM</option>';
+  html += '<option value="sglang">SGLang</option>';
+  html += '</select>';
+  html += '<span class="hwfit-help-chip" title="Higher numbers usually mean better quality, but they need more memory. Lower numbers fit on more hardware.">?</span>';
+  // Ctx slider — lets you target a context length for fit estimates; the
+  // hwfit ranking uses _ctxValue() to factor that into VRAM math, so
+  // dragging this re-sorts the list toward models that fit your chosen ctx.
  html += '<label class="hwfit-ctx-control" title="Context length for fit estimates. Lower it to find more models that could fit your hardware.">';
-  html += '<span>Ctx</span><input type="range" id="hwfit-context" min="0" max="5" step="1" value="3" />';
+  html += '<span>Ctx</span><span class="hwfit-help-chip hwfit-help-chip-inline" title="Context length. Lower it to find more models that could fit your hardware; raise it when you need longer chats or documents.">?</span><input type="range" id="hwfit-context" min="0" max="5" step="1" value="3" />';
  html += '<output id="hwfit-context-label">50k</output></label>';
  html += '</div>';
  html += '<div class="hwfit-toolbar" style="margin-top:7px;">';
@@ -1462,8 +1644,10 @@ function _renderRecipes() {
  // Scan/refresh button (icon-only) where the quant dropdown used to sit.
  html += '<button type="button" class="hwfit-gpu-btn" id="hwfit-rescan" title="Re-scan hardware" style="flex-shrink:0;position:relative;top:-3px;left:-1px;">↻ RESCAN</button>';
  html += '<button type="button" class="hwfit-gpu-btn hwfit-hw-manual-btn" id="hwfit-hw-manual-btn" title="Set hardware manually" style="flex-shrink:0;position:relative;top:-3px;left:-1px;">EDIT</button>';
+  // Sort state — the clickable column headers read/write this (pewds' original
+  // sort paradigm). Newest is reachable by clicking the Model column header.
  html += '<select class="cookbook-field-input hwfit-sort" id="hwfit-sort" style="display:none">';
-  html += '<option value="score">Score</option><option value="vram">VRAM</option>';
+  html += '<option value="fit">Fit</option><option value="score">Score</option><option value="vram">VRAM</option>';
  html += '<option value="speed">Speed</option><option value="params">Params</option>';
  html += '<option value="context">Context</option></select>';
  html += '</div>';
@@ -1523,6 +1707,7 @@ function _renderRecipes() {
  html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
  html += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px;">';
  html += '<h2 style="margin:0;padding:0;line-height:1;">Dependencies</h2>';
+  html += '<button class="cookbook-field-input" id="cookbook-rebuild-engine" title="Clear the cached llama.cpp build so the next serve recompiles from source (use after installing a CUDA/ROCm toolkit to turn a CPU-only build into a GPU build)." style="height:24px;font-size:10px;padding:0 8px;cursor:pointer;width:auto;">Rebuild llama.cpp</button>';
  html += '<span style="font-size:10px;opacity:0.5;margin-left:auto;">Server</span>';
  html += '<select class="cookbook-field-input" id="hwfit-deps-server" style="height:28px;min-width:70px;">';
  html += _buildServerOpts(false);