Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
2026-06-30 00:22:10 -04:00 · 2026-06-03 16:49:10 +09:00
parent eb79b76432 41a928f21b
commit 3706d756f3
569 changed files with 35252 additions and 3489 deletions
@@ -41,6 +41,48 @@ const SERVE_STATE_KEY = 'cookbook-serve-state';

 let _cachedAllModels = [];

+function _repoLooksAwqLike(model, repo) {
+  const q = String(model?.quant || '').toUpperCase();
+  const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
+  return /^AWQ|^GPTQ/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8)\b/i.test(n);
+}
+
+function _repoLooksGgufLike(model, repo) {
+  const q = String(model?.quant || '').toUpperCase();
+  const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
+  return !!model?.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf');
+}
+
+function _serveBackendWarning(model, repo, backend, fields = {}) {
+  const awqLike = _repoLooksAwqLike(model, repo);
+  const ggufLike = _repoLooksGgufLike(model, repo);
+  if (awqLike && (backend === 'llamacpp' || backend === 'ollama')) {
+    return {
+      title: 'AWQ needs vLLM or SGLang',
+      body: 'This model looks like AWQ/GPTQ/FP8 safetensors. llama.cpp and Ollama need GGUF files, so this backend cannot serve it. Choose vLLM/SGLang on a CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama.',
+    };
+  }
+  if (awqLike && _isMetal() && (backend === 'vllm' || backend === 'sglang')) {
+    return {
+      title: 'AWQ is not a unified-memory path',
+      body: 'This model looks like AWQ/GPTQ/FP8 safetensors. AWQ is for vLLM/SGLang on CUDA/ROCm-style GPU servers, not local unified-memory llama.cpp/Ollama serving. For unified memory, download a GGUF model and use llama.cpp/Ollama.',
+    };
+  }
+  if (awqLike && fields.unified_mem) {
+    return {
+      title: 'AWQ is not a unified-memory path',
+      body: 'This model looks like AWQ/GPTQ/FP8 safetensors, but unified-memory local serving expects GGUF. Use vLLM/SGLang on a compatible GPU server, or download a GGUF version for llama.cpp/Ollama.',
+    };
+  }
+  if (ggufLike && (backend === 'vllm' || backend === 'sglang')) {
+    return {
+      title: 'GGUF needs llama.cpp or Ollama',
+      body: 'This model looks like GGUF. vLLM/SGLang expect HuggingFace safetensors-style repos. Choose llama.cpp/Ollama for GGUF, or download a safetensors model for vLLM/SGLang.',
+    };
+  }
+  return null;
+}
+
 function _hasOwn(obj, key) {
  return Object.prototype.hasOwnProperty.call(obj || {}, key);
 }
@@ -51,6 +93,67 @@ function _allGpuIds(count) {
  return Array.from({ length: Math.floor(n) }, (_, i) => String(i)).join(',');
 }

+function _selectedServeTarget(panel) {
+  const select = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
+  const servers = Array.isArray(_envState.servers) ? _envState.servers : [];
+  let host = _envState.remoteHost || '';
+  let server = host ? servers.find(s => s.host === host) : null;
+  if (select && select.value != null) {
+    if (select.value === 'local') {
+      host = '';
+      server = servers.find(s => !s.host || s.host === 'local') || null;
+    } else {
+      const idx = /^\d+$/.test(String(select.value)) ? parseInt(select.value, 10) : -1;
+      server = servers.find(s => s.host === select.value) || (idx >= 0 ? servers[idx] : null) || null;
+      host = server?.host || '';
+    }
+  }
+  const venv = panel?.querySelector('[data-field="venv"]')?.value?.trim() || server?.envPath || _envState.envPath || '';
+  const label = host
+    ? (server?.name ? `${server.name} (${host})` : host)
+    : (server?.name || 'local server');
+  return {
+    host,
+    port: host ? (_getPort(host) || server?.port || '') : '',
+    venv,
+    label,
+  };
+}
+
+async function _fetchServeRuntimePackage(panel, backend) {
+  const packageByBackend = {
+    vllm: 'vllm',
+    sglang: 'sglang',
+    llamacpp: 'llama_cpp',
+    diffusers: 'diffusers',
+  };
+  const packageName = packageByBackend[backend];
+  if (!packageName) return null;
+  const target = _selectedServeTarget(panel);
+  const params = new URLSearchParams();
+  if (target.host) {
+    params.set('host', target.host);
+    if (target.port) params.set('ssh_port', target.port);
+    if (target.venv) params.set('venv', target.venv);
+  }
+  const res = await fetch('/api/cookbook/packages' + (params.toString() ? '?' + params.toString() : ''), { credentials: 'same-origin' });
+  if (!res.ok) throw new Error(`HTTP ${res.status}`);
+  const data = await res.json();
+  const pkg = (data.packages || []).find(p => p.name === packageName);
+  return { pkg, target };
+}
+
+function _runtimeNoteText(backend, pkg, target) {
+  const labels = { vllm: 'vLLM', sglang: 'SGLang', llamacpp: 'llama.cpp', diffusers: 'Diffusers' };
+  const label = labels[backend] || backend;
+  if (!pkg) return `${label} readiness unavailable for ${target.label}.`;
+  const note = pkg.status_note || pkg.update_note || '';
+  if (pkg.installed) {
+    return note ? `${label} ready on ${target.label}: ${note}` : `${label} ready on ${target.label}.`;
+  }
+  return note ? `${label} missing on ${target.label}: ${note}` : `${label} missing on ${target.label}.`;
+}
+
 // ── Filter/sort cached model list ──

 function _filterCachedList() {
@@ -99,6 +202,64 @@ function _isActivelyServing(repoId) {
  } catch { return false; }
 }

+function _formatGgufSize(bytes) {
+  const n = Number(bytes || 0);
+  if (!Number.isFinite(n) || n <= 0) return '';
+  if (n >= 1024 ** 3) return `${(n / (1024 ** 3)).toFixed(1)} GB`;
+  if (n >= 1024 ** 2) return `${Math.round(n / (1024 ** 2))} MB`;
+  return `${Math.max(1, Math.round(n / 1024))} KB`;
+}
+
+function _ggufFilesForModel(model) {
+  return Array.isArray(model?.gguf_files)
+    ? model.gguf_files.filter(f => f && typeof f.rel_path === 'string' && f.rel_path)
+    : [];
+}
+
+function _runnableGgufFiles(model) {
+  const files = _ggufFilesForModel(model);
+  const primary = files.filter(f => (f.role || 'model') === 'model');
+  return primary.length ? primary : files;
+}
+
+function _ggufFileLabel(file) {
+  const base = (file.name || file.rel_path || '').split('/').pop();
+  const size = _formatGgufSize(file.size_bytes);
+  const quant = file.quant ? `${file.quant} ` : '';
+  const parts = Number(file.parts || 0);
+  const split = parts > 1 ? `, ${parts} parts` : '';
+  const role = file.role && file.role !== 'model' ? ` ${file.role}` : '';
+  return `${quant}${base}${size || split ? ` (${[size, split.replace(/^, /, '')].filter(Boolean).join(', ')})` : ''}${role}`;
+}
+
+function _shellPathExpr(path) {
+  const s = String(path || '');
+  if (s === '~') return '${HOME}';
+  if (s.startsWith('~/')) return '${HOME}' + _shellQuote(s.slice(1));
+  return _shellQuote(s);
+}
+
+function _selectedGgufExpr(model, repo, relPath) {
+  const rel = String(relPath || '').replace(/^\/+/, '');
+  if (!rel) return '';
+  if (model.is_local_dir && model.path) {
+    const base = String(model.path || '').replace(/\/+$/, '');
+    return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
+  }
+  if (model.path) {
+    const base = String(model.path || '').replace(/\/+$/, '');
+    return `$(printf %s ${_shellPathExpr(`${base}/models--${repo.replace(/\//g, '--')}/snapshots/${rel}`)})`;
+  }
+  const cacheRepo = repo.replace(/\//g, '--');
+  return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`;
+}
+
+function _ggufSearchDirExpr(model, repo) {
+  if (model.is_local_dir && model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/${repo}`);
+  if (model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/models--${repo.replace(/\//g, '--')}/snapshots`);
+  return `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
+}
+
 function _rerenderCachedModels() {
  const list = document.getElementById('hwfit-cached-list');
  const tagContainer = document.getElementById('serve-tags');
@@ -131,6 +292,8 @@ function _rerenderCachedModels() {
    if (m.path) {
      metaParts.push(`<span style="opacity:0.7;">${esc(m.path)}</span>`);
    }
+    const ggufCount = _runnableGgufFiles(m).length;
+    if (ggufCount > 1) metaParts.push(`${ggufCount} GGUFs`);
    if (m.status === 'downloading') {
      const _active = _isActivelyDownloading(m.repo_id);
      metaParts.push(`<span class="cookbook-dl-status" style="color:var(--accent,var(--red));">${_active ? 'downloading' : 'download stalled'}</span>`);
@@ -307,7 +470,9 @@ function _rerenderCachedModels() {

      // Toggle — close if already open
      if (item.classList.contains('doclib-card-expanded')) {
-        item.querySelector('.hwfit-serve-panel')?.remove();
+        const existingPanel = item.querySelector('.hwfit-serve-panel');
+        existingPanel?._cleanupRuntimeReadiness?.();
+        existingPanel?.remove();
        item.classList.remove('doclib-card-expanded');
        item.style.flexDirection = '';
        item.style.alignItems = '';
@@ -318,18 +483,14 @@ function _rerenderCachedModels() {

      // Collapse any other expanded
      list.querySelectorAll('.doclib-card-expanded').forEach(c => {
-        c.querySelector('.hwfit-serve-panel')?.remove();
+        const openPanel = c.querySelector('.hwfit-serve-panel');
+        openPanel?._cleanupRuntimeReadiness?.();
+        openPanel?.remove();
        c.classList.remove('doclib-card-expanded');
        c.style.flexDirection = '';
        c.style.alignItems = '';
      });

-      // Capture grid height
-      const _tb = list.closest('.admin-card')?.querySelector('.memory-toolbar');
-      const _tbH = _tb ? _tb.offsetHeight : 0;
-      list.style.minHeight = (list.offsetHeight + _tbH) + 'px';
-      list.style.maxHeight = (list.offsetHeight + _tbH) + 'px';
-
      const shortName = repo.split('/').pop();
      const _es = _envState;
      // The venv set per-server in Settings (server.envPath). Used as the venv
@@ -350,8 +511,13 @@ function _rerenderCachedModels() {
        ? _byRepo[repo]
        : (_lastUsed || (_isLegacyFlat ? _allSs : {}));
      const detectedBackend = _detectBackend(m).backend;
-      const defaultBackend = detectedBackend;
-      const savedMatchesBackend = (ss.backend || 'vllm') === detectedBackend;
+      const _allowedBackends = new Set(_isWindows()
+        ? ['llamacpp']
+        : (_isMetal() ? ['llamacpp', 'ollama'] : ['vllm', 'sglang', 'llamacpp', 'ollama', 'diffusers']));
+      const defaultBackend = (ss._forceBackend && ss.backend && _allowedBackends.has(ss.backend))
+        ? ss.backend
+        : detectedBackend;
+      const savedMatchesBackend = !!ss._forceBackend || (ss.backend || 'vllm') === detectedBackend;
      const sv = (k, def) => (ss[k] !== undefined && savedMatchesBackend) ? ss[k] : def;
      const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', '1');
      const detectedGpuIds = _allGpuIds(_getGpuToggleTotal?.());
@@ -362,7 +528,16 @@ function _rerenderCachedModels() {
          : (_es.gpus || detectedGpuIds));
      const tpOpts = [1,2,4,8].map(n => `<option${defaultTp==String(n)?' selected':''}>${n}</option>`).join('');
      const dtypeOpts = ['auto','float16','bfloat16'].map(d => `<option value="${d}"${sv('dtype','auto')===d?' selected':''}>${d}</option>`).join('');
+      const vllmKvCacheOpts = ['auto','fp8'].map(d => `<option value="${d}"${sv('vllm_kv_cache_dtype','auto')===d?' selected':''}>${d}</option>`).join('');
      const _l = (name, tip) => `<span>${name}<span class="hwfit-hint" title="${tip}">?</span></span>`;
+      const _ggufChoices = _runnableGgufFiles(m);
+      const _savedGguf = String(sv('gguf_file', '') || '');
+      const _defaultGguf = _ggufChoices.some(f => f.rel_path === _savedGguf)
+        ? _savedGguf
+        : (_ggufChoices[0]?.rel_path || '');
+      const _ggufOptions = _ggufChoices.map(f =>
+        `<option value="${esc(f.rel_path)}"${f.rel_path === _defaultGguf ? ' selected' : ''}>${esc(_ggufFileLabel(f))}</option>`
+      ).join('');
      // Build save slots
      const _allPresets = _loadPresets();
      const _repoShort = repo.split('/').pop();
@@ -372,10 +547,16 @@ function _rerenderCachedModels() {
      // load, × to delete) plus a "Save current config" row — see _showSavedConfigMenu.
      // Split button: "Save" saves the current config directly; the arrow opens
      // the dropdown of saved configs (load / delete). Arrow shows the count.
+      // The arrow button shows just the saved-config count next to a "▾".
+      // Spell out what the number means in the tooltip so users don't have
+      // to click it to find out the badge isn't a notification dot.
      const _arrowLabel = _modelPresets.length > 0 ? `${_modelPresets.length} ▾` : '▾';
+      const _arrowTitle = _modelPresets.length > 0
+        ? `${_modelPresets.length} saved launch config${_modelPresets.length === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete`
+        : `No saved launch configs for ${_repoShort} yet — click Save to add one`;
      let _slotsHtml = `<div class="cookbook-serve-slots cookbook-saved-split">`
        + `<button type="button" class="cookbook-slot-btn cookbook-saved-save" title="Save current config"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M19 21H5a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2h11l5 5v11a2 2 0 0 1-2 2z"/><polyline points="17 21 17 13 7 13 7 21"/><polyline points="7 3 7 8 15 8"/></svg>Save</button>`
-        + `<button type="button" class="cookbook-slot-btn cookbook-saved-arrow" title="Saved launch configs">${_arrowLabel}</button>`
+        + `<button type="button" class="cookbook-slot-btn cookbook-saved-arrow" title="${esc(_arrowTitle)}">${_arrowLabel}</button>`
        + `</div>`;

      let panelHtml = `<div class="hwfit-serve-panel">${_slotsHtml}`;
@@ -403,6 +584,14 @@ function _rerenderCachedModels() {
      }
      panelHtml += `<label>${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
      panelHtml += `</div>`;
+      panelHtml += `<div class="hwfit-serve-runtime-note" style="display:none;font-size:11px;line-height:1.35;color:var(--fg-muted);margin-top:-4px;"></div>`;
+      if (_ggufChoices.length > 1) {
+        panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
+        panelHtml += `<label class="hwfit-backend-llamacpp">${_l('GGUF File','Choose the exact GGUF artifact to serve from this cached model folder.')}<select class="hwfit-sf hwfit-sf-wide" data-field="gguf_file">${_ggufOptions}</select></label>`;
+        panelHtml += `</div>`;
+      } else if (_defaultGguf) {
+        panelHtml += `<input type="hidden" class="hwfit-sf" data-field="gguf_file" value="${esc(_defaultGguf)}" />`;
+      }
      // Row 2: Core settings
      panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp">`;
      panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('TP','Tensor Parallelism — split model across N GPUs')}<select class="hwfit-sf" data-field="tp">${tpOpts}</select></label>`;
@@ -414,6 +603,7 @@ function _rerenderCachedModels() {
      panelHtml += `<label class="hwfit-backend-vllm">${_l('Swap','CPU swap space in GB. Leave empty to omit (removed in newer vLLM)')}<input type="text" class="hwfit-sf" data-field="swap" value="${esc(sv('swap', ''))}" placeholder="off" /></label>`;
      panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('Max Seqs','Maximum concurrent requests. Lower = less memory. Default 8 — prosumer GPUs often OOM on vLLM default 256 during CUDA graph capture.')}<input type="text" class="hwfit-sf" data-field="max_seqs" value="${esc(sv('max_seqs', '8'))}" placeholder="8" /></label>`;
      panelHtml += `<label>${_l('Dtype','Data type for weights. auto picks best for GPU')}<select class="hwfit-sf" data-field="dtype">${dtypeOpts}</select></label>`;
+      panelHtml += `<label class="hwfit-backend-vllm">${_l('KV Cache','vLLM --kv-cache-dtype. auto uses the model/runtime default; fp8 reduces KV memory for long context.')}<select class="hwfit-sf" data-field="vllm_kv_cache_dtype">${vllmKvCacheOpts}</select></label>`;
      panelHtml += `</div>`;
      // Row 2b: Diffusers settings
      const diffDtypeOpts = ['bfloat16','float16','float32'].map(d => `<option value="${d}"${sv('diff_dtype','bfloat16')===d?' selected':''}>${d}</option>`).join('');
@@ -432,9 +622,47 @@ function _rerenderCachedModels() {
      panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="prefix_cache"${sv('prefix_cache',false)?' checked':''} /> Prefix Caching${_h('Cache shared prompt prefixes across requests')}</label>`;
      panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="auto_tool"${sv('auto_tool',false)?' checked':''} /> Auto Tool Choice${_h('Enable function/tool calling for agent mode')}</label>`;
      panelHtml += `</div>`;
+      // Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
+      const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => `<option value="${k}"${sv('cache_type','')===k?' selected':''}>${k||'default'}</option>`).join('');
+      const llamaFitOpts = ['', 'off', 'on'].map(d => `<option value="${d}"${sv('llama_fit','')===d?' selected':''}>${d||'default'}</option>`).join('');
+      const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => `<option value="${d}"${sv('llama_split_mode','')===d?' selected':''}>${d||'default'}</option>`).join('');
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
+      panelHtml += `<label>${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" style="width:54px;" /></label>`;
+      panelHtml += `<label>${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = sharp long-context, f16 = full. Blank = llama.cpp default.')}<select class="hwfit-sf" data-field="cache_type">${_kvOpts}</select></label>`;
+      panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="flash_attn"${sv('flash_attn',false)?' checked':''} /> Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache.')}</label>`;
+      panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="vision"${sv('vision',false)?' checked':''} /> Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model (download one into the model folder). Adds ~1 GB VRAM + a small per-image cost.')}</label>`;
+      panelHtml += `<label>${_l('Fit','llama.cpp --fit. Leave default unless you need explicit off/on behavior for a preset.')}<select class="hwfit-sf" data-field="llama_fit">${llamaFitOpts}</select></label>`;
+      panelHtml += `</div>`;
+      // Row 2d: native llama-server placement/runtime controls. These are
+      // explicit overrides for known-good advanced presets; blank keeps
+      // llama.cpp/profile defaults.
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
+      panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode">${llamaSplitModeOpts}</select></label>`;
+      panelHtml += `<label>${_l('Tensor Split','GPU proportions for llama.cpp, e.g. 50,50 across two visible GPUs. Leave blank for auto.')}<input type="text" class="hwfit-sf" data-field="llama_tensor_split" value="${esc(sv('llama_tensor_split', ''))}" placeholder="50,50" /></label>`;
+      panelHtml += `<label>${_l('Main GPU','llama.cpp --main-gpu index inside the visible GPU set. Mostly useful for split mode none/row.')}<input type="text" class="hwfit-sf" data-field="llama_main_gpu" value="${esc(sv('llama_main_gpu', ''))}" placeholder="auto" /></label>`;
+      panelHtml += `<label>${_l('Parallel','llama.cpp parallel slots. Leave blank for llama.cpp default; 1 matches single-lane presets.')}<input type="text" class="hwfit-sf" data-field="llama_parallel" value="${esc(sv('llama_parallel', ''))}" placeholder="1" /></label>`;
+      panelHtml += `<label>${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
+      panelHtml += `<label>${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
+      panelHtml += `</div>`;
+      // Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
+      // Buttons are injected after the panel mounts (needs an async fetch).
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp hwfit-serve-profiles" style="align-items:center;gap:8px;">`;
+      panelHtml += `<span style="opacity:0.7;font-size:11px;">Auto profiles:</span>`;
+      panelHtml += `<span class="hwfit-profile-btns" style="display:flex;gap:6px;flex-wrap:wrap;"><span style="opacity:0.5;font-size:11px;">computing…</span></span>`;
+      panelHtml += `</div>`;
+      // Live VRAM / RAM-spillover monitor for the serve target's GPU. Polls
+      // /api/cookbook/gpus while the panel is open so you can SEE whether the
+      // config fits VRAM (fast) or spills to system RAM (slow). Populated after mount.
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp hwfit-vram-monitor" style="align-items:center;gap:8px;font-size:11px;">`;
+      panelHtml += `<span style="opacity:0.7;">GPU memory:</span>`;
+      panelHtml += `<span class="hwfit-vram-readout" style="opacity:0.5;">checking…</span>`;
+      panelHtml += `</div>`;
      // Row 3a: Checkboxes (llama.cpp-only)
      panelHtml += `<div class="hwfit-serve-checks hwfit-backend-llamacpp">`;
      panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="unified_mem"${sv('unified_mem',false)?' checked':''} /> Unified Memory${_h('For AMD APUs / Strix Halo: exports GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 so llama.cpp can address the full BIOS VRAM carveout instead of the default ~28 GB cap. No-op on discrete GPUs.')}</label>`;
+      panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_mmap"${sv('llama_no_mmap',false)?' checked':''} /> No mmap${_h('Adds --no-mmap for native llama-server. Useful for some high-context/local-storage setups, but not a universal default.')}</label>`;
+      panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_warmup"${sv('llama_no_warmup',false)?' checked':''} /> Skip warmup${_h('Adds --no-warmup. Can reduce startup memory spikes for tight launches, but llama.cpp defaults to warming up.')}</label>`;
+      panelHtml += `<label class="hwfit-sf-cb hwfit-spec-group"><input type="checkbox" class="hwfit-sf" data-field="llama_speculative_mtp"${sv('llama_speculative_mtp',false)?' checked':''} /> MTP Spec${_h('llama.cpp native MTP speculative decoding: --spec-type draft-mtp. Requires a GGUF with MTP heads and a recent llama-server build.')} <span class="hwfit-numstep"><button type="button" class="hwfit-numstep-btn" data-step="-1" tabindex="-1" aria-label="Decrease">‹</button><input type="number" class="hwfit-sf hwfit-spec-tokens" data-field="llama_spec_tokens" value="${esc(sv('llama_spec_tokens', '3'))}" min="1" max="10" title="--spec-draft-n-max" /><button type="button" class="hwfit-numstep-btn" data-step="1" tabindex="-1" aria-label="Increase">›</button></span></label>`;
      panelHtml += `</div>`;
      // Row 3b: Checkboxes (diffusers)
      panelHtml += `<div class="hwfit-serve-checks hwfit-backend-diffusers">`;
@@ -500,9 +728,10 @@ function _rerenderCachedModels() {
      item.classList.add('doclib-card-expanded');
      item.style.flexDirection = 'column';
      item.style.alignItems = 'stretch';
-      if (list) list.scrollTop = 0;
      item.insertAdjacentHTML('beforeend', panelHtml);
      const panel = item.querySelector('.hwfit-serve-panel');
+      // Scroll the serve panel into view within its nearest scrollable ancestor
+      requestAnimationFrame(() => panel.scrollIntoView({ block: 'nearest', behavior: 'smooth' }));

      // Build command preview
      function updateCmd() {
@@ -514,19 +743,27 @@ function _rerenderCachedModels() {
        const backend = f.backend || 'vllm';
        const serveModel = m.is_local_dir && m.path ? `${m.path}/${repo}` : repo;
        if (backend === 'llamacpp') {
+          const ggufChoices = _runnableGgufFiles(m);
+          const selectedGguf = ggufChoices.find(file => file.rel_path === f.gguf_file);
          // For multi-part GGUFs, llama.cpp requires the first split
          // (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes
          // before Q4_K_M/001 etc); fall back to any single GGUF sorted.
-          // Use $HOME (not ~) so tilde survives variable interpolation inside $(...).
-          const dir = `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
+          const dir = _ggufSearchDirExpr(m, repo);
          // GGUF needs the actual .gguf FILE, not the folder. For a custom-dir
          // model the file lives under "<path>/<repo>" — search there just like we
          // search the HF snapshots dir, so serving a GGUF from a custom dir works
          // instead of handing llama.cpp a directory (which fails).
-          const _ldir = `"${m.path}/${repo}"`;
-          f._gguf_path = m.is_local_dir && m.path
+          const _ldir = m.path ? _shellQuote(`${m.path}/${repo}`) : '""';
+          f._gguf_path = selectedGguf
+            ? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
+            : m.is_local_dir && m.path
            ? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
            : `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
+          // Vision: auto-find the mmproj (CLIP/projector) file in the same dir.
+          // Resolved at runtime so the toggle just works if an mmproj-*.gguf is
+          // present (downloaded alongside the model). Empty if none → cmd omits it.
+          const _vsearchdir = (m.is_local_dir && m.path) ? _ldir : dir;
+          f._mmproj_path = `$(find ${_vsearchdir} -iname 'mmproj*.gguf' 2>/dev/null | sort | head -1)`;
        }
        if (f.reasoning_parser) {
          const _rpEl2 = panel.querySelector('[data-field="reasoning_parser"]');
@@ -541,6 +778,151 @@ function _rerenderCachedModels() {
      }
      updateCmd();

+      // Context clamp. Two ceilings:
+      //  - ABSOLUTE_CTX_MAX: a hard sanity cap (no LLM trains past ~1M tokens),
+      //    so an obvious typo like 16000000 can never reach llama.cpp even when
+      //    we don't know the model's real limit (not in catalog / profiles
+      //    fetch failed). This is what stops the radv ErrorDeviceLost crash.
+      //  - panel._modelCtxMax: the model's actual trained limit (set by the
+      //    profiles fetch below) — a tighter, model-specific cap when known.
+      const ABSOLUTE_CTX_MAX = 1048576;   // 1M tokens — above any real n_ctx_train
+      const _ctxEl0 = panel.querySelector('[data-field="ctx"]');
+      function _clampCtx(announce) {
+        if (!_ctxEl0) return;
+        const cap = panel._modelCtxMax > 0 ? panel._modelCtxMax : ABSOLUTE_CTX_MAX;
+        const v = parseInt(_ctxEl0.value, 10);
+        if (Number.isFinite(v) && v > cap) {
+          _ctxEl0.value = String(cap);
+          _ctxEl0.title = `Capped to ${panel._modelCtxMax > 0 ? "this model's trained limit" : "the maximum sane context"} (${cap}).`;
+          if (announce) uiModule.showToast(`Context capped to ${cap}`);
+          updateCmd();
+        }
+      }
+      if (_ctxEl0) {
+        _ctxEl0.addEventListener('change', () => _clampCtx(false));
+        _ctxEl0.addEventListener('blur', () => _clampCtx(false));
+        _clampCtx(false);   // fix any stale/preset value already present
+      }
+
+      // Auto profiles — fetch hardware-computed llama.cpp profiles and render
+      // them as clickable chips. Clicking one fills the ctx/CPU-MoE/KV/flash
+      // fields and rebuilds the command. Computed from detected VRAM (see
+      // services/hwfit/profiles.py); rough on t/s, accurate on fit.
+      async function _loadServeProfiles() {
+        const wrap = panel.querySelector('.hwfit-profile-btns');
+        if (!wrap) return;
+        try {
+          const host = (_es.remoteHost || '').trim();
+          const params = new URLSearchParams({ model: repo });
+          if (host) {
+            params.set('host', host);
+            const _sp = (_es.servers || []).find(s => s.host === host)?.port;
+            if (_sp) params.set('ssh_port', _sp);
+          }
+          // SERVE mode: this is a specific GGUF file already on disk, so its quant
+          // is fixed — tell the profiler the file's real size + quant so it varies
+          // only the serving knobs (KV/ctx/offload), not the quant. Parse the size
+          // from m.size (e.g. "20.6 GB") and the quant from the file/repo name.
+          const _sizeMatch = String(m.size || '').match(/([\d.]+)\s*GB/i);
+          if (_sizeMatch) params.set('serve_weights_gb', _sizeMatch[1]);
+          const _qMatch = String(repo).match(/(Q\d[\w]*|IQ\d[\w]*|F16|BF16|FP8)/i);
+          if (_qMatch) params.set('serve_quant', _qMatch[1]);
+          const res = await fetch(`/api/hwfit/profiles?${params}`);
+          const data = await res.json();
+          // Remember the model's trained context limit and clamp the ctx field
+          // to it — asking llama.cpp for ctx > n_ctx_train overflows and, with a
+          // quantized KV cache, can crash the GPU (radv ErrorDeviceLost).
+          const ctxMax = Number(data && data.model_ctx_max) || 0;
+          if (ctxMax > 0) {
+            panel._modelCtxMax = ctxMax;   // tighten the clamp to the real limit
+            _clampCtx(false);              // re-apply now that we know the model's max
+          }
+          const profs = (data && Array.isArray(data.profiles)) ? data.profiles : [];
+          if (!profs.length) { wrap.innerHTML = `<span style="opacity:0.5;font-size:11px;">no auto profile for this model</span>`; return; }
+          wrap.innerHTML = '';
+          for (const p of profs) {
+            const b = document.createElement('button');
+            b.type = 'button';
+            b.className = 'cookbook-btn hwfit-profile-chip';
+            b.style.cssText = 'height:24px;padding:0 9px;font-size:11px;';
+            const off = p.offloads ? `, ncm${p.n_cpu_moe}` : ', all-GPU';
+            b.textContent = `${p.label} · ${p.quant} · ${Math.round(p.ctx/1024)}k${off}`;
+            b.title = `${p.note}\nKV ${p.cache_type}, ~${p.est_vram_gb} GB VRAM`;
+            b.addEventListener('click', () => {
+              const set = (field, val) => {
+                const el = panel.querySelector(`[data-field="${field}"]`);
+                if (!el) return;
+                if (el.type === 'checkbox') el.checked = !!val; else el.value = val;
+              };
+              set('ctx', p.ctx);
+              set('n_cpu_moe', p.n_cpu_moe || '');
+              set('cache_type', p.cache_type || '');
+              set('flash_attn', true);   // required for a quantized KV cache
+              wrap.querySelectorAll('.hwfit-profile-chip').forEach(x => x.classList.remove('cookbook-btn-active'));
+              b.classList.add('cookbook-btn-active');
+              updateCmd();
+            });
+            wrap.appendChild(b);
+          }
+        } catch {
+          wrap.innerHTML = `<span style="opacity:0.5;font-size:11px;">profile compute failed</span>`;
+        }
+      }
+      _loadServeProfiles();
+
+      // Live GPU-memory monitor: poll /api/cookbook/gpus and show VRAM usage +
+      // RAM-spillover, with a plain-language health/speed hint. Lets you tell at
+      // a glance whether the chosen config fits VRAM (fast) or is paging into
+      // system RAM over PCIe (slow). AMD sysfs reports gtt_used_mb for spillover.
+      async function _refreshVramMonitor() {
+        const el = panel.querySelector('.hwfit-vram-readout');
+        if (!el || !document.body.contains(el)) return false;  // panel closed → stop
+        try {
+          const host = (_es.remoteHost || '').trim();
+          const params = new URLSearchParams();
+          if (host) {
+            params.set('host', host);
+            const _sp = (_es.servers || []).find(s => s.host === host)?.port;
+            if (_sp) params.set('ssh_port', _sp);
+          }
+          const res = await fetch('/api/cookbook/gpus' + (params.toString() ? '?' + params : ''));
+          const data = await res.json();
+          const gpus = Array.isArray(data) ? data : (data.gpus || []);
+          if (!gpus.length) { el.textContent = 'no GPU detected'; el.style.color = ''; return true; }
+          const g = gpus[0];
+          const usedG = (g.used_mb / 1024), totG = (g.total_mb / 1024);
+          const pct = totG ? Math.round((usedG / totG) * 100) : 0;
+          const freeG = Math.max(0, totG - usedG);
+          const spillG = (g.gtt_used_mb || 0) / 1024;
+          // Color: green < 85%, amber 85-97%, red > 97% or spilling.
+          const spilling = spillG > 0.5 && !g.unified_memory;   // unified APUs always use GTT; not a spill
+          let color = 'var(--green, #50fa7b)';
+          if (pct >= 97 || spilling) color = 'var(--red, #ff5555)';
+          else if (pct >= 85) color = 'var(--orange, #ffb86c)';
+          let txt = `${usedG.toFixed(1)} / ${totG.toFixed(1)} GB (${pct}%) · ${freeG.toFixed(1)} GB free`;
+          if (spilling) {
+            txt += ` · ⚠ ${spillG.toFixed(1)} GB spilled to RAM — slow (raise CPU MoE or lower context)`;
+          } else if (pct >= 90) {
+            txt += ` · tight — risk of OOM/spill on long context or images`;
+          } else {
+            txt += ` · healthy`;
+          }
+          el.textContent = txt;
+          el.style.color = color;
+          return true;
+        } catch {
+          el.textContent = 'unavailable';
+          el.style.color = '';
+          return true;
+        }
+      }
+      _refreshVramMonitor();
+      // Poll every 4s while the panel is open; stop when it's removed from the DOM.
+      const _vramTimer = setInterval(async () => {
+        const ok = await _refreshVramMonitor();
+        if (ok === false) clearInterval(_vramTimer);
+      }, 4000);
+
      // Show/hide backend-specific sections
      function updateBackendVisibility() {
        const b = panel.querySelector('[data-field="backend"]')?.value || 'vllm';
@@ -551,6 +933,38 @@ function _rerenderCachedModels() {
      }
      updateBackendVisibility();

+      async function updateRuntimeReadinessNote() {
+        const note = panel.querySelector('.hwfit-serve-runtime-note');
+        if (!note) return;
+        const backend = panel.querySelector('[data-field="backend"]')?.value || 'vllm';
+        if (!['vllm', 'sglang', 'llamacpp', 'diffusers'].includes(backend)) {
+          note.style.display = 'none';
+          note.textContent = '';
+          return;
+        }
+        const seq = (panel._runtimeReadinessSeq || 0) + 1;
+        panel._runtimeReadinessSeq = seq;
+        note.style.display = '';
+        note.textContent = 'Checking runtime on selected server...';
+        try {
+          const { pkg, target } = await _fetchServeRuntimePackage(panel, backend);
+          if (panel._runtimeReadinessSeq !== seq) return;
+          note.textContent = _runtimeNoteText(backend, pkg, target);
+          note.style.color = pkg?.installed ? 'var(--fg-muted)' : 'var(--red)';
+        } catch (err) {
+          if (panel._runtimeReadinessSeq !== seq) return;
+          note.textContent = `Runtime readiness unavailable: ${err?.message || err}`;
+          note.style.color = 'var(--fg-muted)';
+        }
+      }
+      updateRuntimeReadinessNote();
+      const runtimeServerSelect = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
+      if (runtimeServerSelect) {
+        const refreshRuntimeOnServerChange = () => updateRuntimeReadinessNote();
+        runtimeServerSelect.addEventListener('change', refreshRuntimeOnServerChange);
+        panel._cleanupRuntimeReadiness = () => runtimeServerSelect.removeEventListener('change', refreshRuntimeOnServerChange);
+      }
+
      // Wire save slots
      function _loadSlotIntoPanel(slotIdx) {
        const presets = _loadPresets();
@@ -580,7 +994,17 @@ function _rerenderCachedModels() {
            gpu_mem: _ex(/--gpu-memory-utilization\s+([\d.]+)/) || '0.90',
            swap: _ex(/--swap-space\s+(\d+)/) || '',
            dtype: _ex(/--dtype\s+(\w+)/) || 'auto',
+            vllm_kv_cache_dtype: _ex(/--kv-cache-dtype\s+([\w.-]+)/) || 'auto',
            max_seqs: _ex(/--max-num-seqs\s+(\d+)/) || '',
+            cache_type: _ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
+            llama_fit: _ex(/(?:--fit|-fit)\s+(on|off)/) || '',
+            llama_split_mode: _ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
+            llama_tensor_split: _ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
+            llama_main_gpu: _ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
+            llama_parallel: _ex(/(?:--parallel|-np)\s+(\d+)/) || '',
+            llama_batch_size: _ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
+            llama_ubatch_size: _ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
+            llama_spec_tokens: _ex(/--spec-draft-n-max\s+(\d+)/) || '3',
            venv: p.envPath || '',
          };
          const checks = {
@@ -588,6 +1012,11 @@ function _rerenderCachedModels() {
            trust_remote: cmd.includes('--trust-remote-code'),
            prefix_cache: cmd.includes('--enable-prefix-caching'),
            auto_tool: cmd.includes('--enable-auto-tool-choice'),
+            flash_attn: /--flash-attn\s+on\b/.test(cmd),
+            unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
+            llama_no_mmap: /--no-mmap\b/.test(cmd),
+            llama_no_warmup: /--no-warmup\b/.test(cmd),
+            llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
            speculative: cmd.includes('--speculative-config'),
          };
          const _specMatch = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);
@@ -619,16 +1048,21 @@ function _rerenderCachedModels() {
        const _gf = panel.querySelector('[data-field="gpus"]');
        if (_gf) _gf.value = activeGpus.join(',');
        updateBackendVisibility();
+        updateRuntimeReadinessNote();
        updateCmd();
        panel.querySelectorAll('.cookbook-slot-btn').forEach(b => b.classList.remove('active'));
        panel.querySelector(`.cookbook-slot-btn[data-slot="${slotIdx}"]`)?.classList.add('active');
      }

-      // Keep the arrow button's count in sync with the stored presets.
+      // Keep the arrow button's count + tooltip in sync with stored presets.
      function _updateSavedToggleLabel() {
        const n = _presetsForModel(_loadPresets(), repo).length;
        const t = panel.querySelector('.cookbook-saved-arrow');
-        if (t) t.textContent = n > 0 ? `${n} ▾` : '▾';
+        if (!t) return;
+        t.textContent = n > 0 ? `${n} ▾` : '▾';
+        t.title = n > 0
+          ? `${n} saved launch config${n === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete`
+          : `No saved launch configs for ${_repoShort} yet — click Save to add one`;
      }

      // Save the current panel fields as a new named preset (shared by the menu's
@@ -1154,6 +1588,10 @@ function _rerenderCachedModels() {
            const extraEl = panel.querySelector('[data-field="extra"]');
            if (extraEl) extraEl.value = '';
            updateBackendVisibility();
+            updateRuntimeReadinessNote();
+          }
+          if (e.target.dataset.field === 'venv') {
+            updateRuntimeReadinessNote();
          }
          updateCmd();
        });
@@ -1185,6 +1623,7 @@ function _rerenderCachedModels() {
      // "back out" affordance next to Launch.
      panel.querySelector('.hwfit-serve-cancel')?.addEventListener('click', (ev) => {
        ev.stopPropagation();
+        panel._cleanupRuntimeReadiness?.();
        panel.remove();
        item.classList.remove('doclib-card-expanded');
        item.style.flexDirection = '';
@@ -1195,6 +1634,12 @@ function _rerenderCachedModels() {
      // Launch button
      panel.querySelector('.hwfit-serve-launch').addEventListener('click', async (ev) => {
        const _launchBtn = ev.currentTarget;
+        // Final safety net: never launch with ctx beyond the model's trained
+        // limit (or the absolute sanity ceiling when the limit is unknown). A
+        // stale preset or typo (e.g. 16000000) overflows and, with a quantized
+        // KV cache, can crash the GPU. Skip only if the user hand-edited the raw
+        // command (then we respect their literal text).
+        if (!_cmdManuallyEdited) _clampCtx(true);
        if (!_cmdManuallyEdited) updateCmd();
        const launchCmd = _cmdTextarea ? _cmdTextarea.value.trim() : panel._cmd;
        const serveState = {};
@@ -1202,7 +1647,16 @@ function _rerenderCachedModels() {
          if (el.type === 'checkbox') serveState[el.dataset.field] = el.checked;
          else serveState[el.dataset.field] = el.value;
        });
-        serveState.backend = (_detectBackend(m).backend) || serveState.backend || 'vllm';
+        serveState.backend = serveState.backend || (_detectBackend(m).backend) || 'vllm';
+        const backendWarning = _serveBackendWarning(m, repo, serveState.backend, serveState);
+        if (backendWarning) {
+          await window.styledConfirm(backendWarning.body, {
+            title: backendWarning.title,
+            confirmText: 'Edit settings',
+            cancelText: 'Close',
+          });
+          return;
+        }
        // Save in the { _byRepo, _lastUsed } schema — no legacy flat keys at
        // the root so per-model state doesn't leak between models.
        try {
@@ -1515,7 +1969,10 @@ export async function _fetchCachedModels() {
    const data = await res.json();
    _dlWp.destroy();

-    const ready = data.models.filter(m => m.status === 'ready' && (m.backend === 'ollama' || !m.size.includes('MB')));
+    // CHANGELOG: 'ready' already excludes partial downloads; 
+    // show every complete model regardless of size/backend.
+    const ready = data.models.filter(m => m.status === 'ready');
+
    const downloading = data.models.filter(m => m.status === 'downloading');
    const allModels = [...ready, ...downloading];
    _cachedAllModels = allModels;