Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
2026-06-17 18:25:26 -04:00 · 2026-06-03 16:49:10 +09:00
parent eb79b76432 41a928f21b
commit 3706d756f3
569 changed files with 35252 additions and 3489 deletions
@@ -213,6 +213,8 @@ export function _renderGpuToggles(system) {
        if (quantSel && quantSel.value !== '') {
          if (count <= 1) {
            quantSel.value = 'Q4_K_M'; // RAM or 1 GPU -> Q4 sweet spot
+          } else if (String(system?.backend || '').toLowerCase() === 'rocm') {
+            quantSel.value = 'Q4_K_M'; // ROCm default stays GGUF/local-safe; AWQ is explicit only
          } else {
            quantSel.value = 'AWQ-4bit'; // Multi-GPU -> AWQ for vLLM
          }
@@ -244,11 +246,13 @@ function _ctxLabel(value) {
  if (!n) return 'Max';
  return n >= 1000 ? Math.round(n / 1000) + 'k' : String(n);
 }
+
 function _ctxValue() {
  const slider = document.getElementById('hwfit-context');
  const idx = Math.max(0, Math.min(_CTX_PRESETS.length - 1, Number(slider?.value ?? 3) || 0));
  return _CTX_PRESETS[idx] || 0;
 }
+
 function _syncCtxControl() {
  const slider = document.getElementById('hwfit-context');
  const label = document.getElementById('hwfit-context-label');
@@ -359,6 +363,7 @@ function _scanSig() {
    o: sortEl?.value || 'score',
    r: sortEl?.dataset.reverse === '1' ? 1 : 0,
    q: document.getElementById('hwfit-quant')?.value || '',
+    c: _ctxValue(),
    g: (tc && typeof tc._activeCount === 'number') ? String(tc._activeCount) : '',
    gg: (tc && tc._activeGroup) ? String(tc._activeGroup) : '',
    m: _manualHwParams(),
@@ -408,6 +413,17 @@ function _hwfitShowError(list, host, detail) {
  if (rb) rb.addEventListener('click', () => { _resetGpuToggleState(); _hwfitFetch(true); });
 }

+// Client-side "Engine" filter (llama.cpp / vLLM / SGLang). Empty = show all.
+// Uses the same _detectBackend() the serve commands use, so what you filter to
+// is exactly what would be launched. Pure view filter — no refetch needed.
+function _applyEngineFilter(models) {
+  const want = document.getElementById('hwfit-engine')?.value || '';
+  if (!want || !Array.isArray(models)) return models || [];
+  return models.filter(m => {
+    try { return _detectBackend(m).backend === want; } catch { return true; }
+  });
+}
+
 export async function _hwfitFetch(fresh = false) {
  const _tk = ++_hwfitFetchToken;
  const useCase = document.getElementById('hwfit-usecase')?.value || '';
@@ -427,7 +443,7 @@ export async function _hwfitFetch(fresh = false) {
  if (_cached) {
    _hwfitCache = _cached;
    _hwfitRenderHw(hw, _cached.system);
-    _hwfitRenderList(list, _cached.models);
+    _hwfitRenderList(list, _applyEngineFilter(_cached.models));
  } else {
    // Show spinner while scanning — stack the spinner above a text label
    // (the .hwfit-loading class is a centered flex ROW, so force column here).
@@ -456,7 +472,9 @@ export async function _hwfitFetch(fresh = false) {
    fetch(`/api/model/cached?${_cacheParams}`, { credentials: 'same-origin' })
      .then(r => r.json())
      .then(d => {
-        _cachedModelIds = new Set((d.models || []).map(m => m.repo_id));
+        // Exclude stalled (download-shell) entries — a 12 KB README-only
+        // folder shouldn't count as "downloaded" in the Scan/Download list.
+        _cachedModelIds = new Set((d.models || []).filter(m => m.status !== 'stalled').map(m => m.repo_id));
        // Re-mark rows if already rendered
        list.querySelectorAll('.hwfit-row[data-model]').forEach(row => {
          const name = row.dataset.model;
@@ -472,6 +490,7 @@ export async function _hwfitFetch(fresh = false) {
  try {
    const sortBy = document.getElementById('hwfit-sort')?.value || 'score';
    const quantPref = document.getElementById('hwfit-quant')?.value || '';
+    const targetCtx = _ctxValue();
    // Get active GPU count from toggles
    const toggleContainer = document.getElementById('hwfit-gpu-toggles');
    let gpuCountOverride = '';
@@ -507,6 +526,7 @@ export async function _hwfitFetch(fresh = false) {
    if (!isImageMode) {
      if (useCase) params.set('use_case', useCase);
      if (quantPref) params.set('quant', quantPref);
+      if (targetCtx) params.set('ctx', String(targetCtx));
    }
    const endpoint = isImageMode ? `/api/hwfit/image-models?${params}` : `/api/hwfit/models?${params}`;
    const res = await fetch(endpoint);
@@ -562,13 +582,26 @@ export async function _hwfitFetch(fresh = false) {
      const sortSel = document.getElementById('hwfit-sort');
      const sortKey = sortSel?.value || 'score';
      const asc = sortSel?.dataset.reverse === '1';   // reversed → ascending (lowest first)
-      const field = { score: 'score', vram: 'required_gb', speed: 'speed_tps', params: 'params_b', context: 'context' }[sortKey] || 'score';
-      data.models.sort((a, b) => {
-        const av = Number(a[field]) || 0, bv = Number(b[field]) || 0;
-        return asc ? av - bv : bv - av;
-      });
+      if (sortKey === 'fit') {
+        // fit_level is categorical (perfect→good→marginal→too_tight), not numeric,
+        // so rank it explicitly instead of falling through to the score column.
+        // Tie-break by score so rows within one fit tier stay meaningfully ordered.
+        const fitRank = { perfect: 4, good: 3, marginal: 2, too_tight: 1, no_fit: 0 };
+        data.models.sort((a, b) => {
+          const ar = fitRank[a.fit_level] ?? -1, br = fitRank[b.fit_level] ?? -1;
+          if (ar !== br) return asc ? ar - br : br - ar;
+          const as = Number(a.score) || 0, bs = Number(b.score) || 0;
+          return asc ? as - bs : bs - as;
+        });
+      } else {
+        const field = { score: 'score', vram: 'required_gb', speed: 'speed_tps', params: 'params_b', context: 'context' }[sortKey] || 'score';
+        data.models.sort((a, b) => {
+          const av = Number(a[field]) || 0, bv = Number(b[field]) || 0;
+          return asc ? av - bv : bv - av;
+        });
+      }
    }
-    _hwfitRenderList(list, data.models);
+    _hwfitRenderList(list, _applyEngineFilter(data.models));
    // Persist this result so the next page load can paint it instantly.
    _writeScanCache(_sig, data);
    // Render GPU toggles — only on first scan (no override active)
@@ -614,8 +647,36 @@ export function _hwfitRenderHw(el, sys) {
  };
  let gpuChip;
  if (sys.gpu_name) {
-    const label = gpuCount > 1 ? `${gpuCount}x ${esc(sys.gpu_name)}` : esc(sys.gpu_name);
-    gpuChip = chip('gpu', label);
+    // Mixed-GPU boxes (#711): `${gpuCount}x ${gpu_name}` uses gpus[0].name for
+    // every card, so a 4090+3060 reads as "2x RTX 4090". Use gpu_groups (the
+    // backend already groups identical cards) to render each pool separately
+    // and put the per-card index+VRAM into the tooltip so it's actually
+    // useful for picking CUDA_VISIBLE_DEVICES.
+    const groups = Array.isArray(sys.gpu_groups) ? sys.gpu_groups : [];
+    // Shorten vendor prefixes so a mixed-GPU label fits in the chip row
+    // without overflowing. Single-GPU label still shows the full name
+    // (that's what users are used to seeing). Tooltip carries the full
+    // unmodified names regardless, so no information is lost.
+    const _shortGpuName = (n) => String(n || '')
+      .replace(/^NVIDIA\s+GeForce\s+/i, '')
+      .replace(/^NVIDIA\s+/i, '')
+      .replace(/^AMD\s+Radeon\s+/i, '')
+      .replace(/^AMD\s+/i, '')
+      .replace(/^Intel\s+/i, '');
+    let label;
+    if (groups.length > 1) {
+      // Heterogeneous: "1× RTX 4090 + 1× RTX 3060"
+      label = groups.map(g => `${g.count}× ${esc(_shortGpuName(g.name))}`).join(' + ');
+    } else if (gpuCount > 1) {
+      label = `${gpuCount}× ${esc(sys.gpu_name)}`;
+    } else {
+      label = esc(sys.gpu_name);
+    }
+    const gpus = Array.isArray(sys.gpus) ? sys.gpus : [];
+    const tip = gpus.length
+      ? gpus.map(g => `GPU ${g.index}: ${g.name} · ${(+g.vram_gb).toFixed(1)} GB`).join('\n')
+      : 'Click to toggle off (X to hide)';
+    gpuChip = chip('gpu', label, tip);
  } else if (sys.gpu_error) {
    gpuChip = _removedHwChips.has('gpu')
      ? ''
@@ -761,8 +822,22 @@ function _wireManualHardwareControls(el) {

 export const _fitColors = { perfect: 'var(--green, #50fa7b)', good: 'var(--yellow, #f1fa8c)', marginal: 'var(--orange, #ffb86c)', too_tight: 'var(--red, #ff5555)' };

+function _requiresAcceleratorBackend(model) {
+  const q = String(model?.quant || model?.quantization || '').toUpperCase();
+  const text = `${model?.name || ''} ${model?.repo_id || ''} ${model?.path || ''}`.toLowerCase();
+  return /^AWQ|^GPTQ|^NVFP4/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8|nvfp4)\b/i.test(text);
+}
+
+function _modeLabel(model) {
+  if (model?.is_image_gen) return 'image';
+  if (_requiresAcceleratorBackend(model)) return 'vLLM/SGLang';
+  const detected = _detectBackend(model);
+  if (detected?.label) return detected.label;
+  return String(model?.run_mode || '').replace('_', '+');
+}
+
 export const _hwfitColumns = [
-  { key: 'score', label: 'Fit',    cls: 'hwfit-fit' },
+  { key: 'fit', label: 'Fit',    cls: 'hwfit-fit' },
  { key: null,    label: 'Model',  cls: 'hwfit-name' },
  { key: 'params',label: 'Param', cls: 'hwfit-c-params' },
  { key: null,    label: 'Quant',  cls: 'hwfit-c-quant' },
@@ -783,9 +858,10 @@ export function _hwfitRenderList(el, models) {
    const hasHw = sys && ((sys.gpu_vram_gb || 0) > 0 || (sys.total_ram_gb || 0) > 8);
    const hasFilters = !!(document.getElementById('hwfit-search')?.value?.trim()
      || document.getElementById('hwfit-usecase')?.value
-      || document.getElementById('hwfit-quant')?.value);
+      || document.getElementById('hwfit-quant')?.value
+      || document.getElementById('hwfit-engine')?.value);
    let msg;
-    if (hasFilters) msg = 'No models match these filters — try clearing the search, use-case, or quant.';
+    if (hasFilters) msg = 'No models match these filters — try clearing the search, use-case, quant, or engine.';
    else if (hasHw) msg = 'No models fit — the hardware probe may have under-reported. Try Rescan.';
    else msg = 'No models fit your hardware';
    el.innerHTML = `<div class="hwfit-loading">${msg}</div>`;
@@ -827,7 +903,7 @@ export function _hwfitRenderList(el, models) {
    const pcount = m.parameter_count || '?';
    const ctx = m.context ? (m.context >= 1024 ? (m.context / 1024).toFixed(0) + 'k' : m.context) : '?';
    const fitLabel = (m.fit_level || '').replace('_', ' ');
-    const modeLabel = (m.run_mode || '').replace('_', '+');
+    const modeLabel = _modeLabel(m);
    const vramLabel = m.required_gb ? m.required_gb.toFixed(1) + 'G' : '?';
    const moeBadge = m.is_moe ? '<span class="hwfit-badge hwfit-moe">MoE</span>' : '';
    const imgBadge = m.is_image_gen ? '<span class="hwfit-badge" style="background:color-mix(in srgb, var(--red) 20%, transparent);color:var(--red);font-size:8px;padding:1px 4px;border-radius:3px;margin-left:4px;">IMG</span>' : '';
@@ -841,7 +917,7 @@ export function _hwfitRenderList(el, models) {
    html += `<span class="hwfit-col hwfit-c-ctx">${m.is_image_gen ? '\u2014' : ctx}</span>`;
    html += `<span class="hwfit-col hwfit-c-speed">${m.is_image_gen ? '\u2014' : tps + ' t/s'}</span>`;
    html += `<span class="hwfit-col hwfit-c-score">${score}</span>`;
-    html += `<span class="hwfit-col hwfit-c-mode">${m.is_image_gen ? 'image' : esc(modeLabel)}</span>`;
+    html += `<span class="hwfit-col hwfit-c-mode" title="${_requiresAcceleratorBackend(m) ? 'Requires vLLM or SGLang with a visible CUDA/ROCm accelerator. llama.cpp and Ollama need GGUF files.' : ''}">${esc(modeLabel)}</span>`;
    html += `</div>`;
  }
  el.innerHTML = html;
@@ -941,6 +1017,8 @@ export function _expandModelRow(row, modelData) {
  html += `</div>`;
  if (modelData.is_image_gen) {
    html += `<div style="font-size:10px;opacity:0.5;margin-top:4px;">${esc((modelData.capabilities || []).join(' \u00B7 ') || '')}${modelData.description ? ' \u2014 ' + esc(modelData.description) : ''}</div>`;
+  } else if (_requiresAcceleratorBackend(modelData)) {
+    html += `<div class="hwfit-panel-note">This is a safetensors GPU-serving format. Use vLLM/SGLang with a visible CUDA/ROCm accelerator, or pick a GGUF download for llama.cpp/Ollama.</div>`;
  }
  html += `</div>`;

@@ -1145,6 +1223,17 @@ export function _hwfitInit() {
  if (uc) uc.addEventListener('change', () => _hwfitFetch());
  if (sort) sort.addEventListener('change', () => _hwfitFetch());
  if (qpref) qpref.addEventListener('change', () => _hwfitFetch());
+  // Engine filter is a pure client-side view filter over the already-fetched
+  // list, so just re-render from cache instead of re-probing hardware.
+  const engine = document.getElementById('hwfit-engine');
+  if (engine) engine.addEventListener('change', () => {
+    const list = document.getElementById('hwfit-list');
+    if (list && _hwfitCache && Array.isArray(_hwfitCache.models)) {
+      _hwfitRenderList(list, _applyEngineFilter(_hwfitCache.models));
+    } else {
+      _hwfitFetch();
+    }
+  });
  if (ctx && !ctx.dataset.bound) {
    ctx.dataset.bound = '1';
    ctx.addEventListener('input', () => {