Merge remote-tracking branch 'origin/main' into visual-pr-playground

# Conflicts:
#	routes/cookbook_routes.py
#	routes/hwfit_routes.py
#	services/hwfit/fit.py
#	services/hwfit/models.py
#	static/js/cookbook-diagnosis.js
#	static/js/cookbook-hwfit.js
#	static/js/cookbook.js
#	static/js/cookbookRunning.js
This commit is contained in:
pewdiepie-archdaemon
2026-06-03 16:49:10 +09:00
569 changed files with 35252 additions and 3489 deletions
+477 -20
View File
@@ -41,6 +41,48 @@ const SERVE_STATE_KEY = 'cookbook-serve-state';
let _cachedAllModels = [];
function _repoLooksAwqLike(model, repo) {
const q = String(model?.quant || '').toUpperCase();
const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
return /^AWQ|^GPTQ/.test(q) || q === 'FP8' || /\b(awq|gptq|fp8)\b/i.test(n);
}
function _repoLooksGgufLike(model, repo) {
const q = String(model?.quant || '').toUpperCase();
const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase();
return !!model?.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf');
}
function _serveBackendWarning(model, repo, backend, fields = {}) {
const awqLike = _repoLooksAwqLike(model, repo);
const ggufLike = _repoLooksGgufLike(model, repo);
if (awqLike && (backend === 'llamacpp' || backend === 'ollama')) {
return {
title: 'AWQ needs vLLM or SGLang',
body: 'This model looks like AWQ/GPTQ/FP8 safetensors. llama.cpp and Ollama need GGUF files, so this backend cannot serve it. Choose vLLM/SGLang on a CUDA/ROCm GPU server, or download a GGUF version for llama.cpp/Ollama.',
};
}
if (awqLike && _isMetal() && (backend === 'vllm' || backend === 'sglang')) {
return {
title: 'AWQ is not a unified-memory path',
body: 'This model looks like AWQ/GPTQ/FP8 safetensors. AWQ is for vLLM/SGLang on CUDA/ROCm-style GPU servers, not local unified-memory llama.cpp/Ollama serving. For unified memory, download a GGUF model and use llama.cpp/Ollama.',
};
}
if (awqLike && fields.unified_mem) {
return {
title: 'AWQ is not a unified-memory path',
body: 'This model looks like AWQ/GPTQ/FP8 safetensors, but unified-memory local serving expects GGUF. Use vLLM/SGLang on a compatible GPU server, or download a GGUF version for llama.cpp/Ollama.',
};
}
if (ggufLike && (backend === 'vllm' || backend === 'sglang')) {
return {
title: 'GGUF needs llama.cpp or Ollama',
body: 'This model looks like GGUF. vLLM/SGLang expect HuggingFace safetensors-style repos. Choose llama.cpp/Ollama for GGUF, or download a safetensors model for vLLM/SGLang.',
};
}
return null;
}
function _hasOwn(obj, key) {
return Object.prototype.hasOwnProperty.call(obj || {}, key);
}
@@ -51,6 +93,67 @@ function _allGpuIds(count) {
return Array.from({ length: Math.floor(n) }, (_, i) => String(i)).join(',');
}
function _selectedServeTarget(panel) {
const select = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
const servers = Array.isArray(_envState.servers) ? _envState.servers : [];
let host = _envState.remoteHost || '';
let server = host ? servers.find(s => s.host === host) : null;
if (select && select.value != null) {
if (select.value === 'local') {
host = '';
server = servers.find(s => !s.host || s.host === 'local') || null;
} else {
const idx = /^\d+$/.test(String(select.value)) ? parseInt(select.value, 10) : -1;
server = servers.find(s => s.host === select.value) || (idx >= 0 ? servers[idx] : null) || null;
host = server?.host || '';
}
}
const venv = panel?.querySelector('[data-field="venv"]')?.value?.trim() || server?.envPath || _envState.envPath || '';
const label = host
? (server?.name ? `${server.name} (${host})` : host)
: (server?.name || 'local server');
return {
host,
port: host ? (_getPort(host) || server?.port || '') : '',
venv,
label,
};
}
async function _fetchServeRuntimePackage(panel, backend) {
const packageByBackend = {
vllm: 'vllm',
sglang: 'sglang',
llamacpp: 'llama_cpp',
diffusers: 'diffusers',
};
const packageName = packageByBackend[backend];
if (!packageName) return null;
const target = _selectedServeTarget(panel);
const params = new URLSearchParams();
if (target.host) {
params.set('host', target.host);
if (target.port) params.set('ssh_port', target.port);
if (target.venv) params.set('venv', target.venv);
}
const res = await fetch('/api/cookbook/packages' + (params.toString() ? '?' + params.toString() : ''), { credentials: 'same-origin' });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const data = await res.json();
const pkg = (data.packages || []).find(p => p.name === packageName);
return { pkg, target };
}
function _runtimeNoteText(backend, pkg, target) {
const labels = { vllm: 'vLLM', sglang: 'SGLang', llamacpp: 'llama.cpp', diffusers: 'Diffusers' };
const label = labels[backend] || backend;
if (!pkg) return `${label} readiness unavailable for ${target.label}.`;
const note = pkg.status_note || pkg.update_note || '';
if (pkg.installed) {
return note ? `${label} ready on ${target.label}: ${note}` : `${label} ready on ${target.label}.`;
}
return note ? `${label} missing on ${target.label}: ${note}` : `${label} missing on ${target.label}.`;
}
// ── Filter/sort cached model list ──
function _filterCachedList() {
@@ -99,6 +202,64 @@ function _isActivelyServing(repoId) {
} catch { return false; }
}
function _formatGgufSize(bytes) {
const n = Number(bytes || 0);
if (!Number.isFinite(n) || n <= 0) return '';
if (n >= 1024 ** 3) return `${(n / (1024 ** 3)).toFixed(1)} GB`;
if (n >= 1024 ** 2) return `${Math.round(n / (1024 ** 2))} MB`;
return `${Math.max(1, Math.round(n / 1024))} KB`;
}
function _ggufFilesForModel(model) {
return Array.isArray(model?.gguf_files)
? model.gguf_files.filter(f => f && typeof f.rel_path === 'string' && f.rel_path)
: [];
}
function _runnableGgufFiles(model) {
const files = _ggufFilesForModel(model);
const primary = files.filter(f => (f.role || 'model') === 'model');
return primary.length ? primary : files;
}
function _ggufFileLabel(file) {
const base = (file.name || file.rel_path || '').split('/').pop();
const size = _formatGgufSize(file.size_bytes);
const quant = file.quant ? `${file.quant} ` : '';
const parts = Number(file.parts || 0);
const split = parts > 1 ? `, ${parts} parts` : '';
const role = file.role && file.role !== 'model' ? ` ${file.role}` : '';
return `${quant}${base}${size || split ? ` (${[size, split.replace(/^, /, '')].filter(Boolean).join(', ')})` : ''}${role}`;
}
function _shellPathExpr(path) {
const s = String(path || '');
if (s === '~') return '${HOME}';
if (s.startsWith('~/')) return '${HOME}' + _shellQuote(s.slice(1));
return _shellQuote(s);
}
function _selectedGgufExpr(model, repo, relPath) {
const rel = String(relPath || '').replace(/^\/+/, '');
if (!rel) return '';
if (model.is_local_dir && model.path) {
const base = String(model.path || '').replace(/\/+$/, '');
return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
}
if (model.path) {
const base = String(model.path || '').replace(/\/+$/, '');
return `$(printf %s ${_shellPathExpr(`${base}/models--${repo.replace(/\//g, '--')}/snapshots/${rel}`)})`;
}
const cacheRepo = repo.replace(/\//g, '--');
return `$(printf %s \${HOME}${_shellQuote(`/.cache/huggingface/hub/models--${cacheRepo}/snapshots/${rel}`)})`;
}
function _ggufSearchDirExpr(model, repo) {
if (model.is_local_dir && model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/${repo}`);
if (model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/models--${repo.replace(/\//g, '--')}/snapshots`);
return `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
}
function _rerenderCachedModels() {
const list = document.getElementById('hwfit-cached-list');
const tagContainer = document.getElementById('serve-tags');
@@ -131,6 +292,8 @@ function _rerenderCachedModels() {
if (m.path) {
metaParts.push(`<span style="opacity:0.7;">${esc(m.path)}</span>`);
}
const ggufCount = _runnableGgufFiles(m).length;
if (ggufCount > 1) metaParts.push(`${ggufCount} GGUFs`);
if (m.status === 'downloading') {
const _active = _isActivelyDownloading(m.repo_id);
metaParts.push(`<span class="cookbook-dl-status" style="color:var(--accent,var(--red));">${_active ? 'downloading' : 'download stalled'}</span>`);
@@ -307,7 +470,9 @@ function _rerenderCachedModels() {
// Toggle — close if already open
if (item.classList.contains('doclib-card-expanded')) {
item.querySelector('.hwfit-serve-panel')?.remove();
const existingPanel = item.querySelector('.hwfit-serve-panel');
existingPanel?._cleanupRuntimeReadiness?.();
existingPanel?.remove();
item.classList.remove('doclib-card-expanded');
item.style.flexDirection = '';
item.style.alignItems = '';
@@ -318,18 +483,14 @@ function _rerenderCachedModels() {
// Collapse any other expanded
list.querySelectorAll('.doclib-card-expanded').forEach(c => {
c.querySelector('.hwfit-serve-panel')?.remove();
const openPanel = c.querySelector('.hwfit-serve-panel');
openPanel?._cleanupRuntimeReadiness?.();
openPanel?.remove();
c.classList.remove('doclib-card-expanded');
c.style.flexDirection = '';
c.style.alignItems = '';
});
// Capture grid height
const _tb = list.closest('.admin-card')?.querySelector('.memory-toolbar');
const _tbH = _tb ? _tb.offsetHeight : 0;
list.style.minHeight = (list.offsetHeight + _tbH) + 'px';
list.style.maxHeight = (list.offsetHeight + _tbH) + 'px';
const shortName = repo.split('/').pop();
const _es = _envState;
// The venv set per-server in Settings (server.envPath). Used as the venv
@@ -350,8 +511,13 @@ function _rerenderCachedModels() {
? _byRepo[repo]
: (_lastUsed || (_isLegacyFlat ? _allSs : {}));
const detectedBackend = _detectBackend(m).backend;
const defaultBackend = detectedBackend;
const savedMatchesBackend = (ss.backend || 'vllm') === detectedBackend;
const _allowedBackends = new Set(_isWindows()
? ['llamacpp']
: (_isMetal() ? ['llamacpp', 'ollama'] : ['vllm', 'sglang', 'llamacpp', 'ollama', 'diffusers']));
const defaultBackend = (ss._forceBackend && ss.backend && _allowedBackends.has(ss.backend))
? ss.backend
: detectedBackend;
const savedMatchesBackend = !!ss._forceBackend || (ss.backend || 'vllm') === detectedBackend;
const sv = (k, def) => (ss[k] !== undefined && savedMatchesBackend) ? ss[k] : def;
const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', '1');
const detectedGpuIds = _allGpuIds(_getGpuToggleTotal?.());
@@ -362,7 +528,16 @@ function _rerenderCachedModels() {
: (_es.gpus || detectedGpuIds));
const tpOpts = [1,2,4,8].map(n => `<option${defaultTp==String(n)?' selected':''}>${n}</option>`).join('');
const dtypeOpts = ['auto','float16','bfloat16'].map(d => `<option value="${d}"${sv('dtype','auto')===d?' selected':''}>${d}</option>`).join('');
const vllmKvCacheOpts = ['auto','fp8'].map(d => `<option value="${d}"${sv('vllm_kv_cache_dtype','auto')===d?' selected':''}>${d}</option>`).join('');
const _l = (name, tip) => `<span>${name}<span class="hwfit-hint" title="${tip}">?</span></span>`;
const _ggufChoices = _runnableGgufFiles(m);
const _savedGguf = String(sv('gguf_file', '') || '');
const _defaultGguf = _ggufChoices.some(f => f.rel_path === _savedGguf)
? _savedGguf
: (_ggufChoices[0]?.rel_path || '');
const _ggufOptions = _ggufChoices.map(f =>
`<option value="${esc(f.rel_path)}"${f.rel_path === _defaultGguf ? ' selected' : ''}>${esc(_ggufFileLabel(f))}</option>`
).join('');
// Build save slots
const _allPresets = _loadPresets();
const _repoShort = repo.split('/').pop();
@@ -372,10 +547,16 @@ function _rerenderCachedModels() {
// load, × to delete) plus a "Save current config" row — see _showSavedConfigMenu.
// Split button: "Save" saves the current config directly; the arrow opens
// the dropdown of saved configs (load / delete). Arrow shows the count.
// The arrow button shows just the saved-config count next to a "▾".
// Spell out what the number means in the tooltip so users don't have
// to click it to find out the badge isn't a notification dot.
const _arrowLabel = _modelPresets.length > 0 ? `${_modelPresets.length}` : '▾';
const _arrowTitle = _modelPresets.length > 0
? `${_modelPresets.length} saved launch config${_modelPresets.length === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete`
: `No saved launch configs for ${_repoShort} yet — click Save to add one`;
let _slotsHtml = `<div class="cookbook-serve-slots cookbook-saved-split">`
+ `<button type="button" class="cookbook-slot-btn cookbook-saved-save" title="Save current config"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M19 21H5a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2h11l5 5v11a2 2 0 0 1-2 2z"/><polyline points="17 21 17 13 7 13 7 21"/><polyline points="7 3 7 8 15 8"/></svg>Save</button>`
+ `<button type="button" class="cookbook-slot-btn cookbook-saved-arrow" title="Saved launch configs">${_arrowLabel}</button>`
+ `<button type="button" class="cookbook-slot-btn cookbook-saved-arrow" title="${esc(_arrowTitle)}">${_arrowLabel}</button>`
+ `</div>`;
let panelHtml = `<div class="hwfit-serve-panel">${_slotsHtml}`;
@@ -403,6 +584,14 @@ function _rerenderCachedModels() {
}
panelHtml += `<label>${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
panelHtml += `</div>`;
panelHtml += `<div class="hwfit-serve-runtime-note" style="display:none;font-size:11px;line-height:1.35;color:var(--fg-muted);margin-top:-4px;"></div>`;
if (_ggufChoices.length > 1) {
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
panelHtml += `<label class="hwfit-backend-llamacpp">${_l('GGUF File','Choose the exact GGUF artifact to serve from this cached model folder.')}<select class="hwfit-sf hwfit-sf-wide" data-field="gguf_file">${_ggufOptions}</select></label>`;
panelHtml += `</div>`;
} else if (_defaultGguf) {
panelHtml += `<input type="hidden" class="hwfit-sf" data-field="gguf_file" value="${esc(_defaultGguf)}" />`;
}
// Row 2: Core settings
panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp">`;
panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('TP','Tensor Parallelism — split model across N GPUs')}<select class="hwfit-sf" data-field="tp">${tpOpts}</select></label>`;
@@ -414,6 +603,7 @@ function _rerenderCachedModels() {
panelHtml += `<label class="hwfit-backend-vllm">${_l('Swap','CPU swap space in GB. Leave empty to omit (removed in newer vLLM)')}<input type="text" class="hwfit-sf" data-field="swap" value="${esc(sv('swap', ''))}" placeholder="off" /></label>`;
panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('Max Seqs','Maximum concurrent requests. Lower = less memory. Default 8 — prosumer GPUs often OOM on vLLM default 256 during CUDA graph capture.')}<input type="text" class="hwfit-sf" data-field="max_seqs" value="${esc(sv('max_seqs', '8'))}" placeholder="8" /></label>`;
panelHtml += `<label>${_l('Dtype','Data type for weights. auto picks best for GPU')}<select class="hwfit-sf" data-field="dtype">${dtypeOpts}</select></label>`;
panelHtml += `<label class="hwfit-backend-vllm">${_l('KV Cache','vLLM --kv-cache-dtype. auto uses the model/runtime default; fp8 reduces KV memory for long context.')}<select class="hwfit-sf" data-field="vllm_kv_cache_dtype">${vllmKvCacheOpts}</select></label>`;
panelHtml += `</div>`;
// Row 2b: Diffusers settings
const diffDtypeOpts = ['bfloat16','float16','float32'].map(d => `<option value="${d}"${sv('diff_dtype','bfloat16')===d?' selected':''}>${d}</option>`).join('');
@@ -432,9 +622,47 @@ function _rerenderCachedModels() {
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="prefix_cache"${sv('prefix_cache',false)?' checked':''} /> Prefix Caching${_h('Cache shared prompt prefixes across requests')}</label>`;
panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="auto_tool"${sv('auto_tool',false)?' checked':''} /> Auto Tool Choice${_h('Enable function/tool calling for agent mode')}</label>`;
panelHtml += `</div>`;
// Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => `<option value="${k}"${sv('cache_type','')===k?' selected':''}>${k||'default'}</option>`).join('');
const llamaFitOpts = ['', 'off', 'on'].map(d => `<option value="${d}"${sv('llama_fit','')===d?' selected':''}>${d||'default'}</option>`).join('');
const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => `<option value="${d}"${sv('llama_split_mode','')===d?' selected':''}>${d||'default'}</option>`).join('');
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
panelHtml += `<label>${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" style="width:54px;" /></label>`;
panelHtml += `<label>${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = sharp long-context, f16 = full. Blank = llama.cpp default.')}<select class="hwfit-sf" data-field="cache_type">${_kvOpts}</select></label>`;
panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="flash_attn"${sv('flash_attn',false)?' checked':''} /> Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="vision"${sv('vision',false)?' checked':''} /> Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model (download one into the model folder). Adds ~1 GB VRAM + a small per-image cost.')}</label>`;
panelHtml += `<label>${_l('Fit','llama.cpp --fit. Leave default unless you need explicit off/on behavior for a preset.')}<select class="hwfit-sf" data-field="llama_fit">${llamaFitOpts}</select></label>`;
panelHtml += `</div>`;
// Row 2d: native llama-server placement/runtime controls. These are
// explicit overrides for known-good advanced presets; blank keeps
// llama.cpp/profile defaults.
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode">${llamaSplitModeOpts}</select></label>`;
panelHtml += `<label>${_l('Tensor Split','GPU proportions for llama.cpp, e.g. 50,50 across two visible GPUs. Leave blank for auto.')}<input type="text" class="hwfit-sf" data-field="llama_tensor_split" value="${esc(sv('llama_tensor_split', ''))}" placeholder="50,50" /></label>`;
panelHtml += `<label>${_l('Main GPU','llama.cpp --main-gpu index inside the visible GPU set. Mostly useful for split mode none/row.')}<input type="text" class="hwfit-sf" data-field="llama_main_gpu" value="${esc(sv('llama_main_gpu', ''))}" placeholder="auto" /></label>`;
panelHtml += `<label>${_l('Parallel','llama.cpp parallel slots. Leave blank for llama.cpp default; 1 matches single-lane presets.')}<input type="text" class="hwfit-sf" data-field="llama_parallel" value="${esc(sv('llama_parallel', ''))}" placeholder="1" /></label>`;
panelHtml += `<label>${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
panelHtml += `<label>${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
panelHtml += `</div>`;
// Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
// Buttons are injected after the panel mounts (needs an async fetch).
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp hwfit-serve-profiles" style="align-items:center;gap:8px;">`;
panelHtml += `<span style="opacity:0.7;font-size:11px;">Auto profiles:</span>`;
panelHtml += `<span class="hwfit-profile-btns" style="display:flex;gap:6px;flex-wrap:wrap;"><span style="opacity:0.5;font-size:11px;">computing…</span></span>`;
panelHtml += `</div>`;
// Live VRAM / RAM-spillover monitor for the serve target's GPU. Polls
// /api/cookbook/gpus while the panel is open so you can SEE whether the
// config fits VRAM (fast) or spills to system RAM (slow). Populated after mount.
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp hwfit-vram-monitor" style="align-items:center;gap:8px;font-size:11px;">`;
panelHtml += `<span style="opacity:0.7;">GPU memory:</span>`;
panelHtml += `<span class="hwfit-vram-readout" style="opacity:0.5;">checking…</span>`;
panelHtml += `</div>`;
// Row 3a: Checkboxes (llama.cpp-only)
panelHtml += `<div class="hwfit-serve-checks hwfit-backend-llamacpp">`;
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="unified_mem"${sv('unified_mem',false)?' checked':''} /> Unified Memory${_h('For AMD APUs / Strix Halo: exports GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 so llama.cpp can address the full BIOS VRAM carveout instead of the default ~28 GB cap. No-op on discrete GPUs.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_mmap"${sv('llama_no_mmap',false)?' checked':''} /> No mmap${_h('Adds --no-mmap for native llama-server. Useful for some high-context/local-storage setups, but not a universal default.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_warmup"${sv('llama_no_warmup',false)?' checked':''} /> Skip warmup${_h('Adds --no-warmup. Can reduce startup memory spikes for tight launches, but llama.cpp defaults to warming up.')}</label>`;
panelHtml += `<label class="hwfit-sf-cb hwfit-spec-group"><input type="checkbox" class="hwfit-sf" data-field="llama_speculative_mtp"${sv('llama_speculative_mtp',false)?' checked':''} /> MTP Spec${_h('llama.cpp native MTP speculative decoding: --spec-type draft-mtp. Requires a GGUF with MTP heads and a recent llama-server build.')} <span class="hwfit-numstep"><button type="button" class="hwfit-numstep-btn" data-step="-1" tabindex="-1" aria-label="Decrease"></button><input type="number" class="hwfit-sf hwfit-spec-tokens" data-field="llama_spec_tokens" value="${esc(sv('llama_spec_tokens', '3'))}" min="1" max="10" title="--spec-draft-n-max" /><button type="button" class="hwfit-numstep-btn" data-step="1" tabindex="-1" aria-label="Increase"></button></span></label>`;
panelHtml += `</div>`;
// Row 3b: Checkboxes (diffusers)
panelHtml += `<div class="hwfit-serve-checks hwfit-backend-diffusers">`;
@@ -500,9 +728,10 @@ function _rerenderCachedModels() {
item.classList.add('doclib-card-expanded');
item.style.flexDirection = 'column';
item.style.alignItems = 'stretch';
if (list) list.scrollTop = 0;
item.insertAdjacentHTML('beforeend', panelHtml);
const panel = item.querySelector('.hwfit-serve-panel');
// Scroll the serve panel into view within its nearest scrollable ancestor
requestAnimationFrame(() => panel.scrollIntoView({ block: 'nearest', behavior: 'smooth' }));
// Build command preview
function updateCmd() {
@@ -514,19 +743,27 @@ function _rerenderCachedModels() {
const backend = f.backend || 'vllm';
const serveModel = m.is_local_dir && m.path ? `${m.path}/${repo}` : repo;
if (backend === 'llamacpp') {
const ggufChoices = _runnableGgufFiles(m);
const selectedGguf = ggufChoices.find(file => file.rel_path === f.gguf_file);
// For multi-part GGUFs, llama.cpp requires the first split
// (-00001-of-NNNNN.gguf). Prefer it (sorted, so UD-IQ4_XS/001 comes
// before Q4_K_M/001 etc); fall back to any single GGUF sorted.
// Use $HOME (not ~) so tilde survives variable interpolation inside $(...).
const dir = `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
const dir = _ggufSearchDirExpr(m, repo);
// GGUF needs the actual .gguf FILE, not the folder. For a custom-dir
// model the file lives under "<path>/<repo>" — search there just like we
// search the HF snapshots dir, so serving a GGUF from a custom dir works
// instead of handing llama.cpp a directory (which fails).
const _ldir = `"${m.path}/${repo}"`;
f._gguf_path = m.is_local_dir && m.path
const _ldir = m.path ? _shellQuote(`${m.path}/${repo}`) : '""';
f._gguf_path = selectedGguf
? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
: m.is_local_dir && m.path
? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
: `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
// Vision: auto-find the mmproj (CLIP/projector) file in the same dir.
// Resolved at runtime so the toggle just works if an mmproj-*.gguf is
// present (downloaded alongside the model). Empty if none → cmd omits it.
const _vsearchdir = (m.is_local_dir && m.path) ? _ldir : dir;
f._mmproj_path = `$(find ${_vsearchdir} -iname 'mmproj*.gguf' 2>/dev/null | sort | head -1)`;
}
if (f.reasoning_parser) {
const _rpEl2 = panel.querySelector('[data-field="reasoning_parser"]');
@@ -541,6 +778,151 @@ function _rerenderCachedModels() {
}
updateCmd();
// Context clamp. Two ceilings:
// - ABSOLUTE_CTX_MAX: a hard sanity cap (no LLM trains past ~1M tokens),
// so an obvious typo like 16000000 can never reach llama.cpp even when
// we don't know the model's real limit (not in catalog / profiles
// fetch failed). This is what stops the radv ErrorDeviceLost crash.
// - panel._modelCtxMax: the model's actual trained limit (set by the
// profiles fetch below) — a tighter, model-specific cap when known.
const ABSOLUTE_CTX_MAX = 1048576; // 1M tokens — above any real n_ctx_train
const _ctxEl0 = panel.querySelector('[data-field="ctx"]');
function _clampCtx(announce) {
if (!_ctxEl0) return;
const cap = panel._modelCtxMax > 0 ? panel._modelCtxMax : ABSOLUTE_CTX_MAX;
const v = parseInt(_ctxEl0.value, 10);
if (Number.isFinite(v) && v > cap) {
_ctxEl0.value = String(cap);
_ctxEl0.title = `Capped to ${panel._modelCtxMax > 0 ? "this model's trained limit" : "the maximum sane context"} (${cap}).`;
if (announce) uiModule.showToast(`Context capped to ${cap}`);
updateCmd();
}
}
if (_ctxEl0) {
_ctxEl0.addEventListener('change', () => _clampCtx(false));
_ctxEl0.addEventListener('blur', () => _clampCtx(false));
_clampCtx(false); // fix any stale/preset value already present
}
// Auto profiles — fetch hardware-computed llama.cpp profiles and render
// them as clickable chips. Clicking one fills the ctx/CPU-MoE/KV/flash
// fields and rebuilds the command. Computed from detected VRAM (see
// services/hwfit/profiles.py); rough on t/s, accurate on fit.
async function _loadServeProfiles() {
const wrap = panel.querySelector('.hwfit-profile-btns');
if (!wrap) return;
try {
const host = (_es.remoteHost || '').trim();
const params = new URLSearchParams({ model: repo });
if (host) {
params.set('host', host);
const _sp = (_es.servers || []).find(s => s.host === host)?.port;
if (_sp) params.set('ssh_port', _sp);
}
// SERVE mode: this is a specific GGUF file already on disk, so its quant
// is fixed — tell the profiler the file's real size + quant so it varies
// only the serving knobs (KV/ctx/offload), not the quant. Parse the size
// from m.size (e.g. "20.6 GB") and the quant from the file/repo name.
const _sizeMatch = String(m.size || '').match(/([\d.]+)\s*GB/i);
if (_sizeMatch) params.set('serve_weights_gb', _sizeMatch[1]);
const _qMatch = String(repo).match(/(Q\d[\w]*|IQ\d[\w]*|F16|BF16|FP8)/i);
if (_qMatch) params.set('serve_quant', _qMatch[1]);
const res = await fetch(`/api/hwfit/profiles?${params}`);
const data = await res.json();
// Remember the model's trained context limit and clamp the ctx field
// to it — asking llama.cpp for ctx > n_ctx_train overflows and, with a
// quantized KV cache, can crash the GPU (radv ErrorDeviceLost).
const ctxMax = Number(data && data.model_ctx_max) || 0;
if (ctxMax > 0) {
panel._modelCtxMax = ctxMax; // tighten the clamp to the real limit
_clampCtx(false); // re-apply now that we know the model's max
}
const profs = (data && Array.isArray(data.profiles)) ? data.profiles : [];
if (!profs.length) { wrap.innerHTML = `<span style="opacity:0.5;font-size:11px;">no auto profile for this model</span>`; return; }
wrap.innerHTML = '';
for (const p of profs) {
const b = document.createElement('button');
b.type = 'button';
b.className = 'cookbook-btn hwfit-profile-chip';
b.style.cssText = 'height:24px;padding:0 9px;font-size:11px;';
const off = p.offloads ? `, ncm${p.n_cpu_moe}` : ', all-GPU';
b.textContent = `${p.label} · ${p.quant} · ${Math.round(p.ctx/1024)}k${off}`;
b.title = `${p.note}\nKV ${p.cache_type}, ~${p.est_vram_gb} GB VRAM`;
b.addEventListener('click', () => {
const set = (field, val) => {
const el = panel.querySelector(`[data-field="${field}"]`);
if (!el) return;
if (el.type === 'checkbox') el.checked = !!val; else el.value = val;
};
set('ctx', p.ctx);
set('n_cpu_moe', p.n_cpu_moe || '');
set('cache_type', p.cache_type || '');
set('flash_attn', true); // required for a quantized KV cache
wrap.querySelectorAll('.hwfit-profile-chip').forEach(x => x.classList.remove('cookbook-btn-active'));
b.classList.add('cookbook-btn-active');
updateCmd();
});
wrap.appendChild(b);
}
} catch {
wrap.innerHTML = `<span style="opacity:0.5;font-size:11px;">profile compute failed</span>`;
}
}
_loadServeProfiles();
// Live GPU-memory monitor: poll /api/cookbook/gpus and show VRAM usage +
// RAM-spillover, with a plain-language health/speed hint. Lets you tell at
// a glance whether the chosen config fits VRAM (fast) or is paging into
// system RAM over PCIe (slow). AMD sysfs reports gtt_used_mb for spillover.
async function _refreshVramMonitor() {
const el = panel.querySelector('.hwfit-vram-readout');
if (!el || !document.body.contains(el)) return false; // panel closed → stop
try {
const host = (_es.remoteHost || '').trim();
const params = new URLSearchParams();
if (host) {
params.set('host', host);
const _sp = (_es.servers || []).find(s => s.host === host)?.port;
if (_sp) params.set('ssh_port', _sp);
}
const res = await fetch('/api/cookbook/gpus' + (params.toString() ? '?' + params : ''));
const data = await res.json();
const gpus = Array.isArray(data) ? data : (data.gpus || []);
if (!gpus.length) { el.textContent = 'no GPU detected'; el.style.color = ''; return true; }
const g = gpus[0];
const usedG = (g.used_mb / 1024), totG = (g.total_mb / 1024);
const pct = totG ? Math.round((usedG / totG) * 100) : 0;
const freeG = Math.max(0, totG - usedG);
const spillG = (g.gtt_used_mb || 0) / 1024;
// Color: green < 85%, amber 85-97%, red > 97% or spilling.
const spilling = spillG > 0.5 && !g.unified_memory; // unified APUs always use GTT; not a spill
let color = 'var(--green, #50fa7b)';
if (pct >= 97 || spilling) color = 'var(--red, #ff5555)';
else if (pct >= 85) color = 'var(--orange, #ffb86c)';
let txt = `${usedG.toFixed(1)} / ${totG.toFixed(1)} GB (${pct}%) · ${freeG.toFixed(1)} GB free`;
if (spilling) {
txt += ` · ⚠ ${spillG.toFixed(1)} GB spilled to RAM — slow (raise CPU MoE or lower context)`;
} else if (pct >= 90) {
txt += ` · tight — risk of OOM/spill on long context or images`;
} else {
txt += ` · healthy`;
}
el.textContent = txt;
el.style.color = color;
return true;
} catch {
el.textContent = 'unavailable';
el.style.color = '';
return true;
}
}
_refreshVramMonitor();
// Poll every 4s while the panel is open; stop when it's removed from the DOM.
const _vramTimer = setInterval(async () => {
const ok = await _refreshVramMonitor();
if (ok === false) clearInterval(_vramTimer);
}, 4000);
// Show/hide backend-specific sections
function updateBackendVisibility() {
const b = panel.querySelector('[data-field="backend"]')?.value || 'vllm';
@@ -551,6 +933,38 @@ function _rerenderCachedModels() {
}
updateBackendVisibility();
async function updateRuntimeReadinessNote() {
const note = panel.querySelector('.hwfit-serve-runtime-note');
if (!note) return;
const backend = panel.querySelector('[data-field="backend"]')?.value || 'vllm';
if (!['vllm', 'sglang', 'llamacpp', 'diffusers'].includes(backend)) {
note.style.display = 'none';
note.textContent = '';
return;
}
const seq = (panel._runtimeReadinessSeq || 0) + 1;
panel._runtimeReadinessSeq = seq;
note.style.display = '';
note.textContent = 'Checking runtime on selected server...';
try {
const { pkg, target } = await _fetchServeRuntimePackage(panel, backend);
if (panel._runtimeReadinessSeq !== seq) return;
note.textContent = _runtimeNoteText(backend, pkg, target);
note.style.color = pkg?.installed ? 'var(--fg-muted)' : 'var(--red)';
} catch (err) {
if (panel._runtimeReadinessSeq !== seq) return;
note.textContent = `Runtime readiness unavailable: ${err?.message || err}`;
note.style.color = 'var(--fg-muted)';
}
}
updateRuntimeReadinessNote();
const runtimeServerSelect = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
if (runtimeServerSelect) {
const refreshRuntimeOnServerChange = () => updateRuntimeReadinessNote();
runtimeServerSelect.addEventListener('change', refreshRuntimeOnServerChange);
panel._cleanupRuntimeReadiness = () => runtimeServerSelect.removeEventListener('change', refreshRuntimeOnServerChange);
}
// Wire save slots
function _loadSlotIntoPanel(slotIdx) {
const presets = _loadPresets();
@@ -580,7 +994,17 @@ function _rerenderCachedModels() {
gpu_mem: _ex(/--gpu-memory-utilization\s+([\d.]+)/) || '0.90',
swap: _ex(/--swap-space\s+(\d+)/) || '',
dtype: _ex(/--dtype\s+(\w+)/) || 'auto',
vllm_kv_cache_dtype: _ex(/--kv-cache-dtype\s+([\w.-]+)/) || 'auto',
max_seqs: _ex(/--max-num-seqs\s+(\d+)/) || '',
cache_type: _ex(/(?:--cache-type-k|-ctk)\s+(\S+)/) || '',
llama_fit: _ex(/(?:--fit|-fit)\s+(on|off)/) || '',
llama_split_mode: _ex(/(?:--split-mode|-sm)\s+(none|layer|row|tensor)/) || '',
llama_tensor_split: _ex(/(?:--tensor-split|-ts)\s+([0-9.,]+)/) || '',
llama_main_gpu: _ex(/(?:--main-gpu|-mg)\s+(\d+)/) || '',
llama_parallel: _ex(/(?:--parallel|-np)\s+(\d+)/) || '',
llama_batch_size: _ex(/(?:--batch-size|-b)\s+(\d+)/) || '',
llama_ubatch_size: _ex(/(?:--ubatch-size|-ub)\s+(\d+)/) || '',
llama_spec_tokens: _ex(/--spec-draft-n-max\s+(\d+)/) || '3',
venv: p.envPath || '',
};
const checks = {
@@ -588,6 +1012,11 @@ function _rerenderCachedModels() {
trust_remote: cmd.includes('--trust-remote-code'),
prefix_cache: cmd.includes('--enable-prefix-caching'),
auto_tool: cmd.includes('--enable-auto-tool-choice'),
flash_attn: /--flash-attn\s+on\b/.test(cmd),
unified_mem: /GGML_CUDA_ENABLE_UNIFIED_MEMORY=1/.test(cmd),
llama_no_mmap: /--no-mmap\b/.test(cmd),
llama_no_warmup: /--no-warmup\b/.test(cmd),
llama_speculative_mtp: /--spec-type\s+\S*draft-mtp/.test(cmd),
speculative: cmd.includes('--speculative-config'),
};
const _specMatch = cmd.match(/--speculative-config\s+'?\{[^}]*"method"\s*:\s*"([^"]+)"[^}]*"num_speculative_tokens"\s*:\s*(\d+)/);
@@ -619,16 +1048,21 @@ function _rerenderCachedModels() {
const _gf = panel.querySelector('[data-field="gpus"]');
if (_gf) _gf.value = activeGpus.join(',');
updateBackendVisibility();
updateRuntimeReadinessNote();
updateCmd();
panel.querySelectorAll('.cookbook-slot-btn').forEach(b => b.classList.remove('active'));
panel.querySelector(`.cookbook-slot-btn[data-slot="${slotIdx}"]`)?.classList.add('active');
}
// Keep the arrow button's count in sync with the stored presets.
// Keep the arrow button's count + tooltip in sync with stored presets.
function _updateSavedToggleLabel() {
const n = _presetsForModel(_loadPresets(), repo).length;
const t = panel.querySelector('.cookbook-saved-arrow');
if (t) t.textContent = n > 0 ? `${n}` : '▾';
if (!t) return;
t.textContent = n > 0 ? `${n}` : '▾';
t.title = n > 0
? `${n} saved launch config${n === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete`
: `No saved launch configs for ${_repoShort} yet — click Save to add one`;
}
// Save the current panel fields as a new named preset (shared by the menu's
@@ -1154,6 +1588,10 @@ function _rerenderCachedModels() {
const extraEl = panel.querySelector('[data-field="extra"]');
if (extraEl) extraEl.value = '';
updateBackendVisibility();
updateRuntimeReadinessNote();
}
if (e.target.dataset.field === 'venv') {
updateRuntimeReadinessNote();
}
updateCmd();
});
@@ -1185,6 +1623,7 @@ function _rerenderCachedModels() {
// "back out" affordance next to Launch.
panel.querySelector('.hwfit-serve-cancel')?.addEventListener('click', (ev) => {
ev.stopPropagation();
panel._cleanupRuntimeReadiness?.();
panel.remove();
item.classList.remove('doclib-card-expanded');
item.style.flexDirection = '';
@@ -1195,6 +1634,12 @@ function _rerenderCachedModels() {
// Launch button
panel.querySelector('.hwfit-serve-launch').addEventListener('click', async (ev) => {
const _launchBtn = ev.currentTarget;
// Final safety net: never launch with ctx beyond the model's trained
// limit (or the absolute sanity ceiling when the limit is unknown). A
// stale preset or typo (e.g. 16000000) overflows and, with a quantized
// KV cache, can crash the GPU. Skip only if the user hand-edited the raw
// command (then we respect their literal text).
if (!_cmdManuallyEdited) _clampCtx(true);
if (!_cmdManuallyEdited) updateCmd();
const launchCmd = _cmdTextarea ? _cmdTextarea.value.trim() : panel._cmd;
const serveState = {};
@@ -1202,7 +1647,16 @@ function _rerenderCachedModels() {
if (el.type === 'checkbox') serveState[el.dataset.field] = el.checked;
else serveState[el.dataset.field] = el.value;
});
serveState.backend = (_detectBackend(m).backend) || serveState.backend || 'vllm';
serveState.backend = serveState.backend || (_detectBackend(m).backend) || 'vllm';
const backendWarning = _serveBackendWarning(m, repo, serveState.backend, serveState);
if (backendWarning) {
await window.styledConfirm(backendWarning.body, {
title: backendWarning.title,
confirmText: 'Edit settings',
cancelText: 'Close',
});
return;
}
// Save in the { _byRepo, _lastUsed } schema — no legacy flat keys at
// the root so per-model state doesn't leak between models.
try {
@@ -1515,7 +1969,10 @@ export async function _fetchCachedModels() {
const data = await res.json();
_dlWp.destroy();
const ready = data.models.filter(m => m.status === 'ready' && (m.backend === 'ollama' || !m.size.includes('MB')));
// CHANGELOG: 'ready' already excludes partial downloads;
// show every complete model regardless of size/backend.
const ready = data.models.filter(m => m.status === 'ready');
const downloading = data.models.filter(m => m.status === 'downloading');
const allModels = [...ready, ...downloading];
_cachedAllModels = allModels;