mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Merge remote-tracking branch 'origin/main' into visual-pr-playground
# Conflicts: # routes/cookbook_routes.py # routes/hwfit_routes.py # services/hwfit/fit.py # services/hwfit/models.py # static/js/cookbook-diagnosis.js # static/js/cookbook-hwfit.js # static/js/cookbook.js # static/js/cookbookRunning.js
This commit is contained in:
+229
-44
@@ -223,11 +223,20 @@ function _detectModelOptimizations(modelName) {
|
||||
return opts;
|
||||
}
|
||||
|
||||
/** Detect the right vLLM tool-call-parser based on model name */
|
||||
/** Detect the right vLLM tool-call-parser based on model name.
|
||||
* Qwen tool-call formats split by generation:
|
||||
* - Qwen3-Coder → qwen3_coder (XML <tool_call> with named params)
|
||||
* - Qwen3 (non-coder) → qwen3_xml (reasoning/instruct, XML wrapper)
|
||||
* - Qwen2.5 / Qwen2 / 1.5 → hermes (Qwen2.5 was trained on Hermes format)
|
||||
* Catching "qwen" first and labelling everything qwen3_xml breaks tool
|
||||
* calls on the Qwen2.5 line (the model emits hermes-style which the
|
||||
* qwen3_xml parser doesn't recognise, so the call leaks through as text).
|
||||
*/
|
||||
export function _detectToolParser(modelName) {
|
||||
const n = (modelName || '').toLowerCase();
|
||||
if (n.includes('qwen3') && n.includes('coder')) return 'qwen3_coder';
|
||||
if (n.includes('qwen')) return 'qwen3_xml';
|
||||
if (n.includes('qwen3')) return 'qwen3_xml';
|
||||
if (n.includes('qwen')) return 'hermes'; // Qwen2.5 / Qwen2 / Qwen1.5
|
||||
if (n.includes('llama-4') || n.includes('llama4')) return 'llama4_json';
|
||||
if (n.includes('llama') || n.includes('nemotron')) return 'llama3_json';
|
||||
if (n.includes('mistral') || n.includes('mixtral')) return 'mistral';
|
||||
@@ -251,37 +260,43 @@ export function _detectBackend(model) {
|
||||
const q = (model.quant || '').toUpperCase();
|
||||
const sysBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
|
||||
const isRocm = sysBackend === 'rocm';
|
||||
const isAppleSilicon = ['metal', 'mps', 'apple'].includes(sysBackend);
|
||||
const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
|
||||
if (/\bmlx\b|mlx-|_mlx/i.test(_nm) || q.startsWith('MLX')) {
|
||||
return { backend: 'unsupported', label: 'Unsupported' };
|
||||
}
|
||||
const isAwqLike = /^AWQ|^GPTQ|^NVFP4/.test(q) || ['FP8', 'FP4', 'MXFP4', 'NF4', 'INT4', 'INT8', 'W4A16', 'W8A8', 'W8A16'].includes(q) || /\b(awq|gptq|fp8|fp4|nvfp4|mxfp4|nf4|int4|int8|w4a16|w8a8|w8a16)\b/i.test(_nm);
|
||||
const isGgufLike = model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf');
|
||||
|
||||
// Image gen models → diffusers
|
||||
if (model.is_image_gen || model.is_diffusion || model._tag === 'image') {
|
||||
return { backend: 'diffusers', label: 'Diffusers' };
|
||||
}
|
||||
|
||||
// AWQ / GPTQ / FP8 are safetensors GPU-serving formats. Never route them
|
||||
// through llama.cpp/Ollama just because the host is Mac/Windows; those engines
|
||||
// need GGUF. The UI will warn/block on Metal where vLLM/SGLang aren't viable.
|
||||
if (isAwqLike) {
|
||||
return { backend: 'vllm', label: 'vLLM' };
|
||||
}
|
||||
|
||||
// GGUF → llama.cpp/Ollama-compatible.
|
||||
if (isGgufLike) {
|
||||
return { backend: 'llamacpp', label: 'llama.cpp' };
|
||||
}
|
||||
|
||||
// Windows → default to llama.cpp (no vLLM support on Windows)
|
||||
if (_isWindows()) {
|
||||
return { backend: 'llamacpp', label: 'llama.cpp' };
|
||||
}
|
||||
|
||||
// Apple Silicon (Metal) → llama.cpp (GGUF). vLLM/SGLang are CUDA/ROCm-only and
|
||||
// don't run on macOS; AWQ/GPTQ/FP8 (vLLM-only) models are already filtered out
|
||||
// don't run on macOS; vLLM-native quantized models are already filtered out
|
||||
// of metal Cookbook results, so llama.cpp is always the right engine here.
|
||||
if (['metal', 'mps', 'apple'].includes(sysBackend)) {
|
||||
return { backend: 'llamacpp', label: 'llama.cpp' };
|
||||
}
|
||||
|
||||
// AWQ / GPTQ / FP8 → vLLM
|
||||
if (/^AWQ|^GPTQ/.test(q) || q === 'FP8') {
|
||||
return { backend: 'vllm', label: 'vLLM' };
|
||||
}
|
||||
|
||||
// GGUF → llama.cpp. Match the quant tag OR a gguf hint in the repo/path/name:
|
||||
// a raw .gguf file often has no quant field, which made it fall through to the
|
||||
// vLLM default below.
|
||||
const _nm = `${model.repo_id || ''} ${model.path || ''} ${model.name || ''}`.toLowerCase();
|
||||
if (model.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || _nm.includes('gguf')) {
|
||||
return { backend: 'llamacpp', label: 'llama.cpp' };
|
||||
}
|
||||
|
||||
// ROCm/AMD machines should not blindly default HF safetensors models to
|
||||
// vLLM. SGLang is the safer OpenAI-compatible default for plain HF text
|
||||
// repos there; llama.cpp still wins above whenever the model is GGUF.
|
||||
@@ -351,6 +366,8 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
cmd += ` --gpu-memory-utilization ${f.gpu_mem || '0.90'}`;
|
||||
if (f.swap && f.swap !== '0') cmd += ` --swap-space ${f.swap}`;
|
||||
cmd += ` --dtype ${f.dtype || 'auto'}`;
|
||||
const _kv = (f.vllm_kv_cache_dtype ?? '').toString().trim();
|
||||
if (_kv === 'fp8') cmd += ' --kv-cache-dtype fp8';
|
||||
if (f.max_seqs && f.max_seqs.toString().trim()) cmd += ` --max-num-seqs ${f.max_seqs.toString().trim()}`;
|
||||
if (f.enforce_eager) cmd += ' --enforce-eager';
|
||||
if (f.trust_remote) cmd += ' --trust-remote-code';
|
||||
@@ -384,13 +401,17 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
const ggufPath = f._gguf_path || 'model.gguf';
|
||||
const gpuId = f.gpu_id?.trim() || '';
|
||||
const py = _isWindows() ? 'python' : 'python3';
|
||||
// CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
|
||||
// mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
|
||||
// start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
|
||||
const _cpuOnly = String(f.ngl).trim() === '0';
|
||||
const lcPrefix = (() => {
|
||||
let p = '';
|
||||
if (f.unified_mem && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
|
||||
if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
|
||||
if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `;
|
||||
return p;
|
||||
})();
|
||||
if (f.unified_mem && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
|
||||
if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
|
||||
if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `;
|
||||
if (!_isWindows()) {
|
||||
// Resolve GGUF path once, fail loudly if nothing matched (prevents
|
||||
@@ -402,16 +423,75 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
// renders modern GGUF chat templates that the Python bindings' Jinja2
|
||||
// rejects (do_tojson ensure_ascii). Fall back to llama_cpp.server.
|
||||
// Don't suppress stderr — surface real errors (missing file, lib, OOM).
|
||||
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}`;
|
||||
// Optional perf/fit flags from a hardware profile (see services/hwfit/
|
||||
// profiles.py). n_cpu_moe offloads MoE expert layers to CPU when the model
|
||||
// is bigger than VRAM; flash-attn + a quantized KV cache cut KV memory and
|
||||
// speed things up. Only emitted when set, so manual/older flows are unchanged.
|
||||
const _ncm = (f.n_cpu_moe ?? '').toString().trim();
|
||||
const _kv = (f.cache_type ?? '').toString().trim();
|
||||
const _llamaNum = (v) => {
|
||||
const s = String(v || '').trim();
|
||||
return /^\d+$/.test(s) ? s : '';
|
||||
};
|
||||
const _llamaCsv = (v) => {
|
||||
const s = String(v || '').replace(/\s+/g, '');
|
||||
return /^\d+(?:\.\d+)?(?:,\d+(?:\.\d+)?)*$/.test(s) ? s : '';
|
||||
};
|
||||
let _lcExtra = '';
|
||||
let _lcpExtra = '';
|
||||
if (_ncm !== '' && Number(_ncm) > 0) {
|
||||
_lcExtra += ` --n-cpu-moe ${_ncm}`;
|
||||
_lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores
|
||||
}
|
||||
if (f.flash_attn && !_cpuOnly) {
|
||||
_lcExtra += ' --flash-attn on';
|
||||
_lcpExtra += ' --flash_attn true';
|
||||
}
|
||||
if (_kv) {
|
||||
_lcExtra += ` --cache-type-k ${_kv} --cache-type-v ${_kv}`;
|
||||
// llama-cpp-python exposes these as type_k/type_v; pass through best-effort.
|
||||
_lcpExtra += ` --type_k ${_kv} --type_v ${_kv}`;
|
||||
}
|
||||
const _llamaFit = String(f.llama_fit || '').trim();
|
||||
if (['on', 'off'].includes(_llamaFit)) _lcExtra += ` --fit ${_llamaFit}`;
|
||||
if (f.llama_no_mmap) _lcExtra += ' --no-mmap';
|
||||
if (f.llama_no_warmup) _lcExtra += ' --no-warmup';
|
||||
const _llamaSplitMode = String(f.llama_split_mode || '').trim();
|
||||
if (['none', 'layer', 'row', 'tensor'].includes(_llamaSplitMode)) _lcExtra += ` --split-mode ${_llamaSplitMode}`;
|
||||
const _llamaTensorSplit = _llamaCsv(f.llama_tensor_split);
|
||||
if (_llamaTensorSplit) _lcExtra += ` --tensor-split ${_llamaTensorSplit}`;
|
||||
const _llamaMainGpu = _llamaNum(f.llama_main_gpu);
|
||||
if (_llamaMainGpu) _lcExtra += ` --main-gpu ${_llamaMainGpu}`;
|
||||
const _llamaParallel = _llamaNum(f.llama_parallel);
|
||||
if (_llamaParallel) _lcExtra += ` --parallel ${_llamaParallel}`;
|
||||
const _llamaBatch = _llamaNum(f.llama_batch_size);
|
||||
if (_llamaBatch) _lcExtra += ` --batch-size ${_llamaBatch}`;
|
||||
const _llamaUBatch = _llamaNum(f.llama_ubatch_size);
|
||||
if (_llamaUBatch) _lcExtra += ` --ubatch-size ${_llamaUBatch}`;
|
||||
if (f.llama_speculative_mtp) {
|
||||
const specTokens = parseInt(f.llama_spec_tokens, 10);
|
||||
const specN = Number.isFinite(specTokens) && specTokens > 0 ? specTokens : 3;
|
||||
_lcExtra += ` --spec-type draft-mtp --spec-draft-n-max ${specN}`;
|
||||
}
|
||||
// Vision: serve the multimodal projector so the model can read images. The
|
||||
// mmproj path is resolved at runtime (find mmproj-*.gguf next to the model);
|
||||
// only emitted when the Vision toggle is on AND a projector was found.
|
||||
if (f.vision && f._mmproj_path) {
|
||||
_lcExtra += ` --mmproj "${f._mmproj_path}" --image-max-tokens 1024`;
|
||||
// llama-cpp-python takes the projector via --clip_model_path.
|
||||
_lcpExtra += ` --clip_model_path "${f._mmproj_path}"`;
|
||||
}
|
||||
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
|
||||
if (_isWindows()) {
|
||||
cmd += _lcpServer;
|
||||
} else {
|
||||
cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}`;
|
||||
cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
|
||||
cmd += ` || ${_lcpServer}`;
|
||||
}
|
||||
} else if (backend === 'ollama') {
|
||||
const ollamaPort = f.port || '11434';
|
||||
const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=0.0.0.0:${ollamaPort} ` : '';
|
||||
const bindHost = _envState.remoteHost ? '0.0.0.0' : '127.0.0.1';
|
||||
const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=${bindHost}:${ollamaPort} ` : '';
|
||||
cmd = `${hostEnv}ollama serve`;
|
||||
} else if (backend === 'diffusers') {
|
||||
const gpuStr = f.gpus?.trim();
|
||||
@@ -542,6 +622,10 @@ async function _fetchDependencies() {
|
||||
const _statusTag = (pkg, isLocal, isSystemDep, winBlocked) => {
|
||||
if (winBlocked) return `<span class="cookbook-dep-tag cookbook-dep-na">N/A</span>`;
|
||||
if (pkg.installed && isSystemDep) return `<span class="cookbook-dep-tag cookbook-dep-installed" title="Found on selected server">Installed</span>`;
|
||||
if (pkg.installed && pkg.pip_update_available === false) {
|
||||
const tip = esc(pkg.update_note || pkg.status_note || 'Found externally; update outside Odysseus.');
|
||||
return `<span class="cookbook-dep-tag cookbook-dep-installed" title="${tip}">Installed</span>`;
|
||||
}
|
||||
if (pkg.installed) return `<button class="cookbook-dep-tag cookbook-dep-installed cookbook-dep-installed-btn" title="Installed — click for actions"><span class="cookbook-dep-installed-label">Installed</span><span class="cookbook-dep-caret">▾</span></button>`;
|
||||
if (isSystemDep) {
|
||||
const depTip = esc(pkg.install_hint || 'Install this OS package on the selected server.');
|
||||
@@ -556,11 +640,13 @@ async function _fetchDependencies() {
|
||||
const isSystemDep = pkg.kind === 'system';
|
||||
const winBlocked = !isLocal && _isWindows() && _winUnsupported.has(pkg.name);
|
||||
const note = pkg.status_note ? `<div class="memory-item-meta" style="font-size:10px;opacity:0.65;margin-top:3px;">${esc(pkg.status_note)}</div>` : '';
|
||||
const updateNote = pkg.installed && pkg.pip_update_available === false && pkg.update_note ? `<div class="memory-item-meta" style="font-size:10px;opacity:0.55;margin-top:3px;">${esc(pkg.update_note)}</div>` : '';
|
||||
return `<div class="cookbook-dep-row${winBlocked ? ' cookbook-dep-blocked' : ''}" data-pkg-name="${esc(pkg.name)}" data-dep-pip="${esc(pkg.pip || '')}" data-dep-target="${isLocal ? 'local' : 'remote'}" data-dep-kind="${esc(pkg.kind || 'python')}">`
|
||||
+ `<div class="cookbook-dep-info">`
|
||||
+ `<div class="memory-item-title">${esc(pkg.name)}</div>`
|
||||
+ `<div class="memory-item-meta" style="font-size:10px;opacity:0.5;margin-top:2px;">${esc(pkg.desc)}</div>`
|
||||
+ note
|
||||
+ updateNote
|
||||
+ `</div>`
|
||||
+ `<span class="cookbook-dep-tag cookbook-dep-cat">${esc(pkg.category)}</span>`
|
||||
+ _statusTag(pkg, isLocal, isSystemDep, winBlocked)
|
||||
@@ -642,7 +728,7 @@ async function _fetchDependencies() {
|
||||
}
|
||||
// _dep flags this as a pip dependency/driver install (not a servable
|
||||
// model) so the running-task card doesn't offer a "Serve →" button.
|
||||
const payload = { repo_id: pipName, _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true };
|
||||
const payload = { repo_id: pipName, _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true, env_path: _envState.envPath || '' };
|
||||
_addTask(data.session_id, 'pip ' + pkgName, 'download', payload);
|
||||
if (statusEl) { statusEl.textContent = upgrade ? 'Updating...' : 'Installing...'; statusEl.disabled = true; }
|
||||
uiModule.showToast(`${upgrade ? 'Updating' : 'Installing'} ${pkgName} on ${targetHost}...`);
|
||||
@@ -932,6 +1018,51 @@ function _wireTabEvents(body) {
|
||||
});
|
||||
}
|
||||
|
||||
// "Rebuild llama.cpp" clears the cached build so the next serve recompiles.
|
||||
// The serve bootstrap only builds llama-server when it is missing from PATH,
|
||||
// so a host that first built CPU-only (no nvcc at build time) keeps reusing
|
||||
// that binary forever; this is the lever to force a fresh GPU build after a
|
||||
// CUDA/ROCm toolkit is installed.
|
||||
const rebuildBtn = document.getElementById('cookbook-rebuild-engine');
|
||||
if (rebuildBtn && !rebuildBtn._wired) {
|
||||
rebuildBtn._wired = true;
|
||||
rebuildBtn.addEventListener('click', async () => {
|
||||
// Match _installDep: honor the Dependencies server selector so the clear
|
||||
// runs on the same host the build runs on.
|
||||
const sel = document.getElementById('hwfit-deps-server');
|
||||
if (sel) _applyServerSelection(sel.value);
|
||||
const host = _envState.remoteHost || '';
|
||||
const where = host || 'this server';
|
||||
if (!confirm(`Rebuild the llama.cpp engine on ${where}?\n\nThis clears the cached llama-server build so the next serve recompiles from source (with CUDA/HIP if a toolchain is present). It does not download or install anything.`)) return;
|
||||
const _label = rebuildBtn.textContent;
|
||||
rebuildBtn.disabled = true;
|
||||
rebuildBtn.textContent = 'Clearing...';
|
||||
try {
|
||||
const res = await fetch('/api/cookbook/rebuild-engine', {
|
||||
method: 'POST', credentials: 'same-origin',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
engine: 'llamacpp',
|
||||
remote_host: host || undefined,
|
||||
ssh_port: _getPort(host) || undefined,
|
||||
}),
|
||||
});
|
||||
const data = await res.json().catch(() => ({}));
|
||||
if (!res.ok || !data.ok) {
|
||||
const reason = data.detail || data.error || `HTTP ${res.status}`;
|
||||
uiModule.showToast('Rebuild failed: ' + String(reason).slice(0, 200));
|
||||
} else {
|
||||
uiModule.showToast(`Cleared llama.cpp build on ${where}. Re-launch the serve task to rebuild with GPU support.`);
|
||||
}
|
||||
} catch (err) {
|
||||
uiModule.showToast('Rebuild failed: ' + err.message);
|
||||
} finally {
|
||||
rebuildBtn.disabled = false;
|
||||
rebuildBtn.textContent = _label;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Serve sort
|
||||
const serveSort = document.getElementById('serve-sort');
|
||||
if (serveSort) {
|
||||
@@ -985,6 +1116,7 @@ function _wireTabEvents(body) {
|
||||
|
||||
document.getElementById('serve-bulk-cancel')?.addEventListener('click', () => {
|
||||
selectBtn.classList.remove('active');
|
||||
selectBtn.textContent = 'Select'; // reset label so the button doesn't stay reading "Cancel" after exit
|
||||
bulkBar.classList.add('hidden');
|
||||
document.querySelectorAll('.serve-select-cb').forEach(dot => { dot.style.display = 'none'; dot.classList.remove('selected'); });
|
||||
});
|
||||
@@ -1003,6 +1135,7 @@ function _wireTabEvents(body) {
|
||||
if (item) await _deleteCachedModel(repo, item, true);
|
||||
}
|
||||
selectBtn.classList.remove('active');
|
||||
selectBtn.textContent = 'Select'; // same reset as bulk-cancel
|
||||
bulkBar.classList.add('hidden');
|
||||
document.querySelectorAll('.serve-select-cb').forEach(dot => { dot.style.display = 'none'; dot.classList.remove('selected'); });
|
||||
});
|
||||
@@ -1011,6 +1144,16 @@ function _wireTabEvents(body) {
|
||||
// Download input
|
||||
const dlBtn = document.getElementById('cookbook-dl-btn');
|
||||
const dlInput = document.getElementById('cookbook-dl-repo');
|
||||
const dlCardToggle = document.getElementById('cookbook-download-card-toggle');
|
||||
const dlCardBody = document.getElementById('cookbook-download-card-body');
|
||||
const dlCardArrow = document.getElementById('cookbook-download-card-arrow');
|
||||
if (dlCardToggle && dlCardBody) {
|
||||
dlCardToggle.addEventListener('click', () => {
|
||||
const isOpen = dlCardBody.style.display !== 'none';
|
||||
dlCardBody.style.display = isOpen ? 'none' : 'block';
|
||||
if (dlCardArrow) dlCardArrow.style.transform = isOpen ? 'rotate(0deg)' : 'rotate(90deg)';
|
||||
});
|
||||
}
|
||||
if (dlBtn && dlInput) {
|
||||
function _stripHfUrl(input) {
|
||||
let repo = input.trim();
|
||||
@@ -1104,8 +1247,12 @@ function _wireTabEvents(body) {
|
||||
if (hfToggle && hfList) {
|
||||
let _loaded = false;
|
||||
// Per-server VRAM cache so we don't re-probe on every expand
|
||||
const _vramCache = {};
|
||||
async function _getSelectedServerVram() {
|
||||
const _hwCache = {};
|
||||
function _hfModelLooksAwqLike(m) {
|
||||
const text = `${m?.repo_id || ''} ${(m?.tags || []).join(' ')}`.toLowerCase();
|
||||
return /\b(awq|gptq|fp8|4bit|int4)\b/.test(text);
|
||||
}
|
||||
async function _getSelectedServerHw() {
|
||||
// Prefer the "What Fits" dropdown (the main control that shows hardware);
|
||||
// fall back to the download dropdown. This is the server the list ranks for.
|
||||
const dlSrv = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
|
||||
@@ -1122,7 +1269,7 @@ function _wireTabEvents(body) {
|
||||
}
|
||||
}
|
||||
const cacheKey = host || 'local';
|
||||
if (_vramCache[cacheKey] !== undefined) return _vramCache[cacheKey];
|
||||
if (_hwCache[cacheKey]) return _hwCache[cacheKey];
|
||||
// Fetch system info for this server from hwfit
|
||||
try {
|
||||
const qp = new URLSearchParams();
|
||||
@@ -1132,13 +1279,13 @@ function _wireTabEvents(body) {
|
||||
const r = await fetch(`/api/hwfit/system?${qp}`);
|
||||
if (r.ok) {
|
||||
const sys = await r.json();
|
||||
const v = sys?.gpu_vram_gb || 0;
|
||||
_vramCache[cacheKey] = v;
|
||||
return v;
|
||||
const hw = { vram: sys?.gpu_vram_gb || 0, backend: String(sys?.backend || '').toLowerCase() };
|
||||
_hwCache[cacheKey] = hw;
|
||||
return hw;
|
||||
}
|
||||
} catch {}
|
||||
_vramCache[cacheKey] = 0;
|
||||
return 0;
|
||||
_hwCache[cacheKey] = { vram: 0, backend: '' };
|
||||
return _hwCache[cacheKey];
|
||||
}
|
||||
async function _loadLatest() {
|
||||
// Match the Dependencies loader: whirlpool spinner + text label so the
|
||||
@@ -1157,7 +1304,8 @@ function _wireTabEvents(body) {
|
||||
} catch {
|
||||
hfList.innerHTML = '<div class="hwfit-loading">Scanning models…</div>';
|
||||
}
|
||||
const vram = await _getSelectedServerVram();
|
||||
const hwInfo = await _getSelectedServerHw();
|
||||
const vram = hwInfo.vram || 0;
|
||||
try {
|
||||
let lastErr = '';
|
||||
const _fetchLatest = async (v) => {
|
||||
@@ -1173,6 +1321,9 @@ function _wireTabEvents(body) {
|
||||
if (!models.length && vram > 0) {
|
||||
models = await _fetchLatest(0);
|
||||
}
|
||||
if (['rocm', 'metal', 'mps', 'apple', 'generic', 'cpu'].includes(hwInfo.backend)) {
|
||||
models = models.filter(m => !_hfModelLooksAwqLike(m));
|
||||
}
|
||||
if (!models.length) {
|
||||
// Distinguish "the HF API failed" from "nothing matched" so an outage
|
||||
// doesn't masquerade as no-fitting-models.
|
||||
@@ -1254,9 +1405,32 @@ function _wireTabEvents(body) {
|
||||
// HF token — save on change
|
||||
const hfInput = document.getElementById('hwfit-hftoken');
|
||||
if (hfInput) {
|
||||
hfInput.addEventListener('change', () => {
|
||||
_envState.hfToken = hfInput.value.trim();
|
||||
_persistEnvState();
|
||||
hfInput.addEventListener('change', async () => {
|
||||
const val = hfInput.value.trim();
|
||||
_envState.hfToken = val;
|
||||
try { await _persistEnvState(); } catch {}
|
||||
if (val) {
|
||||
_envState.hfTokenConfigured = true;
|
||||
const masked = val.length > 6 ? val.slice(0, 3) + '…' + val.slice(-3) : '••••';
|
||||
_envState.hfTokenMasked = masked;
|
||||
hfInput.placeholder = `Stored (${masked}) - enter a new token to replace`;
|
||||
hfInput.value = '';
|
||||
let check = hfInput.parentNode.querySelector('.hwfit-hf-check');
|
||||
if (!check) {
|
||||
check = document.createElement('span');
|
||||
check.className = 'hwfit-hf-check';
|
||||
check.title = 'Token stored';
|
||||
check.textContent = '✓';
|
||||
check.style.cssText = 'font-weight:800;color:var(--green,#50fa7b);font-size:15px;line-height:1;flex-shrink:0;position:relative;top:2px;';
|
||||
hfInput.parentNode.insertBefore(check, hfInput);
|
||||
}
|
||||
const flash = document.createElement('span');
|
||||
flash.textContent = 'Saved';
|
||||
flash.style.cssText = 'margin-left:8px;font-size:11px;color:var(--green,#50fa7b);opacity:0;transition:opacity 0.18s;flex-shrink:0;position:relative;top:1px;';
|
||||
hfInput.parentNode.appendChild(flash);
|
||||
requestAnimationFrame(() => { flash.style.opacity = '1'; });
|
||||
setTimeout(() => { flash.style.opacity = '0'; setTimeout(() => flash.remove(), 220); }, 1400);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1393,7 +1567,7 @@ function _renderRecipes() {
|
||||
// silently sending downloads to the wrong server. An empty selection means Local; the user
|
||||
// chooses a remote server explicitly via the dropdown.
|
||||
|
||||
// Download input
|
||||
// Manual download input
|
||||
html += `<div style="margin-top:7px;margin-bottom:2px;display:flex;gap:4px;align-items:center;">`;
|
||||
if (_es.servers.length > 1) {
|
||||
html += `<select class="cookbook-field-input hwfit-dl-server" id="hwfit-dl-server" style="height:28px;position:relative;top:0px;">`;
|
||||
@@ -1409,7 +1583,7 @@ function _renderRecipes() {
|
||||
html += `<button class="cookbook-btn cookbook-dl-btn" id="cookbook-dl-btn">Download</button>`;
|
||||
html += `</div>`;
|
||||
// Latest HF models that fit — collapsible card list
|
||||
html += `<div style="margin-top:2px;position:relative;top:-8px;">`;
|
||||
html += `<div style="margin-top:5px;position:relative;top:-3px;">`;
|
||||
html += `<div style="display:flex;gap:4px;align-items:center;">`;
|
||||
html += `<button type="button" class="memory-toolbar-btn" id="cookbook-hf-latest-toggle" style="flex:1;text-align:left;height:26px;display:flex;align-items:center;gap:6px;border-radius:4px;">`;
|
||||
html += `<span id="cookbook-hf-latest-arrow" style="display:inline-block;transition:transform 0.15s;pointer-events:none;">\u25B8</span>`;
|
||||
@@ -1422,7 +1596,7 @@ function _renderRecipes() {
|
||||
html += `</div>`; // /#cookbook-dl-tab-fold-body (whole Download card body)
|
||||
|
||||
// Search section
|
||||
html += '</div></div></div>';
|
||||
html += '</div></div></div></div>';
|
||||
html += '<div class="cookbook-group" data-backend-group="Search">';
|
||||
html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
|
||||
html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';
|
||||
@@ -1445,13 +1619,21 @@ function _renderRecipes() {
|
||||
html += '<option value="Q4_K_M">Q4</option><option value="Q8_0">Q8</option>';
|
||||
html += '<option value="Q6_K">Q6</option><option value="Q5_K_M">Q5</option>';
|
||||
html += '<option value="Q3_K_M">Q3</option><option value="Q2_K">Q2</option>';
|
||||
html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option></select>';
|
||||
// Ctx slider — ported from origin/main. Lets you target a context length
|
||||
// for fit estimates; the hwfit ranking uses _ctxValue() to factor that into
|
||||
// VRAM math, so dragging this re-sorts the list toward models that fit
|
||||
// your chosen ctx.
|
||||
html += '<option value="AWQ-4bit">AWQ</option><option value="FP8">FP8</option><option value="FP4">FP4</option><option value="NVFP4">NVFP4</option></select>';
|
||||
// Engine filter — show only models whose serve engine matches. Composes
|
||||
// with quant / type / search filters.
|
||||
html += '<select class="cookbook-field-input hwfit-engine" id="hwfit-engine" style="height:28px;" title="Filter by serving engine">';
|
||||
html += '<option value="">Engine</option>';
|
||||
html += '<option value="llamacpp">llama.cpp</option>';
|
||||
html += '<option value="vllm">vLLM</option>';
|
||||
html += '<option value="sglang">SGLang</option>';
|
||||
html += '</select>';
|
||||
html += '<span class="hwfit-help-chip" title="Higher numbers usually mean better quality, but they need more memory. Lower numbers fit on more hardware.">?</span>';
|
||||
// Ctx slider — lets you target a context length for fit estimates; the
|
||||
// hwfit ranking uses _ctxValue() to factor that into VRAM math, so
|
||||
// dragging this re-sorts the list toward models that fit your chosen ctx.
|
||||
html += '<label class="hwfit-ctx-control" title="Context length for fit estimates. Lower it to find more models that could fit your hardware.">';
|
||||
html += '<span>Ctx</span><input type="range" id="hwfit-context" min="0" max="5" step="1" value="3" />';
|
||||
html += '<span>Ctx</span><span class="hwfit-help-chip hwfit-help-chip-inline" title="Context length. Lower it to find more models that could fit your hardware; raise it when you need longer chats or documents.">?</span><input type="range" id="hwfit-context" min="0" max="5" step="1" value="3" />';
|
||||
html += '<output id="hwfit-context-label">50k</output></label>';
|
||||
html += '</div>';
|
||||
html += '<div class="hwfit-toolbar" style="margin-top:7px;">';
|
||||
@@ -1462,8 +1644,10 @@ function _renderRecipes() {
|
||||
// Scan/refresh button (icon-only) where the quant dropdown used to sit.
|
||||
html += '<button type="button" class="hwfit-gpu-btn" id="hwfit-rescan" title="Re-scan hardware" style="flex-shrink:0;position:relative;top:-3px;left:-1px;">↻ RESCAN</button>';
|
||||
html += '<button type="button" class="hwfit-gpu-btn hwfit-hw-manual-btn" id="hwfit-hw-manual-btn" title="Set hardware manually" style="flex-shrink:0;position:relative;top:-3px;left:-1px;">EDIT</button>';
|
||||
// Sort state — the clickable column headers read/write this (pewds' original
|
||||
// sort paradigm). Newest is reachable by clicking the Model column header.
|
||||
html += '<select class="cookbook-field-input hwfit-sort" id="hwfit-sort" style="display:none">';
|
||||
html += '<option value="score">Score</option><option value="vram">VRAM</option>';
|
||||
html += '<option value="fit">Fit</option><option value="score">Score</option><option value="vram">VRAM</option>';
|
||||
html += '<option value="speed">Speed</option><option value="params">Params</option>';
|
||||
html += '<option value="context">Context</option></select>';
|
||||
html += '</div>';
|
||||
@@ -1523,6 +1707,7 @@ function _renderRecipes() {
|
||||
html += '<div class="admin-card" style="flex:1;display:flex;flex-direction:column;overflow:hidden;">';
|
||||
html += '<div style="display:flex;align-items:center;gap:8px;margin-bottom:4px;">';
|
||||
html += '<h2 style="margin:0;padding:0;line-height:1;">Dependencies</h2>';
|
||||
html += '<button class="cookbook-field-input" id="cookbook-rebuild-engine" title="Clear the cached llama.cpp build so the next serve recompiles from source (use after installing a CUDA/ROCm toolkit to turn a CPU-only build into a GPU build)." style="height:24px;font-size:10px;padding:0 8px;cursor:pointer;width:auto;">Rebuild llama.cpp</button>';
|
||||
html += '<span style="font-size:10px;opacity:0.5;margin-left:auto;">Server</span>';
|
||||
html += '<select class="cookbook-field-input" id="hwfit-deps-server" style="height:28px;min-width:70px;">';
|
||||
html += _buildServerOpts(false);
|
||||
|
||||
Reference in New Issue
Block a user