From ee6fd8ffe8321a80fee2571ef42b7a3bf6e1be4c Mon Sep 17 00:00:00 2001 From: pewdiepie-archdaemon Date: Fri, 19 Jun 2026 00:33:37 +0000 Subject: [PATCH] Cookbook UI: backend-aware env vars, always-show MoE/EP/Reasoning toggles, GPU default, Firefox-mobile expand Frontend half of the backend-detection + per-OS install command work, plus a pile of mobile/UX fixes: Backend awareness: - _gpuEnvPrefix() picks CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES / nothing based on detected hwfit backend + scanned-host match (so a stale ajax scan does not leak CUDA env vars into a kierkegaard Vulkan launch). Replaces 6 hardcoded CUDA_VISIBLE_DEVICES sites. - GGML_CUDA_ENABLE_UNIFIED_MEMORY only emitted when backend is actually CUDA (was leaking onto Vulkan/ROCm via saved presets). Per-target install command: - Dep rows render a single mono command box + Copy button when the server resolved pkg.install_cmd_for_target. Reused in the build-deps install failure toast so the toast and the row show the same line. - Diagnosis patterns split cmake/g++/git out of the generic llama-cpp-python catch-all so a missing-cmake failure surfaces a cmake-specific message + per-distro Copy buttons. Form toggles always visible: - Reasoning Parser, Expert Parallel, MoE Env Vars no longer gated on model-family detection. Detection still hints (parser tag shown when matched); toggle works with sensible defaults otherwise. MiniMax M- series added to MoE family detector so the auto-fill is right. Mobile + GPU default: - Launch tab cached-list flex collapsed to 0px on mobile because the desktop `flex: 1 1 0` had no parent height to grow into. Override to `flex: 0 0 auto` in the cookbook mobile @media block. - doclib-card expand on mobile (Firefox no :has() support) pins explicit px heights so the launch form actually appears. - llama_mode defaults to gpu when hwfit detected cuda/rocm/vulkan/ metal on the current target, instead of always cpu (which was forcing -ngl 0 on first-open and burning 35GB models on CPU). --- static/js/cookbook-diagnosis.js | 46 +++- static/js/cookbook-hwfit.js | 12 +- static/js/cookbook.js | 364 +++++++++++++++++++++++++++++--- static/js/cookbookServe.js | 340 +++++++++++++++++++++++++---- static/style.css | 20 +- 5 files changed, 706 insertions(+), 76 deletions(-) diff --git a/static/js/cookbook-diagnosis.js b/static/js/cookbook-diagnosis.js index 5ac387178..200803313 100644 --- a/static/js/cookbook-diagnosis.js +++ b/static/js/cookbook-diagnosis.js @@ -461,6 +461,40 @@ export const ERROR_PATTERNS = [ { label: 'Copy install command', action: () => _copyText('curl -fsSL https://ollama.com/install.sh | sh') }, ], }, + // System build deps must be checked BEFORE the llama-server catch-all: + // a `cmake: command not found` failure ALSO produces `llama-server: + // command not found` later in the script (the build aborts then the + // run line fails) — pattern order is first-match-wins, so without + // these specific entries the user gets the misleading "install + // llama-cpp-python[server]" suggestion when the actual blocker is a + // missing OS-package toolchain that pip can't ship. + { + pattern: /cmake: command not found|cmake.*not found.*Could not/i, + message: 'cmake is required to compile llama.cpp from source, but it is not installed on this server.', + suggestion: 'Suggested action: install cmake via the OS package manager — apt: cmake build-essential / pacman: cmake base-devel / dnf: cmake gcc-c++ make / brew: cmake. Cookbook can do this automatically on the next launch if your user has passwordless sudo for apt/pacman/dnf.', + fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') }, + { label: 'Copy apt install', action: () => _copyText('sudo apt install -y cmake build-essential git') }, + { label: 'Copy pacman install', action: () => _copyText('sudo pacman -Sy --needed cmake base-devel git') }, + { label: 'Copy dnf install', action: () => _copyText('sudo dnf install -y cmake gcc gcc-c++ make git') }, + ], + }, + { + pattern: /^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler/i, + message: 'A C/C++ compiler (build-essential / base-devel) is required to compile llama.cpp.', + fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') }, + { label: 'Copy apt install', action: () => _copyText('sudo apt install -y build-essential') }, + ], + }, + { + pattern: /^git: command not found/i, + message: 'git is required to clone the llama.cpp source tree.', + fixes: [ + { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') }, + { label: 'Copy apt install', action: () => _copyText('sudo apt install -y git') }, + ], + }, { pattern: /llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'/i, message: 'llama-cpp-python server is not installed. Run: pip install "llama-cpp-python[server]"', @@ -688,11 +722,15 @@ export function _showDiagnosis(panel, diagnosis, sourceText) { copyBtn.addEventListener('click', async (e) => { e.stopPropagation(); const bundle = _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText); - try { - await navigator.clipboard.writeText(bundle); + // Use the shared helper which falls back to execCommand('copy') on + // non-HTTPS origins (Tailscale IPs, LAN IPs, etc.) — navigator.clipboard + // is silently a no-op on those, which is why the button appeared dead + // for users on http://100.113.161.2:7011 over Tailscale/mobile. + const ok = await _copyText(bundle); + if (ok) { copyBtn.classList.add('copied'); setTimeout(() => { if (copyBtn.isConnected) copyBtn.classList.remove('copied'); }, 1200); - } catch (_) {} + } }); const dismissBtn = document.createElement('button'); @@ -757,7 +795,7 @@ export function _showDiagnosis(panel, diagnosis, sourceText) { }); row.appendChild(btn); } - body.appendChild(row); + diag.appendChild(row); } } diff --git a/static/js/cookbook-hwfit.js b/static/js/cookbook-hwfit.js index 243d3c9c7..9098d8082 100644 --- a/static/js/cookbook-hwfit.js +++ b/static/js/cookbook-hwfit.js @@ -578,7 +578,9 @@ export async function _hwfitFetch(fresh = false) { const _cached = fresh ? null : _readScanCache(_sig); const wp = spinnerModule.createWhirlpool(18); if (_cached) { - _hwfitCache = _cached; + // Tag the restored cache with its host too (scan-sig keys cache per + // host, so a hit here is always for the current remoteHost). + _hwfitCache = { ..._cached, _scannedHost: remoteHost || '' }; _hwfitRenderHw(hw, _cached.system); if (!remoteHost && _cached.system && _cached.system.platform) { _envState.platform = _cached.system.platform; @@ -750,7 +752,11 @@ export async function _hwfitFetch(fresh = false) { : _olRows; data.models = (data.models || []).concat(_olFiltered); } - _hwfitCache = data; + // Tag the cache with the host this scan was for, so downstream + // code (_gpuEnvVarName, backend-aware command builders) can avoid + // trusting a stale scan when the user switches the server picker + // to a different target without re-running hwfit. + _hwfitCache = { ...data, _scannedHost: remoteHost || '' }; _hwfitRenderHw(hw, data.system); // Propagate local platform from hardware probe so _isWindows(task) works // for local tasks (menu items, shell commands, etc.). @@ -1679,7 +1685,7 @@ export function _expandModelRow(row, modelData) { } else if (runBackend === 'llamacpp') { const dir = `"$HOME/.cache/huggingface/hub/models--${modelData.name.replace(/\//g, '--')}/snapshots"`; const ggufPath = `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`; - cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host. Download a GGUF quant or switch backend."; exit 1; } && llama-server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} || python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 --n_gpu_layers 99 --n_ctx ${maxCtx}`; + cmd = `llama-server --model "${ggufPath}" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} --flash-attn auto`; } else { cmd = `vllm serve ${modelData.name} --host 0.0.0.0 --port ${port}`; cmd += ` --tensor-parallel-size ${tp}`; diff --git a/static/js/cookbook.js b/static/js/cookbook.js index 81acc9e0d..3aaa70465 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -259,6 +259,15 @@ function _detectModelOptimizations(modelName) { opts.kvCacheDtype = 'fp8'; opts.tips.push('fp8 KV cache required — bf16 OOMs at usable context'); } + // MiniMax MoE — Abab/M1/M2/M2.5/M2.7 are all MoE (Lightning Attention + + // MoE in M1, full sparse MoE from M2 onward). They benefit from the + // same --enable-expert-parallel flag as the Qwen/DeepSeek families, + // and the toggle has to be detectable here for the Expert Parallel + // checkbox in the serve form to render at all. + else if (n.includes('minimax')) { + opts.flags.push('--enable-expert-parallel'); + opts.tips.push('MoE expert parallel for MiniMax'); + } // Reasoning parser — applies independently of MoE detection. Without this // flag, models like MiniMax-M2.x, DeepSeek-R1, Qwen3 reasoning, GLM-4.x, // gpt-oss leak blocks as plain text instead of separating them @@ -419,6 +428,38 @@ export function _psQuote(value) { return "'" + String(value ?? '').replace(/'/g, "''") + "'"; } +// Pick the GPU-pinning env-var name for the detected backend. NVIDIA uses +// CUDA_VISIBLE_DEVICES; ROCm/HIP uses HIP_VISIBLE_DEVICES; Vulkan and +// Apple Metal don't take an index env var at all (and CUDA_VISIBLE_DEVICES +// is a silent no-op on those, which silently hides "wrong backend" config +// bugs). Returns 'cmd ' style prefix ('CUDA_VISIBLE_DEVICES=0 ') or '' when +// the backend doesn't support pinning. Pass isWindows=true to get PowerShell +// `$env:` syntax instead. backend defaults to whatever hwfit detected. +function _gpuEnvVarName() { + // Only emit a pinning env var when we POSITIVELY know the backend AND + // the hwfit scan was actually run against the currently-targeted host. + // Without the target-match guard, switching the server picker from an + // NVIDIA box (cuda) to a local/Vulkan target preserved the stale + // `cuda` backend in the cache, leaking `CUDA_VISIBLE_DEVICES=` into + // launches that don't have an NVIDIA GPU at all. Default to "" when + // unsure — the user sees a clean command and is prompted to scan. + const cachedHost = String(_hwfitCache?._scannedHost || ''); + const currentHost = String(_envState.remoteHost || ''); + if (cachedHost !== currentHost) return ''; + const sb = String(_hwfitCache?.system?.backend || '').toLowerCase(); + if (sb === 'cuda') return 'CUDA_VISIBLE_DEVICES'; + if (sb === 'rocm') return 'HIP_VISIBLE_DEVICES'; + return ''; // vulkan / metal / mps / apple / cpu / generic / unknown — no env-var pinning +} +function _gpuEnvPrefix(gpuId, isWindows = false) { + const id = String(gpuId || '').trim(); + if (!id) return ''; + const varName = _gpuEnvVarName(); + if (!varName) return ''; + if (isWindows) return `$env:${varName}="${id}"; `; + return `${varName}=${id} `; +} + export function _buildEnvPrefix() { if (_isWindows()) return _buildEnvPrefixWindows(); let parts = []; @@ -431,7 +472,8 @@ export function _buildEnvPrefix() { } let envVars = []; if (_envState.hfToken) envVars.push('export HF_TOKEN=' + _shellQuote(_envState.hfToken)); - if (_envState.gpus) envVars.push('export CUDA_VISIBLE_DEVICES=' + _shellQuote(_envState.gpus)); + const _envGpuVar = _gpuEnvVarName(); + if (_envState.gpus && _envGpuVar) envVars.push(`export ${_envGpuVar}=` + _shellQuote(_envState.gpus)); if (envVars.length) parts.push(envVars.join(' && ')); if (parts.length === 0) return ''; return parts.join(' && ') + ' &&'; @@ -447,7 +489,8 @@ function _buildEnvPrefixWindows() { parts.push('conda activate ' + _psQuote(_envState.envPath)); } if (_envState.hfToken) parts.push('$env:HF_TOKEN=' + _psQuote(_envState.hfToken)); - if (_envState.gpus) parts.push('$env:CUDA_VISIBLE_DEVICES=' + _psQuote(_envState.gpus)); + const _winGpuVar = _gpuEnvVarName(); + if (_envState.gpus && _winGpuVar) parts.push(`$env:${_winGpuVar}=` + _psQuote(_envState.gpus)); if (parts.length === 0) return ''; return parts.join('; ') + ';'; } @@ -468,10 +511,18 @@ export function _buildServeCmd(f, modelName, backend) { // the bare "auto" input that used to back gpu_id is gone, and the // button strip is the only source for which devices to pin. const gpuId = (f.gpus || f.gpu_id || '').toString().trim(); - if (gpuId) cmd += `CUDA_VISIBLE_DEVICES=${gpuId} `; + cmd += _gpuEnvPrefix(gpuId); if (f.moe_env) { const _opts = _detectModelOptimizations(modelName); - if (_opts.envVars.length) cmd += _opts.envVars.join(' ') + ' '; + if (_opts.envVars.length) { + cmd += _opts.envVars.join(' ') + ' '; + } else { + // Fallback when the user toggles MoE Env on for a model the + // family detector didn't classify as MoE — emit the generic + // vLLM MoE optimization env vars so the toggle is never a + // silent no-op (was the case before the "always show" change). + cmd += 'VLLM_USE_DEEP_GEMM=0 VLLM_USE_FLASHINFER_MOE_FP16=1 OMP_NUM_THREADS=4 '; + } } // Pinned attention backend (Attention field). Empty = let vLLM pick. const _attn = (f.vllm_attn_backend ?? '').toString().trim(); @@ -513,7 +564,7 @@ export function _buildServeCmd(f, modelName, backend) { // the bare "auto" input that used to back gpu_id is gone, and the // button strip is the only source for which devices to pin. const gpuId = (f.gpus || f.gpu_id || '').toString().trim(); - if (gpuId) cmd += `CUDA_VISIBLE_DEVICES=${gpuId} `; + cmd += _gpuEnvPrefix(gpuId); const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim(); if (_extraEnv) cmd += _extraEnv + ' '; cmd += `${_py3Bin} -m sglang.launch_server --model-path ${modelName} --host 0.0.0.0 --port ${f.port || '30000'}`; @@ -536,24 +587,39 @@ export function _buildServeCmd(f, modelName, backend) { // CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command // mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to // start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged. + // The Inference mode pill (GPU/CPU) above gates this — when the user picks + // CPU, force ngl=0 here so all downstream flag-suppression fires + // consistently regardless of what the (now-hidden) ngl input shows. + if (String(f.llama_mode || '').toLowerCase() === 'cpu') { + f.ngl = '0'; + } else if (String(f.llama_mode || '').toLowerCase() === 'gpu' && (!f.ngl || String(f.ngl).trim() === '0')) { + f.ngl = '99'; + } const _cpuOnly = String(f.ngl).trim() === '0'; + // GGML_CUDA_* env vars are no-ops on Vulkan/ROCm/Metal/CPU. Only emit + // them when the detected backend is actually CUDA AND the hwfit scan + // was run against the currently-targeted host, so a saved preset + // from a prior NVIDIA target doesn't pollute a non-NVIDIA launch + // with misleading prefixes. + const _sb = String(_hwfitCache?.system?.backend || '').toLowerCase(); + const _hwfitHost = String(_hwfitCache?._scannedHost || ''); + const _curHost = String(_envState.remoteHost || ''); + const _isCudaTarget = (_sb === 'cuda') && (_hwfitHost === _curHost); const lcPrefix = (() => { let p = ''; - if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `; - if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `; + if (f.unified_mem && !_cpuOnly && !_isWindows() && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `; + // No GPU env var in CPU mode — `-ngl 0` already disables offload + // so CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES would be misleading + // clutter ("why is CUDA pinned for a CPU run?"). + if (!_isWindows() && !_cpuOnly) p += _gpuEnvPrefix(gpuId); return p; })(); - if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `; - if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `; - if (!_isWindows()) { - // Resolve GGUF path once, fail loudly if nothing matched (prevents - // `--model ""` which causes confusing downstream errors). - cmd += `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host. Either download the model here, or switch to the server where it's cached."; exit 1; } && `; - } - const modelArg = _isWindows() ? `"${ggufPath}"` : `"$MODEL_FILE"`; - // Prefer the native llama-server binary on Linux — its minja templating - // renders modern GGUF chat templates that the Python bindings' Jinja2 - // rejects (do_tojson ensure_ascii). Fall back to llama_cpp.server. + if (f.unified_mem && !_cpuOnly && _isWindows() && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `; + if (_isWindows() && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true); + const modelArg = `"${ggufPath}"`; + // Prefer native llama-server. The backend bootstrap resolves/builds the + // right binary (Vulkan/HIP/CUDA/Metal/CPU), so keep the generated command + // as a validator-safe binary + args with no shell chaining. // Don't suppress stderr — surface real errors (missing file, lib, OOM). // Optional perf/fit flags from a hardware profile (see services/hwfit/ // profiles.py). n_cpu_moe offloads MoE expert layers to CPU when the model @@ -575,9 +641,16 @@ export function _buildServeCmd(f, modelName, backend) { _lcExtra += ` --n-cpu-moe ${_ncm}`; _lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores } + // Flash-attn default = auto: native llama-server picks whether to + // enable based on the build/model; explicit ON (the Flash-attn + // toggle in the form) forces it. "auto" is a meaningful arg, not + // omission — older builds without flash-attn ignore it cleanly, + // newer ones get the speedup without the user having to know. if (f.flash_attn && !_cpuOnly) { _lcExtra += ' --flash-attn on'; _lcpExtra += ' --flash_attn true'; + } else if (!_cpuOnly) { + _lcExtra += ' --flash-attn auto'; } if (_kv) { _lcExtra += ` --cache-type-k ${_kv} --cache-type-v ${_kv}`; @@ -613,12 +686,11 @@ export function _buildServeCmd(f, modelName, backend) { // llama-cpp-python takes the projector via --clip_model_path. _lcpExtra += ` --clip_model_path "${f._mmproj_path}"`; } - const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`; if (_isWindows()) { + const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`; cmd += _lcpServer; } else { cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`; - cmd += ` || ${_lcpServer}`; } } else if (backend === 'ollama') { const ollamaPort = f.port || '11434'; @@ -652,7 +724,7 @@ export function _buildServeCmd(f, modelName, backend) { } } else if (backend === 'diffusers') { const gpuStr = f.gpus?.trim(); - if (gpuStr) cmd += `CUDA_VISIBLE_DEVICES=${gpuStr} `; + cmd += _gpuEnvPrefix(gpuStr); const diffusersPy = _isWindows() ? 'python' : _py3Bin; cmd += `${diffusersPy} scripts/diffusion_server.py --model ${modelName} --port ${f.port || '8100'}`; if (f.diff_dtype && f.diff_dtype !== 'bfloat16') cmd += ` --dtype ${f.diff_dtype}`; @@ -771,6 +843,14 @@ async function _fetchDependencies() { if (_depPort) _pkgParams.set('ssh_port', _depPort); if (_depVenv) _pkgParams.set('venv', _depVenv); } + // Pass the detected backend so the server can build a single + // OS+backend-aware install command per row (e.g. add nvidia-cuda-toolkit + // on a CUDA-Debian box, vulkan-headers on a Vulkan-Arch box, etc.) + // instead of dumping every distro's syntax as a hint. + const _depBackend = String(_hwfitCache?.system?.backend || '').toLowerCase(); + if (_depBackend && _hwfitCache?._scannedHost === _depHost) { + _pkgParams.set('backend', _depBackend); + } const resp = await fetch('/api/cookbook/packages' + (_pkgParams.toString() ? '?' + _pkgParams.toString() : '')); const data = await resp.json(); const pkgs = data.packages || []; @@ -832,18 +912,61 @@ async function _fetchDependencies() { // For backends with a recipe catalog (vllm / sglang / llama_cpp), // append a caret button that toggles a per-row recipe panel below. const hasRecipe = RECIPE_BACKENDS.has(pkg.name); - const recipeCaret = hasRecipe - ? `` - : ''; + // Standalone recipe-caret button removed — the "Pick install + // command" action lives inside the Installed ▾ dropdown menu + // (see _showDepMenu) so each row only has ONE caret to click. + // Kept the variable so downstream concat code stays the same. + const recipeCaret = ''; const recipePanel = hasRecipe ? _recipePanelHtml(pkg.name) : ''; + // When llama_cpp (or any future engine) reports build_deps_missing + // from its system_prereqs probe, surface a one-tap install button + // that fires the OS package manager on the target via + // /api/cookbook/install-system-deps. Keeps the user inside Cookbook + // instead of forcing them out to a shell to apt/pacman/dnf. + const _bdm = Array.isArray(pkg.build_deps_missing) ? pkg.build_deps_missing : []; + const _buildDepsBtn = _bdm.length + ? `` + : ''; + // Render the target-specific install command as a compact mono box + // when the server resolved it (target's /etc/os-release was readable + // AND the backend is known). The box doubles as the source of truth + // for the "Install build deps" button's failure toast — both surfaces + // show the same string for the same target. + const _instCmd = (_bdm.length && pkg.install_cmd_for_target) ? String(pkg.install_cmd_for_target) : ''; + const _instCmdOs = pkg.install_cmd_os ? String(pkg.install_cmd_os) : ''; + const _instCmdBe = pkg.install_cmd_backend ? String(pkg.install_cmd_backend) : ''; + const _instLabel = (_instCmdOs && _instCmdBe) ? `${_instCmdOs} + ${_instCmdBe}` : (_instCmdOs || _instCmdBe || 'this target'); + const _instCmdBox = _instCmd + ? `
` + + `
Install on ${esc(_instLabel)}:
` + + `
` + + `${esc(_instCmd)}` + + `` + + `
` + : ''; + // Partial-state row (replaces the cryptic yellow "Partial ▾" tag). + // Renders inline as a yellow banner with two clear actions: one-tap + // Install (runs the reinstall in cookbook) or Copy command (paste + // into a terminal). Same content surfaces whether the user solves + // it from inside Cookbook or from a shell. + const _gpuWheelCmd = 'CMAKE_ARGS="-DGGML_CUDA=on" python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124'; + const _gpuUpgradeBox = (pkg.partial && pkg.partial_action === 'reinstall_llama_cpp_cuda') + ? `
` + + `Installed CPU-only — GPU detected on this target. Upgrade for ~10× faster inference.` + + `` + + `` + + `
` + : ''; return `
` + `
` + `
${_depGlyphHtml(pkg.name)}${esc(pkg.name)}
` + `
${esc(pkg.desc)}
` + note + updateNote + + _instCmdBox + `
` + _rebuildBtn + + _buildDepsBtn + `${esc(pkg.category)}` + _statusTag(pkg, isLocal, isSystemDep, winBlocked) + recipeCaret @@ -985,8 +1108,15 @@ async function _fetchDependencies() { if (!res.ok || !data.ok) { // FastAPI HTTPException returns {detail: …}; the route's own // path returns {ok:false, error:…}. Surface whichever we get. + // Long duration + an OK button — the default 1.2s toast was + // disappearing before the user could read multi-clause errors + // like "tmux missing on remote". const reason = data.detail || data.error || `HTTP ${res.status}`; - uiModule.showToast('Install failed: ' + String(reason).slice(0, 200)); + uiModule.showToast('Install failed: ' + String(reason).slice(0, 400), { + duration: 20000, + action: 'OK', + onAction: () => {}, + }); return; } // _dep flags this as a pip dependency/driver install (not a servable @@ -996,12 +1126,16 @@ async function _fetchDependencies() { if (statusEl) { statusEl.textContent = upgrade ? 'Updating...' : 'Installing...'; statusEl.disabled = true; } uiModule.showToast(`${upgrade ? 'Updating' : 'Installing'} ${pkgName} on ${targetHost}...`); } catch (err) { - uiModule.showToast('Install failed: ' + err.message); + uiModule.showToast('Install failed: ' + err.message, { + duration: 20000, + action: 'OK', + onAction: () => {}, + }); } } // Wire install buttons (not-installed packages) - list.querySelectorAll('.cookbook-dep-install:not(.cookbook-dep-recipe-run)').forEach(btn => { + list.querySelectorAll('.cookbook-dep-install:not(.cookbook-dep-recipe-run):not(.cookbook-dep-install-sysdeps)').forEach(btn => { btn.addEventListener('click', async (e) => { e.stopPropagation(); const pipName = btn.dataset.depPip; @@ -1010,6 +1144,143 @@ async function _fetchDependencies() { }); }); + // Wire "Install build deps" buttons — surfaced on rows whose + // system_prereqs are missing (e.g. llama_cpp with no cmake on the + // target). One-tap call to /api/cookbook/install-system-deps; the + // route enforces a per-package allowlist and uses passwordless + // sudo only, so it can never silently hang or stretch beyond the + // build-toolchain set the catalog declares. + // "Partial ▾" upgrade tag: clicking it fires the action-specific + // install routine (currently only `reinstall_llama_cpp_cuda` — + // forces pip install with the abetlen CUDA wheel index to add GPU + // offload). Same install flow used at launch-time auto-fix, but + // user-initiated here so they don't have to launch + wait + retry. + list.querySelectorAll('.cookbook-dep-partial').forEach(btn => { + btn.addEventListener('click', async (e) => { + e.stopPropagation(); + const action = btn.dataset.depPartialAction || ''; + if (action !== 'reinstall_llama_cpp_cuda') return; + const isLocal = btn.dataset.depTarget === 'local'; + if (!isLocal) { + const depsServerSel = document.getElementById('hwfit-deps-server'); + if (depsServerSel) _applyServerSelection(depsServerSel.value); + } + const targetLabel = isLocal ? 'this server' : (_envState.remoteHost || 'remote'); + const cmd = 'CMAKE_ARGS="-DGGML_CUDA=on" python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124'; + try { + const reqBody = { + repo_id: 'llama-cpp-python-cuda', + cmd, + remote_host: _envState.remoteHost || undefined, + ssh_port: _getPort(_envState.remoteHost) || undefined, + platform: _envState.platform || undefined, + }; + const res = await fetch('/api/model/serve', { + method: 'POST', credentials: 'same-origin', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(reqBody), + }); + const data = await res.json().catch(() => ({})); + if (res.ok && data.ok) { + const payload = { repo_id: 'pip llama-cpp-python[CUDA]', _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true }; + _addTask(data.session_id, 'pip llama-cpp-python[CUDA]', 'download', payload); + uiModule.showToast(`Reinstalling llama-cpp-python with CUDA wheels on ${targetLabel} (~1-3 min)…`, 4000); + } else { + uiModule.showToast('Upgrade failed: ' + String(data.detail || data.error || `HTTP ${res.status}`).slice(0, 300), { + duration: 20000, action: 'OK', onAction: () => {}, + }); + } + } catch (err) { + uiModule.showToast('Upgrade request failed: ' + err.message, { duration: 20000, action: 'OK', onAction: () => {} }); + } + }); + }); + + // Inline command-box "Copy" buttons — one per row that has a + // resolved per-target install command. Same string surfaces here + // and in the toast/diagnosis so the user always sees one answer. + list.querySelectorAll('.cookbook-dep-cmd-copy').forEach(btn => { + btn.addEventListener('click', async (e) => { + e.stopPropagation(); + const cmd = btn.dataset.depCmdCopy || ''; + if (!cmd) return; + try { await navigator.clipboard.writeText(cmd); } + catch { /* fall through */ } + const orig = btn.textContent; + btn.textContent = 'Copied'; + setTimeout(() => { if (btn.isConnected) btn.textContent = orig; }, 1200); + }); + }); + list.querySelectorAll('.cookbook-dep-install-sysdeps').forEach(btn => { + btn.addEventListener('click', async (e) => { + e.stopPropagation(); + const names = (btn.dataset.depSysdeps || '').split(',').map(s => s.trim()).filter(Boolean); + if (!names.length) return; + const isLocal = btn.dataset.depTarget === 'local'; + // Pull the per-target install command from the sibling box on + // the same row, so failure toasts surface the SAME line the + // user already sees inline. No duplicated formatting logic. + const _row = btn.closest('.cookbook-dep-row'); + const _cmdBox = _row?.querySelector('.cookbook-dep-install-cmd'); + const _resolvedCmd = _cmdBox?.dataset.depCmd || ''; + // Mirror _installDep: the Dependencies tab has its own server + // picker that can override _envState. Apply it before reading + // remoteHost, otherwise the install silently runs on the wrong + // target (container ends up with the packages, the real remote + // host stays broken, success toast misleads the user). + if (!isLocal) { + const depsServerSel = document.getElementById('hwfit-deps-server'); + if (depsServerSel) _applyServerSelection(depsServerSel.value); + } + const targetLabel = isLocal ? 'this server' : (_envState.remoteHost || 'remote'); + const origText = btn.textContent; + btn.textContent = 'Installing…'; + btn.disabled = true; + try { + const body = { packages: names }; + if (!isLocal && _envState.remoteHost) { + body.remote_host = _envState.remoteHost; + const _p = _getPort(_envState.remoteHost); + if (_p) body.ssh_port = _p; + } + const res = await fetch('/api/cookbook/install-system-deps', { + method: 'POST', credentials: 'same-origin', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }); + const data = await res.json().catch(() => ({})); + if (res.ok && data.ok) { + uiModule.showToast(`Installed ${names.join(', ')} on ${targetLabel}. Refreshing…`, 4000); + // Refresh the deps panel so the row updates (prereqs now present). + try { await _fetchDependencies(); } catch {} + } else { + const reason = data.error || data.detail || `HTTP ${res.status}`; + // Append the per-target install command (if we already know it + // from the row) so the user can copy-paste it without leaving + // the toast. Otherwise just surface the error. + const _suffix = _resolvedCmd ? `\n\nRun on ${targetLabel}: ${_resolvedCmd}` : ''; + uiModule.showToast('Build-deps install failed: ' + String(reason).slice(0, 300) + _suffix, { + duration: 25000, + action: _resolvedCmd ? 'Copy command' : 'OK', + onAction: async () => { + if (_resolvedCmd) { + try { await navigator.clipboard.writeText(_resolvedCmd); } catch {} + } + }, + }); + btn.textContent = origText; + btn.disabled = false; + } + } catch (err) { + uiModule.showToast('Install request failed: ' + err.message, { + duration: 20000, action: 'OK', onAction: () => {}, + }); + btn.textContent = origText; + btn.disabled = false; + } + }); + }); + // ── Recipe panel wiring (per-backend dropdown with model + commands) ── // Caret toggle: shows/hides the panel directly below the backend row. list.querySelectorAll('[data-dep-recipe-toggle]').forEach(btn => { @@ -1577,8 +1848,22 @@ function _wireTabEvents(body) { if (dlBtn && dlInput) { function _stripHfUrl(input) { let repo = input.trim(); + // Strip a leading `hf download` / `hf-cli download` / `huggingface-cli + // download` wrapper so a paste from CLI docs Just Works. Drop the + // command prefix; the rest is parsed by the existing strippers. + repo = repo.replace(/^(?:huggingface-cli|hf-cli|hf)\s+(?:download|d)\s+/i, ''); + // Strip the `hf://` (and `huggingface://`) scheme — the HF CLI + // accepts it as an alias and users naturally copy it. Same effect + // as the bare `org/repo[/file.gguf]` form after the strip. + repo = repo.replace(/^(?:hf|huggingface):\/\//i, ''); // Strip Ollama-style "hf.co/" prefix if present (e.g. hf.co/unsloth/...:tag) repo = repo.replace(/^hf\.co\//, ''); + // Full HF blob/resolve URL → turn into `org/repo/path/to/file` so + // the downstream `_splitRepoFile` can pick the file out. + // Matches: https://huggingface.co/org/repo/blob/branch/path/to/file.gguf + // https://huggingface.co/org/repo/resolve/branch/path/to/file.gguf + const hfBlob = repo.match(/^https?:\/\/huggingface\.co\/([^/]+\/[^/?#]+)\/(?:blob|resolve)\/[^/?#]+\/([^?#]+)/); + if (hfBlob) return `${hfBlob[1]}/${hfBlob[2]}`; const hfMatch = repo.match(/^https?:\/\/huggingface\.co\/([^/]+\/[^/?#]+(?::[^/?#\s]+)?)/); if (hfMatch) repo = hfMatch[1]; return repo; @@ -1590,6 +1875,22 @@ function _wireTabEvents(body) { if (!m) return { repo: raw, include: null }; return { repo: m[1], include: `*${m[2]}*` }; } + // Split `org/repo/path/to/file.gguf` (or `.safetensors`/`.bin`) into + // repo + exact file include. Lets the user paste a path straight out + // of a HuggingFace "Files and versions" page or a copied filename + // without needing to peel the repo/file apart by hand. Returns null + // when the input doesn't look like a deep file path. + function _splitRepoFile(raw) { + // Must have at least 3 slash-separated segments AND end in a + // model-file extension to avoid eating Ollama tags or repo-only + // inputs like `org/repo`. + const parts = raw.split('/'); + if (parts.length < 3) return null; + const fname = parts[parts.length - 1]; + if (!/\.(gguf|safetensors|bin|pt|pth|onnx|mlx)(\?[^?]*)?$/i.test(fname)) return null; + const repo = parts.slice(0, 2).join('/'); + return { repo, include: fname.replace(/\?.*$/, '') }; + } // Ollama-library name. Matches `qwen2.5:14b`, `llama3:latest`, and the // (rare) `library/:` form which we normalize by stripping the // namespace. The backend's _is_ollama_download check expects the same @@ -1605,7 +1906,14 @@ function _wireTabEvents(body) { const rawRepo = _stripHfUrl(dlInput.value); if (!rawRepo) return; const ollamaName = _ollamaName(rawRepo); - const { repo, include: autoInclude } = ollamaName ? { repo: ollamaName, include: null } : _splitRepoTag(rawRepo); + // Prefer the deep-file split (org/repo/file.gguf → repo + exact + // include) over the tag split (org/repo:tag → glob include), and + // both over the plain repo case. Ollama names still take priority + // since they go through a different backend. + const _fileSplit = !ollamaName ? _splitRepoFile(rawRepo) : null; + const { repo, include: autoInclude } = ollamaName + ? { repo: ollamaName, include: null } + : (_fileSplit || _splitRepoTag(rawRepo)); // HuggingFace repo IDs must be `org/model`. A bare model name would 404 // at snapshot_download time with a raw traceback, so reject it up front. // Ollama names (single-segment with a tag) skip this check — they go diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index f3b5842b2..aba3f7926 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -10,6 +10,7 @@ import { providerLogo } from './providers.js'; import { modelColor } from './chatRenderer.js'; import { bindMenuDismiss, dismissOrRemove } from './escMenuStack.js'; import { openCookbookDependencies } from './cookbook-diagnosis.js'; +import { _hwfitCache } from './cookbook-hwfit.js'; // Shared state/functions injected by init() let _envState; @@ -495,6 +496,7 @@ function _rerenderCachedModels() { item.classList.remove('doclib-card-expanded'); item.style.flexDirection = ''; item.style.alignItems = ''; + item.style.maxHeight = ''; list.style.minHeight = ''; list.style.maxHeight = ''; return; @@ -508,6 +510,7 @@ function _rerenderCachedModels() { c.classList.remove('doclib-card-expanded'); c.style.flexDirection = ''; c.style.alignItems = ''; + c.style.maxHeight = ''; }); const shortName = repo.split('/').pop(); @@ -620,13 +623,31 @@ function _rerenderCachedModels() { // stays as the source-of-truth so every existing change handler // (updateBackendVisibility, runtime readiness, command builder) // still fires via dispatchEvent('change') on selection. - panelHtml += ``; + panelHtml += ``; panelHtml += ``; + // Inference mode pill (llama.cpp only) — lives directly to the + // RIGHT of Backend in Row 1 so the engine and the GPU/CPU choice + // are read together. .hwfit-backend-llamacpp visibility class + // hides it when the user switches to vLLM/SGLang/Ollama. + { + // Default CPU — works on every host without GPU/wheel matching + // hassle. User picks GPU explicitly if they have the right setup + // (avoids "click Launch → silent CPU fallback because the wheel + // is CPU-only" surprises that ate hours of debugging). + // Layout: CPU on left, GPU on right → mode-right triggers when + // GPU is selected so the sliding pill animates rightward. + // Default to GPU mode when hwfit detected a GPU backend on the + // current target — CPU as a global default sent the user down a + // 35GB-model-on-CPU rabbit hole (-ngl 0, no flash-attn, no GPU + // offload). Falls back to CPU only when hwfit detected no GPU + // (cpu_x86 / generic / unscanned) or the cache is stale. + const _hwBackend = String(_hwfitCache?.system?.backend || '').toLowerCase(); + const _hwScanMatch = String(_hwfitCache?._scannedHost || '') === String(_envState.remoteHost || ''); + const _llamaModeDefault = (_hwScanMatch && ['cuda', 'rocm', 'vulkan', 'metal', 'mps', 'apple'].includes(_hwBackend)) ? 'gpu' : 'cpu'; + const _llamaMode = sv('llama_mode', _llamaModeDefault); + panelHtml += ``; + } panelHtml += ``; - // Dtype lives in Row 1 (next to venv) — it's the first knob people - // change when matching the model to the box, so it earns top-row - // real estate over Row 2's launch-tuning controls. - panelHtml += ``; const defaultPort = defaultBackend === 'ollama' ? '11434' : _nextAvailablePort(); panelHtml += ``; const _activeGpus = (defaultGpus || '').split(',').map(s => s.trim()).filter(Boolean); @@ -642,7 +663,7 @@ function _rerenderCachedModels() { // separates the GPU chiclets from the GPU Mem field that follows // (asked-for breathing room; 4px on either side felt cramped on // the GPU-Mem boundary). - const _gpusLabelHtml = ``; + const _gpusLabelHtml = ``; // Save / saved-configs split button — sits at the right end of Row 1. panelHtml += _slotsHtml; panelHtml += `
`; @@ -664,10 +685,12 @@ function _rerenderCachedModels() { // (Swap, KV Cache, Attention backend, Env vars, llama.cpp batch/ubatch) // moved to the Advanced fold below to keep this row scannable. panelHtml += `
`; - // Order: TP → Context → Max Seqs → GPUs → GPU Mem. - // Dtype moved up to Row 1. GPUs moved here next to GPU Mem so the - // "which devices + how much of them" decisions sit adjacent. Max - // Seqs follows Context per the "request-shape" cluster. + // Order: Dtype → TP → Context → Max Seqs → GPUs → GPU Mem. + // Dtype moved down from Row 1 to make space for the Inference pill + // (llama.cpp GPU/CPU toggle, llamacpp-only). GPUs lives next to + // GPU Mem so "which devices + how much" sit adjacent. Max Seqs + // follows Context per the "request-shape" cluster. + panelHtml += ``; panelHtml += ``; // ctx resets to the model's max on every panel open (the real ctx slider // lives in the Scan/Download toolbar — see cookbook.js .hwfit-ctx-control). @@ -711,12 +734,6 @@ function _rerenderCachedModels() { // container — left an empty trailing column gap on wide modals). panelHtml += ``; panelHtml += `
`; - // Advanced llama.cpp row (Batch / UBatch — moved out of Core for the - // same "rarely touched" reason as the vLLM extras above). - panelHtml += `
`; - panelHtml += ``; - panelHtml += ``; - panelHtml += `
`; // Row 2b: Diffusers settings const diffDtypeOpts = ['bfloat16','float16','float32'].map(d => ``).join(''); const deviceMapOpts = ['balanced','auto','sequential'].map(d => ``).join(''); @@ -740,13 +757,19 @@ function _rerenderCachedModels() { panelHtml += `
`; panelHtml += ``; panelHtml += ``; - if (_rp_name) panelHtml += ``; + // Always-render the Reasoning Parser, Expert Parallel, and MoE Env + // checkboxes — the model-family detection above is a hint, not a + // hard gate. User asked to keep these visible regardless so that + // a borderline-undetected MoE/reasoning model can still toggle + // them without dropping back to the raw command box. + panelHtml += ``; panelHtml += ``; panelHtml += ``; // Inline the previously-second vLLM checks row so Expert Parallel / // Speculative / MoE Env sit next to Prefix Caching with no gap. All - // three are vLLM-only — class-gated so they hide on SGLang. - if (_opts2_row3.flags.includes('--enable-expert-parallel')) panelHtml += ``; + // three are vLLM-only — class-gated so they hide on SGLang. Always + // render so the user can flip them on for any MoE model. + panelHtml += ``; { const _specDef = _opts2_row3.spec || { method: 'mtp', tokens: 3 }; const _specMethod = sv('spec_method', _specDef.method); @@ -757,27 +780,39 @@ function _rerenderCachedModels() { ``).join(''); panelHtml += ``; } - if (_opts2_row3.envVars.length) panelHtml += ``; + // Always-render MoE Env Vars — the env vars dict is empty for + // most dense models (toggle is a no-op then), but for MoE families + // the user can still flip it on without re-fitting model detection. + panelHtml += ``; panelHtml += `
`; - // Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand) + // ── llama.cpp Advanced — grouped by purpose ── + // Three clean field rows + one checkbox row, all selects/inputs the + // same 28px height (no per-field `top:-Npx` nudges). Groups follow + // user mental model: (1) where it runs on GPU, (2) how memory is + // shaped, (3) how requests are batched, (4) on/off toggles. const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => ``).join(''); const llamaFitOpts = ['', 'off', 'on'].map(d => ``).join(''); const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => ``).join(''); + + // Group 1 — GPU placement (GPU-only, hides in CPU mode) + panelHtml += `
`; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += `
`; + + // Group 2 — Memory tuning (KV cache + MoE-on-CPU + Fit policy) panelHtml += `
`; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; + panelHtml += ``; + panelHtml += ``; panelHtml += ``; panelHtml += `
`; - // Row 2d: native llama-server placement/runtime controls. These are - // explicit overrides for known-good advanced presets; blank keeps - // llama.cpp/profile defaults. + + // Group 3 — Request batching (Batch / UBatch / Parallel) panelHtml += `
`; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; panelHtml += `
`; // Auto-profile chips row removed — visual fit with the rest of the // serve panel was off, and the manual ctx/n_cpu_moe/cache controls @@ -791,12 +826,19 @@ function _rerenderCachedModels() { panelHtml += `GPU memory:`; panelHtml += `checking…`; panelHtml += ``; - // Row 3a: Checkboxes (llama.cpp-only) + // Group 4 — llama.cpp toggles. Single row of checkboxes, GPU-only + // ones (Flash Attn, Unified Memory, Allow CPU overflow) hide + // automatically in CPU mode. Order: perf-critical → safety → I/O → + // niche. MTP Spec sits last because it owns its own numstep widget + // and is the widest item. panelHtml += `
`; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; - panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; + panelHtml += ``; panelHtml += `
`; // Row 3b: Checkboxes (diffusers) panelHtml += `
`; @@ -859,6 +901,21 @@ function _rerenderCachedModels() { const panel = item.querySelector('.hwfit-serve-panel'); // Scroll the serve panel into view within its nearest scrollable ancestor requestAnimationFrame(() => panel.scrollIntoView({ block: 'nearest', behavior: 'smooth' })); + // Firefox-mobile fallback: the CSS that grows the cached-list and + // expanded card uses :has(.doclib-card-expanded), which Firefox + // mobile doesn't support — so the panel stays collapsed and the + // form is unusable. Pin explicit px heights here. On Chromium/ + // WebKit the !important CSS still wins, so this is a no-op there. + // (See project_skills_expand_firefox memory note.) + requestAnimationFrame(() => { + try { + const _itemH = Math.max(item.scrollHeight, item.getBoundingClientRect().height); + if (_itemH > 0) item.style.maxHeight = _itemH + 'px'; + const _listH = Math.max(list.scrollHeight, list.getBoundingClientRect().height); + if (_listH > 0) list.style.maxHeight = _listH + 'px'; + list.style.minHeight = _listH + 'px'; + } catch {} + }); // Build command preview function updateCmd() { @@ -1859,6 +1916,49 @@ function _rerenderCachedModels() { updateCmd(); }); }); + // llama.cpp GPU/CPU mode-toggle pill wiring. Clicking GPU or CPU + // flips the .active classes + .mode-right marker (so the sliding + // pill matches Agent/Chat), updates the hidden data-field input, + // and fires a change event so the existing field-change handler + // rebuilds the serve cmd (sets -ngl 99 vs -ngl 0). + panel.querySelectorAll('[data-llama-mode-toggle]').forEach(group => { + group.querySelectorAll('.mode-toggle-btn').forEach(btn => { + btn.addEventListener('click', (e) => { + e.preventDefault(); e.stopPropagation(); + const want = btn.dataset.llamaMode; + if (!want) return; + group.querySelectorAll('.mode-toggle-btn').forEach(b => { + const isActive = b.dataset.llamaMode === want; + b.classList.toggle('active', isActive); + b.setAttribute('aria-pressed', isActive ? 'true' : 'false'); + }); + group.classList.toggle('mode-right', want === 'gpu'); + const hidden = group.parentElement.querySelector('[data-field="llama_mode"]'); + if (hidden) { + hidden.value = want; + hidden.dispatchEvent(new Event('change', { bubbles: true })); + } + // Hide every GPU-only control (chiclets, Tensor Split, + // Split Mode, Main GPU, Flash Attn, Unified Memory, etc.) + // in CPU mode — `-ngl 0` ignores them and showing them + // implies they matter. + panel.classList.toggle('cookbook-llama-cpu-mode', want === 'cpu'); + panel.querySelectorAll('.cookbook-llama-gpu-only').forEach(el => { + el.style.display = (want === 'cpu') ? 'none' : ''; + }); + }); + }); + }); + // Apply the CPU-mode visibility on first render too, so a saved + // preset that loaded with llama_mode=cpu hides GPU controls + // immediately instead of flashing them then disappearing. + { + const _saved = panel.querySelector('[data-field="llama_mode"]')?.value || 'gpu'; + if (_saved === 'cpu') { + panel.classList.add('cookbook-llama-cpu-mode'); + panel.querySelectorAll('.cookbook-llama-gpu-only').forEach(el => { el.style.display = 'none'; }); + } + } // Themed +/- buttons next to spec_tokens — step the adjacent number input. panel.querySelectorAll('.hwfit-numstep-btn').forEach(btn => { btn.addEventListener('click', (e) => { @@ -2025,6 +2125,140 @@ function _rerenderCachedModels() { }); return; } + // llama.cpp VRAM-fit preflight. Catches the silent-CPU-fallback + // trap: when the model + KV cache exceed the selected GPUs' free + // VRAM, llama-cpp-python doesn't error — it pushes layers/KV to + // CPU and inference crawls at sub-1 tok/s. Off by default; can + // be bypassed per-launch via the dialog's "Allow CPU overflow" + // action, OR persistently by ticking the same-named checkbox. + if (serveState.backend === 'llamacpp' + && String(serveState.llama_mode || 'gpu') !== 'cpu' + && !serveState.llama_cpu_overflow) { + try { + const _ctx = Math.max(1, parseInt(serveState.ctx, 10) || 8192); + // Model size on disk — close enough for GPU footprint of a GGUF. + const _modelBytes = Number(m?.size_bytes || 0) || Math.round((Number(m?.size_gb || 0)) * 1024 * 1024 * 1024); + const _modelGb = _modelBytes / (1024 ** 3); + // KV cache heuristic. ~0.7MB / token / 7.5GB-of-model at fp16 + // KV, scaled linearly by model size. Imperfect but covers + // the common 7B–70B range within ~20% — good enough to catch + // overflow before it silently happens. + const _kvGbPerToken = _modelGb > 0 ? (_modelGb / 7.5) * 0.0007 : 0.0007; + const _kvGb = _ctx * _kvGbPerToken; + const _needGb = _modelGb + _kvGb; + const _selStr = (serveState.gpus || '').trim(); + const _selIdx = _selStr ? _selStr.split(',').map(s => parseInt(s.trim(), 10)).filter(n => Number.isFinite(n)) : [0]; + // Fetch FRESH GPU data per-launch — the hwfit cache may be + // stale or for a different host (e.g. user switched server + // picker without scanning), which used to silently skip the + // preflight and let the launch silently fall to CPU. + let _hwGpus = []; + try { + const _gh = (_envState.remoteHost || '').trim(); + const _gp = new URLSearchParams(); + if (_gh) { + _gp.set('host', _gh); + const _sp = (_serverByVal?.(_envState.remoteServerKey || _gh) || {}).port; + if (_sp) _gp.set('ssh_port', _sp); + } + const _gr = await fetch('/api/cookbook/gpus' + (_gp.toString() ? '?' + _gp : ''), { credentials: 'same-origin' }); + if (_gr.ok) { + const _gd = await _gr.json(); + _hwGpus = Array.isArray(_gd) ? _gd : (_gd.gpus || []); + } + } catch {} + const _freeFor = (idx) => { + const g = _hwGpus[idx]; + const mb = g?.free_mb; + return Number.isFinite(mb) ? mb / 1024 : 0; + }; + const _selFreeGb = _selIdx.reduce((s, i) => s + _freeFor(i), 0); + // Skip the gate when we don't have any free-VRAM data (probe + // failed) — better to let the launch try than silently refuse + // on a missing data point. + if (_selFreeGb > 0 && _needGb > _selFreeGb && _modelGb > 0) { + // Suggest the smallest set of additional GPUs whose free + // VRAM closes the gap. Greedy by largest-free-first. + const _candidates = _hwGpus + .map((g, i) => ({ i, free: _freeFor(i) })) + .filter(x => !_selIdx.includes(x.i) && x.free > 0) + .sort((a, b) => b.free - a.free); + const _addGpus = []; + let _runFree = _selFreeGb; + for (const c of _candidates) { + _addGpus.push(c.i); _runFree += c.free; + if (_runFree >= _needGb) break; + } + const _canAddGpu = _runFree >= _needGb && _addGpus.length > 0; + // Recommend ctx that just-fits on current selection. + const _recCtxRaw = Math.floor((_selFreeGb - _modelGb) / _kvGbPerToken); + const _recCtx = Math.max(1024, Math.floor(_recCtxRaw / 1024) * 1024); + // Custom modal — styledConfirm only takes 2 buttons; this + // surface needs up to 4 actions (Reduce / Add GPUs / Allow / Cancel). + const _action = await new Promise(resolve => { + const ov = document.createElement('div'); + ov.className = 'modal'; + ov.style.cssText = 'display:flex;align-items:center;justify-content:center;z-index:10050;position:fixed;inset:0;background:rgba(0,0,0,0.4);'; + const _btnRow = []; + if (_recCtx > 1024 && _recCtx < _ctx) { + _btnRow.push(``); + } + if (_canAddGpu) { + _btnRow.push(``); + } + _btnRow.push(``); + _btnRow.push(``); + ov.innerHTML = ''; + document.body.appendChild(ov); + ov.addEventListener('click', (e) => { + const b = e.target.closest('[data-vram-action]'); + if (b) { ov.remove(); resolve(b.dataset.vramAction); } + else if (e.target === ov) { ov.remove(); resolve('cancel'); } + }); + }); + if (_action === 'cancel' || !_action) { _restoreLaunchBtn(); return; } + if (_action === 'reduce') { + const _ctxEl = panel.querySelector('[data-field="ctx"]'); + if (_ctxEl) { + _ctxEl.value = String(_recCtx); + serveState.ctx = String(_recCtx); + _ctxEl.dispatchEvent(new Event('change', { bubbles: true })); + } + } else if (_action === 'add_gpus') { + for (const i of _addGpus) { + const _b = panel.querySelector(`.cookbook-gpu-btn[data-gpu="${i}"]`); + if (_b && !_b.classList.contains('active')) _b.click(); + } + const _gpusEl = panel.querySelector('[data-field="gpus"]'); + if (_gpusEl) serveState.gpus = _gpusEl.value; + } else if (_action === 'allow_cpu') { + const _ov = panel.querySelector('[data-field="llama_cpu_overflow"]'); + if (_ov) { + _ov.checked = true; + _ov.dispatchEvent(new Event('change', { bubbles: true })); + } + serveState.llama_cpu_overflow = true; + } + // After mutation, rebuild the serve cmd preview so the + // launched cmd matches what the user just chose. + try { updateCmd(); } catch {} + } + } catch (_e) { + // Preflight is best-effort — never block on its own failure. + } + } // Pre-launch GPU probe — common failure pattern: vLLM/SGLang launched // on a host where no GPU is visible (driver missing, $CUDA_VISIBLE_DEVICES // unset, container without --gpus). Catch it BEFORE the user spends @@ -2151,6 +2385,38 @@ function _rerenderCachedModels() { if (venvVal) { _envState.env = 'venv'; _envState.envPath = venvVal; } else if (_srvEnvPath) { _envState.env = (_srvEnv === 'conda' ? 'conda' : 'venv'); _envState.envPath = _srvEnvPath; } if (gpusVal) _envState.gpus = gpusVal; + // Preflight: launching a GPU engine (llama.cpp / vLLM / SGLang) + // against the local-in-container target on a host whose hwfit + // scan reports no GPU backend. That falls through to a CPU build + // / CPU inference path and is usually NOT what the user wants — + // they typically have a host-side GPU (AMD/Vulkan, NVIDIA on a + // different box) that the container can't see. Surface this so + // the user can pick the host as a remote target instead, or + // confirm they really meant CPU. + try { + const _isLocalInContainer = !serveHost; // empty serveHost == cookbook container's local + const _wantsGpu = ['llamacpp', 'vllm', 'sglang', 'diffusers'].includes(serveState.backend); + const _detectedBackend = String(_hwfitCache?.system?.backend || '').toLowerCase(); + const _gpuBackends = ['cuda', 'rocm', 'vulkan', 'metal', 'mps', 'apple']; + if (_isLocalInContainer && _wantsGpu && _detectedBackend && !_gpuBackends.includes(_detectedBackend)) { + const _proceed = await window.styledConfirm( + `The local (in-container) target has no GPU backend detected (hwfit reports: "${_detectedBackend || 'none'}"). ${serveState.backend.toUpperCase()} will run on CPU only and may be unusably slow.\n\nIf this machine has a GPU on the host, add the host as a server in Settings and target that instead. Otherwise launch anyway for CPU inference.`, + { + title: 'No GPU on local target', + confirmText: 'Launch anyway (CPU)', + cancelText: 'Cancel', + danger: true, + }, + ); + if (!_proceed) { + if (typeof _restoreLaunchBtn === 'function') _restoreLaunchBtn(); + _envState.env = origEnv; + _envState.envPath = origEnvPath; + _envState.gpus = origGpus; + return; + } + } + } catch { /* preflight is best-effort */ } try { await _withSpinner(_launchBtn, async () => { // Pass the exact form values so the running task can be re-opened diff --git a/static/style.css b/static/style.css index cd1adeb8c..f4a331c47 100644 --- a/static/style.css +++ b/static/style.css @@ -15930,6 +15930,17 @@ body:not(.email-doc-split-active) #email-lib-modal.email-lib-fullscreen:not(.mod flex: 0 0 auto !important; height: auto !important; } + /* Launch tab's cached-list normally has `flex: 1 1 0; min-height: 0` + (so it fills the modal on desktop). On mobile the parent now has + `height: auto`, which collapses `flex: 1 1 0` to ZERO PX — + models render but the list area is invisible because the flex + basis is 0 and there's no free space to grow into. Switch to + content-sized flex so the list grows with its children. */ + #cookbook-modal .cookbook-group[data-backend-group="Serve"] > .admin-card > .hwfit-cached-list, + #cookbook-modal .cookbook-group[data-backend-group="Serve"] > .admin-card > #hwfit-cached-list { + flex: 0 0 auto !important; + overflow: visible !important; + } } #cookbook-modal .hwfit-cached-list { flex-shrink: 0; @@ -18560,7 +18571,7 @@ body.gallery-selecting .gallery-dl-btn, label and center it vertically so the descenders don't clip. */ #hwfit-cache-select { min-width: 58px; - height: 32px; + height: 28px; display: inline-flex; align-items: center; justify-content: center; @@ -19316,7 +19327,7 @@ body.gallery-selecting .gallery-dl-btn, margin-bottom: 4px; } .cookbook-slot-btn { - min-width: 22px; height: 22px; + min-width: 22px; height: 28px; padding: 0 6px; font-size: 10px; font-weight: 600; border: 1px solid var(--border); @@ -19733,11 +19744,12 @@ body.gallery-selecting .gallery-dl-btn, font-size: 12px; padding: 0 6px; height: 28px; + box-sizing: border-box; } .hwfit-sf[data-field="backend"], .hwfit-sf[data-field="dtype"], .hwfit-sf[data-field="tp"] { - height: 32px; + height: 28px; box-sizing: border-box; width: 100%; } @@ -23569,7 +23581,7 @@ details.hwfit-serve-advanced > .hwfit-serve-checks:last-of-type { width: 51px; } #serve-search { - height: 32px; + height: 28px; } #cookbook-dl-btn { position: relative;