mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-22 12:45:25 -04:00
Cookbook UI: backend-aware env vars, always-show MoE/EP/Reasoning toggles, GPU default, Firefox-mobile expand
Frontend half of the backend-detection + per-OS install command work, plus a pile of mobile/UX fixes: Backend awareness: - _gpuEnvPrefix() picks CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES / nothing based on detected hwfit backend + scanned-host match (so a stale ajax scan does not leak CUDA env vars into a kierkegaard Vulkan launch). Replaces 6 hardcoded CUDA_VISIBLE_DEVICES sites. - GGML_CUDA_ENABLE_UNIFIED_MEMORY only emitted when backend is actually CUDA (was leaking onto Vulkan/ROCm via saved presets). Per-target install command: - Dep rows render a single mono command box + Copy button when the server resolved pkg.install_cmd_for_target. Reused in the build-deps install failure toast so the toast and the row show the same line. - Diagnosis patterns split cmake/g++/git out of the generic llama-cpp-python catch-all so a missing-cmake failure surfaces a cmake-specific message + per-distro Copy buttons. Form toggles always visible: - Reasoning Parser, Expert Parallel, MoE Env Vars no longer gated on model-family detection. Detection still hints (parser tag shown when matched); toggle works with sensible defaults otherwise. MiniMax M- series added to MoE family detector so the auto-fill is right. Mobile + GPU default: - Launch tab cached-list flex collapsed to 0px on mobile because the desktop `flex: 1 1 0` had no parent height to grow into. Override to `flex: 0 0 auto` in the cookbook mobile @media block. - doclib-card expand on mobile (Firefox no :has() support) pins explicit px heights so the launch form actually appears. - llama_mode defaults to gpu when hwfit detected cuda/rocm/vulkan/ metal on the current target, instead of always cpu (which was forcing -ngl 0 on first-open and burning 35GB models on CPU).
This commit is contained in:
@@ -461,6 +461,40 @@ export const ERROR_PATTERNS = [
|
||||
{ label: 'Copy install command', action: () => _copyText('curl -fsSL https://ollama.com/install.sh | sh') },
|
||||
],
|
||||
},
|
||||
// System build deps must be checked BEFORE the llama-server catch-all:
|
||||
// a `cmake: command not found` failure ALSO produces `llama-server:
|
||||
// command not found` later in the script (the build aborts then the
|
||||
// run line fails) — pattern order is first-match-wins, so without
|
||||
// these specific entries the user gets the misleading "install
|
||||
// llama-cpp-python[server]" suggestion when the actual blocker is a
|
||||
// missing OS-package toolchain that pip can't ship.
|
||||
{
|
||||
pattern: /cmake: command not found|cmake.*not found.*Could not/i,
|
||||
message: 'cmake is required to compile llama.cpp from source, but it is not installed on this server.',
|
||||
suggestion: 'Suggested action: install cmake via the OS package manager — apt: cmake build-essential / pacman: cmake base-devel / dnf: cmake gcc-c++ make / brew: cmake. Cookbook can do this automatically on the next launch if your user has passwordless sudo for apt/pacman/dnf.',
|
||||
fixes: [
|
||||
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
|
||||
{ label: 'Copy apt install', action: () => _copyText('sudo apt install -y cmake build-essential git') },
|
||||
{ label: 'Copy pacman install', action: () => _copyText('sudo pacman -Sy --needed cmake base-devel git') },
|
||||
{ label: 'Copy dnf install', action: () => _copyText('sudo dnf install -y cmake gcc gcc-c++ make git') },
|
||||
],
|
||||
},
|
||||
{
|
||||
pattern: /^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler/i,
|
||||
message: 'A C/C++ compiler (build-essential / base-devel) is required to compile llama.cpp.',
|
||||
fixes: [
|
||||
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
|
||||
{ label: 'Copy apt install', action: () => _copyText('sudo apt install -y build-essential') },
|
||||
],
|
||||
},
|
||||
{
|
||||
pattern: /^git: command not found/i,
|
||||
message: 'git is required to clone the llama.cpp source tree.',
|
||||
fixes: [
|
||||
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
|
||||
{ label: 'Copy apt install', action: () => _copyText('sudo apt install -y git') },
|
||||
],
|
||||
},
|
||||
{
|
||||
pattern: /llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'/i,
|
||||
message: 'llama-cpp-python server is not installed. Run: pip install "llama-cpp-python[server]"',
|
||||
@@ -688,11 +722,15 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
|
||||
copyBtn.addEventListener('click', async (e) => {
|
||||
e.stopPropagation();
|
||||
const bundle = _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText);
|
||||
try {
|
||||
await navigator.clipboard.writeText(bundle);
|
||||
// Use the shared helper which falls back to execCommand('copy') on
|
||||
// non-HTTPS origins (Tailscale IPs, LAN IPs, etc.) — navigator.clipboard
|
||||
// is silently a no-op on those, which is why the button appeared dead
|
||||
// for users on http://100.113.161.2:7011 over Tailscale/mobile.
|
||||
const ok = await _copyText(bundle);
|
||||
if (ok) {
|
||||
copyBtn.classList.add('copied');
|
||||
setTimeout(() => { if (copyBtn.isConnected) copyBtn.classList.remove('copied'); }, 1200);
|
||||
} catch (_) {}
|
||||
}
|
||||
});
|
||||
|
||||
const dismissBtn = document.createElement('button');
|
||||
@@ -757,7 +795,7 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
|
||||
});
|
||||
row.appendChild(btn);
|
||||
}
|
||||
body.appendChild(row);
|
||||
diag.appendChild(row);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -578,7 +578,9 @@ export async function _hwfitFetch(fresh = false) {
|
||||
const _cached = fresh ? null : _readScanCache(_sig);
|
||||
const wp = spinnerModule.createWhirlpool(18);
|
||||
if (_cached) {
|
||||
_hwfitCache = _cached;
|
||||
// Tag the restored cache with its host too (scan-sig keys cache per
|
||||
// host, so a hit here is always for the current remoteHost).
|
||||
_hwfitCache = { ..._cached, _scannedHost: remoteHost || '' };
|
||||
_hwfitRenderHw(hw, _cached.system);
|
||||
if (!remoteHost && _cached.system && _cached.system.platform) {
|
||||
_envState.platform = _cached.system.platform;
|
||||
@@ -750,7 +752,11 @@ export async function _hwfitFetch(fresh = false) {
|
||||
: _olRows;
|
||||
data.models = (data.models || []).concat(_olFiltered);
|
||||
}
|
||||
_hwfitCache = data;
|
||||
// Tag the cache with the host this scan was for, so downstream
|
||||
// code (_gpuEnvVarName, backend-aware command builders) can avoid
|
||||
// trusting a stale scan when the user switches the server picker
|
||||
// to a different target without re-running hwfit.
|
||||
_hwfitCache = { ...data, _scannedHost: remoteHost || '' };
|
||||
_hwfitRenderHw(hw, data.system);
|
||||
// Propagate local platform from hardware probe so _isWindows(task) works
|
||||
// for local tasks (menu items, shell commands, etc.).
|
||||
@@ -1679,7 +1685,7 @@ export function _expandModelRow(row, modelData) {
|
||||
} else if (runBackend === 'llamacpp') {
|
||||
const dir = `"$HOME/.cache/huggingface/hub/models--${modelData.name.replace(/\//g, '--')}/snapshots"`;
|
||||
const ggufPath = `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
|
||||
cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host. Download a GGUF quant or switch backend."; exit 1; } && llama-server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} || python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 --n_gpu_layers 99 --n_ctx ${maxCtx}`;
|
||||
cmd = `llama-server --model "${ggufPath}" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} --flash-attn auto`;
|
||||
} else {
|
||||
cmd = `vllm serve ${modelData.name} --host 0.0.0.0 --port ${port}`;
|
||||
cmd += ` --tensor-parallel-size ${tp}`;
|
||||
|
||||
+336
-28
@@ -259,6 +259,15 @@ function _detectModelOptimizations(modelName) {
|
||||
opts.kvCacheDtype = 'fp8';
|
||||
opts.tips.push('fp8 KV cache required — bf16 OOMs at usable context');
|
||||
}
|
||||
// MiniMax MoE — Abab/M1/M2/M2.5/M2.7 are all MoE (Lightning Attention +
|
||||
// MoE in M1, full sparse MoE from M2 onward). They benefit from the
|
||||
// same --enable-expert-parallel flag as the Qwen/DeepSeek families,
|
||||
// and the toggle has to be detectable here for the Expert Parallel
|
||||
// checkbox in the serve form to render at all.
|
||||
else if (n.includes('minimax')) {
|
||||
opts.flags.push('--enable-expert-parallel');
|
||||
opts.tips.push('MoE expert parallel for MiniMax');
|
||||
}
|
||||
// Reasoning parser — applies independently of MoE detection. Without this
|
||||
// flag, models like MiniMax-M2.x, DeepSeek-R1, Qwen3 reasoning, GLM-4.x,
|
||||
// gpt-oss leak <think> blocks as plain text instead of separating them
|
||||
@@ -419,6 +428,38 @@ export function _psQuote(value) {
|
||||
return "'" + String(value ?? '').replace(/'/g, "''") + "'";
|
||||
}
|
||||
|
||||
// Pick the GPU-pinning env-var name for the detected backend. NVIDIA uses
|
||||
// CUDA_VISIBLE_DEVICES; ROCm/HIP uses HIP_VISIBLE_DEVICES; Vulkan and
|
||||
// Apple Metal don't take an index env var at all (and CUDA_VISIBLE_DEVICES
|
||||
// is a silent no-op on those, which silently hides "wrong backend" config
|
||||
// bugs). Returns 'cmd ' style prefix ('CUDA_VISIBLE_DEVICES=0 ') or '' when
|
||||
// the backend doesn't support pinning. Pass isWindows=true to get PowerShell
|
||||
// `$env:` syntax instead. backend defaults to whatever hwfit detected.
|
||||
function _gpuEnvVarName() {
|
||||
// Only emit a pinning env var when we POSITIVELY know the backend AND
|
||||
// the hwfit scan was actually run against the currently-targeted host.
|
||||
// Without the target-match guard, switching the server picker from an
|
||||
// NVIDIA box (cuda) to a local/Vulkan target preserved the stale
|
||||
// `cuda` backend in the cache, leaking `CUDA_VISIBLE_DEVICES=` into
|
||||
// launches that don't have an NVIDIA GPU at all. Default to "" when
|
||||
// unsure — the user sees a clean command and is prompted to scan.
|
||||
const cachedHost = String(_hwfitCache?._scannedHost || '');
|
||||
const currentHost = String(_envState.remoteHost || '');
|
||||
if (cachedHost !== currentHost) return '';
|
||||
const sb = String(_hwfitCache?.system?.backend || '').toLowerCase();
|
||||
if (sb === 'cuda') return 'CUDA_VISIBLE_DEVICES';
|
||||
if (sb === 'rocm') return 'HIP_VISIBLE_DEVICES';
|
||||
return ''; // vulkan / metal / mps / apple / cpu / generic / unknown — no env-var pinning
|
||||
}
|
||||
function _gpuEnvPrefix(gpuId, isWindows = false) {
|
||||
const id = String(gpuId || '').trim();
|
||||
if (!id) return '';
|
||||
const varName = _gpuEnvVarName();
|
||||
if (!varName) return '';
|
||||
if (isWindows) return `$env:${varName}="${id}"; `;
|
||||
return `${varName}=${id} `;
|
||||
}
|
||||
|
||||
export function _buildEnvPrefix() {
|
||||
if (_isWindows()) return _buildEnvPrefixWindows();
|
||||
let parts = [];
|
||||
@@ -431,7 +472,8 @@ export function _buildEnvPrefix() {
|
||||
}
|
||||
let envVars = [];
|
||||
if (_envState.hfToken) envVars.push('export HF_TOKEN=' + _shellQuote(_envState.hfToken));
|
||||
if (_envState.gpus) envVars.push('export CUDA_VISIBLE_DEVICES=' + _shellQuote(_envState.gpus));
|
||||
const _envGpuVar = _gpuEnvVarName();
|
||||
if (_envState.gpus && _envGpuVar) envVars.push(`export ${_envGpuVar}=` + _shellQuote(_envState.gpus));
|
||||
if (envVars.length) parts.push(envVars.join(' && '));
|
||||
if (parts.length === 0) return '';
|
||||
return parts.join(' && ') + ' &&';
|
||||
@@ -447,7 +489,8 @@ function _buildEnvPrefixWindows() {
|
||||
parts.push('conda activate ' + _psQuote(_envState.envPath));
|
||||
}
|
||||
if (_envState.hfToken) parts.push('$env:HF_TOKEN=' + _psQuote(_envState.hfToken));
|
||||
if (_envState.gpus) parts.push('$env:CUDA_VISIBLE_DEVICES=' + _psQuote(_envState.gpus));
|
||||
const _winGpuVar = _gpuEnvVarName();
|
||||
if (_envState.gpus && _winGpuVar) parts.push(`$env:${_winGpuVar}=` + _psQuote(_envState.gpus));
|
||||
if (parts.length === 0) return '';
|
||||
return parts.join('; ') + ';';
|
||||
}
|
||||
@@ -468,10 +511,18 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
// the bare "auto" input that used to back gpu_id is gone, and the
|
||||
// button strip is the only source for which devices to pin.
|
||||
const gpuId = (f.gpus || f.gpu_id || '').toString().trim();
|
||||
if (gpuId) cmd += `CUDA_VISIBLE_DEVICES=${gpuId} `;
|
||||
cmd += _gpuEnvPrefix(gpuId);
|
||||
if (f.moe_env) {
|
||||
const _opts = _detectModelOptimizations(modelName);
|
||||
if (_opts.envVars.length) cmd += _opts.envVars.join(' ') + ' ';
|
||||
if (_opts.envVars.length) {
|
||||
cmd += _opts.envVars.join(' ') + ' ';
|
||||
} else {
|
||||
// Fallback when the user toggles MoE Env on for a model the
|
||||
// family detector didn't classify as MoE — emit the generic
|
||||
// vLLM MoE optimization env vars so the toggle is never a
|
||||
// silent no-op (was the case before the "always show" change).
|
||||
cmd += 'VLLM_USE_DEEP_GEMM=0 VLLM_USE_FLASHINFER_MOE_FP16=1 OMP_NUM_THREADS=4 ';
|
||||
}
|
||||
}
|
||||
// Pinned attention backend (Attention field). Empty = let vLLM pick.
|
||||
const _attn = (f.vllm_attn_backend ?? '').toString().trim();
|
||||
@@ -513,7 +564,7 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
// the bare "auto" input that used to back gpu_id is gone, and the
|
||||
// button strip is the only source for which devices to pin.
|
||||
const gpuId = (f.gpus || f.gpu_id || '').toString().trim();
|
||||
if (gpuId) cmd += `CUDA_VISIBLE_DEVICES=${gpuId} `;
|
||||
cmd += _gpuEnvPrefix(gpuId);
|
||||
const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim();
|
||||
if (_extraEnv) cmd += _extraEnv + ' ';
|
||||
cmd += `${_py3Bin} -m sglang.launch_server --model-path ${modelName} --host 0.0.0.0 --port ${f.port || '30000'}`;
|
||||
@@ -536,24 +587,39 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
// CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
|
||||
// mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
|
||||
// start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
|
||||
// The Inference mode pill (GPU/CPU) above gates this — when the user picks
|
||||
// CPU, force ngl=0 here so all downstream flag-suppression fires
|
||||
// consistently regardless of what the (now-hidden) ngl input shows.
|
||||
if (String(f.llama_mode || '').toLowerCase() === 'cpu') {
|
||||
f.ngl = '0';
|
||||
} else if (String(f.llama_mode || '').toLowerCase() === 'gpu' && (!f.ngl || String(f.ngl).trim() === '0')) {
|
||||
f.ngl = '99';
|
||||
}
|
||||
const _cpuOnly = String(f.ngl).trim() === '0';
|
||||
// GGML_CUDA_* env vars are no-ops on Vulkan/ROCm/Metal/CPU. Only emit
|
||||
// them when the detected backend is actually CUDA AND the hwfit scan
|
||||
// was run against the currently-targeted host, so a saved preset
|
||||
// from a prior NVIDIA target doesn't pollute a non-NVIDIA launch
|
||||
// with misleading prefixes.
|
||||
const _sb = String(_hwfitCache?.system?.backend || '').toLowerCase();
|
||||
const _hwfitHost = String(_hwfitCache?._scannedHost || '');
|
||||
const _curHost = String(_envState.remoteHost || '');
|
||||
const _isCudaTarget = (_sb === 'cuda') && (_hwfitHost === _curHost);
|
||||
const lcPrefix = (() => {
|
||||
let p = '';
|
||||
if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
|
||||
if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `;
|
||||
if (f.unified_mem && !_cpuOnly && !_isWindows() && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
|
||||
// No GPU env var in CPU mode — `-ngl 0` already disables offload
|
||||
// so CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES would be misleading
|
||||
// clutter ("why is CUDA pinned for a CPU run?").
|
||||
if (!_isWindows() && !_cpuOnly) p += _gpuEnvPrefix(gpuId);
|
||||
return p;
|
||||
})();
|
||||
if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
|
||||
if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `;
|
||||
if (!_isWindows()) {
|
||||
// Resolve GGUF path once, fail loudly if nothing matched (prevents
|
||||
// `--model ""` which causes confusing downstream errors).
|
||||
cmd += `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host. Either download the model here, or switch to the server where it's cached."; exit 1; } && `;
|
||||
}
|
||||
const modelArg = _isWindows() ? `"${ggufPath}"` : `"$MODEL_FILE"`;
|
||||
// Prefer the native llama-server binary on Linux — its minja templating
|
||||
// renders modern GGUF chat templates that the Python bindings' Jinja2
|
||||
// rejects (do_tojson ensure_ascii). Fall back to llama_cpp.server.
|
||||
if (f.unified_mem && !_cpuOnly && _isWindows() && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
|
||||
if (_isWindows() && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true);
|
||||
const modelArg = `"${ggufPath}"`;
|
||||
// Prefer native llama-server. The backend bootstrap resolves/builds the
|
||||
// right binary (Vulkan/HIP/CUDA/Metal/CPU), so keep the generated command
|
||||
// as a validator-safe binary + args with no shell chaining.
|
||||
// Don't suppress stderr — surface real errors (missing file, lib, OOM).
|
||||
// Optional perf/fit flags from a hardware profile (see services/hwfit/
|
||||
// profiles.py). n_cpu_moe offloads MoE expert layers to CPU when the model
|
||||
@@ -575,9 +641,16 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
_lcExtra += ` --n-cpu-moe ${_ncm}`;
|
||||
_lcpExtra += ` --n_cpu_moe ${_ncm}`; // llama-cpp-python uses underscores
|
||||
}
|
||||
// Flash-attn default = auto: native llama-server picks whether to
|
||||
// enable based on the build/model; explicit ON (the Flash-attn
|
||||
// toggle in the form) forces it. "auto" is a meaningful arg, not
|
||||
// omission — older builds without flash-attn ignore it cleanly,
|
||||
// newer ones get the speedup without the user having to know.
|
||||
if (f.flash_attn && !_cpuOnly) {
|
||||
_lcExtra += ' --flash-attn on';
|
||||
_lcpExtra += ' --flash_attn true';
|
||||
} else if (!_cpuOnly) {
|
||||
_lcExtra += ' --flash-attn auto';
|
||||
}
|
||||
if (_kv) {
|
||||
_lcExtra += ` --cache-type-k ${_kv} --cache-type-v ${_kv}`;
|
||||
@@ -613,12 +686,11 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
// llama-cpp-python takes the projector via --clip_model_path.
|
||||
_lcpExtra += ` --clip_model_path "${f._mmproj_path}"`;
|
||||
}
|
||||
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
|
||||
if (_isWindows()) {
|
||||
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
|
||||
cmd += _lcpServer;
|
||||
} else {
|
||||
cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
|
||||
cmd += ` || ${_lcpServer}`;
|
||||
}
|
||||
} else if (backend === 'ollama') {
|
||||
const ollamaPort = f.port || '11434';
|
||||
@@ -652,7 +724,7 @@ export function _buildServeCmd(f, modelName, backend) {
|
||||
}
|
||||
} else if (backend === 'diffusers') {
|
||||
const gpuStr = f.gpus?.trim();
|
||||
if (gpuStr) cmd += `CUDA_VISIBLE_DEVICES=${gpuStr} `;
|
||||
cmd += _gpuEnvPrefix(gpuStr);
|
||||
const diffusersPy = _isWindows() ? 'python' : _py3Bin;
|
||||
cmd += `${diffusersPy} scripts/diffusion_server.py --model ${modelName} --port ${f.port || '8100'}`;
|
||||
if (f.diff_dtype && f.diff_dtype !== 'bfloat16') cmd += ` --dtype ${f.diff_dtype}`;
|
||||
@@ -771,6 +843,14 @@ async function _fetchDependencies() {
|
||||
if (_depPort) _pkgParams.set('ssh_port', _depPort);
|
||||
if (_depVenv) _pkgParams.set('venv', _depVenv);
|
||||
}
|
||||
// Pass the detected backend so the server can build a single
|
||||
// OS+backend-aware install command per row (e.g. add nvidia-cuda-toolkit
|
||||
// on a CUDA-Debian box, vulkan-headers on a Vulkan-Arch box, etc.)
|
||||
// instead of dumping every distro's syntax as a hint.
|
||||
const _depBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
|
||||
if (_depBackend && _hwfitCache?._scannedHost === _depHost) {
|
||||
_pkgParams.set('backend', _depBackend);
|
||||
}
|
||||
const resp = await fetch('/api/cookbook/packages' + (_pkgParams.toString() ? '?' + _pkgParams.toString() : ''));
|
||||
const data = await resp.json();
|
||||
const pkgs = data.packages || [];
|
||||
@@ -832,18 +912,61 @@ async function _fetchDependencies() {
|
||||
// For backends with a recipe catalog (vllm / sglang / llama_cpp),
|
||||
// append a caret button that toggles a per-row recipe panel below.
|
||||
const hasRecipe = RECIPE_BACKENDS.has(pkg.name);
|
||||
const recipeCaret = hasRecipe
|
||||
? `<button class="cookbook-dep-tag cookbook-dep-recipe-caret" data-dep-recipe-toggle="${esc(pkg.name)}" title="Pick a model to see the exact install commands" aria-expanded="false" style="background:none;border:1px solid var(--border);padding:2px 6px;display:inline-flex;align-items:center;cursor:pointer;"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" style="transition:transform 0.15s"><polyline points="6 9 12 15 18 9"/></svg></button>`
|
||||
: '';
|
||||
// Standalone recipe-caret button removed — the "Pick install
|
||||
// command" action lives inside the Installed ▾ dropdown menu
|
||||
// (see _showDepMenu) so each row only has ONE caret to click.
|
||||
// Kept the variable so downstream concat code stays the same.
|
||||
const recipeCaret = '';
|
||||
const recipePanel = hasRecipe ? _recipePanelHtml(pkg.name) : '';
|
||||
// When llama_cpp (or any future engine) reports build_deps_missing
|
||||
// from its system_prereqs probe, surface a one-tap install button
|
||||
// that fires the OS package manager on the target via
|
||||
// /api/cookbook/install-system-deps. Keeps the user inside Cookbook
|
||||
// instead of forcing them out to a shell to apt/pacman/dnf.
|
||||
const _bdm = Array.isArray(pkg.build_deps_missing) ? pkg.build_deps_missing : [];
|
||||
const _buildDepsBtn = _bdm.length
|
||||
? `<button type="button" class="cookbook-dep-tag cookbook-dep-install cookbook-dep-install-sysdeps" data-dep-sysdeps="${esc(_bdm.join(','))}" data-dep-target="${isLocal ? 'local' : 'remote'}" title="Install ${esc(_bdm.join(', '))} via the OS package manager on this target (requires passwordless sudo or root).">Install build deps</button>`
|
||||
: '';
|
||||
// Render the target-specific install command as a compact mono box
|
||||
// when the server resolved it (target's /etc/os-release was readable
|
||||
// AND the backend is known). The box doubles as the source of truth
|
||||
// for the "Install build deps" button's failure toast — both surfaces
|
||||
// show the same string for the same target.
|
||||
const _instCmd = (_bdm.length && pkg.install_cmd_for_target) ? String(pkg.install_cmd_for_target) : '';
|
||||
const _instCmdOs = pkg.install_cmd_os ? String(pkg.install_cmd_os) : '';
|
||||
const _instCmdBe = pkg.install_cmd_backend ? String(pkg.install_cmd_backend) : '';
|
||||
const _instLabel = (_instCmdOs && _instCmdBe) ? `${_instCmdOs} + ${_instCmdBe}` : (_instCmdOs || _instCmdBe || 'this target');
|
||||
const _instCmdBox = _instCmd
|
||||
? `<div class="cookbook-dep-install-cmd" data-dep-cmd="${esc(_instCmd)}" style="margin-top:6px;font-size:10.5px;opacity:0.85;">`
|
||||
+ `<div style="opacity:0.65;margin-bottom:2px;">Install on ${esc(_instLabel)}:</div>`
|
||||
+ `<div style="display:flex;gap:4px;align-items:stretch;">`
|
||||
+ `<code style="flex:1;padding:4px 6px;background:color-mix(in srgb, var(--fg) 6%, transparent);border:1px solid var(--border);border-radius:4px;font-family:var(--mono, ui-monospace, monospace);font-size:10.5px;white-space:pre-wrap;word-break:break-all;">${esc(_instCmd)}</code>`
|
||||
+ `<button type="button" class="cookbook-dep-cmd-copy" data-dep-cmd-copy="${esc(_instCmd)}" title="Copy install command" style="padding:2px 8px;font-size:10px;border:1px solid var(--border);border-radius:4px;background:none;cursor:pointer;color:var(--fg-muted);">Copy</button>`
|
||||
+ `</div></div>`
|
||||
: '';
|
||||
// Partial-state row (replaces the cryptic yellow "Partial ▾" tag).
|
||||
// Renders inline as a yellow banner with two clear actions: one-tap
|
||||
// Install (runs the reinstall in cookbook) or Copy command (paste
|
||||
// into a terminal). Same content surfaces whether the user solves
|
||||
// it from inside Cookbook or from a shell.
|
||||
const _gpuWheelCmd = 'CMAKE_ARGS="-DGGML_CUDA=on" python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124';
|
||||
const _gpuUpgradeBox = (pkg.partial && pkg.partial_action === 'reinstall_llama_cpp_cuda')
|
||||
? `<div class="cookbook-dep-gpu-upgrade" style="margin-top:6px;font-size:11px;display:flex;align-items:center;gap:6px;flex-wrap:wrap;background:color-mix(in srgb, var(--yellow, #f1fa8c) 14%, transparent);border:1px solid color-mix(in srgb, var(--yellow, #f1fa8c) 40%, var(--border));padding:6px 8px;border-radius:6px;">`
|
||||
+ `<span style="flex:1;min-width:160px;">Installed CPU-only — GPU detected on this target. Upgrade for ~10× faster inference.</span>`
|
||||
+ `<button type="button" class="cookbook-dep-tag cookbook-dep-install cookbook-dep-install-gpu-wheel" data-dep-target="${isLocal ? 'local' : 'remote'}" data-dep-gpu-cmd="${esc(_gpuWheelCmd)}" style="font-weight:600;">Install GPU wheel</button>`
|
||||
+ `<button type="button" class="cookbook-dep-tag cookbook-dep-cmd-copy" data-dep-cmd-copy="${esc(_gpuWheelCmd)}" title="Copy command to clipboard">Copy command</button>`
|
||||
+ `</div>`
|
||||
: '';
|
||||
return `<div class="cookbook-dep-row${winBlocked ? ' cookbook-dep-blocked' : ''}" data-pkg-name="${esc(pkg.name)}" data-dep-pip="${esc(pkg.pip || '')}" data-dep-target="${isLocal ? 'local' : 'remote'}" data-dep-kind="${esc(pkg.kind || 'python')}">`
|
||||
+ `<div class="cookbook-dep-info">`
|
||||
+ `<div class="memory-item-title">${_depGlyphHtml(pkg.name)}${esc(pkg.name)}</div>`
|
||||
+ `<div class="memory-item-meta" style="font-size:10px;opacity:0.5;margin-top:2px;">${esc(pkg.desc)}</div>`
|
||||
+ note
|
||||
+ updateNote
|
||||
+ _instCmdBox
|
||||
+ `</div>`
|
||||
+ _rebuildBtn
|
||||
+ _buildDepsBtn
|
||||
+ `<span class="cookbook-dep-tag cookbook-dep-cat">${esc(pkg.category)}</span>`
|
||||
+ _statusTag(pkg, isLocal, isSystemDep, winBlocked)
|
||||
+ recipeCaret
|
||||
@@ -985,8 +1108,15 @@ async function _fetchDependencies() {
|
||||
if (!res.ok || !data.ok) {
|
||||
// FastAPI HTTPException returns {detail: …}; the route's own
|
||||
// path returns {ok:false, error:…}. Surface whichever we get.
|
||||
// Long duration + an OK button — the default 1.2s toast was
|
||||
// disappearing before the user could read multi-clause errors
|
||||
// like "tmux missing on remote".
|
||||
const reason = data.detail || data.error || `HTTP ${res.status}`;
|
||||
uiModule.showToast('Install failed: ' + String(reason).slice(0, 200));
|
||||
uiModule.showToast('Install failed: ' + String(reason).slice(0, 400), {
|
||||
duration: 20000,
|
||||
action: 'OK',
|
||||
onAction: () => {},
|
||||
});
|
||||
return;
|
||||
}
|
||||
// _dep flags this as a pip dependency/driver install (not a servable
|
||||
@@ -996,12 +1126,16 @@ async function _fetchDependencies() {
|
||||
if (statusEl) { statusEl.textContent = upgrade ? 'Updating...' : 'Installing...'; statusEl.disabled = true; }
|
||||
uiModule.showToast(`${upgrade ? 'Updating' : 'Installing'} ${pkgName} on ${targetHost}...`);
|
||||
} catch (err) {
|
||||
uiModule.showToast('Install failed: ' + err.message);
|
||||
uiModule.showToast('Install failed: ' + err.message, {
|
||||
duration: 20000,
|
||||
action: 'OK',
|
||||
onAction: () => {},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wire install buttons (not-installed packages)
|
||||
list.querySelectorAll('.cookbook-dep-install:not(.cookbook-dep-recipe-run)').forEach(btn => {
|
||||
list.querySelectorAll('.cookbook-dep-install:not(.cookbook-dep-recipe-run):not(.cookbook-dep-install-sysdeps)').forEach(btn => {
|
||||
btn.addEventListener('click', async (e) => {
|
||||
e.stopPropagation();
|
||||
const pipName = btn.dataset.depPip;
|
||||
@@ -1010,6 +1144,143 @@ async function _fetchDependencies() {
|
||||
});
|
||||
});
|
||||
|
||||
// Wire "Install build deps" buttons — surfaced on rows whose
|
||||
// system_prereqs are missing (e.g. llama_cpp with no cmake on the
|
||||
// target). One-tap call to /api/cookbook/install-system-deps; the
|
||||
// route enforces a per-package allowlist and uses passwordless
|
||||
// sudo only, so it can never silently hang or stretch beyond the
|
||||
// build-toolchain set the catalog declares.
|
||||
// "Partial ▾" upgrade tag: clicking it fires the action-specific
|
||||
// install routine (currently only `reinstall_llama_cpp_cuda` —
|
||||
// forces pip install with the abetlen CUDA wheel index to add GPU
|
||||
// offload). Same install flow used at launch-time auto-fix, but
|
||||
// user-initiated here so they don't have to launch + wait + retry.
|
||||
list.querySelectorAll('.cookbook-dep-partial').forEach(btn => {
|
||||
btn.addEventListener('click', async (e) => {
|
||||
e.stopPropagation();
|
||||
const action = btn.dataset.depPartialAction || '';
|
||||
if (action !== 'reinstall_llama_cpp_cuda') return;
|
||||
const isLocal = btn.dataset.depTarget === 'local';
|
||||
if (!isLocal) {
|
||||
const depsServerSel = document.getElementById('hwfit-deps-server');
|
||||
if (depsServerSel) _applyServerSelection(depsServerSel.value);
|
||||
}
|
||||
const targetLabel = isLocal ? 'this server' : (_envState.remoteHost || 'remote');
|
||||
const cmd = 'CMAKE_ARGS="-DGGML_CUDA=on" python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124';
|
||||
try {
|
||||
const reqBody = {
|
||||
repo_id: 'llama-cpp-python-cuda',
|
||||
cmd,
|
||||
remote_host: _envState.remoteHost || undefined,
|
||||
ssh_port: _getPort(_envState.remoteHost) || undefined,
|
||||
platform: _envState.platform || undefined,
|
||||
};
|
||||
const res = await fetch('/api/model/serve', {
|
||||
method: 'POST', credentials: 'same-origin',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(reqBody),
|
||||
});
|
||||
const data = await res.json().catch(() => ({}));
|
||||
if (res.ok && data.ok) {
|
||||
const payload = { repo_id: 'pip llama-cpp-python[CUDA]', _cmd: cmd, remote_host: _envState.remoteHost || '', _dep: true };
|
||||
_addTask(data.session_id, 'pip llama-cpp-python[CUDA]', 'download', payload);
|
||||
uiModule.showToast(`Reinstalling llama-cpp-python with CUDA wheels on ${targetLabel} (~1-3 min)…`, 4000);
|
||||
} else {
|
||||
uiModule.showToast('Upgrade failed: ' + String(data.detail || data.error || `HTTP ${res.status}`).slice(0, 300), {
|
||||
duration: 20000, action: 'OK', onAction: () => {},
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
uiModule.showToast('Upgrade request failed: ' + err.message, { duration: 20000, action: 'OK', onAction: () => {} });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Inline command-box "Copy" buttons — one per row that has a
|
||||
// resolved per-target install command. Same string surfaces here
|
||||
// and in the toast/diagnosis so the user always sees one answer.
|
||||
list.querySelectorAll('.cookbook-dep-cmd-copy').forEach(btn => {
|
||||
btn.addEventListener('click', async (e) => {
|
||||
e.stopPropagation();
|
||||
const cmd = btn.dataset.depCmdCopy || '';
|
||||
if (!cmd) return;
|
||||
try { await navigator.clipboard.writeText(cmd); }
|
||||
catch { /* fall through */ }
|
||||
const orig = btn.textContent;
|
||||
btn.textContent = 'Copied';
|
||||
setTimeout(() => { if (btn.isConnected) btn.textContent = orig; }, 1200);
|
||||
});
|
||||
});
|
||||
list.querySelectorAll('.cookbook-dep-install-sysdeps').forEach(btn => {
|
||||
btn.addEventListener('click', async (e) => {
|
||||
e.stopPropagation();
|
||||
const names = (btn.dataset.depSysdeps || '').split(',').map(s => s.trim()).filter(Boolean);
|
||||
if (!names.length) return;
|
||||
const isLocal = btn.dataset.depTarget === 'local';
|
||||
// Pull the per-target install command from the sibling box on
|
||||
// the same row, so failure toasts surface the SAME line the
|
||||
// user already sees inline. No duplicated formatting logic.
|
||||
const _row = btn.closest('.cookbook-dep-row');
|
||||
const _cmdBox = _row?.querySelector('.cookbook-dep-install-cmd');
|
||||
const _resolvedCmd = _cmdBox?.dataset.depCmd || '';
|
||||
// Mirror _installDep: the Dependencies tab has its own server
|
||||
// picker that can override _envState. Apply it before reading
|
||||
// remoteHost, otherwise the install silently runs on the wrong
|
||||
// target (container ends up with the packages, the real remote
|
||||
// host stays broken, success toast misleads the user).
|
||||
if (!isLocal) {
|
||||
const depsServerSel = document.getElementById('hwfit-deps-server');
|
||||
if (depsServerSel) _applyServerSelection(depsServerSel.value);
|
||||
}
|
||||
const targetLabel = isLocal ? 'this server' : (_envState.remoteHost || 'remote');
|
||||
const origText = btn.textContent;
|
||||
btn.textContent = 'Installing…';
|
||||
btn.disabled = true;
|
||||
try {
|
||||
const body = { packages: names };
|
||||
if (!isLocal && _envState.remoteHost) {
|
||||
body.remote_host = _envState.remoteHost;
|
||||
const _p = _getPort(_envState.remoteHost);
|
||||
if (_p) body.ssh_port = _p;
|
||||
}
|
||||
const res = await fetch('/api/cookbook/install-system-deps', {
|
||||
method: 'POST', credentials: 'same-origin',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
const data = await res.json().catch(() => ({}));
|
||||
if (res.ok && data.ok) {
|
||||
uiModule.showToast(`Installed ${names.join(', ')} on ${targetLabel}. Refreshing…`, 4000);
|
||||
// Refresh the deps panel so the row updates (prereqs now present).
|
||||
try { await _fetchDependencies(); } catch {}
|
||||
} else {
|
||||
const reason = data.error || data.detail || `HTTP ${res.status}`;
|
||||
// Append the per-target install command (if we already know it
|
||||
// from the row) so the user can copy-paste it without leaving
|
||||
// the toast. Otherwise just surface the error.
|
||||
const _suffix = _resolvedCmd ? `\n\nRun on ${targetLabel}: ${_resolvedCmd}` : '';
|
||||
uiModule.showToast('Build-deps install failed: ' + String(reason).slice(0, 300) + _suffix, {
|
||||
duration: 25000,
|
||||
action: _resolvedCmd ? 'Copy command' : 'OK',
|
||||
onAction: async () => {
|
||||
if (_resolvedCmd) {
|
||||
try { await navigator.clipboard.writeText(_resolvedCmd); } catch {}
|
||||
}
|
||||
},
|
||||
});
|
||||
btn.textContent = origText;
|
||||
btn.disabled = false;
|
||||
}
|
||||
} catch (err) {
|
||||
uiModule.showToast('Install request failed: ' + err.message, {
|
||||
duration: 20000, action: 'OK', onAction: () => {},
|
||||
});
|
||||
btn.textContent = origText;
|
||||
btn.disabled = false;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ── Recipe panel wiring (per-backend dropdown with model + commands) ──
|
||||
// Caret toggle: shows/hides the panel directly below the backend row.
|
||||
list.querySelectorAll('[data-dep-recipe-toggle]').forEach(btn => {
|
||||
@@ -1577,8 +1848,22 @@ function _wireTabEvents(body) {
|
||||
if (dlBtn && dlInput) {
|
||||
function _stripHfUrl(input) {
|
||||
let repo = input.trim();
|
||||
// Strip a leading `hf download` / `hf-cli download` / `huggingface-cli
|
||||
// download` wrapper so a paste from CLI docs Just Works. Drop the
|
||||
// command prefix; the rest is parsed by the existing strippers.
|
||||
repo = repo.replace(/^(?:huggingface-cli|hf-cli|hf)\s+(?:download|d)\s+/i, '');
|
||||
// Strip the `hf://` (and `huggingface://`) scheme — the HF CLI
|
||||
// accepts it as an alias and users naturally copy it. Same effect
|
||||
// as the bare `org/repo[/file.gguf]` form after the strip.
|
||||
repo = repo.replace(/^(?:hf|huggingface):\/\//i, '');
|
||||
// Strip Ollama-style "hf.co/" prefix if present (e.g. hf.co/unsloth/...:tag)
|
||||
repo = repo.replace(/^hf\.co\//, '');
|
||||
// Full HF blob/resolve URL → turn into `org/repo/path/to/file` so
|
||||
// the downstream `_splitRepoFile` can pick the file out.
|
||||
// Matches: https://huggingface.co/org/repo/blob/branch/path/to/file.gguf
|
||||
// https://huggingface.co/org/repo/resolve/branch/path/to/file.gguf
|
||||
const hfBlob = repo.match(/^https?:\/\/huggingface\.co\/([^/]+\/[^/?#]+)\/(?:blob|resolve)\/[^/?#]+\/([^?#]+)/);
|
||||
if (hfBlob) return `${hfBlob[1]}/${hfBlob[2]}`;
|
||||
const hfMatch = repo.match(/^https?:\/\/huggingface\.co\/([^/]+\/[^/?#]+(?::[^/?#\s]+)?)/);
|
||||
if (hfMatch) repo = hfMatch[1];
|
||||
return repo;
|
||||
@@ -1590,6 +1875,22 @@ function _wireTabEvents(body) {
|
||||
if (!m) return { repo: raw, include: null };
|
||||
return { repo: m[1], include: `*${m[2]}*` };
|
||||
}
|
||||
// Split `org/repo/path/to/file.gguf` (or `.safetensors`/`.bin`) into
|
||||
// repo + exact file include. Lets the user paste a path straight out
|
||||
// of a HuggingFace "Files and versions" page or a copied filename
|
||||
// without needing to peel the repo/file apart by hand. Returns null
|
||||
// when the input doesn't look like a deep file path.
|
||||
function _splitRepoFile(raw) {
|
||||
// Must have at least 3 slash-separated segments AND end in a
|
||||
// model-file extension to avoid eating Ollama tags or repo-only
|
||||
// inputs like `org/repo`.
|
||||
const parts = raw.split('/');
|
||||
if (parts.length < 3) return null;
|
||||
const fname = parts[parts.length - 1];
|
||||
if (!/\.(gguf|safetensors|bin|pt|pth|onnx|mlx)(\?[^?]*)?$/i.test(fname)) return null;
|
||||
const repo = parts.slice(0, 2).join('/');
|
||||
return { repo, include: fname.replace(/\?.*$/, '') };
|
||||
}
|
||||
// Ollama-library name. Matches `qwen2.5:14b`, `llama3:latest`, and the
|
||||
// (rare) `library/<name>:<tag>` form which we normalize by stripping the
|
||||
// namespace. The backend's _is_ollama_download check expects the same
|
||||
@@ -1605,7 +1906,14 @@ function _wireTabEvents(body) {
|
||||
const rawRepo = _stripHfUrl(dlInput.value);
|
||||
if (!rawRepo) return;
|
||||
const ollamaName = _ollamaName(rawRepo);
|
||||
const { repo, include: autoInclude } = ollamaName ? { repo: ollamaName, include: null } : _splitRepoTag(rawRepo);
|
||||
// Prefer the deep-file split (org/repo/file.gguf → repo + exact
|
||||
// include) over the tag split (org/repo:tag → glob include), and
|
||||
// both over the plain repo case. Ollama names still take priority
|
||||
// since they go through a different backend.
|
||||
const _fileSplit = !ollamaName ? _splitRepoFile(rawRepo) : null;
|
||||
const { repo, include: autoInclude } = ollamaName
|
||||
? { repo: ollamaName, include: null }
|
||||
: (_fileSplit || _splitRepoTag(rawRepo));
|
||||
// HuggingFace repo IDs must be `org/model`. A bare model name would 404
|
||||
// at snapshot_download time with a raw traceback, so reject it up front.
|
||||
// Ollama names (single-segment with a tag) skip this check — they go
|
||||
|
||||
+303
-37
@@ -10,6 +10,7 @@ import { providerLogo } from './providers.js';
|
||||
import { modelColor } from './chatRenderer.js';
|
||||
import { bindMenuDismiss, dismissOrRemove } from './escMenuStack.js';
|
||||
import { openCookbookDependencies } from './cookbook-diagnosis.js';
|
||||
import { _hwfitCache } from './cookbook-hwfit.js';
|
||||
|
||||
// Shared state/functions injected by init()
|
||||
let _envState;
|
||||
@@ -495,6 +496,7 @@ function _rerenderCachedModels() {
|
||||
item.classList.remove('doclib-card-expanded');
|
||||
item.style.flexDirection = '';
|
||||
item.style.alignItems = '';
|
||||
item.style.maxHeight = '';
|
||||
list.style.minHeight = '';
|
||||
list.style.maxHeight = '';
|
||||
return;
|
||||
@@ -508,6 +510,7 @@ function _rerenderCachedModels() {
|
||||
c.classList.remove('doclib-card-expanded');
|
||||
c.style.flexDirection = '';
|
||||
c.style.alignItems = '';
|
||||
c.style.maxHeight = '';
|
||||
});
|
||||
|
||||
const shortName = repo.split('/').pop();
|
||||
@@ -620,13 +623,31 @@ function _rerenderCachedModels() {
|
||||
// stays as the source-of-truth so every existing change handler
|
||||
// (updateBackendVisibility, runtime readiness, command builder)
|
||||
// still fires via dispatchEvent('change') on selection.
|
||||
panelHtml += `<label>${_l('Backend','Inference engine: vLLM, SGLang, llama.cpp, Ollama, or Diffusers')}<div class="hwfit-backend-picker" data-backend-picker style="position:relative;width:100%;"><select class="hwfit-sf hwfit-backend-source" data-field="backend" style="display:none;">${backendOpts}</select><button type="button" class="hwfit-backend-btn" data-backend-btn aria-haspopup="listbox" aria-expanded="false" style="display:flex;align-items:center;gap:6px;width:100%;height:28px;padding:0 8px;background:var(--bg);color:var(--fg);border:1px solid var(--border);border-radius:4px;font:inherit;font-size:11px;cursor:pointer;text-align:left;"><span class="hwfit-backend-btn-icon" data-backend-icon-slot aria-hidden="true" style="display:inline-flex;align-items:center;justify-content:center;width:16px;height:16px;color:var(--accent, var(--red));flex-shrink:0;"></span><span class="hwfit-backend-btn-label" data-backend-label style="flex:1;min-width:0;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;"></span><svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true" style="opacity:0.6;flex-shrink:0;"><polyline points="6 9 12 15 18 9"/></svg></button><div class="hwfit-backend-menu" data-backend-menu role="listbox" hidden style="position:absolute;top:calc(100% + 4px);left:0;right:0;z-index:100;background:var(--panel, var(--bg));border:1px solid var(--border);border-radius:6px;box-shadow:0 6px 20px rgba(0,0,0,0.22);padding:4px;"></div></div></label>`;
|
||||
panelHtml += `<label>${_l('Backend','Inference engine: vLLM, SGLang, llama.cpp, Ollama, or Diffusers')}<div class="hwfit-backend-picker" data-backend-picker style="position:relative;width:100%;"><select class="hwfit-sf hwfit-backend-source" data-field="backend" style="display:none;">${backendOpts}</select><button type="button" class="hwfit-backend-btn" data-backend-btn aria-haspopup="listbox" aria-expanded="false" style="display:flex;align-items:center;gap:6px;width:100%;height:28px;padding:0 8px;background:var(--bg);color:var(--fg);border:1px solid var(--border);border-radius:4px;font:inherit;font-size:11px;cursor:pointer;text-align:left;position:relative;top:-3px;"><span class="hwfit-backend-btn-icon" data-backend-icon-slot aria-hidden="true" style="display:inline-flex;align-items:center;justify-content:center;width:16px;height:16px;color:var(--accent, var(--red));flex-shrink:0;"></span><span class="hwfit-backend-btn-label" data-backend-label style="flex:1;min-width:0;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;"></span><svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true" style="opacity:0.6;flex-shrink:0;"><polyline points="6 9 12 15 18 9"/></svg></button><div class="hwfit-backend-menu" data-backend-menu role="listbox" hidden style="position:absolute;top:calc(100% + 4px);left:0;right:0;z-index:100;background:var(--panel, var(--bg));border:1px solid var(--border);border-radius:6px;box-shadow:0 6px 20px rgba(0,0,0,0.22);padding:4px;"></div></div></label>`;
|
||||
panelHtml += `<input type="hidden" class="hwfit-sf" data-field="host" value="${esc(_es.remoteHost || '')}" />`;
|
||||
// Inference mode pill (llama.cpp only) — lives directly to the
|
||||
// RIGHT of Backend in Row 1 so the engine and the GPU/CPU choice
|
||||
// are read together. .hwfit-backend-llamacpp visibility class
|
||||
// hides it when the user switches to vLLM/SGLang/Ollama.
|
||||
{
|
||||
// Default CPU — works on every host without GPU/wheel matching
|
||||
// hassle. User picks GPU explicitly if they have the right setup
|
||||
// (avoids "click Launch → silent CPU fallback because the wheel
|
||||
// is CPU-only" surprises that ate hours of debugging).
|
||||
// Layout: CPU on left, GPU on right → mode-right triggers when
|
||||
// GPU is selected so the sliding pill animates rightward.
|
||||
// Default to GPU mode when hwfit detected a GPU backend on the
|
||||
// current target — CPU as a global default sent the user down a
|
||||
// 35GB-model-on-CPU rabbit hole (-ngl 0, no flash-attn, no GPU
|
||||
// offload). Falls back to CPU only when hwfit detected no GPU
|
||||
// (cpu_x86 / generic / unscanned) or the cache is stale.
|
||||
const _hwBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
|
||||
const _hwScanMatch = String(_hwfitCache?._scannedHost || '') === String(_envState.remoteHost || '');
|
||||
const _llamaModeDefault = (_hwScanMatch && ['cuda', 'rocm', 'vulkan', 'metal', 'mps', 'apple'].includes(_hwBackend)) ? 'gpu' : 'cpu';
|
||||
const _llamaMode = sv('llama_mode', _llamaModeDefault);
|
||||
panelHtml += `<label class="hwfit-backend-llamacpp">${_l('Inference','CPU (default) = -ngl 0 (works anywhere). GPU = -ngl 99 (offload all layers; needs CUDA/ROCm/Vulkan-built llama-cpp).')}<div class="mode-toggle${_llamaMode === 'gpu' ? ' mode-right' : ''}" data-llama-mode-toggle style="display:flex;width:100%;height:30px;position:relative;top:2px;"><button type="button" class="mode-toggle-btn${_llamaMode === 'cpu' ? ' active' : ''}" data-llama-mode="cpu" aria-pressed="${_llamaMode === 'cpu'}" style="flex:1;"><span style="position:relative;top:-5px;">CPU</span></button><button type="button" class="mode-toggle-btn${_llamaMode === 'gpu' ? ' active' : ''}" data-llama-mode="gpu" aria-pressed="${_llamaMode === 'gpu'}" style="flex:1;"><span style="position:relative;top:-5px;">GPU</span></button></div><input type="hidden" class="hwfit-sf" data-field="llama_mode" value="${esc(_llamaMode)}" /></label>`;
|
||||
}
|
||||
panelHtml += `<label>${_l('venv','Path to Python venv or conda env activate script')}<input type="text" class="hwfit-sf hwfit-sf-wide" data-field="venv" value="${esc(sv('venv', _es.envPath || _srvVenv || ''))}" placeholder="~/venv" /></label>`;
|
||||
// Dtype lives in Row 1 (next to venv) — it's the first knob people
|
||||
// change when matching the model to the box, so it earns top-row
|
||||
// real estate over Row 2's launch-tuning controls.
|
||||
panelHtml += `<label>${_l('Dtype','Data type for weights. auto picks best for GPU')}<select class="hwfit-sf" data-field="dtype">${dtypeOpts}</select></label>`;
|
||||
const defaultPort = defaultBackend === 'ollama' ? '11434' : _nextAvailablePort();
|
||||
panelHtml += `<label>${_l('Port','HTTP port for the API server')}<input type="text" class="hwfit-sf" data-field="port" value="${esc(sv('port', defaultPort))}" /></label>`;
|
||||
const _activeGpus = (defaultGpus || '').split(',').map(s => s.trim()).filter(Boolean);
|
||||
@@ -642,7 +663,7 @@ function _rerenderCachedModels() {
|
||||
// separates the GPU chiclets from the GPU Mem field that follows
|
||||
// (asked-for breathing room; 4px on either side felt cramped on
|
||||
// the GPU-Mem boundary).
|
||||
const _gpusLabelHtml = `<label class="hwfit-gpus-label" style="margin:0 8px 0 4px;">${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
|
||||
const _gpusLabelHtml = `<label class="hwfit-gpus-label cookbook-llama-gpu-only" style="margin:0 8px 0 4px;">${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
|
||||
// Save / saved-configs split button — sits at the right end of Row 1.
|
||||
panelHtml += _slotsHtml;
|
||||
panelHtml += `</div>`;
|
||||
@@ -664,10 +685,12 @@ function _rerenderCachedModels() {
|
||||
// (Swap, KV Cache, Attention backend, Env vars, llama.cpp batch/ubatch)
|
||||
// moved to the Advanced fold below to keep this row scannable.
|
||||
panelHtml += `<div class="hwfit-serve-row hwfit-serve-row-core hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp hwfit-backend-ollama">`;
|
||||
// Order: TP → Context → Max Seqs → GPUs → GPU Mem.
|
||||
// Dtype moved up to Row 1. GPUs moved here next to GPU Mem so the
|
||||
// "which devices + how much of them" decisions sit adjacent. Max
|
||||
// Seqs follows Context per the "request-shape" cluster.
|
||||
// Order: Dtype → TP → Context → Max Seqs → GPUs → GPU Mem.
|
||||
// Dtype moved down from Row 1 to make space for the Inference pill
|
||||
// (llama.cpp GPU/CPU toggle, llamacpp-only). GPUs lives next to
|
||||
// GPU Mem so "which devices + how much" sit adjacent. Max Seqs
|
||||
// follows Context per the "request-shape" cluster.
|
||||
panelHtml += `<label>${_l('Dtype','Data type for weights. auto picks best for GPU')}<select class="hwfit-sf" data-field="dtype">${dtypeOpts}</select></label>`;
|
||||
panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('TP','Tensor Parallelism — split model across N GPUs')}<select class="hwfit-sf" data-field="tp">${tpOpts}</select></label>`;
|
||||
// ctx resets to the model's max on every panel open (the real ctx slider
|
||||
// lives in the Scan/Download toolbar — see cookbook.js .hwfit-ctx-control).
|
||||
@@ -711,12 +734,6 @@ function _rerenderCachedModels() {
|
||||
// container — left an empty trailing column gap on wide modals).
|
||||
panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang" style="grid-column:1 / -1;">${_l('Env','Extra KEY=VALUE env-var pairs prepended to the launch (space-separated). Example: CUDACXX=$VIRTUAL_ENV/lib/python3.10/site-packages/nvidia/cuda_nvcc/bin/nvcc — points flashinfer at the venv-bundled nvcc when the system one is too old for your GPU.')}<input type="text" class="hwfit-sf" data-field="extra_env" value="${esc(sv('extra_env',''))}" placeholder="CUDACXX=/path/to/nvcc NCCL_P2P_DISABLE=1" style="width:100%;" /></label>`;
|
||||
panelHtml += `</div>`;
|
||||
// Advanced llama.cpp row (Batch / UBatch — moved out of Core for the
|
||||
// same "rarely touched" reason as the vLLM extras above).
|
||||
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
|
||||
panelHtml += `<label class="hwfit-backend-llamacpp">${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
|
||||
panelHtml += `<label class="hwfit-backend-llamacpp">${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
|
||||
panelHtml += `</div>`;
|
||||
// Row 2b: Diffusers settings
|
||||
const diffDtypeOpts = ['bfloat16','float16','float32'].map(d => `<option value="${d}"${sv('diff_dtype','bfloat16')===d?' selected':''}>${d}</option>`).join('');
|
||||
const deviceMapOpts = ['balanced','auto','sequential'].map(d => `<option value="${d}"${sv('diff_device_map','balanced')===d?' selected':''}>${d}</option>`).join('');
|
||||
@@ -740,13 +757,19 @@ function _rerenderCachedModels() {
|
||||
panelHtml += `<div class="hwfit-serve-checks hwfit-backend-vllm hwfit-backend-sglang">`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="trust_remote"${sv('trust_remote',false)?' checked':''} /> Trust Remote Code${_h('Allow model to run custom code from HuggingFace')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="auto_tool"${sv('auto_tool',false)?' checked':''} /> Auto Tool Choice${_h('Enable function/tool calling for agent mode')}</label>`;
|
||||
if (_rp_name) panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="reasoning_parser" data-parser="${_rp_name}" /> Reasoning Parser <span class="hwfit-parser-tag">${_rp_name}</span></label>`;
|
||||
// Always-render the Reasoning Parser, Expert Parallel, and MoE Env
|
||||
// checkboxes — the model-family detection above is a hint, not a
|
||||
// hard gate. User asked to keep these visible regardless so that
|
||||
// a borderline-undetected MoE/reasoning model can still toggle
|
||||
// them without dropping back to the raw command box.
|
||||
panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="reasoning_parser" data-parser="${_rp_name || ''}" /> Reasoning Parser${_rp_name ? ` <span class="hwfit-parser-tag">${_rp_name}</span>` : ''}${_h('Splits <think> tokens into a separate channel. The tag (when shown) is the auto-detected parser; edit the command if you need a different one.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="enforce_eager"${sv('enforce_eager',false)?' checked':''} /> Enforce Eager${_h('Disable CUDA graphs. Slower but uses less memory')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="prefix_cache"${sv('prefix_cache',false)?' checked':''} /> Prefix Caching${_h('Cache shared prompt prefixes across requests')}</label>`;
|
||||
// Inline the previously-second vLLM checks row so Expert Parallel /
|
||||
// Speculative / MoE Env sit next to Prefix Caching with no gap. All
|
||||
// three are vLLM-only — class-gated so they hide on SGLang.
|
||||
if (_opts2_row3.flags.includes('--enable-expert-parallel')) panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="expert_parallel" /> Expert Parallel</label>`;
|
||||
// three are vLLM-only — class-gated so they hide on SGLang. Always
|
||||
// render so the user can flip them on for any MoE model.
|
||||
panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="expert_parallel" /> Expert Parallel${_h('MoE: shard expert layers across GPUs. Helps for MiniMax M-series, Qwen3 A3B/A10B/A22B MoE, DeepSeek V3+/R1. Ignored / wasteful on dense models.')}</label>`;
|
||||
{
|
||||
const _specDef = _opts2_row3.spec || { method: 'mtp', tokens: 3 };
|
||||
const _specMethod = sv('spec_method', _specDef.method);
|
||||
@@ -757,27 +780,39 @@ function _rerenderCachedModels() {
|
||||
`<option value="${m}"${m === _specMethod ? ' selected' : ''}>${m}</option>`).join('');
|
||||
panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm hwfit-spec-group"><input type="checkbox" class="hwfit-sf" data-field="speculative" /> Speculative <select class="hwfit-sf hwfit-spec-method" data-field="spec_method" title="vLLM --speculative-config method">${_specOpts}</select><input type="number" class="hwfit-sf hwfit-spec-tokens hwfit-spec-tokens-bare" data-field="spec_tokens" value="${esc(_specTokens)}" min="1" max="10" title="num_speculative_tokens" style="width:44px;" /><span class="hwfit-help-chip hwfit-help-chip-inline" title="MTP / speculative decoding is supported on a few model families only — turn it on when the model card explicitly recommends it. On supported models it can boost inference throughput up to ~3×; on unsupported models it will either be ignored or fail to launch." style="margin-left:6px;">?</span></label>`;
|
||||
}
|
||||
if (_opts2_row3.envVars.length) panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="moe_env" /> MoE Env Vars</label>`;
|
||||
// Always-render MoE Env Vars — the env vars dict is empty for
|
||||
// most dense models (toggle is a no-op then), but for MoE families
|
||||
// the user can still flip it on without re-fitting model detection.
|
||||
panelHtml += `<label class="hwfit-sf-cb hwfit-backend-vllm"><input type="checkbox" class="hwfit-sf" data-field="moe_env" /> MoE Env Vars${_h('Adds MoE-specific env vars to the launch command: VLLM_USE_DEEP_GEMM=0, VLLM_USE_FLASHINFER_MOE_FP16=1, OMP_NUM_THREADS=4. Helpful on MoE models like Qwen3 A3B/A10B, MiniMax, DeepSeek V3+; ignored on dense models.')}</label>`;
|
||||
panelHtml += `</div>`;
|
||||
// Row 2c: llama.cpp fit/perf flags (set by Auto profiles, editable by hand)
|
||||
// ── llama.cpp Advanced — grouped by purpose ──
|
||||
// Three clean field rows + one checkbox row, all selects/inputs the
|
||||
// same 28px height (no per-field `top:-Npx` nudges). Groups follow
|
||||
// user mental model: (1) where it runs on GPU, (2) how memory is
|
||||
// shaped, (3) how requests are batched, (4) on/off toggles.
|
||||
const _kvOpts = ['', 'q4_0', 'q8_0', 'f16'].map(k => `<option value="${k}"${sv('cache_type','')===k?' selected':''}>${k||'default'}</option>`).join('');
|
||||
const llamaFitOpts = ['', 'off', 'on'].map(d => `<option value="${d}"${sv('llama_fit','')===d?' selected':''}>${d||'default'}</option>`).join('');
|
||||
const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => `<option value="${d}"${sv('llama_split_mode','')===d?' selected':''}>${d||'default'}</option>`).join('');
|
||||
|
||||
// Group 1 — GPU placement (GPU-only, hides in CPU mode)
|
||||
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp cookbook-llama-gpu-only">`;
|
||||
panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer = default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode">${llamaSplitModeOpts}</select></label>`;
|
||||
panelHtml += `<label>${_l('Tensor Split','GPU proportions, e.g. 50,50 across two GPUs. Blank = auto.')}<input type="text" class="hwfit-sf" data-field="llama_tensor_split" value="${esc(sv('llama_tensor_split', ''))}" placeholder="auto" /></label>`;
|
||||
panelHtml += `<label>${_l('Main GPU','--main-gpu index inside the visible GPU set. Useful for split mode none/row.')}<input type="text" class="hwfit-sf" data-field="llama_main_gpu" value="${esc(sv('llama_main_gpu', ''))}" placeholder="auto" /></label>`;
|
||||
panelHtml += `</div>`;
|
||||
|
||||
// Group 2 — Memory tuning (KV cache + MoE-on-CPU + Fit policy)
|
||||
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
|
||||
panelHtml += `<label>${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" style="width:54px;position:relative;top:-8px;" /></label>`;
|
||||
panelHtml += `<label>${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = sharp long-context, f16 = full. Blank = llama.cpp default.')}<select class="hwfit-sf" data-field="cache_type">${_kvOpts}</select></label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="flash_attn"${sv('flash_attn',false)?' checked':''} /> Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="vision"${sv('vision',false)?' checked':''} /> Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model (download one into the model folder). Adds ~1 GB VRAM + a small per-image cost.')}</label>`;
|
||||
panelHtml += `<label>${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = long-context, f16 = full.')}<select class="hwfit-sf" data-field="cache_type">${_kvOpts}</select></label>`;
|
||||
panelHtml += `<label class="cookbook-llama-gpu-only">${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" /></label>`;
|
||||
panelHtml += `<label>${_l('Fit','llama.cpp --fit. Leave default unless you need explicit off/on behavior for a preset.')}<select class="hwfit-sf" data-field="llama_fit">${llamaFitOpts}</select></label>`;
|
||||
panelHtml += `</div>`;
|
||||
// Row 2d: native llama-server placement/runtime controls. These are
|
||||
// explicit overrides for known-good advanced presets; blank keeps
|
||||
// llama.cpp/profile defaults.
|
||||
|
||||
// Group 3 — Request batching (Batch / UBatch / Parallel)
|
||||
panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
|
||||
panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode" style="position:relative;top:-8px;">${llamaSplitModeOpts}</select></label>`;
|
||||
panelHtml += `<label>${_l('Tensor Split','GPU proportions for llama.cpp, e.g. 50,50 across two visible GPUs. Leave blank for auto.')}<input type="text" class="hwfit-sf" data-field="llama_tensor_split" value="${esc(sv('llama_tensor_split', ''))}" placeholder="50,50" /></label>`;
|
||||
panelHtml += `<label>${_l('Main GPU','llama.cpp --main-gpu index inside the visible GPU set. Mostly useful for split mode none/row.')}<input type="text" class="hwfit-sf" data-field="llama_main_gpu" value="${esc(sv('llama_main_gpu', ''))}" placeholder="auto" /></label>`;
|
||||
panelHtml += `<label>${_l('Parallel','llama.cpp parallel slots. Leave blank for llama.cpp default; 1 matches single-lane presets.')}<input type="text" class="hwfit-sf" data-field="llama_parallel" value="${esc(sv('llama_parallel', ''))}" placeholder="1" /></label>`;
|
||||
panelHtml += `<label>${_l('Batch','llama.cpp prompt batch size. Blank = default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
|
||||
panelHtml += `<label>${_l('UBatch','llama.cpp physical micro-batch size. Blank = default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
|
||||
panelHtml += `<label>${_l('Parallel','llama.cpp parallel slots. Blank = default; 1 matches single-lane presets.')}<input type="text" class="hwfit-sf" data-field="llama_parallel" value="${esc(sv('llama_parallel', ''))}" placeholder="1" /></label>`;
|
||||
panelHtml += `</div>`;
|
||||
// Auto-profile chips row removed — visual fit with the rest of the
|
||||
// serve panel was off, and the manual ctx/n_cpu_moe/cache controls
|
||||
@@ -791,12 +826,19 @@ function _rerenderCachedModels() {
|
||||
panelHtml += `<span style="opacity:0.7;">GPU memory:</span>`;
|
||||
panelHtml += `<span class="hwfit-vram-readout" style="opacity:0.5;">checking…</span>`;
|
||||
panelHtml += `</div>`;
|
||||
// Row 3a: Checkboxes (llama.cpp-only)
|
||||
// Group 4 — llama.cpp toggles. Single row of checkboxes, GPU-only
|
||||
// ones (Flash Attn, Unified Memory, Allow CPU overflow) hide
|
||||
// automatically in CPU mode. Order: perf-critical → safety → I/O →
|
||||
// niche. MTP Spec sits last because it owns its own numstep widget
|
||||
// and is the widest item.
|
||||
panelHtml += `<div class="hwfit-serve-checks hwfit-backend-llamacpp">`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="unified_mem"${sv('unified_mem',false)?' checked':''} /> Unified Memory${_h('For AMD APUs / Strix Halo: exports GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 so llama.cpp can address the full BIOS VRAM carveout instead of the default ~28 GB cap. No-op on discrete GPUs.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_mmap"${sv('llama_no_mmap',false)?' checked':''} /> No mmap${_h('Adds --no-mmap for native llama-server. Useful for some high-context/local-storage setups, but not a universal default.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_warmup"${sv('llama_no_warmup',false)?' checked':''} /> Skip warmup${_h('Adds --no-warmup. Can reduce startup memory spikes for tight launches, but llama.cpp defaults to warming up.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb hwfit-spec-group"><input type="checkbox" class="hwfit-sf" data-field="llama_speculative_mtp"${sv('llama_speculative_mtp',false)?' checked':''} /> MTP Spec${_h('llama.cpp native MTP speculative decoding: --spec-type draft-mtp. Requires a GGUF with MTP heads and a recent llama-server build.')} <span class="hwfit-numstep"><button type="button" class="hwfit-numstep-btn" data-step="-1" tabindex="-1" aria-label="Decrease">‹</button><input type="number" class="hwfit-sf hwfit-spec-tokens" data-field="llama_spec_tokens" value="${esc(sv('llama_spec_tokens', '3'))}" min="1" max="10" title="--spec-draft-n-max" /><button type="button" class="hwfit-numstep-btn" data-step="1" tabindex="-1" aria-label="Increase">›</button></span></label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb cookbook-llama-gpu-only"><input type="checkbox" class="hwfit-sf" data-field="flash_attn"${sv('flash_attn',false)?' checked':''} /> Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache. Auto by default.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb cookbook-llama-gpu-only"><input type="checkbox" class="hwfit-sf" data-field="unified_mem"${sv('unified_mem',false)?' checked':''} /> Unified Memory${_h('For AMD APUs / Strix Halo: exports GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 so llama.cpp can address the full BIOS VRAM carveout instead of the default ~28 GB cap. No-op on discrete GPUs.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb cookbook-llama-gpu-only"><input type="checkbox" class="hwfit-sf" data-field="llama_cpu_overflow"${sv('llama_cpu_overflow',false)?' checked':''} /> Allow CPU overflow${_h('OFF (default): cookbook blocks launches that would overflow GPU VRAM. ON: layers/KV cache that do not fit get pushed to CPU (slow).')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb cookbook-llama-gpu-only"><input type="checkbox" class="hwfit-sf" data-field="vision"${sv('vision',false)?' checked':''} /> Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model. Adds ~1 GB VRAM.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_mmap"${sv('llama_no_mmap',false)?' checked':''} /> No mmap${_h('Adds --no-mmap. Useful for some high-context/local-storage setups.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="llama_no_warmup"${sv('llama_no_warmup',false)?' checked':''} /> Skip warmup${_h('Adds --no-warmup. Reduces startup memory spikes; llama.cpp defaults to warming up.')}</label>`;
|
||||
panelHtml += `<label class="hwfit-sf-cb hwfit-spec-group"><input type="checkbox" class="hwfit-sf" data-field="llama_speculative_mtp"${sv('llama_speculative_mtp',false)?' checked':''} /> MTP Spec${_h('llama.cpp native MTP speculative decoding: --spec-type draft-mtp. Requires a GGUF with MTP heads.')} <span class="hwfit-numstep"><button type="button" class="hwfit-numstep-btn" data-step="-1" tabindex="-1" aria-label="Decrease">‹</button><input type="number" class="hwfit-sf hwfit-spec-tokens" data-field="llama_spec_tokens" value="${esc(sv('llama_spec_tokens', '3'))}" min="1" max="10" title="--spec-draft-n-max" /><button type="button" class="hwfit-numstep-btn" data-step="1" tabindex="-1" aria-label="Increase">›</button></span></label>`;
|
||||
panelHtml += `</div>`;
|
||||
// Row 3b: Checkboxes (diffusers)
|
||||
panelHtml += `<div class="hwfit-serve-checks hwfit-backend-diffusers">`;
|
||||
@@ -859,6 +901,21 @@ function _rerenderCachedModels() {
|
||||
const panel = item.querySelector('.hwfit-serve-panel');
|
||||
// Scroll the serve panel into view within its nearest scrollable ancestor
|
||||
requestAnimationFrame(() => panel.scrollIntoView({ block: 'nearest', behavior: 'smooth' }));
|
||||
// Firefox-mobile fallback: the CSS that grows the cached-list and
|
||||
// expanded card uses :has(.doclib-card-expanded), which Firefox
|
||||
// mobile doesn't support — so the panel stays collapsed and the
|
||||
// form is unusable. Pin explicit px heights here. On Chromium/
|
||||
// WebKit the !important CSS still wins, so this is a no-op there.
|
||||
// (See project_skills_expand_firefox memory note.)
|
||||
requestAnimationFrame(() => {
|
||||
try {
|
||||
const _itemH = Math.max(item.scrollHeight, item.getBoundingClientRect().height);
|
||||
if (_itemH > 0) item.style.maxHeight = _itemH + 'px';
|
||||
const _listH = Math.max(list.scrollHeight, list.getBoundingClientRect().height);
|
||||
if (_listH > 0) list.style.maxHeight = _listH + 'px';
|
||||
list.style.minHeight = _listH + 'px';
|
||||
} catch {}
|
||||
});
|
||||
|
||||
// Build command preview
|
||||
function updateCmd() {
|
||||
@@ -1859,6 +1916,49 @@ function _rerenderCachedModels() {
|
||||
updateCmd();
|
||||
});
|
||||
});
|
||||
// llama.cpp GPU/CPU mode-toggle pill wiring. Clicking GPU or CPU
|
||||
// flips the .active classes + .mode-right marker (so the sliding
|
||||
// pill matches Agent/Chat), updates the hidden data-field input,
|
||||
// and fires a change event so the existing field-change handler
|
||||
// rebuilds the serve cmd (sets -ngl 99 vs -ngl 0).
|
||||
panel.querySelectorAll('[data-llama-mode-toggle]').forEach(group => {
|
||||
group.querySelectorAll('.mode-toggle-btn').forEach(btn => {
|
||||
btn.addEventListener('click', (e) => {
|
||||
e.preventDefault(); e.stopPropagation();
|
||||
const want = btn.dataset.llamaMode;
|
||||
if (!want) return;
|
||||
group.querySelectorAll('.mode-toggle-btn').forEach(b => {
|
||||
const isActive = b.dataset.llamaMode === want;
|
||||
b.classList.toggle('active', isActive);
|
||||
b.setAttribute('aria-pressed', isActive ? 'true' : 'false');
|
||||
});
|
||||
group.classList.toggle('mode-right', want === 'gpu');
|
||||
const hidden = group.parentElement.querySelector('[data-field="llama_mode"]');
|
||||
if (hidden) {
|
||||
hidden.value = want;
|
||||
hidden.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
}
|
||||
// Hide every GPU-only control (chiclets, Tensor Split,
|
||||
// Split Mode, Main GPU, Flash Attn, Unified Memory, etc.)
|
||||
// in CPU mode — `-ngl 0` ignores them and showing them
|
||||
// implies they matter.
|
||||
panel.classList.toggle('cookbook-llama-cpu-mode', want === 'cpu');
|
||||
panel.querySelectorAll('.cookbook-llama-gpu-only').forEach(el => {
|
||||
el.style.display = (want === 'cpu') ? 'none' : '';
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
// Apply the CPU-mode visibility on first render too, so a saved
|
||||
// preset that loaded with llama_mode=cpu hides GPU controls
|
||||
// immediately instead of flashing them then disappearing.
|
||||
{
|
||||
const _saved = panel.querySelector('[data-field="llama_mode"]')?.value || 'gpu';
|
||||
if (_saved === 'cpu') {
|
||||
panel.classList.add('cookbook-llama-cpu-mode');
|
||||
panel.querySelectorAll('.cookbook-llama-gpu-only').forEach(el => { el.style.display = 'none'; });
|
||||
}
|
||||
}
|
||||
// Themed +/- buttons next to spec_tokens — step the adjacent number input.
|
||||
panel.querySelectorAll('.hwfit-numstep-btn').forEach(btn => {
|
||||
btn.addEventListener('click', (e) => {
|
||||
@@ -2025,6 +2125,140 @@ function _rerenderCachedModels() {
|
||||
});
|
||||
return;
|
||||
}
|
||||
// llama.cpp VRAM-fit preflight. Catches the silent-CPU-fallback
|
||||
// trap: when the model + KV cache exceed the selected GPUs' free
|
||||
// VRAM, llama-cpp-python doesn't error — it pushes layers/KV to
|
||||
// CPU and inference crawls at sub-1 tok/s. Off by default; can
|
||||
// be bypassed per-launch via the dialog's "Allow CPU overflow"
|
||||
// action, OR persistently by ticking the same-named checkbox.
|
||||
if (serveState.backend === 'llamacpp'
|
||||
&& String(serveState.llama_mode || 'gpu') !== 'cpu'
|
||||
&& !serveState.llama_cpu_overflow) {
|
||||
try {
|
||||
const _ctx = Math.max(1, parseInt(serveState.ctx, 10) || 8192);
|
||||
// Model size on disk — close enough for GPU footprint of a GGUF.
|
||||
const _modelBytes = Number(m?.size_bytes || 0) || Math.round((Number(m?.size_gb || 0)) * 1024 * 1024 * 1024);
|
||||
const _modelGb = _modelBytes / (1024 ** 3);
|
||||
// KV cache heuristic. ~0.7MB / token / 7.5GB-of-model at fp16
|
||||
// KV, scaled linearly by model size. Imperfect but covers
|
||||
// the common 7B–70B range within ~20% — good enough to catch
|
||||
// overflow before it silently happens.
|
||||
const _kvGbPerToken = _modelGb > 0 ? (_modelGb / 7.5) * 0.0007 : 0.0007;
|
||||
const _kvGb = _ctx * _kvGbPerToken;
|
||||
const _needGb = _modelGb + _kvGb;
|
||||
const _selStr = (serveState.gpus || '').trim();
|
||||
const _selIdx = _selStr ? _selStr.split(',').map(s => parseInt(s.trim(), 10)).filter(n => Number.isFinite(n)) : [0];
|
||||
// Fetch FRESH GPU data per-launch — the hwfit cache may be
|
||||
// stale or for a different host (e.g. user switched server
|
||||
// picker without scanning), which used to silently skip the
|
||||
// preflight and let the launch silently fall to CPU.
|
||||
let _hwGpus = [];
|
||||
try {
|
||||
const _gh = (_envState.remoteHost || '').trim();
|
||||
const _gp = new URLSearchParams();
|
||||
if (_gh) {
|
||||
_gp.set('host', _gh);
|
||||
const _sp = (_serverByVal?.(_envState.remoteServerKey || _gh) || {}).port;
|
||||
if (_sp) _gp.set('ssh_port', _sp);
|
||||
}
|
||||
const _gr = await fetch('/api/cookbook/gpus' + (_gp.toString() ? '?' + _gp : ''), { credentials: 'same-origin' });
|
||||
if (_gr.ok) {
|
||||
const _gd = await _gr.json();
|
||||
_hwGpus = Array.isArray(_gd) ? _gd : (_gd.gpus || []);
|
||||
}
|
||||
} catch {}
|
||||
const _freeFor = (idx) => {
|
||||
const g = _hwGpus[idx];
|
||||
const mb = g?.free_mb;
|
||||
return Number.isFinite(mb) ? mb / 1024 : 0;
|
||||
};
|
||||
const _selFreeGb = _selIdx.reduce((s, i) => s + _freeFor(i), 0);
|
||||
// Skip the gate when we don't have any free-VRAM data (probe
|
||||
// failed) — better to let the launch try than silently refuse
|
||||
// on a missing data point.
|
||||
if (_selFreeGb > 0 && _needGb > _selFreeGb && _modelGb > 0) {
|
||||
// Suggest the smallest set of additional GPUs whose free
|
||||
// VRAM closes the gap. Greedy by largest-free-first.
|
||||
const _candidates = _hwGpus
|
||||
.map((g, i) => ({ i, free: _freeFor(i) }))
|
||||
.filter(x => !_selIdx.includes(x.i) && x.free > 0)
|
||||
.sort((a, b) => b.free - a.free);
|
||||
const _addGpus = [];
|
||||
let _runFree = _selFreeGb;
|
||||
for (const c of _candidates) {
|
||||
_addGpus.push(c.i); _runFree += c.free;
|
||||
if (_runFree >= _needGb) break;
|
||||
}
|
||||
const _canAddGpu = _runFree >= _needGb && _addGpus.length > 0;
|
||||
// Recommend ctx that just-fits on current selection.
|
||||
const _recCtxRaw = Math.floor((_selFreeGb - _modelGb) / _kvGbPerToken);
|
||||
const _recCtx = Math.max(1024, Math.floor(_recCtxRaw / 1024) * 1024);
|
||||
// Custom modal — styledConfirm only takes 2 buttons; this
|
||||
// surface needs up to 4 actions (Reduce / Add GPUs / Allow / Cancel).
|
||||
const _action = await new Promise(resolve => {
|
||||
const ov = document.createElement('div');
|
||||
ov.className = 'modal';
|
||||
ov.style.cssText = 'display:flex;align-items:center;justify-content:center;z-index:10050;position:fixed;inset:0;background:rgba(0,0,0,0.4);';
|
||||
const _btnRow = [];
|
||||
if (_recCtx > 1024 && _recCtx < _ctx) {
|
||||
_btnRow.push(`<button data-vram-action="reduce" class="confirm-btn confirm-btn-primary" style="width:100%;">Reduce ctx to ${_recCtx.toLocaleString()}</button>`);
|
||||
}
|
||||
if (_canAddGpu) {
|
||||
_btnRow.push(`<button data-vram-action="add_gpus" class="confirm-btn confirm-btn-primary" style="width:100%;">Add GPU${_addGpus.length > 1 ? 's' : ''} ${_addGpus.join(', ')}</button>`);
|
||||
}
|
||||
_btnRow.push(`<button data-vram-action="allow_cpu" class="confirm-btn confirm-btn-secondary" style="width:100%;">Allow CPU overflow (slow)</button>`);
|
||||
_btnRow.push(`<button data-vram-action="cancel" class="confirm-btn confirm-btn-secondary" style="width:100%;">Cancel</button>`);
|
||||
ov.innerHTML = '<div class="modal-content" style="max-width:480px;">'
|
||||
+ '<div class="modal-header"><h4>Will not fit on selected GPU' + (_selIdx.length > 1 ? 's' : '') + '</h4></div>'
|
||||
+ '<div class="modal-body" style="font-size:12px;line-height:1.5;">'
|
||||
+ '<p>Model + KV cache would overflow VRAM on the selected GPU' + (_selIdx.length > 1 ? 's' : '') + '. llama-cpp-python will silently spill to CPU → very slow inference.</p>'
|
||||
+ '<ul style="opacity:0.75;padding-left:18px;">'
|
||||
+ '<li>Model: ~' + _modelGb.toFixed(1) + ' GB</li>'
|
||||
+ '<li>KV cache (ctx ' + _ctx.toLocaleString() + '): ~' + _kvGb.toFixed(1) + ' GB</li>'
|
||||
+ '<li>Total needed: ~' + _needGb.toFixed(1) + ' GB</li>'
|
||||
+ '<li>Free on GPU ' + _selIdx.join(', ') + ': ~' + _selFreeGb.toFixed(1) + ' GB</li>'
|
||||
+ '</ul>'
|
||||
+ '</div>'
|
||||
+ '<div class="modal-footer" style="flex-direction:column;gap:6px;align-items:stretch;">' + _btnRow.join('') + '</div>'
|
||||
+ '</div>';
|
||||
document.body.appendChild(ov);
|
||||
ov.addEventListener('click', (e) => {
|
||||
const b = e.target.closest('[data-vram-action]');
|
||||
if (b) { ov.remove(); resolve(b.dataset.vramAction); }
|
||||
else if (e.target === ov) { ov.remove(); resolve('cancel'); }
|
||||
});
|
||||
});
|
||||
if (_action === 'cancel' || !_action) { _restoreLaunchBtn(); return; }
|
||||
if (_action === 'reduce') {
|
||||
const _ctxEl = panel.querySelector('[data-field="ctx"]');
|
||||
if (_ctxEl) {
|
||||
_ctxEl.value = String(_recCtx);
|
||||
serveState.ctx = String(_recCtx);
|
||||
_ctxEl.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
}
|
||||
} else if (_action === 'add_gpus') {
|
||||
for (const i of _addGpus) {
|
||||
const _b = panel.querySelector(`.cookbook-gpu-btn[data-gpu="${i}"]`);
|
||||
if (_b && !_b.classList.contains('active')) _b.click();
|
||||
}
|
||||
const _gpusEl = panel.querySelector('[data-field="gpus"]');
|
||||
if (_gpusEl) serveState.gpus = _gpusEl.value;
|
||||
} else if (_action === 'allow_cpu') {
|
||||
const _ov = panel.querySelector('[data-field="llama_cpu_overflow"]');
|
||||
if (_ov) {
|
||||
_ov.checked = true;
|
||||
_ov.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
}
|
||||
serveState.llama_cpu_overflow = true;
|
||||
}
|
||||
// After mutation, rebuild the serve cmd preview so the
|
||||
// launched cmd matches what the user just chose.
|
||||
try { updateCmd(); } catch {}
|
||||
}
|
||||
} catch (_e) {
|
||||
// Preflight is best-effort — never block on its own failure.
|
||||
}
|
||||
}
|
||||
// Pre-launch GPU probe — common failure pattern: vLLM/SGLang launched
|
||||
// on a host where no GPU is visible (driver missing, $CUDA_VISIBLE_DEVICES
|
||||
// unset, container without --gpus). Catch it BEFORE the user spends
|
||||
@@ -2151,6 +2385,38 @@ function _rerenderCachedModels() {
|
||||
if (venvVal) { _envState.env = 'venv'; _envState.envPath = venvVal; }
|
||||
else if (_srvEnvPath) { _envState.env = (_srvEnv === 'conda' ? 'conda' : 'venv'); _envState.envPath = _srvEnvPath; }
|
||||
if (gpusVal) _envState.gpus = gpusVal;
|
||||
// Preflight: launching a GPU engine (llama.cpp / vLLM / SGLang)
|
||||
// against the local-in-container target on a host whose hwfit
|
||||
// scan reports no GPU backend. That falls through to a CPU build
|
||||
// / CPU inference path and is usually NOT what the user wants —
|
||||
// they typically have a host-side GPU (AMD/Vulkan, NVIDIA on a
|
||||
// different box) that the container can't see. Surface this so
|
||||
// the user can pick the host as a remote target instead, or
|
||||
// confirm they really meant CPU.
|
||||
try {
|
||||
const _isLocalInContainer = !serveHost; // empty serveHost == cookbook container's local
|
||||
const _wantsGpu = ['llamacpp', 'vllm', 'sglang', 'diffusers'].includes(serveState.backend);
|
||||
const _detectedBackend = String(_hwfitCache?.system?.backend || '').toLowerCase();
|
||||
const _gpuBackends = ['cuda', 'rocm', 'vulkan', 'metal', 'mps', 'apple'];
|
||||
if (_isLocalInContainer && _wantsGpu && _detectedBackend && !_gpuBackends.includes(_detectedBackend)) {
|
||||
const _proceed = await window.styledConfirm(
|
||||
`The local (in-container) target has no GPU backend detected (hwfit reports: "${_detectedBackend || 'none'}"). ${serveState.backend.toUpperCase()} will run on CPU only and may be unusably slow.\n\nIf this machine has a GPU on the host, add the host as a server in Settings and target that instead. Otherwise launch anyway for CPU inference.`,
|
||||
{
|
||||
title: 'No GPU on local target',
|
||||
confirmText: 'Launch anyway (CPU)',
|
||||
cancelText: 'Cancel',
|
||||
danger: true,
|
||||
},
|
||||
);
|
||||
if (!_proceed) {
|
||||
if (typeof _restoreLaunchBtn === 'function') _restoreLaunchBtn();
|
||||
_envState.env = origEnv;
|
||||
_envState.envPath = origEnvPath;
|
||||
_envState.gpus = origGpus;
|
||||
return;
|
||||
}
|
||||
}
|
||||
} catch { /* preflight is best-effort */ }
|
||||
try {
|
||||
await _withSpinner(_launchBtn, async () => {
|
||||
// Pass the exact form values so the running task can be re-opened
|
||||
|
||||
+16
-4
@@ -15930,6 +15930,17 @@ body:not(.email-doc-split-active) #email-lib-modal.email-lib-fullscreen:not(.mod
|
||||
flex: 0 0 auto !important;
|
||||
height: auto !important;
|
||||
}
|
||||
/* Launch tab's cached-list normally has `flex: 1 1 0; min-height: 0`
|
||||
(so it fills the modal on desktop). On mobile the parent now has
|
||||
`height: auto`, which collapses `flex: 1 1 0` to ZERO PX —
|
||||
models render but the list area is invisible because the flex
|
||||
basis is 0 and there's no free space to grow into. Switch to
|
||||
content-sized flex so the list grows with its children. */
|
||||
#cookbook-modal .cookbook-group[data-backend-group="Serve"] > .admin-card > .hwfit-cached-list,
|
||||
#cookbook-modal .cookbook-group[data-backend-group="Serve"] > .admin-card > #hwfit-cached-list {
|
||||
flex: 0 0 auto !important;
|
||||
overflow: visible !important;
|
||||
}
|
||||
}
|
||||
#cookbook-modal .hwfit-cached-list {
|
||||
flex-shrink: 0;
|
||||
@@ -18560,7 +18571,7 @@ body.gallery-selecting .gallery-dl-btn,
|
||||
label and center it vertically so the descenders don't clip. */
|
||||
#hwfit-cache-select {
|
||||
min-width: 58px;
|
||||
height: 32px;
|
||||
height: 28px;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
@@ -19316,7 +19327,7 @@ body.gallery-selecting .gallery-dl-btn,
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
.cookbook-slot-btn {
|
||||
min-width: 22px; height: 22px;
|
||||
min-width: 22px; height: 28px;
|
||||
padding: 0 6px;
|
||||
font-size: 10px; font-weight: 600;
|
||||
border: 1px solid var(--border);
|
||||
@@ -19733,11 +19744,12 @@ body.gallery-selecting .gallery-dl-btn,
|
||||
font-size: 12px;
|
||||
padding: 0 6px;
|
||||
height: 28px;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.hwfit-sf[data-field="backend"],
|
||||
.hwfit-sf[data-field="dtype"],
|
||||
.hwfit-sf[data-field="tp"] {
|
||||
height: 32px;
|
||||
height: 28px;
|
||||
box-sizing: border-box;
|
||||
width: 100%;
|
||||
}
|
||||
@@ -23569,7 +23581,7 @@ details.hwfit-serve-advanced > .hwfit-serve-checks:last-of-type {
|
||||
width: 51px;
|
||||
}
|
||||
#serve-search {
|
||||
height: 32px;
|
||||
height: 28px;
|
||||
}
|
||||
#cookbook-dl-btn {
|
||||
position: relative;
|
||||
|
||||
Reference in New Issue
Block a user