Cookbook UI: backend-aware env vars, always-show MoE/EP/Reasoning toggles, GPU default, Firefox-mobile expand

Frontend half of the backend-detection + per-OS install command work, plus a pile of mobile/UX fixes: Backend awareness: - _gpuEnvPrefix() picks CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES / nothing based on detected hwfit backend + scanned-host match (so a stale ajax scan does not leak CUDA env vars into a kierkegaard Vulkan launch). Replaces 6 hardcoded CUDA_VISIBLE_DEVICES sites. - GGML_CUDA_ENABLE_UNIFIED_MEMORY only emitted when backend is actually CUDA (was leaking onto Vulkan/ROCm via saved presets). Per-target install command: - Dep rows render a single mono command box + Copy button when the server resolved pkg.install_cmd_for_target. Reused in the build-deps install failure toast so the toast and the row show the same line. - Diagnosis patterns split cmake/g++/git out of the generic llama-cpp-python catch-all so a missing-cmake failure surfaces a cmake-specific message + per-distro Copy buttons. Form toggles always visible: - Reasoning Parser, Expert Parallel, MoE Env Vars no longer gated on model-family detection. Detection still hints (parser tag shown when matched); toggle works with sensible defaults otherwise. MiniMax M- series added to MoE family detector so the auto-fill is right. Mobile + GPU default: - Launch tab cached-list flex collapsed to 0px on mobile because the desktop `flex: 1 1 0` had no parent height to grow into. Override to `flex: 0 0 auto` in the cookbook mobile @media block. - doclib-card expand on mobile (Firefox no :has() support) pins explicit px heights so the launch form actually appears. - llama_mode defaults to gpu when hwfit detected cuda/rocm/vulkan/ metal on the current target, instead of always cpu (which was forcing -ngl 0 on first-open and burning 35GB models on CPU).
2026-06-30 00:22:10 -04:00 · 2026-06-19 00:33:37 +00:00
parent f01465e87f
commit ee6fd8ffe8
5 changed files with 706 additions and 76 deletions
@@ -578,7 +578,9 @@ export async function _hwfitFetch(fresh = false) {
  const _cached = fresh ? null : _readScanCache(_sig);
  const wp = spinnerModule.createWhirlpool(18);
  if (_cached) {
-    _hwfitCache = _cached;
+    // Tag the restored cache with its host too (scan-sig keys cache per
+    // host, so a hit here is always for the current remoteHost).
+    _hwfitCache = { ..._cached, _scannedHost: remoteHost || '' };
    _hwfitRenderHw(hw, _cached.system);
    if (!remoteHost && _cached.system && _cached.system.platform) {
      _envState.platform = _cached.system.platform;
@@ -750,7 +752,11 @@ export async function _hwfitFetch(fresh = false) {
        : _olRows;
      data.models = (data.models || []).concat(_olFiltered);
    }
-    _hwfitCache = data;
+    // Tag the cache with the host this scan was for, so downstream
+    // code (_gpuEnvVarName, backend-aware command builders) can avoid
+    // trusting a stale scan when the user switches the server picker
+    // to a different target without re-running hwfit.
+    _hwfitCache = { ...data, _scannedHost: remoteHost || '' };
    _hwfitRenderHw(hw, data.system);
    // Propagate local platform from hardware probe so _isWindows(task) works
    // for local tasks (menu items, shell commands, etc.).
@@ -1679,7 +1685,7 @@ export function _expandModelRow(row, modelData) {
      } else if (runBackend === 'llamacpp') {
        const dir = `"$HOME/.cache/huggingface/hub/models--${modelData.name.replace(/\//g, '--')}/snapshots"`;
        const ggufPath = `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
-        cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host. Download a GGUF quant or switch backend."; exit 1; } && llama-server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} || python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 --n_gpu_layers 99 --n_ctx ${maxCtx}`;
+        cmd = `llama-server --model "${ggufPath}" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} --flash-attn auto`;
      } else {
        cmd = `vllm serve ${modelData.name} --host 0.0.0.0 --port ${port}`;
        cmd += ` --tensor-parallel-size ${tp}`;