mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-30 00:22:10 -04:00
Cookbook UI: backend-aware env vars, always-show MoE/EP/Reasoning toggles, GPU default, Firefox-mobile expand
Frontend half of the backend-detection + per-OS install command work, plus a pile of mobile/UX fixes: Backend awareness: - _gpuEnvPrefix() picks CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES / nothing based on detected hwfit backend + scanned-host match (so a stale ajax scan does not leak CUDA env vars into a kierkegaard Vulkan launch). Replaces 6 hardcoded CUDA_VISIBLE_DEVICES sites. - GGML_CUDA_ENABLE_UNIFIED_MEMORY only emitted when backend is actually CUDA (was leaking onto Vulkan/ROCm via saved presets). Per-target install command: - Dep rows render a single mono command box + Copy button when the server resolved pkg.install_cmd_for_target. Reused in the build-deps install failure toast so the toast and the row show the same line. - Diagnosis patterns split cmake/g++/git out of the generic llama-cpp-python catch-all so a missing-cmake failure surfaces a cmake-specific message + per-distro Copy buttons. Form toggles always visible: - Reasoning Parser, Expert Parallel, MoE Env Vars no longer gated on model-family detection. Detection still hints (parser tag shown when matched); toggle works with sensible defaults otherwise. MiniMax M- series added to MoE family detector so the auto-fill is right. Mobile + GPU default: - Launch tab cached-list flex collapsed to 0px on mobile because the desktop `flex: 1 1 0` had no parent height to grow into. Override to `flex: 0 0 auto` in the cookbook mobile @media block. - doclib-card expand on mobile (Firefox no :has() support) pins explicit px heights so the launch form actually appears. - llama_mode defaults to gpu when hwfit detected cuda/rocm/vulkan/ metal on the current target, instead of always cpu (which was forcing -ngl 0 on first-open and burning 35GB models on CPU).
This commit is contained in:
@@ -578,7 +578,9 @@ export async function _hwfitFetch(fresh = false) {
|
||||
const _cached = fresh ? null : _readScanCache(_sig);
|
||||
const wp = spinnerModule.createWhirlpool(18);
|
||||
if (_cached) {
|
||||
_hwfitCache = _cached;
|
||||
// Tag the restored cache with its host too (scan-sig keys cache per
|
||||
// host, so a hit here is always for the current remoteHost).
|
||||
_hwfitCache = { ..._cached, _scannedHost: remoteHost || '' };
|
||||
_hwfitRenderHw(hw, _cached.system);
|
||||
if (!remoteHost && _cached.system && _cached.system.platform) {
|
||||
_envState.platform = _cached.system.platform;
|
||||
@@ -750,7 +752,11 @@ export async function _hwfitFetch(fresh = false) {
|
||||
: _olRows;
|
||||
data.models = (data.models || []).concat(_olFiltered);
|
||||
}
|
||||
_hwfitCache = data;
|
||||
// Tag the cache with the host this scan was for, so downstream
|
||||
// code (_gpuEnvVarName, backend-aware command builders) can avoid
|
||||
// trusting a stale scan when the user switches the server picker
|
||||
// to a different target without re-running hwfit.
|
||||
_hwfitCache = { ...data, _scannedHost: remoteHost || '' };
|
||||
_hwfitRenderHw(hw, data.system);
|
||||
// Propagate local platform from hardware probe so _isWindows(task) works
|
||||
// for local tasks (menu items, shell commands, etc.).
|
||||
@@ -1679,7 +1685,7 @@ export function _expandModelRow(row, modelData) {
|
||||
} else if (runBackend === 'llamacpp') {
|
||||
const dir = `"$HOME/.cache/huggingface/hub/models--${modelData.name.replace(/\//g, '--')}/snapshots"`;
|
||||
const ggufPath = `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
|
||||
cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host. Download a GGUF quant or switch backend."; exit 1; } && llama-server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} || python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 --n_gpu_layers 99 --n_ctx ${maxCtx}`;
|
||||
cmd = `llama-server --model "${ggufPath}" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} --flash-attn auto`;
|
||||
} else {
|
||||
cmd = `vllm serve ${modelData.name} --host 0.0.0.0 --port ${port}`;
|
||||
cmd += ` --tensor-parallel-size ${tp}`;
|
||||
|
||||
Reference in New Issue
Block a user