diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index 51f019edb..836b26c8a 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -561,7 +561,7 @@ def _bash_squote(v: str) -> str: # Allow-list of binaries permitted as the leading token of `req.cmd` for /api/model/serve. # Anything else is rejected before the cmd is interpolated into a tmux/PowerShell wrapper. _SERVE_CMD_ALLOWLIST = { - "vllm", "llama-server", "llama_server", "llama.cpp", "ollama", + "vllm", "llama-server", "llama-server.exe", "llama_server", "llama.cpp", "ollama", "python", "python3", "sglang", "lmdeploy", "node", "npx", diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index f57ecf6e5..a0fb672d7 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -73,6 +73,9 @@ def setup_cookbook_routes() -> APIRouter: return "stored" return f"{value[:4]}...{value[-4:]}" + def _client_host_platform() -> str: + return "windows" if IS_WINDOWS else "" + def _decrypt_secret(value: str | None) -> str: if not value: return "" @@ -245,11 +248,15 @@ def setup_cookbook_routes() -> APIRouter: """Return cookbook state without raw secrets for browser clients.""" _strip_task_secrets(state) env = state.get("env") if isinstance(state, dict) else None + if isinstance(state, dict) and not isinstance(env, dict): + env = {} + state["env"] = env if isinstance(env, dict): token = _decrypt_secret(env.get("hfToken")) env.pop("hfToken", None) env["hfTokenConfigured"] = bool(token) env["hfTokenMasked"] = _mask_secret(token) + env["hostPlatform"] = _client_host_platform() return state def _state_for_storage(state, on_disk=None): @@ -268,6 +275,7 @@ def setup_cookbook_routes() -> APIRouter: env.pop("hfToken", None) env.pop("hfTokenMasked", None) env.pop("hfTokenConfigured", None) + env.pop("hostPlatform", None) return state def _load_stored_hf_token() -> str: @@ -1479,6 +1487,10 @@ def setup_cookbook_routes() -> APIRouter: # shell resolves the bundled python3/hf, mirroring the download flow. if not remote: runner_lines.append(_local_tooling_path_export(sys.executable)) + if local_windows: + # Detached Git Bash runs do not always inherit recently edited + # user PATH entries from the already-running Odysseus process. + runner_lines.append('export PATH="$HOME/bin:$HOME/llama.cpp/build-cuda/bin/Release:$HOME/llama.cpp/build/bin/Release:$HOME/llama.cpp/build/bin/Debug:$HOME/llama.cpp/build/bin:$PATH"') runner_lines.append("export FLASHINFER_DISABLE_VERSION_CHECK=1") if req.hf_token: runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'") @@ -1493,7 +1505,8 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(_HF_TOKEN_STATUS_SNIPPET) handled_ollama_serve = False # Auto-install inference engine if missing - if "llama_cpp" in req.cmd or "llama-server" in req.cmd: + local_windows_llama_cmd = local_windows and ("llama_cpp" in req.cmd or "llama-server" in req.cmd) + if ("llama_cpp" in req.cmd or "llama-server" in req.cmd) and not local_windows_llama_cmd: # Prefer the NATIVE llama-server binary — its minja templating # renders modern GGUF chat templates that the Python bindings' # Jinja2 rejects (do_tojson ensure_ascii). Build it once from @@ -2396,8 +2409,8 @@ def setup_cookbook_routes() -> APIRouter: try: return _state_for_client(json.loads(_cookbook_state_path.read_text(encoding="utf-8"))) except Exception: - return {} - return {} + return _state_for_client({}) + return _state_for_client({}) @router.post("/api/cookbook/state") async def save_cookbook_state(request: Request): diff --git a/static/js/cookbook.js b/static/js/cookbook.js index fca21b57e..8821e1bc6 100644 --- a/static/js/cookbook.js +++ b/static/js/cookbook.js @@ -76,7 +76,7 @@ function _platformIcon(platform) { return ''; } -export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', defaultServer: '' }; +export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', hostPlatform: '', defaultServer: '' }; let _lastCacheHostVal = null; let _cookbookOpeningSpinners = []; export function _lastCacheHost() { return _lastCacheHostVal; } @@ -213,8 +213,13 @@ function _getPort(hostOrTask) { /** Get platform for a given host (or task object). Returns 'windows', 'termux', 'linux', or '' */ export function _getPlatform(hostOrTask) { - if (!hostOrTask) return _envState.platform || ''; - if (typeof hostOrTask === 'object') return hostOrTask.platform || _getPlatform(hostOrTask.remoteServerKey || hostOrTask.remoteHost); + if (hostOrTask === 'local') return _envState.hostPlatform || ''; + if (!hostOrTask) return _envState.remoteHost ? (_envState.platform || '') : (_envState.hostPlatform || ''); + if (typeof hostOrTask === 'object') { + const taskHost = hostOrTask.remoteServerKey || hostOrTask.remoteHost || ''; + if (!taskHost || taskHost === 'local') return _envState.hostPlatform || ''; + return hostOrTask.platform || _getPlatform(taskHost); + } const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null; const srv = selected || _serverByVal(hostOrTask); return srv?.platform || ''; @@ -638,7 +643,12 @@ export function _buildServeCmd(f, modelName, backend) { // GPU list — read from gpus (button strip); fall back to gpu_id for // backward-compat with older saved presets that pre-date the removal. const gpuId = (f.gpus || f.gpu_id || '').toString().trim(); - const py = _isWindows() ? 'python' : 'python3'; + const _targetHost = Object.prototype.hasOwnProperty.call(f, 'host') + ? String(f.host || '').trim() + : String(_envState.remoteHost || '').trim(); + const _isWin = _targetHost ? _isWindows(_targetHost) : _isWindows('local'); + const _localWindows = _isWin && !_targetHost; + const py = _isWin ? 'python' : 'python3'; // CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command // mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to // start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged. @@ -660,19 +670,19 @@ export function _buildServeCmd(f, modelName, backend) { // with misleading prefixes. const _sb = String(_hwfitCache?.system?.backend || '').toLowerCase(); const _hwfitHost = String(_hwfitCache?._scannedHost || ''); - const _curHost = String(_envState.remoteHost || ''); + const _curHost = _targetHost; const _isCudaTarget = (_sb === 'cuda') && (_hwfitHost === _curHost); const lcPrefix = (() => { let p = ''; - if (f.unified_mem && !_cpuOnly && !_isWindows() && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `; - // No GPU env var in CPU mode — `-ngl 0` already disables offload + if (f.unified_mem && !_cpuOnly && (!_isWin || _localWindows) && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `; + // No GPU env var in CPU mode - `-ngl 0` already disables offload // so CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES would be misleading // clutter ("why is CUDA pinned for a CPU run?"). - if (!_isWindows() && !_cpuOnly) p += _gpuEnvPrefix(gpuId); + if ((!_isWin || _localWindows) && !_cpuOnly) p += _gpuEnvPrefix(gpuId); return p; })(); - if (f.unified_mem && !_cpuOnly && _isWindows() && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `; - if (_isWindows() && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true); + if (f.unified_mem && !_cpuOnly && _isWin && !_localWindows && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `; + if (_isWin && !_localWindows && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true); const needsGgufPrelude = /^\$\(\{\s*find\s/.test(String(ggufPath || '')); const modelArg = needsGgufPrelude ? '"$MODEL_FILE"' : `"${ggufPath}"`; // Prefer native llama-server. The backend bootstrap resolves/builds the @@ -744,11 +754,16 @@ export function _buildServeCmd(f, modelName, backend) { // llama-cpp-python takes the projector via --clip_model_path. _lcpExtra += ` --clip_model_path "${f._mmproj_path}"`; } - if (_isWindows()) { - const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`; + const _lcServer = `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`; + const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`; + if (_localWindows) { + // Local Windows serve is launched through Git Bash, so use the native + // llama-server shape and let PATH resolve the CUDA Release wrapper. + cmd += _lcServer; + } else if (_isWin) { cmd += _lcpServer; } else { - cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`; + cmd += _lcServer; } if (needsGgufPrelude) { cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host"; exit 1; } && ${cmd}`; @@ -2612,13 +2627,14 @@ function _renderRecipes() { const isLocal = !s.host || s.host.toLowerCase() === 'local'; if (isLocal) { s.host = ''; + s.platform = _envState.hostPlatform || ''; if (_localSeen) return false; _localSeen = true; } return true; }); if (!_localSeen) { - _es.servers.unshift({ host: '', env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub' }); + _es.servers.unshift({ host: '', env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub', platform: _envState.hostPlatform || '' }); } if (_es.remoteHost && !_es.servers.some(s => s.host === _es.remoteHost)) { _es.servers.push({ host: _es.remoteHost, env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub' }); diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js index 8e106f533..a64205c4d 100644 --- a/static/js/cookbookRunning.js +++ b/static/js/cookbookRunning.js @@ -781,6 +781,7 @@ function _stripStateSecrets(state) { const safe = { ...state }; if (safe.env && typeof safe.env === 'object') { const { hfToken, ...env } = safe.env; + delete env.hostPlatform; safe.env = env; } if (Array.isArray(safe.tasks)) safe.tasks = safe.tasks.map(_redactTaskForStorage); @@ -1673,7 +1674,7 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid || _envState.servers.find(s => s.host === _host) || {}; const _serverMetaKey = _targetKey || (_hsrv && _serverKey ? _serverKey(_hsrv) : '') || (_host || 'local'); const _serverMetaName = targetMeta?.serverName || _hsrv.name || (_host ? _host : 'Local'); - const _hplatform = _host ? (_hsrv.platform || '') : (_envState.platform || ''); + const _hplatform = _host ? (_hsrv.platform || '') : (_envState.hostPlatform || ''); const _replaceTaskId = fields?._replaceTaskId || ''; if (_replaceTaskId) { try { @@ -1688,7 +1689,6 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid } } catch {} } - // Replace any serve already targeting this same host:port — you can't run two // servers on one port, so re-serving (or retrying) should stop & remove the // old one instead of leaving a dead duplicate behind. (The retry buttons diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index 6f7b53057..253ba7483 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -527,7 +527,7 @@ function _selectedServeTarget(panel) { env: server?.env || '', port: host ? (server?.port || _getPort(host) || '') : '', venv, - platform: server?.platform || _envState.platform || '', + platform: host ? (server?.platform || '') : (_envState.hostPlatform || ''), label, }; } @@ -658,6 +658,12 @@ function _selectedGgufSizeGb(model, relPath) { return bytes / (1024 ** 3); } +function _projectorGgufFiles(model) { + return _ggufFilesForModel(model) + .filter(f => (f.role || '') === 'projector' || /(^|\/)mmproj[^/]*\.gguf$/i.test(f.rel_path || f.name || '')) + .sort((a, b) => String(a.rel_path || a.name || '').localeCompare(String(b.rel_path || b.name || ''))); +} + function _ggufFileLabel(file) { const base = (file.name || file.rel_path || '').split('/').pop(); const size = _formatGgufSize(file.size_bytes); @@ -1198,6 +1204,7 @@ function _rerenderCachedModels() { panelHtml += `