fix(cookbook): treat local Windows as Windows for serve commands (#3975)

* fix(cookbook): prefer native llama-server on local Windows

* fix(cookbook): harden local llama-server launch commands

* fix(cookbook): build serve commands for selected target
This commit is contained in:
Dividesbyzer0
2026-06-26 08:13:01 -04:00
committed by GitHub
parent de12d4734a
commit f5200ec45b
7 changed files with 190 additions and 36 deletions
+1 -1
View File
@@ -561,7 +561,7 @@ def _bash_squote(v: str) -> str:
# Allow-list of binaries permitted as the leading token of `req.cmd` for /api/model/serve. # Allow-list of binaries permitted as the leading token of `req.cmd` for /api/model/serve.
# Anything else is rejected before the cmd is interpolated into a tmux/PowerShell wrapper. # Anything else is rejected before the cmd is interpolated into a tmux/PowerShell wrapper.
_SERVE_CMD_ALLOWLIST = { _SERVE_CMD_ALLOWLIST = {
"vllm", "llama-server", "llama_server", "llama.cpp", "ollama", "vllm", "llama-server", "llama-server.exe", "llama_server", "llama.cpp", "ollama",
"python", "python3", "python", "python3",
"sglang", "lmdeploy", "sglang", "lmdeploy",
"node", "npx", "node", "npx",
+16 -3
View File
@@ -73,6 +73,9 @@ def setup_cookbook_routes() -> APIRouter:
return "stored" return "stored"
return f"{value[:4]}...{value[-4:]}" return f"{value[:4]}...{value[-4:]}"
def _client_host_platform() -> str:
return "windows" if IS_WINDOWS else ""
def _decrypt_secret(value: str | None) -> str: def _decrypt_secret(value: str | None) -> str:
if not value: if not value:
return "" return ""
@@ -245,11 +248,15 @@ def setup_cookbook_routes() -> APIRouter:
"""Return cookbook state without raw secrets for browser clients.""" """Return cookbook state without raw secrets for browser clients."""
_strip_task_secrets(state) _strip_task_secrets(state)
env = state.get("env") if isinstance(state, dict) else None env = state.get("env") if isinstance(state, dict) else None
if isinstance(state, dict) and not isinstance(env, dict):
env = {}
state["env"] = env
if isinstance(env, dict): if isinstance(env, dict):
token = _decrypt_secret(env.get("hfToken")) token = _decrypt_secret(env.get("hfToken"))
env.pop("hfToken", None) env.pop("hfToken", None)
env["hfTokenConfigured"] = bool(token) env["hfTokenConfigured"] = bool(token)
env["hfTokenMasked"] = _mask_secret(token) env["hfTokenMasked"] = _mask_secret(token)
env["hostPlatform"] = _client_host_platform()
return state return state
def _state_for_storage(state, on_disk=None): def _state_for_storage(state, on_disk=None):
@@ -268,6 +275,7 @@ def setup_cookbook_routes() -> APIRouter:
env.pop("hfToken", None) env.pop("hfToken", None)
env.pop("hfTokenMasked", None) env.pop("hfTokenMasked", None)
env.pop("hfTokenConfigured", None) env.pop("hfTokenConfigured", None)
env.pop("hostPlatform", None)
return state return state
def _load_stored_hf_token() -> str: def _load_stored_hf_token() -> str:
@@ -1479,6 +1487,10 @@ def setup_cookbook_routes() -> APIRouter:
# shell resolves the bundled python3/hf, mirroring the download flow. # shell resolves the bundled python3/hf, mirroring the download flow.
if not remote: if not remote:
runner_lines.append(_local_tooling_path_export(sys.executable)) runner_lines.append(_local_tooling_path_export(sys.executable))
if local_windows:
# Detached Git Bash runs do not always inherit recently edited
# user PATH entries from the already-running Odysseus process.
runner_lines.append('export PATH="$HOME/bin:$HOME/llama.cpp/build-cuda/bin/Release:$HOME/llama.cpp/build/bin/Release:$HOME/llama.cpp/build/bin/Debug:$HOME/llama.cpp/build/bin:$PATH"')
runner_lines.append("export FLASHINFER_DISABLE_VERSION_CHECK=1") runner_lines.append("export FLASHINFER_DISABLE_VERSION_CHECK=1")
if req.hf_token: if req.hf_token:
runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'") runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
@@ -1493,7 +1505,8 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(_HF_TOKEN_STATUS_SNIPPET) runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
handled_ollama_serve = False handled_ollama_serve = False
# Auto-install inference engine if missing # Auto-install inference engine if missing
if "llama_cpp" in req.cmd or "llama-server" in req.cmd: local_windows_llama_cmd = local_windows and ("llama_cpp" in req.cmd or "llama-server" in req.cmd)
if ("llama_cpp" in req.cmd or "llama-server" in req.cmd) and not local_windows_llama_cmd:
# Prefer the NATIVE llama-server binary — its minja templating # Prefer the NATIVE llama-server binary — its minja templating
# renders modern GGUF chat templates that the Python bindings' # renders modern GGUF chat templates that the Python bindings'
# Jinja2 rejects (do_tojson ensure_ascii). Build it once from # Jinja2 rejects (do_tojson ensure_ascii). Build it once from
@@ -2396,8 +2409,8 @@ def setup_cookbook_routes() -> APIRouter:
try: try:
return _state_for_client(json.loads(_cookbook_state_path.read_text(encoding="utf-8"))) return _state_for_client(json.loads(_cookbook_state_path.read_text(encoding="utf-8")))
except Exception: except Exception:
return {} return _state_for_client({})
return {} return _state_for_client({})
@router.post("/api/cookbook/state") @router.post("/api/cookbook/state")
async def save_cookbook_state(request: Request): async def save_cookbook_state(request: Request):
+30 -14
View File
@@ -76,7 +76,7 @@ function _platformIcon(platform) {
return ''; return '';
} }
export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', defaultServer: '' }; export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', hostPlatform: '', defaultServer: '' };
let _lastCacheHostVal = null; let _lastCacheHostVal = null;
let _cookbookOpeningSpinners = []; let _cookbookOpeningSpinners = [];
export function _lastCacheHost() { return _lastCacheHostVal; } export function _lastCacheHost() { return _lastCacheHostVal; }
@@ -213,8 +213,13 @@ function _getPort(hostOrTask) {
/** Get platform for a given host (or task object). Returns 'windows', 'termux', 'linux', or '' */ /** Get platform for a given host (or task object). Returns 'windows', 'termux', 'linux', or '' */
export function _getPlatform(hostOrTask) { export function _getPlatform(hostOrTask) {
if (!hostOrTask) return _envState.platform || ''; if (hostOrTask === 'local') return _envState.hostPlatform || '';
if (typeof hostOrTask === 'object') return hostOrTask.platform || _getPlatform(hostOrTask.remoteServerKey || hostOrTask.remoteHost); if (!hostOrTask) return _envState.remoteHost ? (_envState.platform || '') : (_envState.hostPlatform || '');
if (typeof hostOrTask === 'object') {
const taskHost = hostOrTask.remoteServerKey || hostOrTask.remoteHost || '';
if (!taskHost || taskHost === 'local') return _envState.hostPlatform || '';
return hostOrTask.platform || _getPlatform(taskHost);
}
const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null; const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null;
const srv = selected || _serverByVal(hostOrTask); const srv = selected || _serverByVal(hostOrTask);
return srv?.platform || ''; return srv?.platform || '';
@@ -638,7 +643,12 @@ export function _buildServeCmd(f, modelName, backend) {
// GPU list — read from gpus (button strip); fall back to gpu_id for // GPU list — read from gpus (button strip); fall back to gpu_id for
// backward-compat with older saved presets that pre-date the removal. // backward-compat with older saved presets that pre-date the removal.
const gpuId = (f.gpus || f.gpu_id || '').toString().trim(); const gpuId = (f.gpus || f.gpu_id || '').toString().trim();
const py = _isWindows() ? 'python' : 'python3'; const _targetHost = Object.prototype.hasOwnProperty.call(f, 'host')
? String(f.host || '').trim()
: String(_envState.remoteHost || '').trim();
const _isWin = _targetHost ? _isWindows(_targetHost) : _isWindows('local');
const _localWindows = _isWin && !_targetHost;
const py = _isWin ? 'python' : 'python3';
// CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command // CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
// mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to // mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
// start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged. // start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
@@ -660,19 +670,19 @@ export function _buildServeCmd(f, modelName, backend) {
// with misleading prefixes. // with misleading prefixes.
const _sb = String(_hwfitCache?.system?.backend || '').toLowerCase(); const _sb = String(_hwfitCache?.system?.backend || '').toLowerCase();
const _hwfitHost = String(_hwfitCache?._scannedHost || ''); const _hwfitHost = String(_hwfitCache?._scannedHost || '');
const _curHost = String(_envState.remoteHost || ''); const _curHost = _targetHost;
const _isCudaTarget = (_sb === 'cuda') && (_hwfitHost === _curHost); const _isCudaTarget = (_sb === 'cuda') && (_hwfitHost === _curHost);
const lcPrefix = (() => { const lcPrefix = (() => {
let p = ''; let p = '';
if (f.unified_mem && !_cpuOnly && !_isWindows() && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `; if (f.unified_mem && !_cpuOnly && (!_isWin || _localWindows) && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
// No GPU env var in CPU mode `-ngl 0` already disables offload // No GPU env var in CPU mode - `-ngl 0` already disables offload
// so CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES would be misleading // so CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES would be misleading
// clutter ("why is CUDA pinned for a CPU run?"). // clutter ("why is CUDA pinned for a CPU run?").
if (!_isWindows() && !_cpuOnly) p += _gpuEnvPrefix(gpuId); if ((!_isWin || _localWindows) && !_cpuOnly) p += _gpuEnvPrefix(gpuId);
return p; return p;
})(); })();
if (f.unified_mem && !_cpuOnly && _isWindows() && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `; if (f.unified_mem && !_cpuOnly && _isWin && !_localWindows && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
if (_isWindows() && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true); if (_isWin && !_localWindows && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true);
const needsGgufPrelude = /^\$\(\{\s*find\s/.test(String(ggufPath || '')); const needsGgufPrelude = /^\$\(\{\s*find\s/.test(String(ggufPath || ''));
const modelArg = needsGgufPrelude ? '"$MODEL_FILE"' : `"${ggufPath}"`; const modelArg = needsGgufPrelude ? '"$MODEL_FILE"' : `"${ggufPath}"`;
// Prefer native llama-server. The backend bootstrap resolves/builds the // Prefer native llama-server. The backend bootstrap resolves/builds the
@@ -744,11 +754,16 @@ export function _buildServeCmd(f, modelName, backend) {
// llama-cpp-python takes the projector via --clip_model_path. // llama-cpp-python takes the projector via --clip_model_path.
_lcpExtra += ` --clip_model_path "${f._mmproj_path}"`; _lcpExtra += ` --clip_model_path "${f._mmproj_path}"`;
} }
if (_isWindows()) { const _lcServer = `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`; const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
if (_localWindows) {
// Local Windows serve is launched through Git Bash, so use the native
// llama-server shape and let PATH resolve the CUDA Release wrapper.
cmd += _lcServer;
} else if (_isWin) {
cmd += _lcpServer; cmd += _lcpServer;
} else { } else {
cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`; cmd += _lcServer;
} }
if (needsGgufPrelude) { if (needsGgufPrelude) {
cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host"; exit 1; } && ${cmd}`; cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host"; exit 1; } && ${cmd}`;
@@ -2612,13 +2627,14 @@ function _renderRecipes() {
const isLocal = !s.host || s.host.toLowerCase() === 'local'; const isLocal = !s.host || s.host.toLowerCase() === 'local';
if (isLocal) { if (isLocal) {
s.host = ''; s.host = '';
s.platform = _envState.hostPlatform || '';
if (_localSeen) return false; if (_localSeen) return false;
_localSeen = true; _localSeen = true;
} }
return true; return true;
}); });
if (!_localSeen) { if (!_localSeen) {
_es.servers.unshift({ host: '', env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub' }); _es.servers.unshift({ host: '', env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub', platform: _envState.hostPlatform || '' });
} }
if (_es.remoteHost && !_es.servers.some(s => s.host === _es.remoteHost)) { if (_es.remoteHost && !_es.servers.some(s => s.host === _es.remoteHost)) {
_es.servers.push({ host: _es.remoteHost, env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub' }); _es.servers.push({ host: _es.remoteHost, env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub' });
+2 -2
View File
@@ -781,6 +781,7 @@ function _stripStateSecrets(state) {
const safe = { ...state }; const safe = { ...state };
if (safe.env && typeof safe.env === 'object') { if (safe.env && typeof safe.env === 'object') {
const { hfToken, ...env } = safe.env; const { hfToken, ...env } = safe.env;
delete env.hostPlatform;
safe.env = env; safe.env = env;
} }
if (Array.isArray(safe.tasks)) safe.tasks = safe.tasks.map(_redactTaskForStorage); if (Array.isArray(safe.tasks)) safe.tasks = safe.tasks.map(_redactTaskForStorage);
@@ -1673,7 +1674,7 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid
|| _envState.servers.find(s => s.host === _host) || {}; || _envState.servers.find(s => s.host === _host) || {};
const _serverMetaKey = _targetKey || (_hsrv && _serverKey ? _serverKey(_hsrv) : '') || (_host || 'local'); const _serverMetaKey = _targetKey || (_hsrv && _serverKey ? _serverKey(_hsrv) : '') || (_host || 'local');
const _serverMetaName = targetMeta?.serverName || _hsrv.name || (_host ? _host : 'Local'); const _serverMetaName = targetMeta?.serverName || _hsrv.name || (_host ? _host : 'Local');
const _hplatform = _host ? (_hsrv.platform || '') : (_envState.platform || ''); const _hplatform = _host ? (_hsrv.platform || '') : (_envState.hostPlatform || '');
const _replaceTaskId = fields?._replaceTaskId || ''; const _replaceTaskId = fields?._replaceTaskId || '';
if (_replaceTaskId) { if (_replaceTaskId) {
try { try {
@@ -1688,7 +1689,6 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid
} }
} catch {} } catch {}
} }
// Replace any serve already targeting this same host:port — you can't run two // Replace any serve already targeting this same host:port — you can't run two
// servers on one port, so re-serving (or retrying) should stop & remove the // servers on one port, so re-serving (or retrying) should stop & remove the
// old one instead of leaving a dead duplicate behind. (The retry buttons // old one instead of leaving a dead duplicate behind. (The retry buttons
+27 -7
View File
@@ -527,7 +527,7 @@ function _selectedServeTarget(panel) {
env: server?.env || '', env: server?.env || '',
port: host ? (server?.port || _getPort(host) || '') : '', port: host ? (server?.port || _getPort(host) || '') : '',
venv, venv,
platform: server?.platform || _envState.platform || '', platform: host ? (server?.platform || '') : (_envState.hostPlatform || ''),
label, label,
}; };
} }
@@ -658,6 +658,12 @@ function _selectedGgufSizeGb(model, relPath) {
return bytes / (1024 ** 3); return bytes / (1024 ** 3);
} }
function _projectorGgufFiles(model) {
return _ggufFilesForModel(model)
.filter(f => (f.role || '') === 'projector' || /(^|\/)mmproj[^/]*\.gguf$/i.test(f.rel_path || f.name || ''))
.sort((a, b) => String(a.rel_path || a.name || '').localeCompare(String(b.rel_path || b.name || '')));
}
function _ggufFileLabel(file) { function _ggufFileLabel(file) {
const base = (file.name || file.rel_path || '').split('/').pop(); const base = (file.name || file.rel_path || '').split('/').pop();
const size = _formatGgufSize(file.size_bytes); const size = _formatGgufSize(file.size_bytes);
@@ -1198,6 +1204,7 @@ function _rerenderCachedModels() {
panelHtml += `<div class="hwfit-serve-warn" style="margin:0 0 8px;padding:6px 10px;border-radius:5px;font-size:11px;background:color-mix(in srgb, var(--color-warning, #f0ad4e) 14%, transparent);border:1px solid color-mix(in srgb, var(--color-warning, #f0ad4e) 40%, transparent);color:var(--color-warning, #f0ad4e);display:flex;gap:6px;align-items:flex-start;line-height:1.4;"><span aria-hidden="true">⚠</span><span>${_warnText}</span></div>`; panelHtml += `<div class="hwfit-serve-warn" style="margin:0 0 8px;padding:6px 10px;border-radius:5px;font-size:11px;background:color-mix(in srgb, var(--color-warning, #f0ad4e) 14%, transparent);border:1px solid color-mix(in srgb, var(--color-warning, #f0ad4e) 40%, transparent);color:var(--color-warning, #f0ad4e);display:flex;gap:6px;align-items:flex-start;line-height:1.4;"><span aria-hidden="true">⚠</span><span>${_warnText}</span></div>`;
} }
panelHtml += `<div class="hwfit-serve-preset-row">${_slotsHtml}</div>`; panelHtml += `<div class="hwfit-serve-preset-row">${_slotsHtml}</div>`;
panelHtml += `<div class="hwfit-serve-vision-warn" style="display:none;margin:0 0 8px;padding:6px 10px;border-radius:5px;font-size:11px;background:color-mix(in srgb, var(--color-warning, #f0ad4e) 14%, transparent);border:1px solid color-mix(in srgb, var(--color-warning, #f0ad4e) 40%, transparent);color:var(--color-warning, #f0ad4e);gap:6px;align-items:flex-start;line-height:1.4;"><span aria-hidden="true">⚠</span><span>Vision is enabled, but no mmproj GGUF projector was found in the cached model scan. Download an mmproj-*.gguf for this model, then refresh the cached model list before launching.</span></div>`;
// Row 1: Engine + Server + Env // Row 1: Engine + Server + Env
panelHtml += `<div class="hwfit-serve-row">`; panelHtml += `<div class="hwfit-serve-row">`;
const backendOpts = _backendChoices.map(([v,l]) => `<option value="${v}"${defaultBackend===v?' selected':''}>${l}</option>`).join(''); const backendOpts = _backendChoices.map(([v,l]) => `<option value="${v}"${defaultBackend===v?' selected':''}>${l}</option>`).join('');
@@ -1524,6 +1531,11 @@ function _rerenderCachedModels() {
if (el.type === 'checkbox') f[el.dataset.field] = el.checked; if (el.type === 'checkbox') f[el.dataset.field] = el.checked;
else f[el.dataset.field] = el.value; else f[el.dataset.field] = el.value;
}); });
const buildTarget = _selectedServeTarget(panel);
f.host = buildTarget.host || '';
f.platform = buildTarget.platform || '';
const hostField = panel.querySelector('[data-field="host"]');
if (hostField) hostField.value = f.host;
const backend = f.backend || 'vllm'; const backend = f.backend || 'vllm';
const serveModel = (f.model_path || '').trim() || (m.is_local_dir && m.path ? `${m.path}/${repo}` : repo); const serveModel = (f.model_path || '').trim() || (m.is_local_dir && m.path ? `${m.path}/${repo}` : repo);
if (backend === 'llamacpp') { if (backend === 'llamacpp') {
@@ -1543,11 +1555,11 @@ function _rerenderCachedModels() {
: m.is_local_dir && m.path : m.is_local_dir && m.path
? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)` ? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
: `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`; : `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
// Vision: auto-find the mmproj (CLIP/projector) file in the same dir. // Vision: use the scanned projector (CLIP/mmproj) file when present.
// Resolved at runtime so the toggle just works if an mmproj-*.gguf is // Keeping this as a printf path avoids generating a command substitution
// present (downloaded alongside the model). Empty if none → cmd omits it. // that the backend serve-command validator must reject as unsafe.
const _vsearchdir = (m.is_local_dir && m.path) ? _ldir : dir; const selectedProjector = _projectorGgufFiles(m)[0];
f._mmproj_path = `$(find ${_vsearchdir} -iname 'mmproj*.gguf' 2>/dev/null | sort | head -1)`; f._mmproj_path = selectedProjector ? _selectedGgufExpr(m, repo, selectedProjector.rel_path) : '';
} }
if (f.reasoning_parser) { if (f.reasoning_parser) {
const _rpEl2 = panel.querySelector('[data-field="reasoning_parser"]'); const _rpEl2 = panel.querySelector('[data-field="reasoning_parser"]');
@@ -1563,6 +1575,10 @@ function _rerenderCachedModels() {
} }
let cmd = _buildServeCmd(f, serveModel, backend); let cmd = _buildServeCmd(f, serveModel, backend);
if (f.extra && f.extra.trim()) cmd += ' ' + f.extra.trim(); if (f.extra && f.extra.trim()) cmd += ' ' + f.extra.trim();
const missingVisionProjector = backend === 'llamacpp' && !!f.vision && !f._mmproj_path;
panel._visionMissingProjector = missingVisionProjector;
const _visionWarn = panel.querySelector('.hwfit-serve-vision-warn');
if (_visionWarn) _visionWarn.style.display = missingVisionProjector ? 'flex' : 'none';
const _ce2 = panel.querySelector('.hwfit-serve-cmd'); _ce2.value = _formatServeCmdPreview(cmd); _ce2.style.height = 'auto'; _ce2.style.height = _ce2.scrollHeight + 'px'; const _ce2 = panel.querySelector('.hwfit-serve-cmd'); _ce2.value = _formatServeCmdPreview(cmd); _ce2.style.height = 'auto'; _ce2.style.height = _ce2.scrollHeight + 'px';
panel._cmd = cmd; panel._cmd = cmd;
panel._host = f.host || ''; panel._host = f.host || '';
@@ -2938,12 +2954,16 @@ function _rerenderCachedModels() {
}); });
serveState.backend = serveState.backend || (_detectBackend(m).backend) || 'vllm'; serveState.backend = serveState.backend || (_detectBackend(m).backend) || 'vllm';
const launchTarget = _selectedServeTarget(panel); const launchTarget = _selectedServeTarget(panel);
if (serveState.backend === 'llamacpp' && serveState.vision && !/(?:^|\s)(?:--mmproj|--clip_model_path)\b/.test(launchCmd)) {
_restoreLaunchBtn();
uiModule.showToast('Vision is checked, but no mmproj projector is in the launch command. Refresh cached models after downloading mmproj, or add --mmproj manually.', 8000);
return;
}
if (serveState.backend === 'diffusers' && _remoteWindowsDiffusersUnsupported(launchTarget)) { if (serveState.backend === 'diffusers' && _remoteWindowsDiffusersUnsupported(launchTarget)) {
_restoreLaunchBtn(); _restoreLaunchBtn();
uiModule.showToast('Diffusers serving is not supported on remote Windows servers yet. Use local Windows or a Linux server.', 9000); uiModule.showToast('Diffusers serving is not supported on remote Windows servers yet. Use local Windows or a Linux server.', 9000);
return; return;
} }
// Pre-launch: check our own task list for a serve already running // Pre-launch: check our own task list for a serve already running
// on this host. Offer to stop+launch as the default action — the // on this host. Offer to stop+launch as the default action — the
// SSH-based port probe below is more thorough but it can miss // SSH-based port probe below is more thorough but it can miss
+101 -3
View File
@@ -1,4 +1,4 @@
"""Regression guard for issue #1291 CPU-only serve still emitted GPU-only flags. """Regression guard for issue #1291 - CPU-only serve still emitted GPU-only flags.
The llama.cpp serve command builder (static/js/cookbook.js) added The llama.cpp serve command builder (static/js/cookbook.js) added
`--flash-attn on` and exported `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` from `--flash-attn on` and exported `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` from
@@ -16,8 +16,8 @@ from pathlib import Path
SRC = Path(__file__).resolve().parent.parent / "static/js/cookbook.js" SRC = Path(__file__).resolve().parent.parent / "static/js/cookbook.js"
SERVE_SRC = Path(__file__).resolve().parent.parent / "static/js/cookbookServe.js" SERVE_SRC = Path(__file__).resolve().parent.parent / "static/js/cookbookServe.js"
ROUTES_SRC = Path(__file__).resolve().parent.parent / "routes/cookbook_routes.py" ROOT = SRC.parent.parent.parent
ROUTES_SRC = ROOT / "routes/cookbook_routes.py"
def test_cpu_only_drops_gpu_only_flags(): def test_cpu_only_drops_gpu_only_flags():
text = SRC.read_text(encoding="utf-8") text = SRC.read_text(encoding="utf-8")
@@ -84,3 +84,101 @@ def test_vllm_route_strips_swap_space_when_runtime_rejects_it():
assert "print(shlex.join(parts[:serve_i + 1] + [\"--help\"]))" in text assert "print(shlex.join(parts[:serve_i + 1] + [\"--help\"]))" in text
assert "eval \"$ODYSSEUS_VLLM_HELP_CMD\" 2>&1 | grep -q -- \"--swap-space\"" in text assert "eval \"$ODYSSEUS_VLLM_HELP_CMD\" 2>&1 | grep -q -- \"--swap-space\"" in text
assert "eval \"$ODYSSEUS_SERVE_CMD\"" in text assert "eval \"$ODYSSEUS_SERVE_CMD\"" in text
def test_local_windows_platform_comes_from_backend_host_state():
text = SRC.read_text(encoding="utf-8")
routes = ROUTES_SRC.read_text(encoding="utf-8")
running = (SRC.parent / "cookbookRunning.js").read_text(encoding="utf-8")
assert "hostPlatform" in text
assert "navigator.platform" not in text
assert "hostOrTask === 'local'" in text
assert "if (hostOrTask === 'local') return _envState.hostPlatform || '';" in text
assert "return _envState.hostPlatform || _envState.platform || ''" not in text
assert "s.platform = _envState.hostPlatform || '';" in text
assert "platform: _envState.hostPlatform || ''" in text
assert "s.platform = _envState.hostPlatform || _envState.platform || '';" not in text
assert "platform: _envState.hostPlatform || _envState.platform || ''" not in text
assert 'return "windows" if IS_WINDOWS else ""' in routes
assert 'env["hostPlatform"] = _client_host_platform()' in routes
assert "return _state_for_client({})" in routes
assert 'env.pop("hostPlatform", None)' in routes
assert "delete env.hostPlatform;" in running
def test_local_serve_payload_ignores_stale_env_platform():
serve = SERVE_SRC.read_text(encoding="utf-8")
running = (SRC.parent / "cookbookRunning.js").read_text(encoding="utf-8")
assert "platform: host ? (server?.platform || '') : (_envState.hostPlatform || '')," in serve
assert "platform: server?.platform || _envState.platform || ''" not in serve
assert "const _hplatform = _host ? (_hsrv.platform || '') : (_envState.hostPlatform || '');" in running
assert "const _hplatform = _host ? (_hsrv.platform || '') : (_envState.platform || '');" not in running
def test_local_windows_llamacpp_prefers_native_llama_server():
text = SRC.read_text(encoding="utf-8")
helpers = (ROOT / "routes/cookbook_helpers.py").read_text(encoding="utf-8")
assert "Object.prototype.hasOwnProperty.call(f, 'host')" in text
assert "const _isWin = _targetHost ? _isWindows(_targetHost) : _isWindows('local');" in text
assert "const _localWindows = _isWin && !_targetHost;" in text
assert "const _curHost = _targetHost;" in text
assert "const _localWindows = _isWin && !_envState.remoteHost;" not in text
assert "const gpuId = (f.gpus || f.gpu_id || '').toString().trim();" in text
assert "const _lcServer = `${lcPrefix}llama-server --model" in text
assert "if (_localWindows) {" in text
assert "cmd += _lcServer;" in text
assert '"llama-server.exe"' in helpers
def test_serve_command_preview_uses_selected_target_host():
text = SERVE_SRC.read_text(encoding="utf-8")
assert "const buildTarget = _selectedServeTarget(panel);" in text
assert "f.host = buildTarget.host || '';" in text
assert "f.platform = buildTarget.platform || '';" in text
assert "const hostField = panel.querySelector('[data-field=\"host\"]');" in text
assert "if (hostField) hostField.value = f.host;" in text
def test_local_windows_llama_server_skips_source_bootstrap():
routes = ROUTES_SRC.read_text(encoding="utf-8")
assert 'local_windows_llama_cmd = local_windows and ("llama_cpp" in req.cmd or "llama-server" in req.cmd)' in routes
assert 'if ("llama_cpp" in req.cmd or "llama-server" in req.cmd) and not local_windows_llama_cmd:' in routes
def test_local_windows_llama_server_path_includes_user_wrapper_and_cuda_builds():
routes = (ROOT / "routes/cookbook_routes.py").read_text(encoding="utf-8")
assert 'if local_windows:' in routes
assert (
'export PATH="$HOME/bin:$HOME/llama.cpp/build-cuda/bin/Release:'
'$HOME/llama.cpp/build/bin/Release:$HOME/llama.cpp/build/bin/Debug:'
'$HOME/llama.cpp/build/bin:$PATH"'
) in routes
def test_serve_panel_keeps_row_markup_and_launch_cmd_assignment_executable():
text = SERVE_SRC.read_text(encoding="utf-8").replace("\r\n", "\n")
assert '// Row 1: Engine + Server + Env panelHtml +=' not in text
assert "px'; panel._cmd = cmd;" not in text
assert '// Row 1: Engine + Server + Env\n panelHtml += `<div class="hwfit-serve-row">`;' in text
assert "px';\n panel._cmd = cmd;" in text
def test_llamacpp_vision_uses_scanned_projector_instead_of_runtime_find():
text = SERVE_SRC.read_text(encoding="utf-8")
assert "function _projectorGgufFiles(model)" in text
assert "const selectedProjector = _projectorGgufFiles(m)[0];" in text
assert "f._mmproj_path = selectedProjector ? _selectedGgufExpr(m, repo, selectedProjector.rel_path) : '';" in text
assert "const missingVisionProjector = backend === 'llamacpp' && !!f.vision && !f._mmproj_path;" in text
assert "hwfit-serve-vision-warn" in text
assert "!/(?:^|\\s)(?:--mmproj|--clip_model_path)\\b/.test(launchCmd)" in text
assert "no mmproj projector is in the launch command" in text
assert "find ${_vsearchdir} -iname 'mmproj*.gguf'" not in text
+13 -6
View File
@@ -419,8 +419,6 @@ def test_pip_install_attempt_failure_propagates_real_exit_code():
"""Run the generated snippet against a deliberately broken pip install """Run the generated snippet against a deliberately broken pip install
to confirm the subshell exits with pip's non-zero status.""" to confirm the subshell exits with pip's non-zero status."""
snippet = _pip_install_attempt("python3 -m pip install __nonexistent_package_12345__") snippet = _pip_install_attempt("python3 -m pip install __nonexistent_package_12345__")
if sys.platform == "win32":
snippet = snippet.replace("$", "\\$")
result = subprocess.run( result = subprocess.run(
["bash", "-c", snippet], ["bash", "-c", snippet],
capture_output=True, capture_output=True,
@@ -433,8 +431,6 @@ def test_pip_install_attempt_failure_propagates_real_exit_code():
def test_pip_install_attempt_success_exits_zero(): def test_pip_install_attempt_success_exits_zero():
"""When pip succeeds, the subshell should exit 0.""" """When pip succeeds, the subshell should exit 0."""
snippet = _pip_install_attempt("python3 -c 'pass'") snippet = _pip_install_attempt("python3 -c 'pass'")
if sys.platform == "win32":
snippet = snippet.replace("$", "\\$")
result = subprocess.run( result = subprocess.run(
["bash", "-c", snippet], ["bash", "-c", snippet],
capture_output=True, capture_output=True,
@@ -447,8 +443,6 @@ def test_pip_install_attempt_success_exits_zero():
def test_pip_install_attempt_surfaces_stderr_on_failure(): def test_pip_install_attempt_surfaces_stderr_on_failure():
"""On failure, the last 5 lines of pip output should appear in stdout.""" """On failure, the last 5 lines of pip output should appear in stdout."""
snippet = _pip_install_attempt("python3 -m pip install __nonexistent_package_12345__") snippet = _pip_install_attempt("python3 -m pip install __nonexistent_package_12345__")
if sys.platform == "win32":
snippet = snippet.replace("$", "\\$")
result = subprocess.run( result = subprocess.run(
["bash", "-c", snippet], ["bash", "-c", snippet],
capture_output=True, capture_output=True,
@@ -557,6 +551,19 @@ def test_validate_serve_cmd_accepts_windows_printf_format():
assert _validate_serve_cmd(cmd) == cmd assert _validate_serve_cmd(cmd) == cmd
def test_validate_serve_cmd_accepts_llama_mmproj_printf_format():
cmd = (
"CUDA_VISIBLE_DEVICES=0 llama-server --model "
"\"$(printf %s ${HOME}'/.cache/huggingface/hub/models--unsloth--Qwen3.6-35B-A3B-GGUF/snapshots/abc/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf')\" "
"--host 0.0.0.0 --port 8000 -ngl 99 -c 20000 "
"--cache-type-k q4_0 --cache-type-v q4_0 --mmproj "
"\"$(printf %s ${HOME}'/.cache/huggingface/hub/models--unsloth--Qwen3.6-35B-A3B-GGUF/snapshots/abc/mmproj-BF16.gguf')\" "
"--image-max-tokens 1024"
)
assert _validate_serve_cmd(cmd) == cmd
def test_normalize_llama_cpp_python_cache_types_for_stale_client_cmd(): def test_normalize_llama_cpp_python_cache_types_for_stale_client_cmd():
cmd = ( cmd = (
"python -m llama_cpp.server --model model.gguf --host 0.0.0.0 --port 8000 " "python -m llama_cpp.server --model model.gguf --host 0.0.0.0 --port 8000 "