fix(cookbook): treat local Windows as Windows for serve commands (#3975)

* fix(cookbook): prefer native llama-server on local Windows

* fix(cookbook): harden local llama-server launch commands

* fix(cookbook): build serve commands for selected target
This commit is contained in:
Dividesbyzer0
2026-06-26 08:13:01 -04:00
committed by GitHub
parent de12d4734a
commit f5200ec45b
7 changed files with 190 additions and 36 deletions
+30 -14
View File
@@ -76,7 +76,7 @@ function _platformIcon(platform) {
return '';
}
export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', defaultServer: '' };
export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', hostPlatform: '', defaultServer: '' };
let _lastCacheHostVal = null;
let _cookbookOpeningSpinners = [];
export function _lastCacheHost() { return _lastCacheHostVal; }
@@ -213,8 +213,13 @@ function _getPort(hostOrTask) {
/** Get platform for a given host (or task object). Returns 'windows', 'termux', 'linux', or '' */
export function _getPlatform(hostOrTask) {
if (!hostOrTask) return _envState.platform || '';
if (typeof hostOrTask === 'object') return hostOrTask.platform || _getPlatform(hostOrTask.remoteServerKey || hostOrTask.remoteHost);
if (hostOrTask === 'local') return _envState.hostPlatform || '';
if (!hostOrTask) return _envState.remoteHost ? (_envState.platform || '') : (_envState.hostPlatform || '');
if (typeof hostOrTask === 'object') {
const taskHost = hostOrTask.remoteServerKey || hostOrTask.remoteHost || '';
if (!taskHost || taskHost === 'local') return _envState.hostPlatform || '';
return hostOrTask.platform || _getPlatform(taskHost);
}
const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null;
const srv = selected || _serverByVal(hostOrTask);
return srv?.platform || '';
@@ -638,7 +643,12 @@ export function _buildServeCmd(f, modelName, backend) {
// GPU list — read from gpus (button strip); fall back to gpu_id for
// backward-compat with older saved presets that pre-date the removal.
const gpuId = (f.gpus || f.gpu_id || '').toString().trim();
const py = _isWindows() ? 'python' : 'python3';
const _targetHost = Object.prototype.hasOwnProperty.call(f, 'host')
? String(f.host || '').trim()
: String(_envState.remoteHost || '').trim();
const _isWin = _targetHost ? _isWindows(_targetHost) : _isWindows('local');
const _localWindows = _isWin && !_targetHost;
const py = _isWin ? 'python' : 'python3';
// CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
// mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
// start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
@@ -660,19 +670,19 @@ export function _buildServeCmd(f, modelName, backend) {
// with misleading prefixes.
const _sb = String(_hwfitCache?.system?.backend || '').toLowerCase();
const _hwfitHost = String(_hwfitCache?._scannedHost || '');
const _curHost = String(_envState.remoteHost || '');
const _curHost = _targetHost;
const _isCudaTarget = (_sb === 'cuda') && (_hwfitHost === _curHost);
const lcPrefix = (() => {
let p = '';
if (f.unified_mem && !_cpuOnly && !_isWindows() && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
// No GPU env var in CPU mode `-ngl 0` already disables offload
if (f.unified_mem && !_cpuOnly && (!_isWin || _localWindows) && _isCudaTarget) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
// No GPU env var in CPU mode - `-ngl 0` already disables offload
// so CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES would be misleading
// clutter ("why is CUDA pinned for a CPU run?").
if (!_isWindows() && !_cpuOnly) p += _gpuEnvPrefix(gpuId);
if ((!_isWin || _localWindows) && !_cpuOnly) p += _gpuEnvPrefix(gpuId);
return p;
})();
if (f.unified_mem && !_cpuOnly && _isWindows() && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
if (_isWindows() && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true);
if (f.unified_mem && !_cpuOnly && _isWin && !_localWindows && _isCudaTarget) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
if (_isWin && !_localWindows && !_cpuOnly) cmd += _gpuEnvPrefix(gpuId, true);
const needsGgufPrelude = /^\$\(\{\s*find\s/.test(String(ggufPath || ''));
const modelArg = needsGgufPrelude ? '"$MODEL_FILE"' : `"${ggufPath}"`;
// Prefer native llama-server. The backend bootstrap resolves/builds the
@@ -744,11 +754,16 @@ export function _buildServeCmd(f, modelName, backend) {
// llama-cpp-python takes the projector via --clip_model_path.
_lcpExtra += ` --clip_model_path "${f._mmproj_path}"`;
}
if (_isWindows()) {
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
const _lcServer = `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
const _lcpServer = `${lcPrefix}${py} -m llama_cpp.server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} --n_gpu_layers ${f.ngl || '99'} --n_ctx ${f.ctx || '8192'}${_lcpExtra}`;
if (_localWindows) {
// Local Windows serve is launched through Git Bash, so use the native
// llama-server shape and let PATH resolve the CUDA Release wrapper.
cmd += _lcServer;
} else if (_isWin) {
cmd += _lcpServer;
} else {
cmd += `${lcPrefix}llama-server --model ${modelArg} --host 0.0.0.0 --port ${f.port || '8080'} -ngl ${f.ngl || '99'} -c ${f.ctx || '8192'}${_lcExtra}`;
cmd += _lcServer;
}
if (needsGgufPrelude) {
cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host"; exit 1; } && ${cmd}`;
@@ -2612,13 +2627,14 @@ function _renderRecipes() {
const isLocal = !s.host || s.host.toLowerCase() === 'local';
if (isLocal) {
s.host = '';
s.platform = _envState.hostPlatform || '';
if (_localSeen) return false;
_localSeen = true;
}
return true;
});
if (!_localSeen) {
_es.servers.unshift({ host: '', env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub' });
_es.servers.unshift({ host: '', env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub', platform: _envState.hostPlatform || '' });
}
if (_es.remoteHost && !_es.servers.some(s => s.host === _es.remoteHost)) {
_es.servers.push({ host: _es.remoteHost, env: _es.env || 'none', envPath: _es.envPath || '', modelDir: '~/.cache/huggingface/hub' });