Drop GPU-only flags from the CPU-only (-ngl 0) serve command (#1433)

A CPU-only llama.cpp serve config still emitted --flash-attn on and exported GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 (independent toggles, often left on by an Auto profile), so the command mixed "zero GPU layers" with CUDA/flash-attn and failed to start (issue #1291). Gate both on a _cpuOnly check (ngl == 0). GPU serving is unchanged — the gate only affects the ngl=0 path. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-16 17:55:26 -04:00 · 2026-06-03 03:26:15 +08:00
parent 5c6bd0fc2b
commit 0e6cbd8315
2 changed files with 37 additions and 3 deletions
@@ -401,13 +401,17 @@ export function _buildServeCmd(f, modelName, backend) {
    const ggufPath = f._gguf_path || 'model.gguf';
    const gpuId = f.gpu_id?.trim() || '';
    const py = _isWindows() ? 'python' : 'python3';
+    // CPU-only serve (-ngl 0): drop the GPU-only flags, otherwise the command
+    // mixes "zero GPU layers" with CUDA unified-memory + flash-attn and fails to
+    // start (issue #1291). Only affects the ngl=0 path; GPU serving is unchanged.
+    const _cpuOnly = String(f.ngl).trim() === '0';
    const lcPrefix = (() => {
      let p = '';
-      if (f.unified_mem && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
+      if (f.unified_mem && !_cpuOnly && !_isWindows()) p += `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 `;
      if (gpuId && !_isWindows()) p += `CUDA_VISIBLE_DEVICES=${gpuId} `;
      return p;
    })();
-    if (f.unified_mem && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
+    if (f.unified_mem && !_cpuOnly && _isWindows()) cmd += `$env:GGML_CUDA_ENABLE_UNIFIED_MEMORY="1"; `;
    if (gpuId && _isWindows()) cmd += `$env:CUDA_VISIBLE_DEVICES="${gpuId}"; `;
    if (!_isWindows()) {
      // Resolve GGUF path once, fail loudly if nothing matched (prevents
@@ -439,7 +443,7 @@ export function _buildServeCmd(f, modelName, backend) {
      _lcExtra += ` --n-cpu-moe ${_ncm}`;
      _lcpExtra += ` --n_cpu_moe ${_ncm}`;   // llama-cpp-python uses underscores
    }
-    if (f.flash_attn) {
+    if (f.flash_attn && !_cpuOnly) {
      _lcExtra += ' --flash-attn on';
      _lcpExtra += ' --flash_attn true';
    }