fix(cookbook): normalize llama-cpp-python cache types

Map llama-cpp-python --type_k/--type_v cache names to integer enum values after serve-command validation while preserving native llama-server flags.
2026-06-17 10:15:27 -04:00 · 2026-06-15 02:12:18 -04:00
parent 268bc1d1a6
commit ec4f91afdd
3 changed files with 79 additions and 0 deletions
@@ -573,6 +573,36 @@ _GGUF_PRELUDE_RE = re.compile(
 _OLLAMA_HOST_ASSIGNMENT_RE = re.compile(r"(?:^|\s)OLLAMA_HOST=([^\s]+)")
 _OLLAMA_BIND_RE = re.compile(r"^\[([^\]]+)\]:(\d+)$|^([^:]+):(\d+)$")
 _OLLAMA_BIND_HOST_RE = re.compile(r"^[A-Za-z0-9._:-]+$")
+_LLAMA_CPP_PYTHON_GGML_TYPES = {
+    "f32": "0",
+    "f16": "1",
+    "q4_0": "2",
+    "q4_1": "3",
+    "q5_0": "6",
+    "q5_1": "7",
+    "q8_0": "8",
+    "q8_1": "9",
+    "q2_k": "10",
+    "q3_k": "11",
+    "q4_k": "12",
+    "q5_k": "13",
+    "q6_k": "14",
+    "q8_k": "15",
+    "iq2_xxs": "16",
+    "iq2_xs": "17",
+    "iq3_xxs": "18",
+    "iq1_s": "19",
+    "iq4_nl": "20",
+    "iq3_s": "21",
+    "iq2_s": "22",
+    "iq4_xs": "23",
+    "mxfp4": "39",
+    "nvfp4": "40",
+    "q1_0": "41",
+}
+_LLAMA_CPP_PYTHON_TYPE_FLAG_RE = re.compile(
+    r"(?P<flag>--type_[kv])(?P<sep>\s+|=)(?P<quote>['\"]?)(?P<value>[A-Za-z0-9_]+)(?P=quote)"
+)


 def _ollama_bind_from_cmd(cmd: str | None, *, default_host: str = "127.0.0.1") -> tuple[str, str]:
@@ -604,6 +634,22 @@ def _ollama_bind_from_cmd(cmd: str | None, *, default_host: str = "127.0.0.1") -
    return f"[{host}]" if bracketed_host else host, port


+def _normalize_llama_cpp_python_cache_types(cmd: str | None) -> str | None:
+    """Map llama.cpp KV cache type names to llama-cpp-python's integer enum."""
+    if not cmd or "llama_cpp.server" not in cmd:
+        return cmd
+
+    def repl(match: re.Match[str]) -> str:
+        value = match.group("value")
+        mapped = _LLAMA_CPP_PYTHON_GGML_TYPES.get(value.lower())
+        if not mapped:
+            return match.group(0)
+        quote = match.group("quote")
+        return f"{match.group('flag')}{match.group('sep')}{quote}{mapped}{quote}"
+
+    return _LLAMA_CPP_PYTHON_TYPE_FLAG_RE.sub(repl, cmd)
+
+
 def _check_serve_binary(seg: str) -> None:
    """Validate that a single command segment starts with an allowlisted binary
    (after skipping leading env-var assignments like `CUDA_VISIBLE_DEVICES=0`)."""
@@ -46,6 +46,7 @@ from routes.cookbook_helpers import (
    _diagnose_serve_output, run_ssh_command_async,
    _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache,
    _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
+    _normalize_llama_cpp_python_cache_types,
    ModelDownloadRequest, ServeRequest,
 )

@@ -1211,6 +1212,7 @@ def setup_cookbook_routes() -> APIRouter:
        # many downstream `"engine" in req.cmd` membership checks can't hit
        # `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
        req.cmd = _validate_serve_cmd(req.cmd) or ""
+        req.cmd = _normalize_llama_cpp_python_cache_types(req.cmd) or ""
        req.cmd = _venv_safe_local_pip_install_cmd(
            req.cmd,
            local=not bool(req.remote_host),
@@ -2,6 +2,7 @@ import json
 import os
 import subprocess
 import sys
+from pathlib import Path

 import pytest
 from fastapi import HTTPException
@@ -21,6 +22,7 @@ from routes.cookbook_helpers import (
    _safe_env_prefix,
    _user_shell_path_bootstrap,
    _venv_safe_local_pip_install_cmd,
+    _normalize_llama_cpp_python_cache_types,
    _validate_gpus,
    _validate_local_dir,
    _validate_repo_id,
@@ -549,6 +551,35 @@ def test_validate_serve_cmd_accepts_windows_printf_format():
    assert _validate_serve_cmd(cmd) == cmd


+def test_normalize_llama_cpp_python_cache_types_for_stale_client_cmd():
+    cmd = (
+        "python -m llama_cpp.server --model model.gguf --host 0.0.0.0 --port 8000 "
+        "--type_k q4_0 --type_v q4_0"
+    )
+
+    assert _normalize_llama_cpp_python_cache_types(cmd).endswith("--type_k 2 --type_v 2")
+
+
+def test_normalize_llama_cpp_python_cache_types_preserves_native_cache_flags():
+    cmd = (
+        "llama-server --model model.gguf --cache-type-k q4_0 --cache-type-v q4_0 "
+        "|| python3 -m llama_cpp.server --model model.gguf --type_k=q8_0 --type_v='f16'"
+    )
+
+    normalized = _normalize_llama_cpp_python_cache_types(cmd)
+    assert "--cache-type-k q4_0 --cache-type-v q4_0" in normalized
+    assert "--type_k=8" in normalized
+    assert "--type_v='1'" in normalized
+
+
+def test_model_serve_normalizes_llama_cpp_python_cache_types_after_validation():
+    src = (Path(__file__).resolve().parents[1] / "routes" / "cookbook_routes.py").read_text(encoding="utf-8")
+
+    assert "req.cmd = _validate_serve_cmd(req.cmd) or \"\"" in src
+    assert "req.cmd = _normalize_llama_cpp_python_cache_types(req.cmd) or \"\"" in src
+    assert src.index("_validate_serve_cmd(req.cmd)") < src.index("_normalize_llama_cpp_python_cache_types(req.cmd)")
+
+
 def test_ollama_serve_defaults_to_loopback_bind():
    assert _ollama_bind_from_cmd("ollama serve") == ("127.0.0.1", "11434")
    assert _ollama_bind_from_cmd("ollama run qwen2.5:0.5b") == ("127.0.0.1", "11434")