fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers (#3945)

* fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers _apply_local_cache_affinity adds session_id + cache_prompt for llama.cpp KV-cache slot affinity (#2927), gated on _is_self_hosted_openai_compatible, which treated any unknown OpenAI-compatible host as self-hosted. Strict cloud providers added as custom endpoints (Mistral at api.mistral.ai) reject unknown body fields, so every request failed with 422 extra_forbidden. Self-hosted now also requires the endpoint to resolve as local via model_context.is_local_endpoint: loopback/private/tailscale host, or endpoint kind explicitly configured as "local" (the escape hatch for tunneled self-hosted servers). is_local_endpoint is promoted to a public name since llm_core now shares it. Fixes #3793 * test(llm): sweep cloud OpenAI-compatible hosts in affinity gating Parametrized cases adapted from #3839 (credit: Shabablinchikow): deepseek, x.ai, together, fireworks, and the Gemini OpenAI-compat endpoint must all stay free of the llama.cpp extras, not just the Mistral host from #3793. * fix(llm): narrow the Tailscale range to 100.64.0.0/10 in is_local_endpoint Review finding on #3945: _PRIVATE_PREFIXES carried a bare "100." prefix, treating all of 100.0.0.0/8 as local while Tailscale only uses the CGNAT block 100.64.0.0/10. Public 100.x hosts (e.g. AWS ranges outside the block) were classified local and still received the llama.cpp extras this PR exists to keep away from strict providers. Match the narrowed classification routes/model_routes.py already uses, with boundary tests just below, inside, and just above the range.
2026-06-17 02:05:22 -04:00 · 2026-06-11 17:51:03 +02:00
parent f941db29d3
commit 263d41c58a
5 changed files with 142 additions and 24 deletions
@@ -457,15 +457,25 @@ def _detect_provider(url: str) -> str:
 def _is_self_hosted_openai_compatible(url: str) -> bool:
    """True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio,
-    vLLM, text-generation-webui, etc.) as opposed to api.openai.com itself.
+    vLLM, text-generation-webui, etc.) as opposed to cloud APIs.
    Used to gate llama.cpp-server-specific payload extras (``session_id``,
-    ``cache_prompt``) — sending unrecognized top-level fields to OpenAI's
+    ``cache_prompt``) used for KV-cache slot affinity (issue #2927). Strict
-    actual API returns a 400 ("Unrecognized request argument"), but
+    cloud providers reject unrecognized top-level fields (api.openai.com
-    self-hosted servers generally ignore unknown fields and many (notably
+    returns 400, Mistral returns 422 "extra_forbidden", issue #3793), and any
-    llama.cpp's server) use them for KV-cache slot affinity (issue #2927).
+    unknown OpenAI-compatible host used to be treated as self-hosted, so those
    fields leaked to every strict provider added as a custom endpoint.
    A server only counts as self-hosted when it also resolves as local:
    loopback/private/tailscale host, or the endpoint explicitly configured
    with kind "local". A self-hosted server exposed via a public hostname
    loses the affinity hint unless its endpoint kind is set to "local" -
    a lost perf hint, versus a hard 4xx on every request the other way.
    """
-    return _detect_provider(url) == "openai" and not _host_match(url, "openai.com")
+    if _detect_provider(url) != "openai" or _host_match(url, "openai.com"):
        return False
    from src.model_context import is_local_endpoint
    return is_local_endpoint(url)
 def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None:
@@ -5,6 +5,7 @@ Query and cache model context window sizes from OpenAI-compatible APIs.
 Provides token estimation for context usage tracking.
 """
 import ipaddress
 import logging
 import sys
 from typing import Dict, List, Optional, Tuple
@@ -19,7 +20,20 @@ _LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1", "host.docker.interna
 _PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.",
                     "172.20.", "172.21.", "172.22.", "172.23.", "172.24.",
                     "172.25.", "172.26.", "172.27.", "172.28.", "172.29.",
-                     "172.30.", "172.31.", "192.168.", "100.")
+                     "172.30.", "172.31.", "192.168.")
 # Tailscale uses the CGNAT range 100.64.0.0/10, NOT all of 100.0.0.0/8.
 # A bare "100." prefix would classify public addresses (e.g. AWS ranges
 # under 100.x outside the CGNAT block) as local; routes/model_routes.py
 # already narrows this the same way for endpoint classification.
 _TAILSCALE_CGNAT = ipaddress.ip_network("100.64.0.0/10")
 def _in_tailscale_range(host: str) -> bool:
    try:
        return ipaddress.ip_address(host) in _TAILSCALE_CGNAT
    except ValueError:
        return False
 def _normalize_base_for_compare(url: str) -> str:
@@ -64,7 +78,7 @@ def _configured_endpoint_kind(url: str) -> Optional[str]:
        return None
-def _is_local_endpoint(url: str) -> bool:
+def is_local_endpoint(url: str) -> bool:
    """Check if URL points to a local/private/tailscale address."""
    kind = _configured_endpoint_kind(url)
    if kind in ("api", "proxy"):
@@ -73,7 +87,7 @@ def _is_local_endpoint(url: str) -> bool:
        return True
    try:
        host = urlparse(url).hostname or ""
-        return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES)
+        return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) or _in_tailscale_range(host)
    except Exception:
        return False
@@ -219,7 +233,7 @@ def get_context_length(endpoint_url: str, model: str) -> int:
    Falls back to DEFAULT_CONTEXT if unavailable.
    """
    configured_kind = _configured_endpoint_kind(endpoint_url)
-    is_local = _is_local_endpoint(endpoint_url)
+    is_local = is_local_endpoint(endpoint_url)
    # Key on (endpoint_url, model): the same model id can be served by two
    # different remote endpoints with different real context windows (e.g. a
    # capped proxy vs. the full provider), so caching by model id alone would
@@ -273,7 +287,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
        return DEFAULT_CONTEXT
    # Try llama.cpp /slots endpoint first — reports actual serving context
-    if _is_local_endpoint(endpoint_url):
+    if is_local_endpoint(endpoint_url):
        try:
            base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0]
            r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT)
@@ -337,7 +351,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
    # For local/self-hosted endpoints, trust the API value (user set --max-model-len)
    # For cloud APIs, use the larger value (API can report low defaults)
    if api_ctx and known:
-        _is_local = _is_local_endpoint(endpoint_url)
+        _is_local = is_local_endpoint(endpoint_url)
        if _is_local and api_ctx < known:
            logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value")
            return api_ctx
@@ -0,0 +1,94 @@
 """llama.cpp slot-affinity fields must never reach cloud providers (#3793).
 _apply_local_cache_affinity adds session_id + cache_prompt to outgoing
 payloads for KV-cache slot affinity (#2927). The old gate treated any unknown
 OpenAI-compatible host as self-hosted, so strict cloud APIs added as custom
 endpoints (Mistral at api.mistral.ai) received the extra fields and rejected
 every request with 422 extra_forbidden. Self-hosted now also requires the
 endpoint to resolve as local: loopback/private/tailscale host, or endpoint
 kind explicitly configured as "local".
 """
 import pytest
 import src.llm_core as llm_core
 import src.model_context as model_context
 def _affinity_fields(url, monkeypatch, kind=None):
    monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: kind)
    payload = {}
    llm_core._apply_local_cache_affinity(payload, url, "sess-123")
    return payload
 def test_mistral_cloud_api_gets_no_affinity_fields(monkeypatch):
    # The #3793 repro: Mistral rejects unknown body fields with 422.
    payload = _affinity_fields("https://api.mistral.ai/v1", monkeypatch)
    assert payload == {}
 def test_openai_api_gets_no_affinity_fields(monkeypatch):
    payload = _affinity_fields("https://api.openai.com/v1", monkeypatch)
    assert payload == {}
 def test_unknown_public_host_gets_no_affinity_fields(monkeypatch):
    # Any strict cloud provider added as a custom endpoint, not just Mistral.
    payload = _affinity_fields("https://llm.example-cloud.com/v1", monkeypatch)
    assert payload == {}
 def test_localhost_server_gets_affinity_fields(monkeypatch):
    payload = _affinity_fields("http://localhost:8080/v1", monkeypatch)
    assert payload == {"session_id": "sess-123", "cache_prompt": True}
 def test_private_lan_server_gets_affinity_fields(monkeypatch):
    payload = _affinity_fields("http://192.168.1.50:8000/v1", monkeypatch)
    assert payload == {"session_id": "sess-123", "cache_prompt": True}
 def test_public_host_with_local_kind_override_gets_affinity_fields(monkeypatch):
    # Escape hatch: a self-hosted llama.cpp exposed via a tunnel keeps the
    # slot-affinity hint when its endpoint kind is configured as "local".
    payload = _affinity_fields("https://my-llama.example.com/v1", monkeypatch, kind="local")
    assert payload == {"session_id": "sess-123", "cache_prompt": True}
 def test_no_session_id_is_a_noop(monkeypatch):
    monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: None)
    payload = {}
    llm_core._apply_local_cache_affinity(payload, "http://localhost:8080/v1", None)
    assert payload == {}
 # Cloud-host sweep absorbed from #3839 (credit: Shabablinchikow) - every cloud
 # API that falls through provider detection to the OpenAI-compatible default
 # must stay clean, not just the Mistral host from the original report.
@pytest.mark.parametrize("url", [
    "https://api.mistral.ai/v1/chat/completions",
    "https://api.deepseek.com/v1/chat/completions",
    "https://api.x.ai/v1/chat/completions",
    "https://api.together.xyz/v1/chat/completions",
    "https://api.fireworks.ai/inference/v1/chat/completions",
    "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
 ])
 def test_cloud_openai_compatible_hosts_get_no_affinity_fields(monkeypatch, url):
    assert _affinity_fields(url, monkeypatch) == {}
 # Tailscale CGNAT boundaries (review finding on #3945): only 100.64.0.0/10 is
 # Tailscale; the rest of 100.0.0.0/8 contains public ranges, and a strict
 # provider addressed by one must not receive the llama.cpp extras.
 def test_host_just_below_cgnat_gets_no_affinity_fields(monkeypatch):
    assert _affinity_fields("http://100.63.255.255/v1", monkeypatch) == {}
 def test_host_just_above_cgnat_gets_no_affinity_fields(monkeypatch):
    assert _affinity_fields("http://100.128.0.1/v1", monkeypatch) == {}
@pytest.mark.parametrize("host", ["100.64.0.1", "100.100.50.2", "100.127.255.254"])
 def test_hosts_inside_cgnat_get_affinity_fields(monkeypatch, host):
    payload = _affinity_fields(f"http://{host}:8080/v1", monkeypatch)
    assert payload == {"session_id": "sess-123", "cache_prompt": True}
@@ -11,7 +11,7 @@ import src.model_context as mc
 def _setup(monkeypatch, windows):
    """windows: {endpoint_url: context_length}. Force the remote path."""
-    monkeypatch.setattr(mc, "_is_local_endpoint", lambda url: False)
+    monkeypatch.setattr(mc, "is_local_endpoint", lambda url: False)
    monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api")
    monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url])
    mc._context_cache.clear()
@@ -6,7 +6,7 @@ import types
 import pytest
 import src.model_context as model_context
-from src.model_context import _is_local_endpoint, estimate_tokens, _lookup_known
+from src.model_context import is_local_endpoint, estimate_tokens, _lookup_known
 class _Column:
@@ -56,20 +56,20 @@ def _install_endpoint_db(monkeypatch, rows):
 class TestIsLocalEndpoint:
    def test_localhost(self):
-        assert _is_local_endpoint("http://localhost:5000/v1/chat/completions") is True
+        assert is_local_endpoint("http://localhost:5000/v1/chat/completions") is True
    def test_loopback_ipv4(self):
-        assert _is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True
+        assert is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True
    def test_private_192_168(self):
-        assert _is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True
+        assert is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True
    def test_private_10(self):
-        assert _is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True
+        assert is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True
    def test_tailscale_100(self):
        # 100.64.0.0/10 is the CGNAT range Tailscale uses.
-        assert _is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True
+        assert is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True
    def test_configured_tailscale_proxy_is_remote(self, monkeypatch):
        _install_endpoint_db(monkeypatch, [
@@ -81,19 +81,19 @@ class TestIsLocalEndpoint:
            )
        ])
-        assert _is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False
+        assert is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False
    def test_openai_is_remote(self):
-        assert _is_local_endpoint("https://api.openai.com/v1/chat/completions") is False
+        assert is_local_endpoint("https://api.openai.com/v1/chat/completions") is False
    def test_anthropic_is_remote(self):
-        assert _is_local_endpoint("https://api.anthropic.com/v1/messages") is False
+        assert is_local_endpoint("https://api.anthropic.com/v1/messages") is False
    def test_empty_url(self):
-        assert _is_local_endpoint("") is False
+        assert is_local_endpoint("") is False
    def test_malformed_url(self):
-        assert _is_local_endpoint("not-a-url") is False
+        assert is_local_endpoint("not-a-url") is False
 class TestEstimateTokens: