fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers (#3945)

* fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers

_apply_local_cache_affinity adds session_id + cache_prompt for llama.cpp
KV-cache slot affinity (#2927), gated on _is_self_hosted_openai_compatible,
which treated any unknown OpenAI-compatible host as self-hosted. Strict
cloud providers added as custom endpoints (Mistral at api.mistral.ai)
reject unknown body fields, so every request failed with 422
extra_forbidden. Self-hosted now also requires the endpoint to resolve as
local via model_context.is_local_endpoint: loopback/private/tailscale
host, or endpoint kind explicitly configured as "local" (the escape hatch
for tunneled self-hosted servers). is_local_endpoint is promoted to a
public name since llm_core now shares it.

Fixes #3793

* test(llm): sweep cloud OpenAI-compatible hosts in affinity gating

Parametrized cases adapted from #3839 (credit: Shabablinchikow): deepseek,
x.ai, together, fireworks, and the Gemini OpenAI-compat endpoint must all
stay free of the llama.cpp extras, not just the Mistral host from #3793.

* fix(llm): narrow the Tailscale range to 100.64.0.0/10 in is_local_endpoint

Review finding on #3945: _PRIVATE_PREFIXES carried a bare "100." prefix,
treating all of 100.0.0.0/8 as local while Tailscale only uses the CGNAT
block 100.64.0.0/10. Public 100.x hosts (e.g. AWS ranges outside the
block) were classified local and still received the llama.cpp extras
this PR exists to keep away from strict providers. Match the narrowed
classification routes/model_routes.py already uses, with boundary tests
just below, inside, and just above the range.
This commit is contained in:
Kenny Van de Maele
2026-06-11 17:51:03 +02:00
committed by GitHub
parent f941db29d3
commit 263d41c58a
5 changed files with 142 additions and 24 deletions
+16 -6
View File
@@ -457,15 +457,25 @@ def _detect_provider(url: str) -> str:
def _is_self_hosted_openai_compatible(url: str) -> bool:
"""True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio,
vLLM, text-generation-webui, etc.) as opposed to api.openai.com itself.
vLLM, text-generation-webui, etc.) as opposed to cloud APIs.
Used to gate llama.cpp-server-specific payload extras (``session_id``,
``cache_prompt``) — sending unrecognized top-level fields to OpenAI's
actual API returns a 400 ("Unrecognized request argument"), but
self-hosted servers generally ignore unknown fields and many (notably
llama.cpp's server) use them for KV-cache slot affinity (issue #2927).
``cache_prompt``) used for KV-cache slot affinity (issue #2927). Strict
cloud providers reject unrecognized top-level fields (api.openai.com
returns 400, Mistral returns 422 "extra_forbidden", issue #3793), and any
unknown OpenAI-compatible host used to be treated as self-hosted, so those
fields leaked to every strict provider added as a custom endpoint.
A server only counts as self-hosted when it also resolves as local:
loopback/private/tailscale host, or the endpoint explicitly configured
with kind "local". A self-hosted server exposed via a public hostname
loses the affinity hint unless its endpoint kind is set to "local" -
a lost perf hint, versus a hard 4xx on every request the other way.
"""
return _detect_provider(url) == "openai" and not _host_match(url, "openai.com")
if _detect_provider(url) != "openai" or _host_match(url, "openai.com"):
return False
from src.model_context import is_local_endpoint
return is_local_endpoint(url)
def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None:
+20 -6
View File
@@ -5,6 +5,7 @@ Query and cache model context window sizes from OpenAI-compatible APIs.
Provides token estimation for context usage tracking.
"""
import ipaddress
import logging
import sys
from typing import Dict, List, Optional, Tuple
@@ -19,7 +20,20 @@ _LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1", "host.docker.interna
_PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.",
"172.20.", "172.21.", "172.22.", "172.23.", "172.24.",
"172.25.", "172.26.", "172.27.", "172.28.", "172.29.",
"172.30.", "172.31.", "192.168.", "100.")
"172.30.", "172.31.", "192.168.")
# Tailscale uses the CGNAT range 100.64.0.0/10, NOT all of 100.0.0.0/8.
# A bare "100." prefix would classify public addresses (e.g. AWS ranges
# under 100.x outside the CGNAT block) as local; routes/model_routes.py
# already narrows this the same way for endpoint classification.
_TAILSCALE_CGNAT = ipaddress.ip_network("100.64.0.0/10")
def _in_tailscale_range(host: str) -> bool:
try:
return ipaddress.ip_address(host) in _TAILSCALE_CGNAT
except ValueError:
return False
def _normalize_base_for_compare(url: str) -> str:
@@ -64,7 +78,7 @@ def _configured_endpoint_kind(url: str) -> Optional[str]:
return None
def _is_local_endpoint(url: str) -> bool:
def is_local_endpoint(url: str) -> bool:
"""Check if URL points to a local/private/tailscale address."""
kind = _configured_endpoint_kind(url)
if kind in ("api", "proxy"):
@@ -73,7 +87,7 @@ def _is_local_endpoint(url: str) -> bool:
return True
try:
host = urlparse(url).hostname or ""
return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES)
return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) or _in_tailscale_range(host)
except Exception:
return False
@@ -219,7 +233,7 @@ def get_context_length(endpoint_url: str, model: str) -> int:
Falls back to DEFAULT_CONTEXT if unavailable.
"""
configured_kind = _configured_endpoint_kind(endpoint_url)
is_local = _is_local_endpoint(endpoint_url)
is_local = is_local_endpoint(endpoint_url)
# Key on (endpoint_url, model): the same model id can be served by two
# different remote endpoints with different real context windows (e.g. a
# capped proxy vs. the full provider), so caching by model id alone would
@@ -273,7 +287,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
return DEFAULT_CONTEXT
# Try llama.cpp /slots endpoint first — reports actual serving context
if _is_local_endpoint(endpoint_url):
if is_local_endpoint(endpoint_url):
try:
base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0]
r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT)
@@ -337,7 +351,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
# For local/self-hosted endpoints, trust the API value (user set --max-model-len)
# For cloud APIs, use the larger value (API can report low defaults)
if api_ctx and known:
_is_local = _is_local_endpoint(endpoint_url)
_is_local = is_local_endpoint(endpoint_url)
if _is_local and api_ctx < known:
logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value")
return api_ctx
+94
View File
@@ -0,0 +1,94 @@
"""llama.cpp slot-affinity fields must never reach cloud providers (#3793).
_apply_local_cache_affinity adds session_id + cache_prompt to outgoing
payloads for KV-cache slot affinity (#2927). The old gate treated any unknown
OpenAI-compatible host as self-hosted, so strict cloud APIs added as custom
endpoints (Mistral at api.mistral.ai) received the extra fields and rejected
every request with 422 extra_forbidden. Self-hosted now also requires the
endpoint to resolve as local: loopback/private/tailscale host, or endpoint
kind explicitly configured as "local".
"""
import pytest
import src.llm_core as llm_core
import src.model_context as model_context
def _affinity_fields(url, monkeypatch, kind=None):
monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: kind)
payload = {}
llm_core._apply_local_cache_affinity(payload, url, "sess-123")
return payload
def test_mistral_cloud_api_gets_no_affinity_fields(monkeypatch):
# The #3793 repro: Mistral rejects unknown body fields with 422.
payload = _affinity_fields("https://api.mistral.ai/v1", monkeypatch)
assert payload == {}
def test_openai_api_gets_no_affinity_fields(monkeypatch):
payload = _affinity_fields("https://api.openai.com/v1", monkeypatch)
assert payload == {}
def test_unknown_public_host_gets_no_affinity_fields(monkeypatch):
# Any strict cloud provider added as a custom endpoint, not just Mistral.
payload = _affinity_fields("https://llm.example-cloud.com/v1", monkeypatch)
assert payload == {}
def test_localhost_server_gets_affinity_fields(monkeypatch):
payload = _affinity_fields("http://localhost:8080/v1", monkeypatch)
assert payload == {"session_id": "sess-123", "cache_prompt": True}
def test_private_lan_server_gets_affinity_fields(monkeypatch):
payload = _affinity_fields("http://192.168.1.50:8000/v1", monkeypatch)
assert payload == {"session_id": "sess-123", "cache_prompt": True}
def test_public_host_with_local_kind_override_gets_affinity_fields(monkeypatch):
# Escape hatch: a self-hosted llama.cpp exposed via a tunnel keeps the
# slot-affinity hint when its endpoint kind is configured as "local".
payload = _affinity_fields("https://my-llama.example.com/v1", monkeypatch, kind="local")
assert payload == {"session_id": "sess-123", "cache_prompt": True}
def test_no_session_id_is_a_noop(monkeypatch):
monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: None)
payload = {}
llm_core._apply_local_cache_affinity(payload, "http://localhost:8080/v1", None)
assert payload == {}
# Cloud-host sweep absorbed from #3839 (credit: Shabablinchikow) - every cloud
# API that falls through provider detection to the OpenAI-compatible default
# must stay clean, not just the Mistral host from the original report.
@pytest.mark.parametrize("url", [
"https://api.mistral.ai/v1/chat/completions",
"https://api.deepseek.com/v1/chat/completions",
"https://api.x.ai/v1/chat/completions",
"https://api.together.xyz/v1/chat/completions",
"https://api.fireworks.ai/inference/v1/chat/completions",
"https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
])
def test_cloud_openai_compatible_hosts_get_no_affinity_fields(monkeypatch, url):
assert _affinity_fields(url, monkeypatch) == {}
# Tailscale CGNAT boundaries (review finding on #3945): only 100.64.0.0/10 is
# Tailscale; the rest of 100.0.0.0/8 contains public ranges, and a strict
# provider addressed by one must not receive the llama.cpp extras.
def test_host_just_below_cgnat_gets_no_affinity_fields(monkeypatch):
assert _affinity_fields("http://100.63.255.255/v1", monkeypatch) == {}
def test_host_just_above_cgnat_gets_no_affinity_fields(monkeypatch):
assert _affinity_fields("http://100.128.0.1/v1", monkeypatch) == {}
@pytest.mark.parametrize("host", ["100.64.0.1", "100.100.50.2", "100.127.255.254"])
def test_hosts_inside_cgnat_get_affinity_fields(monkeypatch, host):
payload = _affinity_fields(f"http://{host}:8080/v1", monkeypatch)
assert payload == {"session_id": "sess-123", "cache_prompt": True}
+1 -1
View File
@@ -11,7 +11,7 @@ import src.model_context as mc
def _setup(monkeypatch, windows):
"""windows: {endpoint_url: context_length}. Force the remote path."""
monkeypatch.setattr(mc, "_is_local_endpoint", lambda url: False)
monkeypatch.setattr(mc, "is_local_endpoint", lambda url: False)
monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api")
monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url])
mc._context_cache.clear()
+11 -11
View File
@@ -6,7 +6,7 @@ import types
import pytest
import src.model_context as model_context
from src.model_context import _is_local_endpoint, estimate_tokens, _lookup_known
from src.model_context import is_local_endpoint, estimate_tokens, _lookup_known
class _Column:
@@ -56,20 +56,20 @@ def _install_endpoint_db(monkeypatch, rows):
class TestIsLocalEndpoint:
def test_localhost(self):
assert _is_local_endpoint("http://localhost:5000/v1/chat/completions") is True
assert is_local_endpoint("http://localhost:5000/v1/chat/completions") is True
def test_loopback_ipv4(self):
assert _is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True
assert is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True
def test_private_192_168(self):
assert _is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True
assert is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True
def test_private_10(self):
assert _is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True
assert is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True
def test_tailscale_100(self):
# 100.64.0.0/10 is the CGNAT range Tailscale uses.
assert _is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True
assert is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True
def test_configured_tailscale_proxy_is_remote(self, monkeypatch):
_install_endpoint_db(monkeypatch, [
@@ -81,19 +81,19 @@ class TestIsLocalEndpoint:
)
])
assert _is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False
assert is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False
def test_openai_is_remote(self):
assert _is_local_endpoint("https://api.openai.com/v1/chat/completions") is False
assert is_local_endpoint("https://api.openai.com/v1/chat/completions") is False
def test_anthropic_is_remote(self):
assert _is_local_endpoint("https://api.anthropic.com/v1/messages") is False
assert is_local_endpoint("https://api.anthropic.com/v1/messages") is False
def test_empty_url(self):
assert _is_local_endpoint("") is False
assert is_local_endpoint("") is False
def test_malformed_url(self):
assert _is_local_endpoint("not-a-url") is False
assert is_local_endpoint("not-a-url") is False
class TestEstimateTokens: