fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers (#3945)

* fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers

_apply_local_cache_affinity adds session_id + cache_prompt for llama.cpp
KV-cache slot affinity (#2927), gated on _is_self_hosted_openai_compatible,
which treated any unknown OpenAI-compatible host as self-hosted. Strict
cloud providers added as custom endpoints (Mistral at api.mistral.ai)
reject unknown body fields, so every request failed with 422
extra_forbidden. Self-hosted now also requires the endpoint to resolve as
local via model_context.is_local_endpoint: loopback/private/tailscale
host, or endpoint kind explicitly configured as "local" (the escape hatch
for tunneled self-hosted servers). is_local_endpoint is promoted to a
public name since llm_core now shares it.

Fixes #3793

* test(llm): sweep cloud OpenAI-compatible hosts in affinity gating

Parametrized cases adapted from #3839 (credit: Shabablinchikow): deepseek,
x.ai, together, fireworks, and the Gemini OpenAI-compat endpoint must all
stay free of the llama.cpp extras, not just the Mistral host from #3793.

* fix(llm): narrow the Tailscale range to 100.64.0.0/10 in is_local_endpoint

Review finding on #3945: _PRIVATE_PREFIXES carried a bare "100." prefix,
treating all of 100.0.0.0/8 as local while Tailscale only uses the CGNAT
block 100.64.0.0/10. Public 100.x hosts (e.g. AWS ranges outside the
block) were classified local and still received the llama.cpp extras
this PR exists to keep away from strict providers. Match the narrowed
classification routes/model_routes.py already uses, with boundary tests
just below, inside, and just above the range.
This commit is contained in:
Kenny Van de Maele
2026-06-11 17:51:03 +02:00
committed by GitHub
parent f941db29d3
commit 263d41c58a
5 changed files with 142 additions and 24 deletions
+16 -6
View File
@@ -457,15 +457,25 @@ def _detect_provider(url: str) -> str:
def _is_self_hosted_openai_compatible(url: str) -> bool: def _is_self_hosted_openai_compatible(url: str) -> bool:
"""True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio, """True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio,
vLLM, text-generation-webui, etc.) as opposed to api.openai.com itself. vLLM, text-generation-webui, etc.) as opposed to cloud APIs.
Used to gate llama.cpp-server-specific payload extras (``session_id``, Used to gate llama.cpp-server-specific payload extras (``session_id``,
``cache_prompt``) — sending unrecognized top-level fields to OpenAI's ``cache_prompt``) used for KV-cache slot affinity (issue #2927). Strict
actual API returns a 400 ("Unrecognized request argument"), but cloud providers reject unrecognized top-level fields (api.openai.com
self-hosted servers generally ignore unknown fields and many (notably returns 400, Mistral returns 422 "extra_forbidden", issue #3793), and any
llama.cpp's server) use them for KV-cache slot affinity (issue #2927). unknown OpenAI-compatible host used to be treated as self-hosted, so those
fields leaked to every strict provider added as a custom endpoint.
A server only counts as self-hosted when it also resolves as local:
loopback/private/tailscale host, or the endpoint explicitly configured
with kind "local". A self-hosted server exposed via a public hostname
loses the affinity hint unless its endpoint kind is set to "local" -
a lost perf hint, versus a hard 4xx on every request the other way.
""" """
return _detect_provider(url) == "openai" and not _host_match(url, "openai.com") if _detect_provider(url) != "openai" or _host_match(url, "openai.com"):
return False
from src.model_context import is_local_endpoint
return is_local_endpoint(url)
def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None: def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None:
+20 -6
View File
@@ -5,6 +5,7 @@ Query and cache model context window sizes from OpenAI-compatible APIs.
Provides token estimation for context usage tracking. Provides token estimation for context usage tracking.
""" """
import ipaddress
import logging import logging
import sys import sys
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@@ -19,7 +20,20 @@ _LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1", "host.docker.interna
_PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.", _PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.",
"172.20.", "172.21.", "172.22.", "172.23.", "172.24.", "172.20.", "172.21.", "172.22.", "172.23.", "172.24.",
"172.25.", "172.26.", "172.27.", "172.28.", "172.29.", "172.25.", "172.26.", "172.27.", "172.28.", "172.29.",
"172.30.", "172.31.", "192.168.", "100.") "172.30.", "172.31.", "192.168.")
# Tailscale uses the CGNAT range 100.64.0.0/10, NOT all of 100.0.0.0/8.
# A bare "100." prefix would classify public addresses (e.g. AWS ranges
# under 100.x outside the CGNAT block) as local; routes/model_routes.py
# already narrows this the same way for endpoint classification.
_TAILSCALE_CGNAT = ipaddress.ip_network("100.64.0.0/10")
def _in_tailscale_range(host: str) -> bool:
try:
return ipaddress.ip_address(host) in _TAILSCALE_CGNAT
except ValueError:
return False
def _normalize_base_for_compare(url: str) -> str: def _normalize_base_for_compare(url: str) -> str:
@@ -64,7 +78,7 @@ def _configured_endpoint_kind(url: str) -> Optional[str]:
return None return None
def _is_local_endpoint(url: str) -> bool: def is_local_endpoint(url: str) -> bool:
"""Check if URL points to a local/private/tailscale address.""" """Check if URL points to a local/private/tailscale address."""
kind = _configured_endpoint_kind(url) kind = _configured_endpoint_kind(url)
if kind in ("api", "proxy"): if kind in ("api", "proxy"):
@@ -73,7 +87,7 @@ def _is_local_endpoint(url: str) -> bool:
return True return True
try: try:
host = urlparse(url).hostname or "" host = urlparse(url).hostname or ""
return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) or _in_tailscale_range(host)
except Exception: except Exception:
return False return False
@@ -219,7 +233,7 @@ def get_context_length(endpoint_url: str, model: str) -> int:
Falls back to DEFAULT_CONTEXT if unavailable. Falls back to DEFAULT_CONTEXT if unavailable.
""" """
configured_kind = _configured_endpoint_kind(endpoint_url) configured_kind = _configured_endpoint_kind(endpoint_url)
is_local = _is_local_endpoint(endpoint_url) is_local = is_local_endpoint(endpoint_url)
# Key on (endpoint_url, model): the same model id can be served by two # Key on (endpoint_url, model): the same model id can be served by two
# different remote endpoints with different real context windows (e.g. a # different remote endpoints with different real context windows (e.g. a
# capped proxy vs. the full provider), so caching by model id alone would # capped proxy vs. the full provider), so caching by model id alone would
@@ -273,7 +287,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
return DEFAULT_CONTEXT return DEFAULT_CONTEXT
# Try llama.cpp /slots endpoint first — reports actual serving context # Try llama.cpp /slots endpoint first — reports actual serving context
if _is_local_endpoint(endpoint_url): if is_local_endpoint(endpoint_url):
try: try:
base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0] base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0]
r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT) r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT)
@@ -337,7 +351,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
# For local/self-hosted endpoints, trust the API value (user set --max-model-len) # For local/self-hosted endpoints, trust the API value (user set --max-model-len)
# For cloud APIs, use the larger value (API can report low defaults) # For cloud APIs, use the larger value (API can report low defaults)
if api_ctx and known: if api_ctx and known:
_is_local = _is_local_endpoint(endpoint_url) _is_local = is_local_endpoint(endpoint_url)
if _is_local and api_ctx < known: if _is_local and api_ctx < known:
logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value") logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value")
return api_ctx return api_ctx
+94
View File
@@ -0,0 +1,94 @@
"""llama.cpp slot-affinity fields must never reach cloud providers (#3793).
_apply_local_cache_affinity adds session_id + cache_prompt to outgoing
payloads for KV-cache slot affinity (#2927). The old gate treated any unknown
OpenAI-compatible host as self-hosted, so strict cloud APIs added as custom
endpoints (Mistral at api.mistral.ai) received the extra fields and rejected
every request with 422 extra_forbidden. Self-hosted now also requires the
endpoint to resolve as local: loopback/private/tailscale host, or endpoint
kind explicitly configured as "local".
"""
import pytest
import src.llm_core as llm_core
import src.model_context as model_context
def _affinity_fields(url, monkeypatch, kind=None):
monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: kind)
payload = {}
llm_core._apply_local_cache_affinity(payload, url, "sess-123")
return payload
def test_mistral_cloud_api_gets_no_affinity_fields(monkeypatch):
# The #3793 repro: Mistral rejects unknown body fields with 422.
payload = _affinity_fields("https://api.mistral.ai/v1", monkeypatch)
assert payload == {}
def test_openai_api_gets_no_affinity_fields(monkeypatch):
payload = _affinity_fields("https://api.openai.com/v1", monkeypatch)
assert payload == {}
def test_unknown_public_host_gets_no_affinity_fields(monkeypatch):
# Any strict cloud provider added as a custom endpoint, not just Mistral.
payload = _affinity_fields("https://llm.example-cloud.com/v1", monkeypatch)
assert payload == {}
def test_localhost_server_gets_affinity_fields(monkeypatch):
payload = _affinity_fields("http://localhost:8080/v1", monkeypatch)
assert payload == {"session_id": "sess-123", "cache_prompt": True}
def test_private_lan_server_gets_affinity_fields(monkeypatch):
payload = _affinity_fields("http://192.168.1.50:8000/v1", monkeypatch)
assert payload == {"session_id": "sess-123", "cache_prompt": True}
def test_public_host_with_local_kind_override_gets_affinity_fields(monkeypatch):
# Escape hatch: a self-hosted llama.cpp exposed via a tunnel keeps the
# slot-affinity hint when its endpoint kind is configured as "local".
payload = _affinity_fields("https://my-llama.example.com/v1", monkeypatch, kind="local")
assert payload == {"session_id": "sess-123", "cache_prompt": True}
def test_no_session_id_is_a_noop(monkeypatch):
monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: None)
payload = {}
llm_core._apply_local_cache_affinity(payload, "http://localhost:8080/v1", None)
assert payload == {}
# Cloud-host sweep absorbed from #3839 (credit: Shabablinchikow) - every cloud
# API that falls through provider detection to the OpenAI-compatible default
# must stay clean, not just the Mistral host from the original report.
@pytest.mark.parametrize("url", [
"https://api.mistral.ai/v1/chat/completions",
"https://api.deepseek.com/v1/chat/completions",
"https://api.x.ai/v1/chat/completions",
"https://api.together.xyz/v1/chat/completions",
"https://api.fireworks.ai/inference/v1/chat/completions",
"https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
])
def test_cloud_openai_compatible_hosts_get_no_affinity_fields(monkeypatch, url):
assert _affinity_fields(url, monkeypatch) == {}
# Tailscale CGNAT boundaries (review finding on #3945): only 100.64.0.0/10 is
# Tailscale; the rest of 100.0.0.0/8 contains public ranges, and a strict
# provider addressed by one must not receive the llama.cpp extras.
def test_host_just_below_cgnat_gets_no_affinity_fields(monkeypatch):
assert _affinity_fields("http://100.63.255.255/v1", monkeypatch) == {}
def test_host_just_above_cgnat_gets_no_affinity_fields(monkeypatch):
assert _affinity_fields("http://100.128.0.1/v1", monkeypatch) == {}
@pytest.mark.parametrize("host", ["100.64.0.1", "100.100.50.2", "100.127.255.254"])
def test_hosts_inside_cgnat_get_affinity_fields(monkeypatch, host):
payload = _affinity_fields(f"http://{host}:8080/v1", monkeypatch)
assert payload == {"session_id": "sess-123", "cache_prompt": True}
+1 -1
View File
@@ -11,7 +11,7 @@ import src.model_context as mc
def _setup(monkeypatch, windows): def _setup(monkeypatch, windows):
"""windows: {endpoint_url: context_length}. Force the remote path.""" """windows: {endpoint_url: context_length}. Force the remote path."""
monkeypatch.setattr(mc, "_is_local_endpoint", lambda url: False) monkeypatch.setattr(mc, "is_local_endpoint", lambda url: False)
monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api") monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api")
monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url]) monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url])
mc._context_cache.clear() mc._context_cache.clear()
+11 -11
View File
@@ -6,7 +6,7 @@ import types
import pytest import pytest
import src.model_context as model_context import src.model_context as model_context
from src.model_context import _is_local_endpoint, estimate_tokens, _lookup_known from src.model_context import is_local_endpoint, estimate_tokens, _lookup_known
class _Column: class _Column:
@@ -56,20 +56,20 @@ def _install_endpoint_db(monkeypatch, rows):
class TestIsLocalEndpoint: class TestIsLocalEndpoint:
def test_localhost(self): def test_localhost(self):
assert _is_local_endpoint("http://localhost:5000/v1/chat/completions") is True assert is_local_endpoint("http://localhost:5000/v1/chat/completions") is True
def test_loopback_ipv4(self): def test_loopback_ipv4(self):
assert _is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True assert is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True
def test_private_192_168(self): def test_private_192_168(self):
assert _is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True assert is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True
def test_private_10(self): def test_private_10(self):
assert _is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True assert is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True
def test_tailscale_100(self): def test_tailscale_100(self):
# 100.64.0.0/10 is the CGNAT range Tailscale uses. # 100.64.0.0/10 is the CGNAT range Tailscale uses.
assert _is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True assert is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True
def test_configured_tailscale_proxy_is_remote(self, monkeypatch): def test_configured_tailscale_proxy_is_remote(self, monkeypatch):
_install_endpoint_db(monkeypatch, [ _install_endpoint_db(monkeypatch, [
@@ -81,19 +81,19 @@ class TestIsLocalEndpoint:
) )
]) ])
assert _is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False assert is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False
def test_openai_is_remote(self): def test_openai_is_remote(self):
assert _is_local_endpoint("https://api.openai.com/v1/chat/completions") is False assert is_local_endpoint("https://api.openai.com/v1/chat/completions") is False
def test_anthropic_is_remote(self): def test_anthropic_is_remote(self):
assert _is_local_endpoint("https://api.anthropic.com/v1/messages") is False assert is_local_endpoint("https://api.anthropic.com/v1/messages") is False
def test_empty_url(self): def test_empty_url(self):
assert _is_local_endpoint("") is False assert is_local_endpoint("") is False
def test_malformed_url(self): def test_malformed_url(self):
assert _is_local_endpoint("not-a-url") is False assert is_local_endpoint("not-a-url") is False
class TestEstimateTokens: class TestEstimateTokens: