diff --git a/src/llm_core.py b/src/llm_core.py index 89c153809..88061c9ea 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -457,15 +457,25 @@ def _detect_provider(url: str) -> str: def _is_self_hosted_openai_compatible(url: str) -> bool: """True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio, - vLLM, text-generation-webui, etc.) as opposed to api.openai.com itself. + vLLM, text-generation-webui, etc.) as opposed to cloud APIs. Used to gate llama.cpp-server-specific payload extras (``session_id``, - ``cache_prompt``) — sending unrecognized top-level fields to OpenAI's - actual API returns a 400 ("Unrecognized request argument"), but - self-hosted servers generally ignore unknown fields and many (notably - llama.cpp's server) use them for KV-cache slot affinity (issue #2927). + ``cache_prompt``) used for KV-cache slot affinity (issue #2927). Strict + cloud providers reject unrecognized top-level fields (api.openai.com + returns 400, Mistral returns 422 "extra_forbidden", issue #3793), and any + unknown OpenAI-compatible host used to be treated as self-hosted, so those + fields leaked to every strict provider added as a custom endpoint. + + A server only counts as self-hosted when it also resolves as local: + loopback/private/tailscale host, or the endpoint explicitly configured + with kind "local". A self-hosted server exposed via a public hostname + loses the affinity hint unless its endpoint kind is set to "local" - + a lost perf hint, versus a hard 4xx on every request the other way. """ - return _detect_provider(url) == "openai" and not _host_match(url, "openai.com") + if _detect_provider(url) != "openai" or _host_match(url, "openai.com"): + return False + from src.model_context import is_local_endpoint + return is_local_endpoint(url) def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None: diff --git a/src/model_context.py b/src/model_context.py index a2ce9f638..0b04b20cc 100644 --- a/src/model_context.py +++ b/src/model_context.py @@ -5,6 +5,7 @@ Query and cache model context window sizes from OpenAI-compatible APIs. Provides token estimation for context usage tracking. """ +import ipaddress import logging import sys from typing import Dict, List, Optional, Tuple @@ -19,7 +20,20 @@ _LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1", "host.docker.interna _PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.", "172.20.", "172.21.", "172.22.", "172.23.", "172.24.", "172.25.", "172.26.", "172.27.", "172.28.", "172.29.", - "172.30.", "172.31.", "192.168.", "100.") + "172.30.", "172.31.", "192.168.") + +# Tailscale uses the CGNAT range 100.64.0.0/10, NOT all of 100.0.0.0/8. +# A bare "100." prefix would classify public addresses (e.g. AWS ranges +# under 100.x outside the CGNAT block) as local; routes/model_routes.py +# already narrows this the same way for endpoint classification. +_TAILSCALE_CGNAT = ipaddress.ip_network("100.64.0.0/10") + + +def _in_tailscale_range(host: str) -> bool: + try: + return ipaddress.ip_address(host) in _TAILSCALE_CGNAT + except ValueError: + return False def _normalize_base_for_compare(url: str) -> str: @@ -64,7 +78,7 @@ def _configured_endpoint_kind(url: str) -> Optional[str]: return None -def _is_local_endpoint(url: str) -> bool: +def is_local_endpoint(url: str) -> bool: """Check if URL points to a local/private/tailscale address.""" kind = _configured_endpoint_kind(url) if kind in ("api", "proxy"): @@ -73,7 +87,7 @@ def _is_local_endpoint(url: str) -> bool: return True try: host = urlparse(url).hostname or "" - return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) + return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) or _in_tailscale_range(host) except Exception: return False @@ -219,7 +233,7 @@ def get_context_length(endpoint_url: str, model: str) -> int: Falls back to DEFAULT_CONTEXT if unavailable. """ configured_kind = _configured_endpoint_kind(endpoint_url) - is_local = _is_local_endpoint(endpoint_url) + is_local = is_local_endpoint(endpoint_url) # Key on (endpoint_url, model): the same model id can be served by two # different remote endpoints with different real context windows (e.g. a # capped proxy vs. the full provider), so caching by model id alone would @@ -273,7 +287,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int: return DEFAULT_CONTEXT # Try llama.cpp /slots endpoint first — reports actual serving context - if _is_local_endpoint(endpoint_url): + if is_local_endpoint(endpoint_url): try: base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0] r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT) @@ -337,7 +351,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int: # For local/self-hosted endpoints, trust the API value (user set --max-model-len) # For cloud APIs, use the larger value (API can report low defaults) if api_ctx and known: - _is_local = _is_local_endpoint(endpoint_url) + _is_local = is_local_endpoint(endpoint_url) if _is_local and api_ctx < known: logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value") return api_ctx diff --git a/tests/test_cache_affinity_local_only.py b/tests/test_cache_affinity_local_only.py new file mode 100644 index 000000000..3fe8a10cc --- /dev/null +++ b/tests/test_cache_affinity_local_only.py @@ -0,0 +1,94 @@ +"""llama.cpp slot-affinity fields must never reach cloud providers (#3793). + +_apply_local_cache_affinity adds session_id + cache_prompt to outgoing +payloads for KV-cache slot affinity (#2927). The old gate treated any unknown +OpenAI-compatible host as self-hosted, so strict cloud APIs added as custom +endpoints (Mistral at api.mistral.ai) received the extra fields and rejected +every request with 422 extra_forbidden. Self-hosted now also requires the +endpoint to resolve as local: loopback/private/tailscale host, or endpoint +kind explicitly configured as "local". +""" +import pytest + +import src.llm_core as llm_core +import src.model_context as model_context + + +def _affinity_fields(url, monkeypatch, kind=None): + monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: kind) + payload = {} + llm_core._apply_local_cache_affinity(payload, url, "sess-123") + return payload + + +def test_mistral_cloud_api_gets_no_affinity_fields(monkeypatch): + # The #3793 repro: Mistral rejects unknown body fields with 422. + payload = _affinity_fields("https://api.mistral.ai/v1", monkeypatch) + assert payload == {} + + +def test_openai_api_gets_no_affinity_fields(monkeypatch): + payload = _affinity_fields("https://api.openai.com/v1", monkeypatch) + assert payload == {} + + +def test_unknown_public_host_gets_no_affinity_fields(monkeypatch): + # Any strict cloud provider added as a custom endpoint, not just Mistral. + payload = _affinity_fields("https://llm.example-cloud.com/v1", monkeypatch) + assert payload == {} + + +def test_localhost_server_gets_affinity_fields(monkeypatch): + payload = _affinity_fields("http://localhost:8080/v1", monkeypatch) + assert payload == {"session_id": "sess-123", "cache_prompt": True} + + +def test_private_lan_server_gets_affinity_fields(monkeypatch): + payload = _affinity_fields("http://192.168.1.50:8000/v1", monkeypatch) + assert payload == {"session_id": "sess-123", "cache_prompt": True} + + +def test_public_host_with_local_kind_override_gets_affinity_fields(monkeypatch): + # Escape hatch: a self-hosted llama.cpp exposed via a tunnel keeps the + # slot-affinity hint when its endpoint kind is configured as "local". + payload = _affinity_fields("https://my-llama.example.com/v1", monkeypatch, kind="local") + assert payload == {"session_id": "sess-123", "cache_prompt": True} + + +def test_no_session_id_is_a_noop(monkeypatch): + monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: None) + payload = {} + llm_core._apply_local_cache_affinity(payload, "http://localhost:8080/v1", None) + assert payload == {} + + +# Cloud-host sweep absorbed from #3839 (credit: Shabablinchikow) - every cloud +# API that falls through provider detection to the OpenAI-compatible default +# must stay clean, not just the Mistral host from the original report. +@pytest.mark.parametrize("url", [ + "https://api.mistral.ai/v1/chat/completions", + "https://api.deepseek.com/v1/chat/completions", + "https://api.x.ai/v1/chat/completions", + "https://api.together.xyz/v1/chat/completions", + "https://api.fireworks.ai/inference/v1/chat/completions", + "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions", +]) +def test_cloud_openai_compatible_hosts_get_no_affinity_fields(monkeypatch, url): + assert _affinity_fields(url, monkeypatch) == {} + + +# Tailscale CGNAT boundaries (review finding on #3945): only 100.64.0.0/10 is +# Tailscale; the rest of 100.0.0.0/8 contains public ranges, and a strict +# provider addressed by one must not receive the llama.cpp extras. +def test_host_just_below_cgnat_gets_no_affinity_fields(monkeypatch): + assert _affinity_fields("http://100.63.255.255/v1", monkeypatch) == {} + + +def test_host_just_above_cgnat_gets_no_affinity_fields(monkeypatch): + assert _affinity_fields("http://100.128.0.1/v1", monkeypatch) == {} + + +@pytest.mark.parametrize("host", ["100.64.0.1", "100.100.50.2", "100.127.255.254"]) +def test_hosts_inside_cgnat_get_affinity_fields(monkeypatch, host): + payload = _affinity_fields(f"http://{host}:8080/v1", monkeypatch) + assert payload == {"session_id": "sess-123", "cache_prompt": True} diff --git a/tests/test_context_cache_per_endpoint.py b/tests/test_context_cache_per_endpoint.py index 3bffd7bad..efabea46a 100644 --- a/tests/test_context_cache_per_endpoint.py +++ b/tests/test_context_cache_per_endpoint.py @@ -11,7 +11,7 @@ import src.model_context as mc def _setup(monkeypatch, windows): """windows: {endpoint_url: context_length}. Force the remote path.""" - monkeypatch.setattr(mc, "_is_local_endpoint", lambda url: False) + monkeypatch.setattr(mc, "is_local_endpoint", lambda url: False) monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api") monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url]) mc._context_cache.clear() diff --git a/tests/test_model_context.py b/tests/test_model_context.py index 31a105c93..ba6556a44 100644 --- a/tests/test_model_context.py +++ b/tests/test_model_context.py @@ -6,7 +6,7 @@ import types import pytest import src.model_context as model_context -from src.model_context import _is_local_endpoint, estimate_tokens, _lookup_known +from src.model_context import is_local_endpoint, estimate_tokens, _lookup_known class _Column: @@ -56,20 +56,20 @@ def _install_endpoint_db(monkeypatch, rows): class TestIsLocalEndpoint: def test_localhost(self): - assert _is_local_endpoint("http://localhost:5000/v1/chat/completions") is True + assert is_local_endpoint("http://localhost:5000/v1/chat/completions") is True def test_loopback_ipv4(self): - assert _is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True + assert is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True def test_private_192_168(self): - assert _is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True + assert is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True def test_private_10(self): - assert _is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True + assert is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True def test_tailscale_100(self): # 100.64.0.0/10 is the CGNAT range Tailscale uses. - assert _is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True + assert is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True def test_configured_tailscale_proxy_is_remote(self, monkeypatch): _install_endpoint_db(monkeypatch, [ @@ -81,19 +81,19 @@ class TestIsLocalEndpoint: ) ]) - assert _is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False + assert is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False def test_openai_is_remote(self): - assert _is_local_endpoint("https://api.openai.com/v1/chat/completions") is False + assert is_local_endpoint("https://api.openai.com/v1/chat/completions") is False def test_anthropic_is_remote(self): - assert _is_local_endpoint("https://api.anthropic.com/v1/messages") is False + assert is_local_endpoint("https://api.anthropic.com/v1/messages") is False def test_empty_url(self): - assert _is_local_endpoint("") is False + assert is_local_endpoint("") is False def test_malformed_url(self): - assert _is_local_endpoint("not-a-url") is False + assert is_local_endpoint("not-a-url") is False class TestEstimateTokens: