mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers (#3945)
* fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers _apply_local_cache_affinity adds session_id + cache_prompt for llama.cpp KV-cache slot affinity (#2927), gated on _is_self_hosted_openai_compatible, which treated any unknown OpenAI-compatible host as self-hosted. Strict cloud providers added as custom endpoints (Mistral at api.mistral.ai) reject unknown body fields, so every request failed with 422 extra_forbidden. Self-hosted now also requires the endpoint to resolve as local via model_context.is_local_endpoint: loopback/private/tailscale host, or endpoint kind explicitly configured as "local" (the escape hatch for tunneled self-hosted servers). is_local_endpoint is promoted to a public name since llm_core now shares it. Fixes #3793 * test(llm): sweep cloud OpenAI-compatible hosts in affinity gating Parametrized cases adapted from #3839 (credit: Shabablinchikow): deepseek, x.ai, together, fireworks, and the Gemini OpenAI-compat endpoint must all stay free of the llama.cpp extras, not just the Mistral host from #3793. * fix(llm): narrow the Tailscale range to 100.64.0.0/10 in is_local_endpoint Review finding on #3945: _PRIVATE_PREFIXES carried a bare "100." prefix, treating all of 100.0.0.0/8 as local while Tailscale only uses the CGNAT block 100.64.0.0/10. Public 100.x hosts (e.g. AWS ranges outside the block) were classified local and still received the llama.cpp extras this PR exists to keep away from strict providers. Match the narrowed classification routes/model_routes.py already uses, with boundary tests just below, inside, and just above the range.
This commit is contained in:
committed by
GitHub
parent
f941db29d3
commit
263d41c58a
+16
-6
@@ -457,15 +457,25 @@ def _detect_provider(url: str) -> str:
|
|||||||
|
|
||||||
def _is_self_hosted_openai_compatible(url: str) -> bool:
|
def _is_self_hosted_openai_compatible(url: str) -> bool:
|
||||||
"""True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio,
|
"""True for custom/local OpenAI-compatible servers (llama.cpp, LM Studio,
|
||||||
vLLM, text-generation-webui, etc.) as opposed to api.openai.com itself.
|
vLLM, text-generation-webui, etc.) as opposed to cloud APIs.
|
||||||
|
|
||||||
Used to gate llama.cpp-server-specific payload extras (``session_id``,
|
Used to gate llama.cpp-server-specific payload extras (``session_id``,
|
||||||
``cache_prompt``) — sending unrecognized top-level fields to OpenAI's
|
``cache_prompt``) used for KV-cache slot affinity (issue #2927). Strict
|
||||||
actual API returns a 400 ("Unrecognized request argument"), but
|
cloud providers reject unrecognized top-level fields (api.openai.com
|
||||||
self-hosted servers generally ignore unknown fields and many (notably
|
returns 400, Mistral returns 422 "extra_forbidden", issue #3793), and any
|
||||||
llama.cpp's server) use them for KV-cache slot affinity (issue #2927).
|
unknown OpenAI-compatible host used to be treated as self-hosted, so those
|
||||||
|
fields leaked to every strict provider added as a custom endpoint.
|
||||||
|
|
||||||
|
A server only counts as self-hosted when it also resolves as local:
|
||||||
|
loopback/private/tailscale host, or the endpoint explicitly configured
|
||||||
|
with kind "local". A self-hosted server exposed via a public hostname
|
||||||
|
loses the affinity hint unless its endpoint kind is set to "local" -
|
||||||
|
a lost perf hint, versus a hard 4xx on every request the other way.
|
||||||
"""
|
"""
|
||||||
return _detect_provider(url) == "openai" and not _host_match(url, "openai.com")
|
if _detect_provider(url) != "openai" or _host_match(url, "openai.com"):
|
||||||
|
return False
|
||||||
|
from src.model_context import is_local_endpoint
|
||||||
|
return is_local_endpoint(url)
|
||||||
|
|
||||||
|
|
||||||
def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None:
|
def _apply_local_cache_affinity(payload: Dict, url: str, session_id: Optional[str]) -> None:
|
||||||
|
|||||||
+20
-6
@@ -5,6 +5,7 @@ Query and cache model context window sizes from OpenAI-compatible APIs.
|
|||||||
Provides token estimation for context usage tracking.
|
Provides token estimation for context usage tracking.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import ipaddress
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
@@ -19,7 +20,20 @@ _LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1", "host.docker.interna
|
|||||||
_PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.",
|
_PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.",
|
||||||
"172.20.", "172.21.", "172.22.", "172.23.", "172.24.",
|
"172.20.", "172.21.", "172.22.", "172.23.", "172.24.",
|
||||||
"172.25.", "172.26.", "172.27.", "172.28.", "172.29.",
|
"172.25.", "172.26.", "172.27.", "172.28.", "172.29.",
|
||||||
"172.30.", "172.31.", "192.168.", "100.")
|
"172.30.", "172.31.", "192.168.")
|
||||||
|
|
||||||
|
# Tailscale uses the CGNAT range 100.64.0.0/10, NOT all of 100.0.0.0/8.
|
||||||
|
# A bare "100." prefix would classify public addresses (e.g. AWS ranges
|
||||||
|
# under 100.x outside the CGNAT block) as local; routes/model_routes.py
|
||||||
|
# already narrows this the same way for endpoint classification.
|
||||||
|
_TAILSCALE_CGNAT = ipaddress.ip_network("100.64.0.0/10")
|
||||||
|
|
||||||
|
|
||||||
|
def _in_tailscale_range(host: str) -> bool:
|
||||||
|
try:
|
||||||
|
return ipaddress.ip_address(host) in _TAILSCALE_CGNAT
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _normalize_base_for_compare(url: str) -> str:
|
def _normalize_base_for_compare(url: str) -> str:
|
||||||
@@ -64,7 +78,7 @@ def _configured_endpoint_kind(url: str) -> Optional[str]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _is_local_endpoint(url: str) -> bool:
|
def is_local_endpoint(url: str) -> bool:
|
||||||
"""Check if URL points to a local/private/tailscale address."""
|
"""Check if URL points to a local/private/tailscale address."""
|
||||||
kind = _configured_endpoint_kind(url)
|
kind = _configured_endpoint_kind(url)
|
||||||
if kind in ("api", "proxy"):
|
if kind in ("api", "proxy"):
|
||||||
@@ -73,7 +87,7 @@ def _is_local_endpoint(url: str) -> bool:
|
|||||||
return True
|
return True
|
||||||
try:
|
try:
|
||||||
host = urlparse(url).hostname or ""
|
host = urlparse(url).hostname or ""
|
||||||
return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES)
|
return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) or _in_tailscale_range(host)
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -219,7 +233,7 @@ def get_context_length(endpoint_url: str, model: str) -> int:
|
|||||||
Falls back to DEFAULT_CONTEXT if unavailable.
|
Falls back to DEFAULT_CONTEXT if unavailable.
|
||||||
"""
|
"""
|
||||||
configured_kind = _configured_endpoint_kind(endpoint_url)
|
configured_kind = _configured_endpoint_kind(endpoint_url)
|
||||||
is_local = _is_local_endpoint(endpoint_url)
|
is_local = is_local_endpoint(endpoint_url)
|
||||||
# Key on (endpoint_url, model): the same model id can be served by two
|
# Key on (endpoint_url, model): the same model id can be served by two
|
||||||
# different remote endpoints with different real context windows (e.g. a
|
# different remote endpoints with different real context windows (e.g. a
|
||||||
# capped proxy vs. the full provider), so caching by model id alone would
|
# capped proxy vs. the full provider), so caching by model id alone would
|
||||||
@@ -273,7 +287,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
|
|||||||
return DEFAULT_CONTEXT
|
return DEFAULT_CONTEXT
|
||||||
|
|
||||||
# Try llama.cpp /slots endpoint first — reports actual serving context
|
# Try llama.cpp /slots endpoint first — reports actual serving context
|
||||||
if _is_local_endpoint(endpoint_url):
|
if is_local_endpoint(endpoint_url):
|
||||||
try:
|
try:
|
||||||
base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0]
|
base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0]
|
||||||
r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT)
|
r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT)
|
||||||
@@ -337,7 +351,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int:
|
|||||||
# For local/self-hosted endpoints, trust the API value (user set --max-model-len)
|
# For local/self-hosted endpoints, trust the API value (user set --max-model-len)
|
||||||
# For cloud APIs, use the larger value (API can report low defaults)
|
# For cloud APIs, use the larger value (API can report low defaults)
|
||||||
if api_ctx and known:
|
if api_ctx and known:
|
||||||
_is_local = _is_local_endpoint(endpoint_url)
|
_is_local = is_local_endpoint(endpoint_url)
|
||||||
if _is_local and api_ctx < known:
|
if _is_local and api_ctx < known:
|
||||||
logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value")
|
logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value")
|
||||||
return api_ctx
|
return api_ctx
|
||||||
|
|||||||
@@ -0,0 +1,94 @@
|
|||||||
|
"""llama.cpp slot-affinity fields must never reach cloud providers (#3793).
|
||||||
|
|
||||||
|
_apply_local_cache_affinity adds session_id + cache_prompt to outgoing
|
||||||
|
payloads for KV-cache slot affinity (#2927). The old gate treated any unknown
|
||||||
|
OpenAI-compatible host as self-hosted, so strict cloud APIs added as custom
|
||||||
|
endpoints (Mistral at api.mistral.ai) received the extra fields and rejected
|
||||||
|
every request with 422 extra_forbidden. Self-hosted now also requires the
|
||||||
|
endpoint to resolve as local: loopback/private/tailscale host, or endpoint
|
||||||
|
kind explicitly configured as "local".
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import src.llm_core as llm_core
|
||||||
|
import src.model_context as model_context
|
||||||
|
|
||||||
|
|
||||||
|
def _affinity_fields(url, monkeypatch, kind=None):
|
||||||
|
monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: kind)
|
||||||
|
payload = {}
|
||||||
|
llm_core._apply_local_cache_affinity(payload, url, "sess-123")
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def test_mistral_cloud_api_gets_no_affinity_fields(monkeypatch):
|
||||||
|
# The #3793 repro: Mistral rejects unknown body fields with 422.
|
||||||
|
payload = _affinity_fields("https://api.mistral.ai/v1", monkeypatch)
|
||||||
|
assert payload == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_openai_api_gets_no_affinity_fields(monkeypatch):
|
||||||
|
payload = _affinity_fields("https://api.openai.com/v1", monkeypatch)
|
||||||
|
assert payload == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_public_host_gets_no_affinity_fields(monkeypatch):
|
||||||
|
# Any strict cloud provider added as a custom endpoint, not just Mistral.
|
||||||
|
payload = _affinity_fields("https://llm.example-cloud.com/v1", monkeypatch)
|
||||||
|
assert payload == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_localhost_server_gets_affinity_fields(monkeypatch):
|
||||||
|
payload = _affinity_fields("http://localhost:8080/v1", monkeypatch)
|
||||||
|
assert payload == {"session_id": "sess-123", "cache_prompt": True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_private_lan_server_gets_affinity_fields(monkeypatch):
|
||||||
|
payload = _affinity_fields("http://192.168.1.50:8000/v1", monkeypatch)
|
||||||
|
assert payload == {"session_id": "sess-123", "cache_prompt": True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_public_host_with_local_kind_override_gets_affinity_fields(monkeypatch):
|
||||||
|
# Escape hatch: a self-hosted llama.cpp exposed via a tunnel keeps the
|
||||||
|
# slot-affinity hint when its endpoint kind is configured as "local".
|
||||||
|
payload = _affinity_fields("https://my-llama.example.com/v1", monkeypatch, kind="local")
|
||||||
|
assert payload == {"session_id": "sess-123", "cache_prompt": True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_session_id_is_a_noop(monkeypatch):
|
||||||
|
monkeypatch.setattr(model_context, "_configured_endpoint_kind", lambda _u: None)
|
||||||
|
payload = {}
|
||||||
|
llm_core._apply_local_cache_affinity(payload, "http://localhost:8080/v1", None)
|
||||||
|
assert payload == {}
|
||||||
|
|
||||||
|
|
||||||
|
# Cloud-host sweep absorbed from #3839 (credit: Shabablinchikow) - every cloud
|
||||||
|
# API that falls through provider detection to the OpenAI-compatible default
|
||||||
|
# must stay clean, not just the Mistral host from the original report.
|
||||||
|
@pytest.mark.parametrize("url", [
|
||||||
|
"https://api.mistral.ai/v1/chat/completions",
|
||||||
|
"https://api.deepseek.com/v1/chat/completions",
|
||||||
|
"https://api.x.ai/v1/chat/completions",
|
||||||
|
"https://api.together.xyz/v1/chat/completions",
|
||||||
|
"https://api.fireworks.ai/inference/v1/chat/completions",
|
||||||
|
"https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
|
||||||
|
])
|
||||||
|
def test_cloud_openai_compatible_hosts_get_no_affinity_fields(monkeypatch, url):
|
||||||
|
assert _affinity_fields(url, monkeypatch) == {}
|
||||||
|
|
||||||
|
|
||||||
|
# Tailscale CGNAT boundaries (review finding on #3945): only 100.64.0.0/10 is
|
||||||
|
# Tailscale; the rest of 100.0.0.0/8 contains public ranges, and a strict
|
||||||
|
# provider addressed by one must not receive the llama.cpp extras.
|
||||||
|
def test_host_just_below_cgnat_gets_no_affinity_fields(monkeypatch):
|
||||||
|
assert _affinity_fields("http://100.63.255.255/v1", monkeypatch) == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_host_just_above_cgnat_gets_no_affinity_fields(monkeypatch):
|
||||||
|
assert _affinity_fields("http://100.128.0.1/v1", monkeypatch) == {}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("host", ["100.64.0.1", "100.100.50.2", "100.127.255.254"])
|
||||||
|
def test_hosts_inside_cgnat_get_affinity_fields(monkeypatch, host):
|
||||||
|
payload = _affinity_fields(f"http://{host}:8080/v1", monkeypatch)
|
||||||
|
assert payload == {"session_id": "sess-123", "cache_prompt": True}
|
||||||
@@ -11,7 +11,7 @@ import src.model_context as mc
|
|||||||
|
|
||||||
def _setup(monkeypatch, windows):
|
def _setup(monkeypatch, windows):
|
||||||
"""windows: {endpoint_url: context_length}. Force the remote path."""
|
"""windows: {endpoint_url: context_length}. Force the remote path."""
|
||||||
monkeypatch.setattr(mc, "_is_local_endpoint", lambda url: False)
|
monkeypatch.setattr(mc, "is_local_endpoint", lambda url: False)
|
||||||
monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api")
|
monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api")
|
||||||
monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url])
|
monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url])
|
||||||
mc._context_cache.clear()
|
mc._context_cache.clear()
|
||||||
|
|||||||
+11
-11
@@ -6,7 +6,7 @@ import types
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import src.model_context as model_context
|
import src.model_context as model_context
|
||||||
from src.model_context import _is_local_endpoint, estimate_tokens, _lookup_known
|
from src.model_context import is_local_endpoint, estimate_tokens, _lookup_known
|
||||||
|
|
||||||
|
|
||||||
class _Column:
|
class _Column:
|
||||||
@@ -56,20 +56,20 @@ def _install_endpoint_db(monkeypatch, rows):
|
|||||||
|
|
||||||
class TestIsLocalEndpoint:
|
class TestIsLocalEndpoint:
|
||||||
def test_localhost(self):
|
def test_localhost(self):
|
||||||
assert _is_local_endpoint("http://localhost:5000/v1/chat/completions") is True
|
assert is_local_endpoint("http://localhost:5000/v1/chat/completions") is True
|
||||||
|
|
||||||
def test_loopback_ipv4(self):
|
def test_loopback_ipv4(self):
|
||||||
assert _is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True
|
assert is_local_endpoint("http://127.0.0.1:8080/v1/chat/completions") is True
|
||||||
|
|
||||||
def test_private_192_168(self):
|
def test_private_192_168(self):
|
||||||
assert _is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True
|
assert is_local_endpoint("http://192.168.1.1:11434/v1/chat/completions") is True
|
||||||
|
|
||||||
def test_private_10(self):
|
def test_private_10(self):
|
||||||
assert _is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True
|
assert is_local_endpoint("http://10.0.0.5:8000/v1/chat/completions") is True
|
||||||
|
|
||||||
def test_tailscale_100(self):
|
def test_tailscale_100(self):
|
||||||
# 100.64.0.0/10 is the CGNAT range Tailscale uses.
|
# 100.64.0.0/10 is the CGNAT range Tailscale uses.
|
||||||
assert _is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True
|
assert is_local_endpoint("http://100.64.0.1:5000/v1/chat/completions") is True
|
||||||
|
|
||||||
def test_configured_tailscale_proxy_is_remote(self, monkeypatch):
|
def test_configured_tailscale_proxy_is_remote(self, monkeypatch):
|
||||||
_install_endpoint_db(monkeypatch, [
|
_install_endpoint_db(monkeypatch, [
|
||||||
@@ -81,19 +81,19 @@ class TestIsLocalEndpoint:
|
|||||||
)
|
)
|
||||||
])
|
])
|
||||||
|
|
||||||
assert _is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False
|
assert is_local_endpoint("http://100.117.136.97:34521/v1/chat/completions") is False
|
||||||
|
|
||||||
def test_openai_is_remote(self):
|
def test_openai_is_remote(self):
|
||||||
assert _is_local_endpoint("https://api.openai.com/v1/chat/completions") is False
|
assert is_local_endpoint("https://api.openai.com/v1/chat/completions") is False
|
||||||
|
|
||||||
def test_anthropic_is_remote(self):
|
def test_anthropic_is_remote(self):
|
||||||
assert _is_local_endpoint("https://api.anthropic.com/v1/messages") is False
|
assert is_local_endpoint("https://api.anthropic.com/v1/messages") is False
|
||||||
|
|
||||||
def test_empty_url(self):
|
def test_empty_url(self):
|
||||||
assert _is_local_endpoint("") is False
|
assert is_local_endpoint("") is False
|
||||||
|
|
||||||
def test_malformed_url(self):
|
def test_malformed_url(self):
|
||||||
assert _is_local_endpoint("not-a-url") is False
|
assert is_local_endpoint("not-a-url") is False
|
||||||
|
|
||||||
|
|
||||||
class TestEstimateTokens:
|
class TestEstimateTokens:
|
||||||
|
|||||||
Reference in New Issue
Block a user