mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 01:35:36 -04:00
263d41c58a
* fix(llm): stop sending llama.cpp slot-affinity fields to cloud providers _apply_local_cache_affinity adds session_id + cache_prompt for llama.cpp KV-cache slot affinity (#2927), gated on _is_self_hosted_openai_compatible, which treated any unknown OpenAI-compatible host as self-hosted. Strict cloud providers added as custom endpoints (Mistral at api.mistral.ai) reject unknown body fields, so every request failed with 422 extra_forbidden. Self-hosted now also requires the endpoint to resolve as local via model_context.is_local_endpoint: loopback/private/tailscale host, or endpoint kind explicitly configured as "local" (the escape hatch for tunneled self-hosted servers). is_local_endpoint is promoted to a public name since llm_core now shares it. Fixes #3793 * test(llm): sweep cloud OpenAI-compatible hosts in affinity gating Parametrized cases adapted from #3839 (credit: Shabablinchikow): deepseek, x.ai, together, fireworks, and the Gemini OpenAI-compat endpoint must all stay free of the llama.cpp extras, not just the Mistral host from #3793. * fix(llm): narrow the Tailscale range to 100.64.0.0/10 in is_local_endpoint Review finding on #3945: _PRIVATE_PREFIXES carried a bare "100." prefix, treating all of 100.0.0.0/8 as local while Tailscale only uses the CGNAT block 100.64.0.0/10. Public 100.x hosts (e.g. AWS ranges outside the block) were classified local and still received the llama.cpp extras this PR exists to keep away from strict providers. Match the narrowed classification routes/model_routes.py already uses, with boundary tests just below, inside, and just above the range.
409 lines
14 KiB
Python
409 lines
14 KiB
Python
"""
|
|
model_context.py
|
|
|
|
Query and cache model context window sizes from OpenAI-compatible APIs.
|
|
Provides token estimation for context usage tracking.
|
|
"""
|
|
|
|
import ipaddress
|
|
import logging
|
|
import sys
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1", "host.docker.internal"}
|
|
_PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.",
|
|
"172.20.", "172.21.", "172.22.", "172.23.", "172.24.",
|
|
"172.25.", "172.26.", "172.27.", "172.28.", "172.29.",
|
|
"172.30.", "172.31.", "192.168.")
|
|
|
|
# Tailscale uses the CGNAT range 100.64.0.0/10, NOT all of 100.0.0.0/8.
|
|
# A bare "100." prefix would classify public addresses (e.g. AWS ranges
|
|
# under 100.x outside the CGNAT block) as local; routes/model_routes.py
|
|
# already narrows this the same way for endpoint classification.
|
|
_TAILSCALE_CGNAT = ipaddress.ip_network("100.64.0.0/10")
|
|
|
|
|
|
def _in_tailscale_range(host: str) -> bool:
|
|
try:
|
|
return ipaddress.ip_address(host) in _TAILSCALE_CGNAT
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
def _normalize_base_for_compare(url: str) -> str:
|
|
url = (url or "").strip().rstrip("/")
|
|
for suffix in ("/chat/completions", "/models", "/completions", "/v1/messages"):
|
|
if url.endswith(suffix):
|
|
url = url[: -len(suffix)].rstrip("/")
|
|
return url
|
|
|
|
|
|
def _configured_endpoint_kind(url: str) -> Optional[str]:
|
|
"""Return configured endpoint kind for a chat/base URL when available."""
|
|
target = _normalize_base_for_compare(url)
|
|
if not target:
|
|
return None
|
|
if "core.database" not in sys.modules:
|
|
return None
|
|
try:
|
|
from core.database import SessionLocal, ModelEndpoint
|
|
db = SessionLocal()
|
|
try:
|
|
rows = db.query(ModelEndpoint).filter(ModelEndpoint.is_enabled == True).all()
|
|
for ep in rows:
|
|
base = _normalize_base_for_compare(getattr(ep, "base_url", "") or "")
|
|
if not base:
|
|
continue
|
|
if target != base and not target.startswith(base + "/"):
|
|
continue
|
|
kind = (getattr(ep, "endpoint_kind", None) or "auto").strip().lower()
|
|
if kind in ("local", "api", "proxy"):
|
|
return kind
|
|
if getattr(ep, "api_key", None):
|
|
parsed = urlparse(base)
|
|
host = (parsed.hostname or "").lower()
|
|
path = (parsed.path or "").rstrip("/")
|
|
if parsed.port != 11434 and "ollama" not in host and (path.endswith("/v1") or "/openai" in path):
|
|
return "proxy"
|
|
return "auto"
|
|
finally:
|
|
db.close()
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def is_local_endpoint(url: str) -> bool:
|
|
"""Check if URL points to a local/private/tailscale address."""
|
|
kind = _configured_endpoint_kind(url)
|
|
if kind in ("api", "proxy"):
|
|
return False
|
|
if kind == "local":
|
|
return True
|
|
try:
|
|
host = urlparse(url).hostname or ""
|
|
return host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES) or _in_tailscale_range(host)
|
|
except Exception:
|
|
return False
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
DEFAULT_CONTEXT = 128000
|
|
REQUEST_TIMEOUT = 5
|
|
|
|
# Known context windows for major API models (used as fallback when /models
|
|
# endpoint doesn't report context_length).
|
|
# Substring matching — use the shortest unique prefix so variants get caught.
|
|
KNOWN_CONTEXT_WINDOWS = {
|
|
# --- Anthropic ---
|
|
'claude-sonnet-4-5': 200000,
|
|
'claude-sonnet-4-6': 200000,
|
|
'claude-sonnet-4': 200000,
|
|
'claude-opus-4': 200000,
|
|
'claude-haiku-4': 200000,
|
|
'claude-haiku-3-5': 200000,
|
|
'claude-3-5-sonnet': 200000,
|
|
'claude-3-5-haiku': 200000,
|
|
'claude-3-opus': 200000,
|
|
'claude-3-sonnet': 200000,
|
|
'claude-3-haiku': 200000,
|
|
|
|
# --- OpenAI ---
|
|
'gpt-5': 400000,
|
|
'gpt-4.1': 1047576,
|
|
'gpt-4.1-mini': 1047576,
|
|
'gpt-4.1-nano': 1047576,
|
|
'gpt-4o': 128000,
|
|
'gpt-4o-mini': 128000,
|
|
'gpt-4-turbo': 128000,
|
|
'gpt-4': 8192,
|
|
'gpt-3.5-turbo': 16385,
|
|
'o1': 200000,
|
|
'o1-mini': 128000,
|
|
'o1-pro': 200000,
|
|
'o3': 200000,
|
|
'o3-mini': 200000,
|
|
'o4-mini': 200000,
|
|
|
|
# --- DeepSeek ---
|
|
'deepseek-chat': 64000,
|
|
'deepseek-coder': 64000,
|
|
'deepseek-reasoner': 64000,
|
|
'deepseek-r1': 64000,
|
|
'deepseek-v3': 64000,
|
|
'deepseek-v2': 64000,
|
|
|
|
# --- Google ---
|
|
'gemini-2.5-pro': 1048576,
|
|
'gemini-2.5-flash': 1048576,
|
|
'gemini-2.0-flash': 1048576,
|
|
'gemini-1.5-pro': 1048576,
|
|
'gemini-1.5-flash': 1048576,
|
|
'gemma-4': 262144,
|
|
'gemma-3': 128000,
|
|
'gemma-2': 8192,
|
|
|
|
# --- Mistral ---
|
|
'mistral-large': 128000,
|
|
'mistral-medium': 32000,
|
|
'mistral-small': 32000,
|
|
'mistral-nemo': 128000,
|
|
'mistral-7b': 32000,
|
|
'mixtral': 32000,
|
|
'codestral': 32000,
|
|
'pixtral': 128000,
|
|
|
|
# --- xAI ---
|
|
'grok-4': 131072,
|
|
'grok-3': 131072,
|
|
'grok-2': 131072,
|
|
|
|
# --- Meta / Llama ---
|
|
'llama-4': 1048576,
|
|
'llama-3.3': 131072,
|
|
'llama-3.2': 131072,
|
|
'llama-3.1': 131072,
|
|
'llama-3': 131072,
|
|
|
|
# --- Qwen ---
|
|
'qwen3': 131072,
|
|
'qwen2.5': 131072,
|
|
'qwen2': 32768,
|
|
'qwq': 32768,
|
|
|
|
# --- Cohere ---
|
|
'command-r-plus': 128000,
|
|
'command-r': 128000,
|
|
'command-a': 256000,
|
|
|
|
# --- Perplexity ---
|
|
'sonar-pro': 200000,
|
|
'sonar': 128000,
|
|
|
|
# --- MiniMax ---
|
|
'minimax': 1000000,
|
|
|
|
# --- Moonshot / Kimi ---
|
|
'moonshot': 128000,
|
|
'kimi': 128000,
|
|
|
|
# --- Microsoft ---
|
|
'phi-4': 16000,
|
|
'phi-3': 128000,
|
|
|
|
# --- Nvidia ---
|
|
'nemotron': 131072,
|
|
|
|
# --- Yi ---
|
|
'yi-large': 32768,
|
|
'yi-1.5': 16384,
|
|
|
|
# --- 01.ai ---
|
|
'yi-lightning': 16384,
|
|
|
|
# --- Nous ---
|
|
'hermes': 131072,
|
|
'nous-hermes': 131072,
|
|
|
|
# --- Open community ---
|
|
'dolphin': 32768,
|
|
'mythomax': 4096,
|
|
'wizard': 32768,
|
|
'openchat': 8192,
|
|
'solar': 32768,
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cache
|
|
# ---------------------------------------------------------------------------
|
|
_context_cache: Dict[Tuple[str, str], int] = {}
|
|
|
|
|
|
def get_context_length(endpoint_url: str, model: str) -> int:
|
|
"""Get the context window size for a model.
|
|
|
|
Queries /v1/models on the endpoint and looks for context_length
|
|
or context_window fields. Caches result per (endpoint, model).
|
|
Falls back to DEFAULT_CONTEXT if unavailable.
|
|
"""
|
|
configured_kind = _configured_endpoint_kind(endpoint_url)
|
|
is_local = is_local_endpoint(endpoint_url)
|
|
# Key on (endpoint_url, model): the same model id can be served by two
|
|
# different remote endpoints with different real context windows (e.g. a
|
|
# capped proxy vs. the full provider), so caching by model id alone would
|
|
# serve one endpoint's window for the other (issue #2603).
|
|
cache_key = (endpoint_url, model)
|
|
if not is_local and cache_key in _context_cache:
|
|
return _context_cache[cache_key]
|
|
|
|
ctx = _query_context_length(endpoint_url, model)
|
|
# Only cache non-default values to allow retry on next request.
|
|
# Local endpoints can restart with a different --max-model-len while keeping
|
|
# the same model id, so always re-query them instead of serving stale cache.
|
|
if not is_local and (ctx != DEFAULT_CONTEXT or configured_kind in ("api", "proxy")):
|
|
_context_cache[cache_key] = ctx
|
|
logger.info(f"Context length for {model}: {ctx}")
|
|
return ctx
|
|
|
|
|
|
def _lookup_known(model: str) -> Optional[int]:
|
|
"""Check known context windows by substring match.
|
|
|
|
Picks the LONGEST matching key so a short key never shadows a more specific
|
|
one. Without this, 'o1' (200k) precedes 'o1-mini' (128k) in the table and a
|
|
first-match return would report o1-mini's window as 200k.
|
|
"""
|
|
name = model.lower()
|
|
basename = name.split("/")[-1] if "/" in name else name
|
|
basename = basename.split(":")[0] # strip :free, :extended etc.
|
|
best_key: Optional[str] = None
|
|
best_ctx: Optional[int] = None
|
|
for key, ctx in KNOWN_CONTEXT_WINDOWS.items():
|
|
if key in basename or key in name:
|
|
if best_key is None or len(key) > len(best_key):
|
|
best_key, best_ctx = key, ctx
|
|
return best_ctx
|
|
|
|
|
|
def _query_context_length(endpoint_url: str, model: str) -> int:
|
|
"""Query the model API for context length."""
|
|
known = _lookup_known(model)
|
|
api_ctx = None
|
|
configured_kind = _configured_endpoint_kind(endpoint_url)
|
|
|
|
# Large OpenAI-compatible proxies can make /models expensive. If the
|
|
# endpoint is explicitly configured as API/proxy, prefer known context
|
|
# metadata (or the default) over downloading the full catalog.
|
|
if configured_kind in ("api", "proxy"):
|
|
if known:
|
|
logger.info(f"Using known context window for {model}: {known}")
|
|
return known
|
|
return DEFAULT_CONTEXT
|
|
|
|
# Try llama.cpp /slots endpoint first — reports actual serving context
|
|
if is_local_endpoint(endpoint_url):
|
|
try:
|
|
base = endpoint_url.split("/v1")[0] if "/v1" in endpoint_url else endpoint_url.rsplit("/", 1)[0]
|
|
r = httpx.get(f"{base}/slots", timeout=REQUEST_TIMEOUT)
|
|
if r.is_success:
|
|
slots = r.json()
|
|
if isinstance(slots, list) and slots:
|
|
n_ctx = slots[0].get("n_ctx")
|
|
if n_ctx and isinstance(n_ctx, int) and n_ctx > 0:
|
|
logger.info(f"llama.cpp /slots reports n_ctx={n_ctx} for {model}")
|
|
return n_ctx
|
|
except Exception:
|
|
pass
|
|
|
|
# GitHub Copilot's /models requires auth + X-GitHub-Api-Version headers that
|
|
# aren't available here; an unauthenticated probe just 400s. All Copilot
|
|
# picker models are major API models covered by the known-context table, so
|
|
# rely on that instead of a doomed network call.
|
|
from src.copilot import is_copilot_base
|
|
if is_copilot_base(endpoint_url):
|
|
if known:
|
|
logger.info(f"Using known context window for {model}: {known}")
|
|
return known or DEFAULT_CONTEXT
|
|
|
|
from src.endpoint_resolver import build_models_url
|
|
|
|
models_url = build_models_url(endpoint_url)
|
|
try:
|
|
r = httpx.get(models_url, timeout=REQUEST_TIMEOUT)
|
|
if r.is_success:
|
|
data = r.json()
|
|
models_list = data.get("data") or []
|
|
|
|
for m in models_list:
|
|
mid = m.get("id", "")
|
|
if mid == model or mid.split("/")[-1] == model.split("/")[-1]:
|
|
for field in (
|
|
"context_length",
|
|
"context_window",
|
|
"max_model_len",
|
|
"max_context_length",
|
|
"max_seq_len",
|
|
):
|
|
val = m.get(field)
|
|
if val and isinstance(val, (int, float)) and val > 0:
|
|
api_ctx = int(val)
|
|
break
|
|
|
|
if not api_ctx:
|
|
meta = m.get("meta") or m.get("model_extra") or {}
|
|
if isinstance(meta, dict):
|
|
# n_ctx is the actual serving context (set via -c flag in llama.cpp)
|
|
for field in ("n_ctx", "context_length", "context_window", "max_model_len"):
|
|
val = meta.get(field)
|
|
if val and isinstance(val, (int, float)) and val > 0:
|
|
api_ctx = int(val)
|
|
break
|
|
break
|
|
except Exception as e:
|
|
logger.debug(f"Failed to query context length for {model}: {e}")
|
|
|
|
# For local/self-hosted endpoints, trust the API value (user set --max-model-len)
|
|
# For cloud APIs, use the larger value (API can report low defaults)
|
|
if api_ctx and known:
|
|
_is_local = is_local_endpoint(endpoint_url)
|
|
if _is_local and api_ctx < known:
|
|
logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value")
|
|
return api_ctx
|
|
result = max(api_ctx, known)
|
|
if api_ctx < known:
|
|
logger.info(f"API reported {api_ctx} for {model}, using known {known} instead")
|
|
return result
|
|
if api_ctx:
|
|
return api_ctx
|
|
if known:
|
|
logger.info(f"Using known context window for {model}: {known}")
|
|
return known
|
|
|
|
return DEFAULT_CONTEXT
|
|
|
|
|
|
def estimate_tokens(messages: List[Dict]) -> int:
|
|
"""Rough token estimate for a list of messages.
|
|
|
|
Uses chars * 0.3 which is closer to real BPE tokenizer output
|
|
than the commonly-cited chars/4 (which underestimates by ~20-30%).
|
|
Also adds ~4 tokens per message for role/formatting overhead, and counts
|
|
assistant tool_calls (name + arguments) — a tool-only turn carries
|
|
content=None with the real payload in tool_calls, so ignoring them made the
|
|
estimate (and the compaction/trim gates that rely on it) blind to large
|
|
tool arguments.
|
|
"""
|
|
total = 0
|
|
for msg in messages:
|
|
total += 4 # per-message overhead (role, separators)
|
|
content = msg.get("content", "")
|
|
if isinstance(content, str):
|
|
total += int(len(content) * 0.3)
|
|
elif isinstance(content, list):
|
|
for item in content:
|
|
if isinstance(item, dict) and item.get("type") == "text":
|
|
total += int(len(item.get("text", "")) * 0.3)
|
|
# Tool calls carry real payload too: a tool-only assistant turn is stored
|
|
# with content=None and the actual args (e.g. a create_document body) in
|
|
# tool_calls[].function.arguments. Ignoring them made large tool arguments
|
|
# read as ~0 tokens, so the compaction/trim gates missed genuine overflow.
|
|
tool_calls = msg.get("tool_calls")
|
|
if isinstance(tool_calls, list):
|
|
for tc in tool_calls:
|
|
if not isinstance(tc, dict):
|
|
continue
|
|
fn = tc.get("function") if isinstance(tc.get("function"), dict) else tc
|
|
name = fn.get("name", "") or ""
|
|
args = fn.get("arguments", "") or ""
|
|
if not isinstance(args, str):
|
|
args = str(args) # some shapes store arguments as a dict
|
|
total += 4 # per tool-call overhead (id, type, wrapper)
|
|
total += int((len(str(name)) + len(args)) * 0.3)
|
|
return total
|