mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Refresh local model context after restart
Co-authored-by: Kevin <120500656+oooindefatigable@users.noreply.github.com>
This commit is contained in:
@@ -169,12 +169,15 @@ def get_context_length(endpoint_url: str, model: str) -> int:
|
|||||||
or context_window fields. Caches result per model ID.
|
or context_window fields. Caches result per model ID.
|
||||||
Falls back to DEFAULT_CONTEXT if unavailable.
|
Falls back to DEFAULT_CONTEXT if unavailable.
|
||||||
"""
|
"""
|
||||||
if model in _context_cache:
|
is_local = _is_local_endpoint(endpoint_url)
|
||||||
|
if not is_local and model in _context_cache:
|
||||||
return _context_cache[model]
|
return _context_cache[model]
|
||||||
|
|
||||||
ctx = _query_context_length(endpoint_url, model)
|
ctx = _query_context_length(endpoint_url, model)
|
||||||
# Only cache non-default values to allow retry on next request
|
# Only cache non-default values to allow retry on next request.
|
||||||
if ctx != DEFAULT_CONTEXT:
|
# Local endpoints can restart with a different --max-model-len while keeping
|
||||||
|
# the same model id, so always re-query them instead of serving stale cache.
|
||||||
|
if not is_local and ctx != DEFAULT_CONTEXT:
|
||||||
_context_cache[model] = ctx
|
_context_cache[model] = ctx
|
||||||
logger.info(f"Context length for {model}: {ctx}")
|
logger.info(f"Context length for {model}: {ctx}")
|
||||||
return ctx
|
return ctx
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
import src.model_context as model_context
|
||||||
from src.model_context import _is_local_endpoint, estimate_tokens, _lookup_known
|
from src.model_context import _is_local_endpoint, estimate_tokens, _lookup_known
|
||||||
|
|
||||||
|
|
||||||
@@ -107,3 +108,46 @@ class TestLookupKnown:
|
|||||||
"""Models with :free or :extended suffixes should still match."""
|
"""Models with :free or :extended suffixes should still match."""
|
||||||
result = _lookup_known("deepseek-r1:free")
|
result = _lookup_known("deepseek-r1:free")
|
||||||
assert result == 64000
|
assert result == 64000
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetContextLength:
|
||||||
|
def setup_method(self):
|
||||||
|
model_context._context_cache.clear()
|
||||||
|
|
||||||
|
def test_local_endpoint_requeries_same_model_after_restart(self, monkeypatch):
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_query(endpoint_url, model):
|
||||||
|
calls.append((endpoint_url, model))
|
||||||
|
return 8192 if len(calls) == 1 else 27000
|
||||||
|
|
||||||
|
monkeypatch.setattr(model_context, "_query_context_length", fake_query)
|
||||||
|
|
||||||
|
endpoint = "http://127.0.0.1:8000/v1/chat/completions"
|
||||||
|
model = "Qwen/Qwen3-14B"
|
||||||
|
|
||||||
|
first = model_context.get_context_length(endpoint, model)
|
||||||
|
second = model_context.get_context_length(endpoint, model)
|
||||||
|
|
||||||
|
assert first == 8192
|
||||||
|
assert second == 27000
|
||||||
|
assert len(calls) == 2
|
||||||
|
|
||||||
|
def test_remote_endpoint_keeps_cached_context(self, monkeypatch):
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_query(endpoint_url, model):
|
||||||
|
calls.append((endpoint_url, model))
|
||||||
|
return 200000 if len(calls) == 1 else 12345
|
||||||
|
|
||||||
|
monkeypatch.setattr(model_context, "_query_context_length", fake_query)
|
||||||
|
|
||||||
|
endpoint = "https://api.openai.com/v1/chat/completions"
|
||||||
|
model = "gpt-5"
|
||||||
|
|
||||||
|
first = model_context.get_context_length(endpoint, model)
|
||||||
|
second = model_context.get_context_length(endpoint, model)
|
||||||
|
|
||||||
|
assert first == 200000
|
||||||
|
assert second == 200000
|
||||||
|
assert len(calls) == 1
|
||||||
|
|||||||
Reference in New Issue
Block a user