diff --git a/src/agent_loop.py b/src/agent_loop.py index 45570a90f..110bb6185 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -2013,30 +2013,34 @@ async def stream_agent_loop( _t3 = time.time() try: from src.context_compactor import trim_for_context - from src.context_budget import compute_input_token_budget, DEFAULT_HARD_MAX - from src.settings import is_setting_overridden + from src.context_budget import compute_input_token_budget, DEFAULT_HARD_MAX, DEFAULT_BUDGET, budget_is_explicit as _budget_is_explicit + from src.model_context import budget_context_for_model - soft_budget = int(get_setting("agent_input_token_budget", 6000) or 0) + soft_budget = int(get_setting("agent_input_token_budget", DEFAULT_BUDGET) or 0) if soft_budget > 0: before_trim_tokens = estimate_tokens(messages) reserve_tokens = min(max(max_tokens or 1024, 512), 2048) - # Honour the configurable ceiling for the auto-derived budget path. - # No-op when the user has an explicit `agent_input_token_budget` - # (that branch ignores hard_max). Falls back to DEFAULT_HARD_MAX - # on missing/malformed values so misconfig can't zero the budget. + # Ceiling for the auto-derived budget (no effect on an explicit budget; + # see #1230). Falls back to DEFAULT_HARD_MAX on missing/malformed values + # so misconfig can't zero the budget. try: hard_max = int(get_setting("agent_input_token_hard_max", DEFAULT_HARD_MAX) or DEFAULT_HARD_MAX) except (TypeError, ValueError): hard_max = DEFAULT_HARD_MAX if hard_max <= 0: hard_max = DEFAULT_HARD_MAX - # Scale the default budget to the model's context window so long-context - # models aren't silently capped at 6000; an explicit user setting is - # still honoured (clamped to the window). (#1170) + # Default value = auto sentinel (scale to the window); any other value = + # explicit cap. Value-based, not presence-based, because the save path + # materializes defaults so a persisted default must still read as auto (#4121). + budget_is_explicit = _budget_is_explicit(soft_budget) + # Scale only off a window we actually discovered, bound to the value it + # proves (else 0) — not the passed-in context_length, which can be stale + # or unset for some callers (#4122 review). + ctx_for_budget = budget_context_for_model(endpoint_url, model, fallback=context_length) effective_budget = compute_input_token_budget( soft_budget, - context_length, - is_setting_overridden("agent_input_token_budget"), + ctx_for_budget, + budget_is_explicit, hard_max=hard_max, ) trimmed_messages = trim_for_context( diff --git a/src/context_budget.py b/src/context_budget.py index d331ffac4..de4789e28 100644 --- a/src/context_budget.py +++ b/src/context_budget.py @@ -31,16 +31,22 @@ def compute_input_token_budget( Args: configured: the value read from settings (may be the default). - context_length: the model's discovered context window (0/unknown if none). - explicit: True if the user explicitly set ``agent_input_token_budget``. + context_length: the model's discovered context window. Pass 0 when the + window is unknown / only a bare fallback — auto-scaling then stays + conservative instead of trusting an unproven window (review on #4122). + explicit: True if the user set a NON-default budget. The default value is + the "auto" sentinel (scale to the window); any other value is an + explicit cap. (A deliberately-chosen default can't be distinguished + from a materialized default by value, so the default reads as auto.) Rules: - Explicit user budget is honoured exactly, only clamped to the model's - window when that window is known (never send more than the model holds). - - Otherwise (default), scale to ``headroom`` of the context window, capped - at ``hard_max`` — so long-context models use their capacity. - - When the window is unknown, fall back to the configured/default value - (preserving the previous behaviour). + window when that window is known (the user's deliberate choice wins; + ``hard_max`` is an auto-budget ceiling only — see #1230). + - Otherwise (auto), scale to ``headroom`` of the context window, capped at + ``hard_max`` — so long-context models use their capacity. + - When the window is unknown (context_length <= 0), use the conservative + ``default`` budget and do NOT scale off the fallback. """ configured = int(configured or 0) context_length = int(context_length or 0) @@ -53,3 +59,17 @@ def compute_input_token_budget( return max(1, min(scaled, hard_max)) return configured if configured > 0 else default + + +def budget_is_explicit(configured: int, *, default: int = DEFAULT_BUDGET) -> bool: + """Whether a configured agent_input_token_budget is a deliberate explicit cap. + + The default value is the "auto" sentinel (scale to the model's window), so only + a NON-default positive value counts as explicit. This keys off the VALUE, not + settings *presence* — the settings-save path materializes every default into + settings.json, so a persisted default must still read as auto (the regression + #4121 / #1230 are about). Centralised here so the materialized-default contract + is unit-testable and can't silently regress to a presence check. + """ + configured = int(configured or 0) + return configured > 0 and configured != default diff --git a/src/model_context.py b/src/model_context.py index 0b04b20cc..d87168cca 100644 --- a/src/model_context.py +++ b/src/model_context.py @@ -222,16 +222,12 @@ KNOWN_CONTEXT_WINDOWS = { # --------------------------------------------------------------------------- # Cache # --------------------------------------------------------------------------- -_context_cache: Dict[Tuple[str, str], int] = {} +_context_cache: Dict[Tuple[str, str], Tuple[int, bool]] = {} -def get_context_length(endpoint_url: str, model: str) -> int: - """Get the context window size for a model. - - Queries /v1/models on the endpoint and looks for context_length - or context_window fields. Caches result per (endpoint, model). - Falls back to DEFAULT_CONTEXT if unavailable. - """ +def _get_context_length_cached(endpoint_url: str, model: str) -> Tuple[int, bool]: + """Return (context_length, known). ``known`` is False only when the value is a + bare DEFAULT_CONTEXT fallback (no endpoint report and not in the known table).""" configured_kind = _configured_endpoint_kind(endpoint_url) is_local = is_local_endpoint(endpoint_url) # Key on (endpoint_url, model): the same model id can be served by two @@ -242,14 +238,50 @@ def get_context_length(endpoint_url: str, model: str) -> int: if not is_local and cache_key in _context_cache: return _context_cache[cache_key] - ctx = _query_context_length(endpoint_url, model) + ctx, known = _query_context_length(endpoint_url, model) # Only cache non-default values to allow retry on next request. # Local endpoints can restart with a different --max-model-len while keeping # the same model id, so always re-query them instead of serving stale cache. if not is_local and (ctx != DEFAULT_CONTEXT or configured_kind in ("api", "proxy")): - _context_cache[cache_key] = ctx + _context_cache[cache_key] = (ctx, known) logger.info(f"Context length for {model}: {ctx}") - return ctx + return ctx, known + + +def get_context_length(endpoint_url: str, model: str) -> int: + """Get the context window size for a model. + + Queries /v1/models on the endpoint and looks for context_length + or context_window fields. Caches result per (endpoint, model). + Falls back to DEFAULT_CONTEXT if unavailable. + """ + return _get_context_length_cached(endpoint_url, model)[0] + + +def get_context_length_known(endpoint_url: str, model: str) -> Tuple[int, bool]: + """Like ``get_context_length`` but also returns whether the window was actually + discovered (endpoint-reported or in the known-models table) rather than the bare + DEFAULT_CONTEXT fallback. Callers that *scale* a budget off the window must not + trust an unknown value — a fallback 128K isn't proof the model holds 128K + (review on #4122).""" + return _get_context_length_cached(endpoint_url, model) + + +def budget_context_for_model(endpoint_url: str, model: str, *, fallback: int = 0) -> int: + """Context window to scale the agent input budget against. + + Returns the *freshly discovered* window when it was actually proven + (endpoint-reported / known table), else 0 so auto-scaling stays conservative. + Crucially this binds the ``known`` flag to the value it proves — callers must + not pair this flag with a context length from a *different* lookup (a stale + local re-query, or a caller that didn't pass one), which would budget off an + unproven number (review on #4122). On probe error, returns ``fallback`` (the + caller's best-known value) to preserve prior behaviour.""" + try: + ctx, known = get_context_length_known(endpoint_url, model) + return ctx if known else 0 + except Exception: + return fallback def _lookup_known(model: str) -> Optional[int]: @@ -271,8 +303,9 @@ def _lookup_known(model: str) -> Optional[int]: return best_ctx -def _query_context_length(endpoint_url: str, model: str) -> int: - """Query the model API for context length.""" +def _query_context_length(endpoint_url: str, model: str) -> Tuple[int, bool]: + """Query the model API for context length. Returns (context_length, known) where + ``known`` is False only for the bare DEFAULT_CONTEXT fallback.""" known = _lookup_known(model) api_ctx = None configured_kind = _configured_endpoint_kind(endpoint_url) @@ -283,8 +316,8 @@ def _query_context_length(endpoint_url: str, model: str) -> int: if configured_kind in ("api", "proxy"): if known: logger.info(f"Using known context window for {model}: {known}") - return known - return DEFAULT_CONTEXT + return known, True + return DEFAULT_CONTEXT, False # Try llama.cpp /slots endpoint first — reports actual serving context if is_local_endpoint(endpoint_url): @@ -297,7 +330,7 @@ def _query_context_length(endpoint_url: str, model: str) -> int: n_ctx = slots[0].get("n_ctx") if n_ctx and isinstance(n_ctx, int) and n_ctx > 0: logger.info(f"llama.cpp /slots reports n_ctx={n_ctx} for {model}") - return n_ctx + return n_ctx, True except Exception: pass @@ -309,7 +342,8 @@ def _query_context_length(endpoint_url: str, model: str) -> int: if is_copilot_base(endpoint_url): if known: logger.info(f"Using known context window for {model}: {known}") - return known or DEFAULT_CONTEXT + return known, True + return DEFAULT_CONTEXT, False from src.endpoint_resolver import build_models_url @@ -354,18 +388,18 @@ def _query_context_length(endpoint_url: str, model: str) -> int: _is_local = is_local_endpoint(endpoint_url) if _is_local and api_ctx < known: logger.info(f"Local endpoint reports {api_ctx} for {model} (known max: {known}) — using API value") - return api_ctx + return api_ctx, True result = max(api_ctx, known) if api_ctx < known: logger.info(f"API reported {api_ctx} for {model}, using known {known} instead") - return result + return result, True if api_ctx: - return api_ctx + return api_ctx, True if known: logger.info(f"Using known context window for {model}: {known}") - return known + return known, True - return DEFAULT_CONTEXT + return DEFAULT_CONTEXT, False def estimate_tokens(messages: List[Dict]) -> int: diff --git a/src/settings.py b/src/settings.py index f305355dc..39c65088d 100644 --- a/src/settings.py +++ b/src/settings.py @@ -101,14 +101,22 @@ DEFAULT_SETTINGS = { "research_run_timeout_seconds": 1800, "agent_max_tool_calls": 0, "agent_max_rounds": 20, # per-message agent step cap (clamped 1..200) + # Soft input-token budget for the agent loop. The DEFAULT value (6000) is the + # "auto" sentinel: it means "scale the budget to the model's context window" + # (#1230) — so long-context models aren't capped at 6000. Set ANY OTHER value + # to enforce an explicit cap (clamped to the window only — hard_max does not + # apply to explicit budgets, #1230); set 0 to disable soft-trimming. The + # default is treated as auto because the settings-save path materializes + # defaults, so a persisted 6000 can't be told apart from a deliberate 6000 — + # to pin a budget near the default, use a nearby value (e.g. 5999). "agent_input_token_budget": 6000, - # Ceiling on the *auto-derived* input budget that #1230 introduced. Has - # no effect when `agent_input_token_budget` is explicitly set (the user's - # value is honoured regardless). Default matches - # `src.context_budget.DEFAULT_HARD_MAX`; lower this for cost-paranoid - # setups, raise it on premium APIs with very large windows that you + # Ceiling on the *auto-derived* input budget; a configurable setting since #1273 + # (the merged #1230 left it a module constant). No effect on an explicit budget + # — a deliberate value is honoured (#1230). Default matches + # `src.context_budget.DEFAULT_HARD_MAX`; lower this for + # cost-paranoid setups, raise it on premium APIs with very large windows you # want to actually use (e.g. 900_000 to fill a 1M-context model). See - # `compute_input_token_budget` in src/context_budget.py. + # `compute_input_token_budget`. "agent_input_token_hard_max": 200_000, "agent_stream_timeout_seconds": 300, # Extra directory roots that read_file / write_file may access, in @@ -223,8 +231,10 @@ def is_setting_overridden(key: str) -> bool: ``load_settings`` merges DEFAULT_SETTINGS with the saved file, so a value equal to its default is indistinguishable from "never set" via get_setting. - Callers that need to treat an explicit user choice differently from the - default (e.g. adaptive budgets) use this to read the raw saved file. + Callers that must distinguish an explicit user choice from a default read + the raw saved file via this. (Note: a materialized default is also "present", + so value-sensitive callers should compare against the default — see + ``context_budget.budget_is_explicit``.) """ try: with open(SETTINGS_FILE, "r", encoding="utf-8") as f: diff --git a/tests/test_budget_auto_sentinel.py b/tests/test_budget_auto_sentinel.py new file mode 100644 index 000000000..ccd127e8e --- /dev/null +++ b/tests/test_budget_auto_sentinel.py @@ -0,0 +1,111 @@ +"""Agent input-token budget contract (review on #4122). + +- The DEFAULT value is the AUTO sentinel: it scales to the model's context window. + Any non-default value is an explicit cap. A materialized default 6000 can't be + told apart from a deliberate 6000 (the settings-save path persists defaults), so + the default reads as auto — pin a cap with a nearby value (e.g. 5999). +- Auto-scaling only trusts a DISCOVERED context window; a bare DEFAULT_CONTEXT + fallback stays conservative instead of scaling off an unproven window. +""" + +import json +from unittest.mock import patch + +import src.settings as settings +import src.model_context as mc +from src.context_budget import compute_input_token_budget, DEFAULT_BUDGET, budget_is_explicit + + +def test_default_value_is_the_auto_sentinel(): + # The settings default equals DEFAULT_BUDGET, so the agent loop (which compares + # the configured value to DEFAULT_BUDGET) treats the default as "auto". + assert settings.DEFAULT_SETTINGS["agent_input_token_budget"] == DEFAULT_BUDGET + + +def test_saving_an_unrelated_setting_does_not_re_cap_the_budget(tmp_path, monkeypatch): + """End-to-end regression (WGlynn, #4121): changing ANY setting makes the + settings-save path persist the merged dict, which materializes the budget + default into settings.json. The budget must still AUTO-SCALE — it must not be + re-read as an explicit 6000 cap. This locks the exact reopening shut. + """ + settings_file = tmp_path / "settings.json" + monkeypatch.setattr(settings, "SETTINGS_FILE", str(settings_file)) + settings._settings_cache = None + + # Simulate a real settings save: a handler loads the merged dict (defaults + + # saved) and persists it after the user changes one *unrelated* setting. + merged = settings.load_settings() + merged["search_result_count"] = 9 # unrelated user change + settings.save_settings(merged) + settings._settings_cache = None + + # The budget default is now physically materialized into the file... + raw = json.loads(settings_file.read_text()) + assert raw["agent_input_token_budget"] == DEFAULT_BUDGET + assert raw["search_result_count"] == 9 + + # ...yet it must read as AUTO (value == default), not an explicit cap — even + # though is_setting_overridden would report True for it now. + assert settings.is_setting_overridden("agent_input_token_budget") is True + soft = int(settings.get_setting("agent_input_token_budget", DEFAULT_BUDGET) or 0) + assert budget_is_explicit(soft) is False + # And the effective budget scales to the window rather than capping at 6000. + assert compute_input_token_budget(soft, 131072, explicit=budget_is_explicit(soft)) == int(131072 * 0.85) + + +def test_auto_scales_on_a_known_window(): + assert compute_input_token_budget(DEFAULT_BUDGET, 131072, explicit=False) == int(131072 * 0.85) + + +def test_auto_stays_conservative_on_unknown_window(): + # P2 #2: the budget block passes context_length=0 when the window is only a + # fallback, so auto-scaling must NOT inflate to the unproven window. + assert compute_input_token_budget(DEFAULT_BUDGET, 0, explicit=False) == DEFAULT_BUDGET + + +def test_nondefault_value_is_an_explicit_cap(): + assert compute_input_token_budget(20000, 131072, explicit=True) == 20000 # honoured + assert compute_input_token_budget(200000, 32000, explicit=True) == 32000 # clamped to window + + +def test_get_context_length_known_surfaces_endpoint_proven_vs_fallback(): + mc._context_cache.clear() + with patch.object(mc, "_query_context_length", return_value=(131072, True)): + assert mc.get_context_length_known("http://proven/v1", "m1") == (131072, True) + mc._context_cache.clear() + with patch.object(mc, "_query_context_length", return_value=(mc.DEFAULT_CONTEXT, False)): + ctx, known = mc.get_context_length_known("http://unknown/v1", "m2") + assert ctx == mc.DEFAULT_CONTEXT and known is False + # get_context_length keeps its plain-int contract for existing callers + mc._context_cache.clear() + with patch.object(mc, "_query_context_length", return_value=(64000, True)): + assert mc.get_context_length("http://proven/v1", "m3") == 64000 + + +def test_budget_context_binds_known_flag_to_its_own_value(): + """Regression (RaresKeY, #4122): scale the budget off the value the `known` + flag actually proves — never a stale/missing context_length from a different + lookup. Covers the local-restaleness case (fresh proven value beats a stale + fallback) and the no-arg-caller case (discovers a long window despite fallback=0). + """ + # unknown / bare fallback -> 0 (don't scale off an unproven window) + with patch.object(mc, "get_context_length_known", return_value=(128000, False)): + assert mc.budget_context_for_model("u", "m", fallback=128000) == 0 + # known -> the freshly-proven value, NOT the (stale) fallback the caller passed + with patch.object(mc, "get_context_length_known", return_value=(4096, True)): + assert mc.budget_context_for_model("u", "m", fallback=128000) == 4096 + # no-arg caller (fallback=0) still gets the discovered long window + with patch.object(mc, "get_context_length_known", return_value=(131072, True)): + assert mc.budget_context_for_model("u", "m", fallback=0) == 131072 + # probe error -> caller's fallback (prior behaviour) + with patch.object(mc, "get_context_length_known", side_effect=RuntimeError): + assert mc.budget_context_for_model("u", "m", fallback=4096) == 4096 + + +def test_no_arg_caller_scales_from_discovered_window_not_6000(): + """End-to-end of the fix: a caller that passes no context_length (scheduled + tasks, teacher escalation, ...) but whose endpoint reports 131072 now scales to + ~111k instead of being capped at the conservative 6000.""" + with patch.object(mc, "get_context_length_known", return_value=(131072, True)): + ctx = mc.budget_context_for_model("u", "m", fallback=0) + assert compute_input_token_budget(DEFAULT_BUDGET, ctx, explicit=False) == int(131072 * 0.85) diff --git a/tests/test_context_budget.py b/tests/test_context_budget.py index 2c97b4780..eec8d046e 100644 --- a/tests/test_context_budget.py +++ b/tests/test_context_budget.py @@ -47,11 +47,11 @@ def test_is_setting_overridden_reads_raw_saved_file(tmp_path, monkeypatch): # --------------------------------------------------------------------------- -# Configurable hard_max — completes the reviewer requirement from #1190 that -# was carried over but not implemented in #1230: the ceiling on the auto- -# derived path should be a setting, not a hidden constant. Without this, -# admins on premium APIs with very large windows (1M+ context) can only -# raise the ceiling by editing src/context_budget.py. +# Configurable hard_max — the ceiling on the auto-derived path is a setting +# (`agent_input_token_hard_max`), not a hidden constant. History: a reviewer +# required it on #1190, the merged #1230 shipped without it, and #1273 added it. +# This test pins the function-level override (the `hard_max` parameter); without +# a raisable ceiling, admins on 1M+ context APIs would be stuck at the 200K default. # --------------------------------------------------------------------------- def test_custom_hard_max_overrides_default_in_auto_branch(): diff --git a/tests/test_context_cache_per_endpoint.py b/tests/test_context_cache_per_endpoint.py index efabea46a..c96c605a6 100644 --- a/tests/test_context_cache_per_endpoint.py +++ b/tests/test_context_cache_per_endpoint.py @@ -13,7 +13,7 @@ def _setup(monkeypatch, windows): """windows: {endpoint_url: context_length}. Force the remote path.""" monkeypatch.setattr(mc, "is_local_endpoint", lambda url: False) monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api") - monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url]) + monkeypatch.setattr(mc, "_query_context_length", lambda url, model: (windows[url], True)) mc._context_cache.clear() @@ -34,6 +34,6 @@ def test_cache_hit_still_works_per_endpoint(monkeypatch): # Both endpoints are now cached under their own key; flip the underlying # query to prove subsequent reads come from the per-endpoint cache, not a re-query. - monkeypatch.setattr(mc, "_query_context_length", lambda url, model: 999) + monkeypatch.setattr(mc, "_query_context_length", lambda url, model: (999, True)) assert mc.get_context_length(a, "shared-model") == 8000 assert mc.get_context_length(b, "shared-model") == 200000 diff --git a/tests/test_llama_server_models_url.py b/tests/test_llama_server_models_url.py index 36c49714a..45f55d429 100644 --- a/tests/test_llama_server_models_url.py +++ b/tests/test_llama_server_models_url.py @@ -51,7 +51,7 @@ def test_model_context_queries_models_for_v1_base(monkeypatch): monkeypatch.setattr(model_context.httpx, "get", fake_get) - assert model_context._query_context_length("http://127.0.0.1:8080/v1", "qwen3") == 32768 + assert model_context._query_context_length("http://127.0.0.1:8080/v1", "qwen3") == (32768, True) assert seen == [ "http://127.0.0.1:8080/slots", "http://127.0.0.1:8080/v1/models", diff --git a/tests/test_model_context.py b/tests/test_model_context.py index ba6556a44..606b1be7a 100644 --- a/tests/test_model_context.py +++ b/tests/test_model_context.py @@ -192,7 +192,7 @@ class TestGetContextLength: def fake_query(endpoint_url, model): calls.append((endpoint_url, model)) - return 8192 if len(calls) == 1 else 27000 + return (8192, True) if len(calls) == 1 else (27000, True) monkeypatch.setattr(model_context, "_query_context_length", fake_query) @@ -211,7 +211,7 @@ class TestGetContextLength: def fake_query(endpoint_url, model): calls.append((endpoint_url, model)) - return 200000 if len(calls) == 1 else 12345 + return (200000, True) if len(calls) == 1 else (12345, True) monkeypatch.setattr(model_context, "_query_context_length", fake_query)