fix(model-context): count tool_calls in estimate_tokens so compaction sees real size (#2751)

2026-06-16 09:45:24 -04:00 · 2026-06-05 13:56:54 +00:00
parent 8354948a1c
commit 6973c5427c
2 changed files with 68 additions and 1 deletions
@@ -357,7 +357,11 @@ def estimate_tokens(messages: List[Dict]) -> int:

    Uses chars * 0.3 which is closer to real BPE tokenizer output
    than the commonly-cited chars/4 (which underestimates by ~20-30%).
-    Also adds ~4 tokens per message for role/formatting overhead.
+    Also adds ~4 tokens per message for role/formatting overhead, and counts
+    assistant tool_calls (name + arguments) — a tool-only turn carries
+    content=None with the real payload in tool_calls, so ignoring them made the
+    estimate (and the compaction/trim gates that rely on it) blind to large
+    tool arguments.
    """
    total = 0
    for msg in messages:
@@ -369,4 +373,20 @@ def estimate_tokens(messages: List[Dict]) -> int:
            for item in content:
                if isinstance(item, dict) and item.get("type") == "text":
                    total += int(len(item.get("text", "")) * 0.3)
+        # Tool calls carry real payload too: a tool-only assistant turn is stored
+        # with content=None and the actual args (e.g. a create_document body) in
+        # tool_calls[].function.arguments. Ignoring them made large tool arguments
+        # read as ~0 tokens, so the compaction/trim gates missed genuine overflow.
+        tool_calls = msg.get("tool_calls")
+        if isinstance(tool_calls, list):
+            for tc in tool_calls:
+                if not isinstance(tc, dict):
+                    continue
+                fn = tc.get("function") if isinstance(tc.get("function"), dict) else tc
+                name = fn.get("name", "") or ""
+                args = fn.get("arguments", "") or ""
+                if not isinstance(args, str):
+                    args = str(args)  # some shapes store arguments as a dict
+                total += 4  # per tool-call overhead (id, type, wrapper)
+                total += int((len(str(name)) + len(args)) * 0.3)
    return total
@@ -0,0 +1,47 @@
+"""Issue #2748 — estimate_tokens must count assistant tool_calls (name + arguments).
+
+A tool-only assistant turn is stored with content=None and the real payload (e.g.
+a large create_document body) in tool_calls[].function.arguments. Before this fix
+estimate_tokens ignored tool_calls, so such a turn counted as ~4 tokens and the
+compaction/trim gates that rely on estimate_tokens silently missed real context
+overflow, letting the upstream call 400 with 'context length exceeded'.
+"""
+
+from src.model_context import estimate_tokens
+
+
+def test_tool_call_arguments_are_counted():
+    big = "x" * 40000  # ~ a large create_document body
+    msg = {
+        "role": "assistant",
+        "content": None,
+        "tool_calls": [
+            {"id": "c1", "type": "function",
+             "function": {"name": "create_document", "arguments": big}},
+        ],
+    }
+    est = estimate_tokens([msg])
+    # ~40k chars * 0.3 ≈ 12000, vs the old ~4 that ignored tool_calls entirely.
+    assert est > 10000, est
+
+
+def test_content_only_message_is_unchanged():
+    # No tool_calls -> identical to the previous behaviour (content*0.3 + overhead).
+    msg = {"role": "user", "content": "x" * 100}
+    assert estimate_tokens([msg]) == 4 + int(100 * 0.3)
+
+
+def test_dict_arguments_are_handled():
+    # Some shapes store arguments as a dict rather than a JSON string.
+    msg = {
+        "role": "assistant",
+        "content": None,
+        "tool_calls": [{"function": {"name": "f", "arguments": {"path": "x" * 1000}}}],
+    }
+    assert estimate_tokens([msg]) > 200
+
+
+def test_empty_and_malformed_tool_calls_are_safe():
+    # tool_calls=None and non-dict entries must not raise and must not inflate.
+    assert estimate_tokens([{"role": "assistant", "content": "hi", "tool_calls": None}]) == 4 + int(2 * 0.3)
+    assert estimate_tokens([{"role": "assistant", "content": None, "tool_calls": ["bad", 5]}]) == 4