fix(model-context): count tool_calls in estimate_tokens so compaction sees real size (#2751)

2026-06-17 10:15:27 -04:00 · 2026-06-05 13:56:54 +00:00
parent 8354948a1c
commit 6973c5427c
2 changed files with 68 additions and 1 deletions
@@ -357,7 +357,11 @@ def estimate_tokens(messages: List[Dict]) -> int:
    Uses chars * 0.3 which is closer to real BPE tokenizer output
    than the commonly-cited chars/4 (which underestimates by ~20-30%).
-    Also adds ~4 tokens per message for role/formatting overhead.
+    Also adds ~4 tokens per message for role/formatting overhead, and counts
    assistant tool_calls (name + arguments) — a tool-only turn carries
    content=None with the real payload in tool_calls, so ignoring them made the
    estimate (and the compaction/trim gates that rely on it) blind to large
    tool arguments.
    """
    total = 0
    for msg in messages:
@@ -369,4 +373,20 @@ def estimate_tokens(messages: List[Dict]) -> int:
            for item in content:
                if isinstance(item, dict) and item.get("type") == "text":
                    total += int(len(item.get("text", "")) * 0.3)
        # Tool calls carry real payload too: a tool-only assistant turn is stored
        # with content=None and the actual args (e.g. a create_document body) in
        # tool_calls[].function.arguments. Ignoring them made large tool arguments
        # read as ~0 tokens, so the compaction/trim gates missed genuine overflow.
        tool_calls = msg.get("tool_calls")
        if isinstance(tool_calls, list):
            for tc in tool_calls:
                if not isinstance(tc, dict):
                    continue
                fn = tc.get("function") if isinstance(tc.get("function"), dict) else tc
                name = fn.get("name", "") or ""
                args = fn.get("arguments", "") or ""
                if not isinstance(args, str):
                    args = str(args)  # some shapes store arguments as a dict
                total += 4  # per tool-call overhead (id, type, wrapper)
                total += int((len(str(name)) + len(args)) * 0.3)
    return total
@@ -0,0 +1,47 @@
 """Issue #2748 — estimate_tokens must count assistant tool_calls (name + arguments).
 A tool-only assistant turn is stored with content=None and the real payload (e.g.
 a large create_document body) in tool_calls[].function.arguments. Before this fix
 estimate_tokens ignored tool_calls, so such a turn counted as ~4 tokens and the
 compaction/trim gates that rely on estimate_tokens silently missed real context
 overflow, letting the upstream call 400 with 'context length exceeded'.
 """
 from src.model_context import estimate_tokens
 def test_tool_call_arguments_are_counted():
    big = "x" * 40000  # ~ a large create_document body
    msg = {
        "role": "assistant",
        "content": None,
        "tool_calls": [
            {"id": "c1", "type": "function",
             "function": {"name": "create_document", "arguments": big}},
        ],
    }
    est = estimate_tokens([msg])
    # ~40k chars * 0.3 ≈ 12000, vs the old ~4 that ignored tool_calls entirely.
    assert est > 10000, est
 def test_content_only_message_is_unchanged():
    # No tool_calls -> identical to the previous behaviour (content*0.3 + overhead).
    msg = {"role": "user", "content": "x" * 100}
    assert estimate_tokens([msg]) == 4 + int(100 * 0.3)
 def test_dict_arguments_are_handled():
    # Some shapes store arguments as a dict rather than a JSON string.
    msg = {
        "role": "assistant",
        "content": None,
        "tool_calls": [{"function": {"name": "f", "arguments": {"path": "x" * 1000}}}],
    }
    assert estimate_tokens([msg]) > 200
 def test_empty_and_malformed_tool_calls_are_safe():
    # tool_calls=None and non-dict entries must not raise and must not inflate.
    assert estimate_tokens([{"role": "assistant", "content": "hi", "tool_calls": None}]) == 4 + int(2 * 0.3)
    assert estimate_tokens([{"role": "assistant", "content": None, "tool_calls": ["bad", 5]}]) == 4