From 6973c5427c07c36493bd031da414f6e72a7a32cf Mon Sep 17 00:00:00 2001 From: nubs Date: Fri, 5 Jun 2026 13:56:54 +0000 Subject: [PATCH] fix(model-context): count tool_calls in estimate_tokens so compaction sees real size (#2751) --- src/model_context.py | 22 ++++++++++- tests/test_estimate_tokens_tool_calls.py | 47 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 tests/test_estimate_tokens_tool_calls.py diff --git a/src/model_context.py b/src/model_context.py index 3a445fe7b..c71d76fcf 100644 --- a/src/model_context.py +++ b/src/model_context.py @@ -357,7 +357,11 @@ def estimate_tokens(messages: List[Dict]) -> int: Uses chars * 0.3 which is closer to real BPE tokenizer output than the commonly-cited chars/4 (which underestimates by ~20-30%). - Also adds ~4 tokens per message for role/formatting overhead. + Also adds ~4 tokens per message for role/formatting overhead, and counts + assistant tool_calls (name + arguments) — a tool-only turn carries + content=None with the real payload in tool_calls, so ignoring them made the + estimate (and the compaction/trim gates that rely on it) blind to large + tool arguments. """ total = 0 for msg in messages: @@ -369,4 +373,20 @@ def estimate_tokens(messages: List[Dict]) -> int: for item in content: if isinstance(item, dict) and item.get("type") == "text": total += int(len(item.get("text", "")) * 0.3) + # Tool calls carry real payload too: a tool-only assistant turn is stored + # with content=None and the actual args (e.g. a create_document body) in + # tool_calls[].function.arguments. Ignoring them made large tool arguments + # read as ~0 tokens, so the compaction/trim gates missed genuine overflow. + tool_calls = msg.get("tool_calls") + if isinstance(tool_calls, list): + for tc in tool_calls: + if not isinstance(tc, dict): + continue + fn = tc.get("function") if isinstance(tc.get("function"), dict) else tc + name = fn.get("name", "") or "" + args = fn.get("arguments", "") or "" + if not isinstance(args, str): + args = str(args) # some shapes store arguments as a dict + total += 4 # per tool-call overhead (id, type, wrapper) + total += int((len(str(name)) + len(args)) * 0.3) return total diff --git a/tests/test_estimate_tokens_tool_calls.py b/tests/test_estimate_tokens_tool_calls.py new file mode 100644 index 000000000..39c890f5b --- /dev/null +++ b/tests/test_estimate_tokens_tool_calls.py @@ -0,0 +1,47 @@ +"""Issue #2748 — estimate_tokens must count assistant tool_calls (name + arguments). + +A tool-only assistant turn is stored with content=None and the real payload (e.g. +a large create_document body) in tool_calls[].function.arguments. Before this fix +estimate_tokens ignored tool_calls, so such a turn counted as ~4 tokens and the +compaction/trim gates that rely on estimate_tokens silently missed real context +overflow, letting the upstream call 400 with 'context length exceeded'. +""" + +from src.model_context import estimate_tokens + + +def test_tool_call_arguments_are_counted(): + big = "x" * 40000 # ~ a large create_document body + msg = { + "role": "assistant", + "content": None, + "tool_calls": [ + {"id": "c1", "type": "function", + "function": {"name": "create_document", "arguments": big}}, + ], + } + est = estimate_tokens([msg]) + # ~40k chars * 0.3 ≈ 12000, vs the old ~4 that ignored tool_calls entirely. + assert est > 10000, est + + +def test_content_only_message_is_unchanged(): + # No tool_calls -> identical to the previous behaviour (content*0.3 + overhead). + msg = {"role": "user", "content": "x" * 100} + assert estimate_tokens([msg]) == 4 + int(100 * 0.3) + + +def test_dict_arguments_are_handled(): + # Some shapes store arguments as a dict rather than a JSON string. + msg = { + "role": "assistant", + "content": None, + "tool_calls": [{"function": {"name": "f", "arguments": {"path": "x" * 1000}}}], + } + assert estimate_tokens([msg]) > 200 + + +def test_empty_and_malformed_tool_calls_are_safe(): + # tool_calls=None and non-dict entries must not raise and must not inflate. + assert estimate_tokens([{"role": "assistant", "content": "hi", "tool_calls": None}]) == 4 + int(2 * 0.3) + assert estimate_tokens([{"role": "assistant", "content": None, "tool_calls": ["bad", 5]}]) == 4