From 6973c5427c07c36493bd031da414f6e72a7a32cf Mon Sep 17 00:00:00 2001
From: nubs <nubs@nubs.site>
Date: Fri, 5 Jun 2026 13:56:54 +0000
Subject: [PATCH] fix(model-context): count tool_calls in estimate_tokens so
 compaction sees real size (#2751)

---
 src/model_context.py                     | 22 ++++++++++-
 tests/test_estimate_tokens_tool_calls.py | 47 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_estimate_tokens_tool_calls.py

diff --git a/src/model_context.py b/src/model_context.py
index 3a445fe7b..c71d76fcf 100644
--- a/src/model_context.py
+++ b/src/model_context.py
@@ -357,7 +357,11 @@ def estimate_tokens(messages: List[Dict]) -> int:
 
     Uses chars * 0.3 which is closer to real BPE tokenizer output
     than the commonly-cited chars/4 (which underestimates by ~20-30%).
-    Also adds ~4 tokens per message for role/formatting overhead.
+    Also adds ~4 tokens per message for role/formatting overhead, and counts
+    assistant tool_calls (name + arguments) — a tool-only turn carries
+    content=None with the real payload in tool_calls, so ignoring them made the
+    estimate (and the compaction/trim gates that rely on it) blind to large
+    tool arguments.
     """
     total = 0
     for msg in messages:
@@ -369,4 +373,20 @@ def estimate_tokens(messages: List[Dict]) -> int:
             for item in content:
                 if isinstance(item, dict) and item.get("type") == "text":
                     total += int(len(item.get("text", "")) * 0.3)
+        # Tool calls carry real payload too: a tool-only assistant turn is stored
+        # with content=None and the actual args (e.g. a create_document body) in
+        # tool_calls[].function.arguments. Ignoring them made large tool arguments
+        # read as ~0 tokens, so the compaction/trim gates missed genuine overflow.
+        tool_calls = msg.get("tool_calls")
+        if isinstance(tool_calls, list):
+            for tc in tool_calls:
+                if not isinstance(tc, dict):
+                    continue
+                fn = tc.get("function") if isinstance(tc.get("function"), dict) else tc
+                name = fn.get("name", "") or ""
+                args = fn.get("arguments", "") or ""
+                if not isinstance(args, str):
+                    args = str(args)  # some shapes store arguments as a dict
+                total += 4  # per tool-call overhead (id, type, wrapper)
+                total += int((len(str(name)) + len(args)) * 0.3)
     return total
diff --git a/tests/test_estimate_tokens_tool_calls.py b/tests/test_estimate_tokens_tool_calls.py
new file mode 100644
index 000000000..39c890f5b
--- /dev/null
+++ b/tests/test_estimate_tokens_tool_calls.py
@@ -0,0 +1,47 @@
+"""Issue #2748 — estimate_tokens must count assistant tool_calls (name + arguments).
+
+A tool-only assistant turn is stored with content=None and the real payload (e.g.
+a large create_document body) in tool_calls[].function.arguments. Before this fix
+estimate_tokens ignored tool_calls, so such a turn counted as ~4 tokens and the
+compaction/trim gates that rely on estimate_tokens silently missed real context
+overflow, letting the upstream call 400 with 'context length exceeded'.
+"""
+
+from src.model_context import estimate_tokens
+
+
+def test_tool_call_arguments_are_counted():
+    big = "x" * 40000  # ~ a large create_document body
+    msg = {
+        "role": "assistant",
+        "content": None,
+        "tool_calls": [
+            {"id": "c1", "type": "function",
+             "function": {"name": "create_document", "arguments": big}},
+        ],
+    }
+    est = estimate_tokens([msg])
+    # ~40k chars * 0.3 ≈ 12000, vs the old ~4 that ignored tool_calls entirely.
+    assert est > 10000, est
+
+
+def test_content_only_message_is_unchanged():
+    # No tool_calls -> identical to the previous behaviour (content*0.3 + overhead).
+    msg = {"role": "user", "content": "x" * 100}
+    assert estimate_tokens([msg]) == 4 + int(100 * 0.3)
+
+
+def test_dict_arguments_are_handled():
+    # Some shapes store arguments as a dict rather than a JSON string.
+    msg = {
+        "role": "assistant",
+        "content": None,
+        "tool_calls": [{"function": {"name": "f", "arguments": {"path": "x" * 1000}}}],
+    }
+    assert estimate_tokens([msg]) > 200
+
+
+def test_empty_and_malformed_tool_calls_are_safe():
+    # tool_calls=None and non-dict entries must not raise and must not inflate.
+    assert estimate_tokens([{"role": "assistant", "content": "hi", "tool_calls": None}]) == 4 + int(2 * 0.3)
+    assert estimate_tokens([{"role": "assistant", "content": None, "tool_calls": ["bad", 5]}]) == 4