fix(agent): don't abort legitimate tool batches as runaway loops (#3183)

The loop-breaker's runaway backstop counted per-tool-type call totals and tripped whenever any tool was used >=15 times — treating 15+ DISTINCT calls to one tool as a stuck loop. A real batch (e.g. "add these 18 birthdays to my calendar" emits 18 distinct manage_calendar create_event calls in one round) got flagged "calling manage_calendar over and over", the calls were discarded (next round tools_sent=0), and 0 events were created. Count IDENTICAL repeated call signatures instead (same tool AND args), via a small, unit-testable _detect_runaway_call() helper. Genuine batches pass; a model truly stuck repeating one call still trips the backstop. Adds a regression test. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-16 01:35:36 -04:00 · 2026-06-07 17:16:17 +03:00
parent bdf4ec8b24
commit adbcb3763f
2 changed files with 83 additions and 4 deletions
@@ -1437,6 +1437,18 @@ def build_active_plan_note(approved_plan: str) -> str:
    )


+def _detect_runaway_call(call_freq, threshold=15):
+    """Tool name of a call signature repeated >= ``threshold`` times — a real
+    runaway loop. Counts IDENTICAL repeated calls (same tool AND args), so a
+    legitimate batch of distinct calls to one tool (e.g. creating 18 calendar
+    events at once) is NOT flagged. Returns ``None`` when nothing is runaway.
+
+    ``call_freq`` is a Counter keyed by ``"{tool_type}:{content[:120]}"``.
+    """
+    sig = next((s for s, n in call_freq.items() if n >= threshold), None)
+    return sig.split(":", 1)[0] if sig else None
+
+
 async def stream_agent_loop(
    endpoint_url: str,
    model: str,
@@ -1774,7 +1786,10 @@ async def stream_agent_loop(
    # signatures + consecutive no-text tool rounds to bail early.
    _recent_call_sigs = collections.deque(maxlen=6)
    _stuck_rounds = 0
-    _tool_type_counts: collections.Counter = collections.Counter()
+    # Frequency of each exact call signature (tool + args), for the runaway
+    # backstop. Counting identical repeats — not distinct same-tool calls —
+    # lets a legit batch (e.g. 18 calendar events at once) through.
+    _call_freq: collections.Counter = collections.Counter()
    _THINK_RE = re.compile(r'<think>.*?</think>', re.DOTALL | re.IGNORECASE)
    _force_answer = False  # set by loop-breaker → next round runs with NO tools
    # Supervisor: how many times we've nudged the model after it announced
@@ -2221,7 +2236,7 @@ async def stream_agent_loop(
        _is_repeat = _sig in _recent_call_sigs
        _recent_call_sigs.append(_sig)
        for _b in tool_blocks:
-            _tool_type_counts[_b.tool_type] += 1
+            _call_freq[f"{_b.tool_type}:{(_b.content or '').strip()[:120]}"] += 1
        # "Real" answer text = round text minus <think> blocks. Empty-think
        # rounds (just "<think>\n\n</think>" + a tool call) must not read as
        # progress, so strip think before checking.
@@ -2232,9 +2247,12 @@ async def stream_agent_loop(
            _stuck_rounds += 1
        else:
            _stuck_rounds = 0
-        _runaway = next((t for t, n in _tool_type_counts.items() if n >= 15), None)
+        # Runaway = the SAME exact call repeated an absurd number of times.
+        # Distinct calls to one tool (a real batch) are legitimate work, so we
+        # count identical call signatures, not raw per-tool-type totals.
+        _runaway = _detect_runaway_call(_call_freq)
        if _stuck_rounds >= 4 or _runaway:
-            reason = (f"calling {_runaway} over and over" if _runaway
+            reason = (f"calling {_runaway} with identical arguments over and over" if _runaway
                      else "repeating the same tool calls without new progress")
            logger.warning(f"[agent] loop-breaker tripped on round {round_num} ({reason}); sig={_sig[:80]!r}")
            # The model has been executing tools, so its results are already