diff --git a/src/agent_loop.py b/src/agent_loop.py index 7a626fb7d..b358f6a00 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1437,6 +1437,18 @@ def build_active_plan_note(approved_plan: str) -> str: ) +def _detect_runaway_call(call_freq, threshold=15): + """Tool name of a call signature repeated >= ``threshold`` times — a real + runaway loop. Counts IDENTICAL repeated calls (same tool AND args), so a + legitimate batch of distinct calls to one tool (e.g. creating 18 calendar + events at once) is NOT flagged. Returns ``None`` when nothing is runaway. + + ``call_freq`` is a Counter keyed by ``"{tool_type}:{content[:120]}"``. + """ + sig = next((s for s, n in call_freq.items() if n >= threshold), None) + return sig.split(":", 1)[0] if sig else None + + async def stream_agent_loop( endpoint_url: str, model: str, @@ -1774,7 +1786,10 @@ async def stream_agent_loop( # signatures + consecutive no-text tool rounds to bail early. _recent_call_sigs = collections.deque(maxlen=6) _stuck_rounds = 0 - _tool_type_counts: collections.Counter = collections.Counter() + # Frequency of each exact call signature (tool + args), for the runaway + # backstop. Counting identical repeats — not distinct same-tool calls — + # lets a legit batch (e.g. 18 calendar events at once) through. + _call_freq: collections.Counter = collections.Counter() _THINK_RE = re.compile(r'.*?', re.DOTALL | re.IGNORECASE) _force_answer = False # set by loop-breaker → next round runs with NO tools # Supervisor: how many times we've nudged the model after it announced @@ -2221,7 +2236,7 @@ async def stream_agent_loop( _is_repeat = _sig in _recent_call_sigs _recent_call_sigs.append(_sig) for _b in tool_blocks: - _tool_type_counts[_b.tool_type] += 1 + _call_freq[f"{_b.tool_type}:{(_b.content or '').strip()[:120]}"] += 1 # "Real" answer text = round text minus blocks. Empty-think # rounds (just "\n\n" + a tool call) must not read as # progress, so strip think before checking. @@ -2232,9 +2247,12 @@ async def stream_agent_loop( _stuck_rounds += 1 else: _stuck_rounds = 0 - _runaway = next((t for t, n in _tool_type_counts.items() if n >= 15), None) + # Runaway = the SAME exact call repeated an absurd number of times. + # Distinct calls to one tool (a real batch) are legitimate work, so we + # count identical call signatures, not raw per-tool-type totals. + _runaway = _detect_runaway_call(_call_freq) if _stuck_rounds >= 4 or _runaway: - reason = (f"calling {_runaway} over and over" if _runaway + reason = (f"calling {_runaway} with identical arguments over and over" if _runaway else "repeating the same tool calls without new progress") logger.warning(f"[agent] loop-breaker tripped on round {round_num} ({reason}); sig={_sig[:80]!r}") # The model has been executing tools, so its results are already diff --git a/tests/test_loop_breaker_runaway.py b/tests/test_loop_breaker_runaway.py new file mode 100644 index 000000000..dbea4d31f --- /dev/null +++ b/tests/test_loop_breaker_runaway.py @@ -0,0 +1,61 @@ +"""Regression test for the agent loop-breaker's runaway backstop. + +A legitimate batch of DISTINCT tool calls (e.g. creating 18 calendar events at +once) must not be flagged as a runaway loop. Only the SAME exact call repeated +an absurd number of times is a real runaway. Previously the backstop counted +per-tool-type totals, so any batch of >=15 distinct calls to one tool was +aborted and the calls were silently discarded. +""" +import sys +import collections +from unittest.mock import MagicMock + +# Mock heavy deps so importing src.agent_loop doesn't load the full app stack. +_MOCKED = [ + 'sqlalchemy', 'sqlalchemy.orm', 'sqlalchemy.ext', 'sqlalchemy.ext.declarative', + 'sqlalchemy.ext.hybrid', 'sqlalchemy.sql', 'sqlalchemy.sql.expression', + 'src.database', 'src.agent_tools', 'core.models', 'core.database', +] +for _m in _MOCKED: + sys.modules.setdefault(_m, MagicMock()) + +from src.agent_loop import _detect_runaway_call + + +def _freq(sigs): + c = collections.Counter() + for s in sigs: + c[s] += 1 + return c + + +def test_distinct_batch_is_not_runaway(): + # 18 distinct manage_calendar create_event calls (the "add 18 birthdays" case) + sigs = [f'manage_calendar:{{"action":"create_event","summary":"Birthday {n}"}}' + for n in range(18)] + assert _detect_runaway_call(_freq(sigs)) is None + + +def test_many_distinct_same_tool_is_not_runaway(): + sigs = [f'bash:echo {i}' for i in range(30)] + assert _detect_runaway_call(_freq(sigs)) is None + + +def test_identical_call_repeated_is_runaway(): + sigs = ['manage_calendar:{"action":"list_events"}'] * 15 + assert _detect_runaway_call(_freq(sigs)) == 'manage_calendar' + + +def test_below_threshold_is_not_runaway(): + sigs = ['bash:ls'] * 14 + assert _detect_runaway_call(_freq(sigs)) is None + + +def test_threshold_is_configurable(): + sigs = ['web_search:python'] * 5 + assert _detect_runaway_call(_freq(sigs), threshold=5) == 'web_search' + assert _detect_runaway_call(_freq(sigs), threshold=6) is None + + +def test_empty_is_not_runaway(): + assert _detect_runaway_call(collections.Counter()) is None