diff --git a/src/agent_loop.py b/src/agent_loop.py index 587f5105f..4283b489e 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1111,7 +1111,7 @@ def _build_base_prompt( -def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num: int): +def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num: int, is_api_model: bool = False): """Choose native function calls or fenced code block parsing. Returns (tool_blocks, used_native).""" used_native = False if native_tool_calls: @@ -1128,7 +1128,21 @@ def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num if tool_blocks: used_native = True if not used_native: - tool_blocks = parse_tool_blocks(round_response) + # Native function-calling models (GPT/Claude/Grok/Qwen3/DeepSeek-V, etc.) + # have a reliable structured channel for real tool invocations. When such + # a model emits no native tool_calls, any ```bash/```python/```json fence + # in its prose is virtually always an illustrative example for the user + # (e.g. "here's the command you'd run"), not an attempted tool call — + # executing it causes accidental runs and clarification loops (#3222). + # + # Gate ONLY that fenced-block pattern for native models, not the whole + # parser: explicit [TOOL_CALL]///DSML markup that + # leaks into content as text is never illustrative — it's a real call + # the model couldn't emit on its structured channel (e.g. DeepSeek-V + # falling back to DSML). Dropping the whole parser would silently lose + # those too. Non-native / textual-only models keep every pattern, + # fenced blocks included, since that's their *only* tool channel. + tool_blocks = parse_tool_blocks(round_response, skip_fenced=is_api_model) if tool_blocks: logger.info(f"Agent round {round_num}: {len(tool_blocks)} fenced tool block(s) detected") @@ -2053,7 +2067,7 @@ async def stream_agent_loop( yield chunk # Intercept [DONE] — don't forward until all rounds finish - tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num) + tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num, is_api_model=_is_api_model) # Force-answer round: we told the model to STOP calling tools and # answer. If it ignored that and emitted a (possibly DSML) tool @@ -2132,7 +2146,12 @@ async def stream_agent_loop( # Save cleaned round text for history persistence # Keep blocks so they render in the thinking section on reload - cleaned_round = strip_tool_blocks(round_response).strip() + # Mirror the same fenced-pattern gate used to resolve tool_blocks above: + # an illustrative fence that wasn't executed (because this is a native + # model with no real native_tool_calls) must not be stripped from the + # persisted text either — otherwise it streams once and then disappears + # on reload (#3222 follow-up). + cleaned_round = strip_tool_blocks(round_response, skip_fenced=(_is_api_model and not used_native)).strip() round_texts.append(cleaned_round) if not tool_blocks: diff --git a/src/tool_parsing.py b/src/tool_parsing.py index 4d2d8e66b..3f296c2e6 100644 --- a/src/tool_parsing.py +++ b/src/tool_parsing.py @@ -427,7 +427,7 @@ def _parse_tool_code_block(raw: str) -> Optional[ToolBlock]: return None -def parse_tool_blocks(text: str) -> List[ToolBlock]: +def parse_tool_blocks(text: str, skip_fenced: bool = False) -> List[ToolBlock]: """Extract executable tool blocks from LLM response text. Supports multiple formats: @@ -436,6 +436,17 @@ def parse_tool_blocks(text: str) -> List[ToolBlock]: 3. XML-style / blocks 4. blocks (MiniMax-M2.5 style) 5. DeepSeek DSML markup (normalized to first) + + `skip_fenced`: when True, Pattern 1 (fenced ```bash/```python/```json code + blocks) is not matched at all. Native function-calling models (GPT/Claude/ + Grok/Qwen3/DeepSeek-V, etc.) commonly write illustrative fenced examples in + prose; for those models we trust the structured tool_calls channel for real + invocations and treat a bare fence as display text rather than an action + (issue #3222). Patterns 2-5 — explicit [TOOL_CALL]///DSML + markup that leaked into content as text — stay fully active regardless, + since that markup is never an illustrative example and dropping it would + silently lose real calls (e.g. DeepSeek-V falling back to DSML when it + can't emit structured tool_calls). """ blocks = [] @@ -443,30 +454,31 @@ def parse_tool_blocks(text: str) -> List[ToolBlock]: # XML patterns below catch it. text = _normalize_dsml(text) - # Pattern 1: fenced code blocks - for m in _TOOL_BLOCK_RE.finditer(text): - tag = m.group(1).lower() - content = m.group(2).strip() - if not content: - continue - # If a code block's content is an XML call (some models wrap - # tool calls in ```python or ```xml fences), parse the invoke instead. - if ' XML call (some models wrap + # tool calls in ```python or ```xml fences), parse the invoke instead. + if ' markup, not literal code. Whether or + # not any call converted, never fall through to append the raw XML as + # a python/bash block — e.g. a hyphenated/namespaced tool name that + # _XML_INVOKE_RE's \w+ can't match would otherwise be executed as code. + continue + if tag in ("python", "bash"): + block = _parse_misfenced_web_lookup(content) if block: blocks.append(block) - # This fenced block is markup, not literal code. Whether or - # not any call converted, never fall through to append the raw XML as - # a python/bash block — e.g. a hyphenated/namespaced tool name that - # _XML_INVOKE_RE's \w+ can't match would otherwise be executed as code. - continue - if tag in ("python", "bash"): - block = _parse_misfenced_web_lookup(content) - if block: - blocks.append(block) - continue - blocks.append(ToolBlock(tag, content)) + continue + blocks.append(ToolBlock(tag, content)) # Pattern 2: [TOOL_CALL] blocks (only if no fenced blocks found) if not blocks: @@ -500,12 +512,23 @@ def parse_tool_blocks(text: str) -> List[ToolBlock]: return blocks -def strip_tool_blocks(text: str) -> str: - """Remove executable tool blocks from text for clean display.""" +def strip_tool_blocks(text: str, skip_fenced: bool = False) -> str: + """Remove executable tool blocks from text for clean display. + + `skip_fenced`: when True, fenced ```bash/```python/```json code blocks + (Pattern 1) are left intact instead of being stripped. This must mirror + whatever `skip_fenced` value `parse_tool_blocks` was called with for the + same response: if a fence wasn't executed as a tool call (because it's an + illustrative example from a native function-calling model), it shouldn't + vanish from the persisted/displayed text either — otherwise the example + streams once and then disappears on reload (issue #3222 follow-up). + Patterns 2-5 + DSML markup are always stripped, since that markup should + never reach the user regardless of whether it converted to a tool call. + """ # Normalize DSML first so its markup gets stripped by the # / removers below instead of leaking to the user. text = _normalize_dsml(text) - cleaned = _TOOL_BLOCK_RE.sub('', text) + cleaned = text if skip_fenced else _TOOL_BLOCK_RE.sub('', text) cleaned = _TOOL_CALL_RE.sub('', cleaned) cleaned = _XML_TOOL_CALL_RE.sub('', cleaned) cleaned = _TOOL_CODE_RE.sub('', cleaned) diff --git a/tests/test_fenced_example_not_executed_for_native_models.py b/tests/test_fenced_example_not_executed_for_native_models.py new file mode 100644 index 000000000..2b69ebc5b --- /dev/null +++ b/tests/test_fenced_example_not_executed_for_native_models.py @@ -0,0 +1,291 @@ +"""Issue #3222 — native function-calling models (GPT/Claude/Grok/Qwen3/DeepSeek-V, +etc.) must not have ordinary illustrative Markdown fences in their prose +(```bash, ```python, ```json examples written for the user to read) executed +as real tool calls just because the textual fallback parser matches them. + +`_resolve_tool_blocks` in src/agent_loop.py picks native `tool_calls` when the +model emits them, and otherwise used to fall back unconditionally to +`parse_tool_blocks(round_response)` (the fenced-block textual parser). For a +native model that produced no real tool_calls — e.g. a "guide-only" turn where +the model writes an example command for the user to copy — that fallback used +to treat the example fence as an executable action, causing accidental command +execution and multi-round loops. + +The fix: for native function-calling models (`_is_api_model=True`) that emitted +no native tool_calls, skip the textual fenced-block fallback entirely — these +models have a reliable structured channel and a bare fence in their prose is +display text, not an attempted call. Non-native / textual-only models keep the +fallback unchanged, since fenced blocks are their *only* tool channel. + +These tests drive the real `stream_agent_loop` (not just source-text regex +assertions) end-to-end with a mocked LLM stream, and assert on whether +`execute_tool_block` actually gets invoked. +""" +import asyncio +import json + +import src.agent_loop as al + + +def _collect(gen): + async def _run(): + return [c async for c in gen] + return asyncio.run(_run()) + + +def _types(chunks): + out = [] + for c in chunks: + if c.startswith("data: ") and not c.startswith("data: [DONE]"): + try: + out.append(json.loads(c[6:])) + except Exception: + pass + return out + + +def _patch_common(monkeypatch, exec_calls): + # Skip RAG/tool-index, MCP, and settings lookups; keep the real loop body, + # _resolve_tool_blocks, and parse_tool_blocks intact. + monkeypatch.setattr(al, "get_setting", lambda key, default=None: default, raising=False) + monkeypatch.setattr(al, "get_mcp_manager", lambda: None, raising=False) + monkeypatch.setattr(al, "estimate_tokens", lambda *a, **k: 10, raising=False) + + async def _fake_exec(block, *a, **k): + exec_calls.append(block) + return ("bash", {"output": "ok", "exit_code": 0}) + monkeypatch.setattr(al, "execute_tool_block", _fake_exec, raising=False) + + +def _run_loop(monkeypatch, model, deltas, native_calls=None, max_rounds=2, endpoint_url=None): + """Drive stream_agent_loop with a fake LLM stream. + + `deltas` is a list of text chunks streamed for round 1 (and reused for any + further round). `native_calls`, if given, is emitted as a native + `tool_calls` event alongside the round-1 text. + """ + call_count = {"n": 0} + + async def _fake_stream(_candidates, messages, **kwargs): + call_count["n"] += 1 + if call_count["n"] == 1: + for d in deltas: + yield f'data: {json.dumps({"delta": d})}\n\n' + if native_calls: + yield f'data: {json.dumps({"type": "tool_calls", "calls": native_calls})}\n\n' + yield "data: [DONE]\n\n" + else: + # Subsequent rounds: just answer plainly so the loop terminates. + yield f'data: {json.dumps({"delta": "All done, here is your answer."})}\n\n' + yield "data: [DONE]\n\n" + + monkeypatch.setattr(al, "stream_llm_with_fallback", _fake_stream, raising=False) + + gen = al.stream_agent_loop( + endpoint_url or "https://api.openai.com/v1", model, + [{"role": "user", "content": "Do not run anything yet, just show me an example."}], + max_rounds=max_rounds, + relevant_tools={"bash"}, + ) + return _types(_collect(gen)) + + +# --------------------------------------------------------------------------- +# 1. Native model, illustrative ```bash fence, NO native tool_calls +# -> must NOT be executed. +# --------------------------------------------------------------------------- +def test_native_model_illustrative_bash_fence_not_executed(monkeypatch): + exec_calls = [] + _patch_common(monkeypatch, exec_calls) + guide_only = ( + "Here is the command you would run locally:\n\n" + "```bash\nnpm run plan:articles\n```\n\n" + "Just paste that into your terminal — I'm not running it for you." + ) + events = _run_loop(monkeypatch, "gpt-4o", [guide_only]) + assert exec_calls == [], f"illustrative fence should not be executed, but got: {exec_calls}" + # No tool-call/action events should be emitted for this round either. + assert not any(e.get("type") == "tool_call" for e in events), events + + +# --------------------------------------------------------------------------- +# 2. Native model that DOES emit a real native tool_calls entry +# -> that call IS resolved/executed normally (untouched native path). +# --------------------------------------------------------------------------- +def test_native_model_real_native_tool_call_is_executed(monkeypatch): + exec_calls = [] + _patch_common(monkeypatch, exec_calls) + native_calls = [{"name": "bash", "arguments": json.dumps({"command": "echo hi"})}] + events = _run_loop( + monkeypatch, "gpt-4o", + ["Sure, let me check that for you."], + native_calls=native_calls, + max_rounds=2, + ) + assert len(exec_calls) == 1, f"expected the native tool call to execute, got: {exec_calls}" + assert exec_calls[0].tool_type == "bash" + assert "echo hi" in exec_calls[0].content + + +# --------------------------------------------------------------------------- +# 3. Non-native / textual-only model using the legitimate fenced format it +# depends on -> still correctly parsed and executed (regression check). +# --------------------------------------------------------------------------- +def test_non_native_model_fenced_tool_call_still_executed(monkeypatch): + exec_calls = [] + _patch_common(monkeypatch, exec_calls) + # Neither this model name nor this endpoint host match any of the + # native-capable keyword/host checks, so _is_api_model resolves to False + # and the model must rely on the textual fenced-block convention to + # invoke tools at all. + events = _run_loop( + monkeypatch, "llama-2-7b-chat", + ["```bash\necho hi\n```"], + max_rounds=2, + endpoint_url="http://192.168.1.50:8000/v1", + ) + assert len(exec_calls) == 1, f"non-native model's fenced tool call should still execute: {exec_calls}" + assert exec_calls[0].tool_type == "bash" + assert "echo hi" in exec_calls[0].content + + +# --------------------------------------------------------------------------- +# 4. The exact illustrative-fence shape from issue #3222's repro (```bash + +# ```json guide-only examples) run through the real resolution path for a +# native model -> confirm zero tool actions resolved. +# --------------------------------------------------------------------------- +def test_issue_3222_repro_guide_only_response_resolves_no_tool_actions(monkeypatch): + exec_calls = [] + _patch_common(monkeypatch, exec_calls) + repro = ( + "Here is the command you would run locally:\n\n" + "```bash\nnpm run plan:articles\n```\n\n" + "And here is an example config shape:\n\n" + "```json\n" + "{\n" + ' "script": "npm run plan:articles",\n' + ' "mode": "guide-only"\n' + "}\n" + "```\n" + ) + events = _run_loop(monkeypatch, "grok-4", [repro]) + assert exec_calls == [], f"guide-only example fences must resolve to zero tool actions: {exec_calls}" + + +# --------------------------------------------------------------------------- +# Direct unit coverage of _resolve_tool_blocks itself (the real seam the fix +# lives in), complementing the end-to-end checks above. +# --------------------------------------------------------------------------- +def test_resolve_tool_blocks_skips_textual_fallback_for_native_models_with_no_native_calls(): + guide_only = "```bash\nnpm run plan:articles\n```\n```json\n{\"a\": 1}\n```" + blocks, used_native = al._resolve_tool_blocks(guide_only, [], round_num=1, is_api_model=True) + assert blocks == [] + assert used_native is False + + +def test_resolve_tool_blocks_keeps_textual_fallback_for_non_native_models(): + text = "```bash\necho hi\n```" + blocks, used_native = al._resolve_tool_blocks(text, [], round_num=1, is_api_model=False) + assert len(blocks) == 1 + assert blocks[0].tool_type == "bash" + assert used_native is False + + +def test_resolve_tool_blocks_native_path_untouched_when_native_calls_present(): + native_calls = [{"name": "bash", "arguments": json.dumps({"command": "echo hi"})}] + blocks, used_native = al._resolve_tool_blocks("some prose", native_calls, round_num=1, is_api_model=True) + assert used_native is True + assert len(blocks) == 1 + assert blocks[0].tool_type == "bash" + + +# --------------------------------------------------------------------------- +# Booyaka101's review on #3356: short-circuiting the *whole* parser for native +# models (`tool_blocks = [] if is_api_model else parse_tool_blocks(...)`) also +# silently dropped explicit [TOOL_CALL]///DSML markup that +# leaked into content as text — a real regression for e.g. DeepSeek-V falling +# back to DSML when it can't emit structured tool_calls. The fix gates ONLY +# the fenced-code pattern (via `skip_fenced=`) so Patterns 2-5 stay active. +# --------------------------------------------------------------------------- +from src.tool_parsing import parse_tool_blocks, strip_tool_blocks # noqa: E402 + + +def test_skip_fenced_still_recovers_xml_invoke_markup(): + leaked = ( + "Sure, I'll look that up.\n" + 'latest python release' + ) + blocks = parse_tool_blocks(leaked, skip_fenced=True) + assert len(blocks) == 1 + assert blocks[0].tool_type == "web_search" + assert "latest python release" in blocks[0].content + + +def test_skip_fenced_still_recovers_dsml_markup(): + dsml = ( + "Let me search for that.\n" + "<||DSML||tool_calls>" + '<||DSML||invoke name="web_search">' + '<||DSML||parameter name="query" string="true">latest python release' + "" + "" + ) + blocks = parse_tool_blocks(dsml, skip_fenced=True) + assert len(blocks) == 1 + assert blocks[0].tool_type == "web_search" + assert "latest python release" in blocks[0].content + + +def test_skip_fenced_ignores_only_the_fenced_pattern(): + text = "```bash\nnpm run plan:articles\n```" + assert parse_tool_blocks(text, skip_fenced=True) == [] + assert len(parse_tool_blocks(text, skip_fenced=False)) == 1 + + +def test_resolve_tool_blocks_recovers_invoke_markup_for_native_model_with_no_native_calls(): + """End-to-end: a native model (is_api_model=True) that emitted no + structured tool_calls but leaked an call into its text content + must still have that real call recovered — not dropped alongside the + fenced-example gating.""" + leaked = ( + "I'll search for that now.\n" + 'odysseus changelog' + ) + blocks, used_native = al._resolve_tool_blocks(leaked, [], round_num=1, is_api_model=True) + assert used_native is False + assert len(blocks) == 1 + assert blocks[0].tool_type == "web_search" + assert "odysseus changelog" in blocks[0].content + + +# --------------------------------------------------------------------------- +# strip_tool_blocks must mirror the same fenced-pattern gate so persisted text +# matches what was (not) executed: an illustrative fence that wasn't run for a +# native model shouldn't vanish from saved/reloaded history either — otherwise +# it streams once and then disappears on reload (Booyaka101's point #2). +# --------------------------------------------------------------------------- +def test_strip_tool_blocks_preserves_fence_when_skip_fenced(): + text = "Here's an example:\n\n```bash\nnpm run plan:articles\n```\n\nJust copy that." + cleaned = strip_tool_blocks(text, skip_fenced=True) + assert "```bash" in cleaned + assert "npm run plan:articles" in cleaned + + +def test_strip_tool_blocks_still_strips_fence_by_default(): + text = "Here's an example:\n\n```bash\nnpm run plan:articles\n```\n\nJust copy that." + cleaned = strip_tool_blocks(text, skip_fenced=False) + assert "```bash" not in cleaned + assert "npm run plan:articles" not in cleaned + + +def test_strip_tool_blocks_always_strips_invoke_and_dsml_regardless_of_skip_fenced(): + leaked = ( + "Searching now.\n" + 'q' + "\nDone." + ) + for skip in (True, False): + cleaned = strip_tool_blocks(leaked, skip_fenced=skip) + assert "