mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
fix(agent): stop treating illustrative Markdown fences as tool calls for native function-calling models (#3356)
* fix(agent): stop executing illustrative Markdown fences as tool calls for native function-calling models _resolve_tool_blocks fell back to the textual parse_tool_blocks() fenced-block parser whenever a model produced no native tool_calls, regardless of whether that model has a reliable native function-calling channel. Native models (GPT/Claude/Grok/Qwen3/DeepSeek-V, etc. - _is_api_model true) commonly write illustrative ```bash/```python/```json examples in guide-only prose; the fallback parser matched these and executed them as real commands, sometimes looping for several rounds as the model tried to clarify with more examples (#3222). Restrict the textual fenced-block fallback to non-native models, which rely on it as their only tool-invocation channel. Native models are trusted to use their structured tool_calls channel for real invocations; when they don't emit one, a bare fence in their response is prose, not an action. The native tool_calls path itself is untouched. This sits one layer below #3088's guide-only policy enforcement: that PR blocks tool exposure/execution on explicit no-tools requests, while this fixes the parser so ordinary illustrative fences are never misread as calls in the first place, on any turn. * fix(agent): gate only the fenced-example pattern for native models, preserve DSML/invoke recovery and persistence _resolve_tool_blocks previously short-circuited the entire textual parser (tool_blocks = [] if is_api_model else parse_tool_blocks(...)) for native function-calling models with no native tool_calls. That also dropped Patterns 2-5 (explicit [TOOL_CALL]/<invoke>/<tool_code>/DSML markup leaked into content as text), which are real calls a model couldn't emit on its structured channel (e.g. DeepSeek-V falling back to DSML), not illustrative examples. parse_tool_blocks/strip_tool_blocks now take a skip_fenced flag that gates ONLY Pattern 1 (the fenced ```bash/```python/```json block matcher). _resolve_tool_blocks passes skip_fenced=is_api_model so fenced examples stop being executed for native models while [TOOL_CALL]/<invoke>/<tool_code>/DSML stay fully active and recoverable. cleaned_round mirrors the same gate when persisting round text, so an illustrative fence that wasn't executed isn't stripped from saved/reloaded history either (it was streaming once and then disappearing on reload).
This commit is contained in:
+23
-4
@@ -1111,7 +1111,7 @@ def _build_base_prompt(
|
||||
|
||||
|
||||
|
||||
def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num: int):
|
||||
def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num: int, is_api_model: bool = False):
|
||||
"""Choose native function calls or fenced code block parsing. Returns (tool_blocks, used_native)."""
|
||||
used_native = False
|
||||
if native_tool_calls:
|
||||
@@ -1128,7 +1128,21 @@ def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num
|
||||
if tool_blocks:
|
||||
used_native = True
|
||||
if not used_native:
|
||||
tool_blocks = parse_tool_blocks(round_response)
|
||||
# Native function-calling models (GPT/Claude/Grok/Qwen3/DeepSeek-V, etc.)
|
||||
# have a reliable structured channel for real tool invocations. When such
|
||||
# a model emits no native tool_calls, any ```bash/```python/```json fence
|
||||
# in its prose is virtually always an illustrative example for the user
|
||||
# (e.g. "here's the command you'd run"), not an attempted tool call —
|
||||
# executing it causes accidental runs and clarification loops (#3222).
|
||||
#
|
||||
# Gate ONLY that fenced-block pattern for native models, not the whole
|
||||
# parser: explicit [TOOL_CALL]/<invoke>/<tool_code>/DSML markup that
|
||||
# leaks into content as text is never illustrative — it's a real call
|
||||
# the model couldn't emit on its structured channel (e.g. DeepSeek-V
|
||||
# falling back to DSML). Dropping the whole parser would silently lose
|
||||
# those too. Non-native / textual-only models keep every pattern,
|
||||
# fenced blocks included, since that's their *only* tool channel.
|
||||
tool_blocks = parse_tool_blocks(round_response, skip_fenced=is_api_model)
|
||||
if tool_blocks:
|
||||
logger.info(f"Agent round {round_num}: {len(tool_blocks)} fenced tool block(s) detected")
|
||||
|
||||
@@ -2053,7 +2067,7 @@ async def stream_agent_loop(
|
||||
yield chunk
|
||||
# Intercept [DONE] — don't forward until all rounds finish
|
||||
|
||||
tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num)
|
||||
tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num, is_api_model=_is_api_model)
|
||||
|
||||
# Force-answer round: we told the model to STOP calling tools and
|
||||
# answer. If it ignored that and emitted a (possibly DSML) tool
|
||||
@@ -2132,7 +2146,12 @@ async def stream_agent_loop(
|
||||
|
||||
# Save cleaned round text for history persistence
|
||||
# Keep <think> blocks so they render in the thinking section on reload
|
||||
cleaned_round = strip_tool_blocks(round_response).strip()
|
||||
# Mirror the same fenced-pattern gate used to resolve tool_blocks above:
|
||||
# an illustrative fence that wasn't executed (because this is a native
|
||||
# model with no real native_tool_calls) must not be stripped from the
|
||||
# persisted text either — otherwise it streams once and then disappears
|
||||
# on reload (#3222 follow-up).
|
||||
cleaned_round = strip_tool_blocks(round_response, skip_fenced=(_is_api_model and not used_native)).strip()
|
||||
round_texts.append(cleaned_round)
|
||||
|
||||
if not tool_blocks:
|
||||
|
||||
+49
-26
@@ -427,7 +427,7 @@ def _parse_tool_code_block(raw: str) -> Optional[ToolBlock]:
|
||||
return None
|
||||
|
||||
|
||||
def parse_tool_blocks(text: str) -> List[ToolBlock]:
|
||||
def parse_tool_blocks(text: str, skip_fenced: bool = False) -> List[ToolBlock]:
|
||||
"""Extract executable tool blocks from LLM response text.
|
||||
|
||||
Supports multiple formats:
|
||||
@@ -436,6 +436,17 @@ def parse_tool_blocks(text: str) -> List[ToolBlock]:
|
||||
3. XML-style <tool_call>/<invoke> blocks
|
||||
4. <tool_code> blocks (MiniMax-M2.5 style)
|
||||
5. DeepSeek DSML markup (normalized to <invoke> first)
|
||||
|
||||
`skip_fenced`: when True, Pattern 1 (fenced ```bash/```python/```json code
|
||||
blocks) is not matched at all. Native function-calling models (GPT/Claude/
|
||||
Grok/Qwen3/DeepSeek-V, etc.) commonly write illustrative fenced examples in
|
||||
prose; for those models we trust the structured tool_calls channel for real
|
||||
invocations and treat a bare fence as display text rather than an action
|
||||
(issue #3222). Patterns 2-5 — explicit [TOOL_CALL]/<invoke>/<tool_code>/DSML
|
||||
markup that leaked into content as text — stay fully active regardless,
|
||||
since that markup is never an illustrative example and dropping it would
|
||||
silently lose real calls (e.g. DeepSeek-V falling back to DSML when it
|
||||
can't emit structured tool_calls).
|
||||
"""
|
||||
blocks = []
|
||||
|
||||
@@ -443,30 +454,31 @@ def parse_tool_blocks(text: str) -> List[ToolBlock]:
|
||||
# XML patterns below catch it.
|
||||
text = _normalize_dsml(text)
|
||||
|
||||
# Pattern 1: fenced code blocks
|
||||
for m in _TOOL_BLOCK_RE.finditer(text):
|
||||
tag = m.group(1).lower()
|
||||
content = m.group(2).strip()
|
||||
if not content:
|
||||
continue
|
||||
# If a code block's content is an <invoke> XML call (some models wrap
|
||||
# tool calls in ```python or ```xml fences), parse the invoke instead.
|
||||
if '<invoke' in content:
|
||||
for inv in _XML_INVOKE_RE.finditer(content):
|
||||
block = _parse_xml_invoke(inv)
|
||||
# Pattern 1: fenced code blocks (skipped when `skip_fenced` — see docstring).
|
||||
if not skip_fenced:
|
||||
for m in _TOOL_BLOCK_RE.finditer(text):
|
||||
tag = m.group(1).lower()
|
||||
content = m.group(2).strip()
|
||||
if not content:
|
||||
continue
|
||||
# If a code block's content is an <invoke> XML call (some models wrap
|
||||
# tool calls in ```python or ```xml fences), parse the invoke instead.
|
||||
if '<invoke' in content:
|
||||
for inv in _XML_INVOKE_RE.finditer(content):
|
||||
block = _parse_xml_invoke(inv)
|
||||
if block:
|
||||
blocks.append(block)
|
||||
# This fenced block is <invoke> markup, not literal code. Whether or
|
||||
# not any call converted, never fall through to append the raw XML as
|
||||
# a python/bash block — e.g. a hyphenated/namespaced tool name that
|
||||
# _XML_INVOKE_RE's \w+ can't match would otherwise be executed as code.
|
||||
continue
|
||||
if tag in ("python", "bash"):
|
||||
block = _parse_misfenced_web_lookup(content)
|
||||
if block:
|
||||
blocks.append(block)
|
||||
# This fenced block is <invoke> markup, not literal code. Whether or
|
||||
# not any call converted, never fall through to append the raw XML as
|
||||
# a python/bash block — e.g. a hyphenated/namespaced tool name that
|
||||
# _XML_INVOKE_RE's \w+ can't match would otherwise be executed as code.
|
||||
continue
|
||||
if tag in ("python", "bash"):
|
||||
block = _parse_misfenced_web_lookup(content)
|
||||
if block:
|
||||
blocks.append(block)
|
||||
continue
|
||||
blocks.append(ToolBlock(tag, content))
|
||||
continue
|
||||
blocks.append(ToolBlock(tag, content))
|
||||
|
||||
# Pattern 2: [TOOL_CALL] blocks (only if no fenced blocks found)
|
||||
if not blocks:
|
||||
@@ -500,12 +512,23 @@ def parse_tool_blocks(text: str) -> List[ToolBlock]:
|
||||
return blocks
|
||||
|
||||
|
||||
def strip_tool_blocks(text: str) -> str:
|
||||
"""Remove executable tool blocks from text for clean display."""
|
||||
def strip_tool_blocks(text: str, skip_fenced: bool = False) -> str:
|
||||
"""Remove executable tool blocks from text for clean display.
|
||||
|
||||
`skip_fenced`: when True, fenced ```bash/```python/```json code blocks
|
||||
(Pattern 1) are left intact instead of being stripped. This must mirror
|
||||
whatever `skip_fenced` value `parse_tool_blocks` was called with for the
|
||||
same response: if a fence wasn't executed as a tool call (because it's an
|
||||
illustrative example from a native function-calling model), it shouldn't
|
||||
vanish from the persisted/displayed text either — otherwise the example
|
||||
streams once and then disappears on reload (issue #3222 follow-up).
|
||||
Patterns 2-5 + DSML markup are always stripped, since that markup should
|
||||
never reach the user regardless of whether it converted to a tool call.
|
||||
"""
|
||||
# Normalize DSML first so its markup gets stripped by the <invoke>
|
||||
# / <tool_call> removers below instead of leaking to the user.
|
||||
text = _normalize_dsml(text)
|
||||
cleaned = _TOOL_BLOCK_RE.sub('', text)
|
||||
cleaned = text if skip_fenced else _TOOL_BLOCK_RE.sub('', text)
|
||||
cleaned = _TOOL_CALL_RE.sub('', cleaned)
|
||||
cleaned = _XML_TOOL_CALL_RE.sub('', cleaned)
|
||||
cleaned = _TOOL_CODE_RE.sub('', cleaned)
|
||||
|
||||
@@ -0,0 +1,291 @@
|
||||
"""Issue #3222 — native function-calling models (GPT/Claude/Grok/Qwen3/DeepSeek-V,
|
||||
etc.) must not have ordinary illustrative Markdown fences in their prose
|
||||
(```bash, ```python, ```json examples written for the user to read) executed
|
||||
as real tool calls just because the textual fallback parser matches them.
|
||||
|
||||
`_resolve_tool_blocks` in src/agent_loop.py picks native `tool_calls` when the
|
||||
model emits them, and otherwise used to fall back unconditionally to
|
||||
`parse_tool_blocks(round_response)` (the fenced-block textual parser). For a
|
||||
native model that produced no real tool_calls — e.g. a "guide-only" turn where
|
||||
the model writes an example command for the user to copy — that fallback used
|
||||
to treat the example fence as an executable action, causing accidental command
|
||||
execution and multi-round loops.
|
||||
|
||||
The fix: for native function-calling models (`_is_api_model=True`) that emitted
|
||||
no native tool_calls, skip the textual fenced-block fallback entirely — these
|
||||
models have a reliable structured channel and a bare fence in their prose is
|
||||
display text, not an attempted call. Non-native / textual-only models keep the
|
||||
fallback unchanged, since fenced blocks are their *only* tool channel.
|
||||
|
||||
These tests drive the real `stream_agent_loop` (not just source-text regex
|
||||
assertions) end-to-end with a mocked LLM stream, and assert on whether
|
||||
`execute_tool_block` actually gets invoked.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import src.agent_loop as al
|
||||
|
||||
|
||||
def _collect(gen):
|
||||
async def _run():
|
||||
return [c async for c in gen]
|
||||
return asyncio.run(_run())
|
||||
|
||||
|
||||
def _types(chunks):
|
||||
out = []
|
||||
for c in chunks:
|
||||
if c.startswith("data: ") and not c.startswith("data: [DONE]"):
|
||||
try:
|
||||
out.append(json.loads(c[6:]))
|
||||
except Exception:
|
||||
pass
|
||||
return out
|
||||
|
||||
|
||||
def _patch_common(monkeypatch, exec_calls):
|
||||
# Skip RAG/tool-index, MCP, and settings lookups; keep the real loop body,
|
||||
# _resolve_tool_blocks, and parse_tool_blocks intact.
|
||||
monkeypatch.setattr(al, "get_setting", lambda key, default=None: default, raising=False)
|
||||
monkeypatch.setattr(al, "get_mcp_manager", lambda: None, raising=False)
|
||||
monkeypatch.setattr(al, "estimate_tokens", lambda *a, **k: 10, raising=False)
|
||||
|
||||
async def _fake_exec(block, *a, **k):
|
||||
exec_calls.append(block)
|
||||
return ("bash", {"output": "ok", "exit_code": 0})
|
||||
monkeypatch.setattr(al, "execute_tool_block", _fake_exec, raising=False)
|
||||
|
||||
|
||||
def _run_loop(monkeypatch, model, deltas, native_calls=None, max_rounds=2, endpoint_url=None):
|
||||
"""Drive stream_agent_loop with a fake LLM stream.
|
||||
|
||||
`deltas` is a list of text chunks streamed for round 1 (and reused for any
|
||||
further round). `native_calls`, if given, is emitted as a native
|
||||
`tool_calls` event alongside the round-1 text.
|
||||
"""
|
||||
call_count = {"n": 0}
|
||||
|
||||
async def _fake_stream(_candidates, messages, **kwargs):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
for d in deltas:
|
||||
yield f'data: {json.dumps({"delta": d})}\n\n'
|
||||
if native_calls:
|
||||
yield f'data: {json.dumps({"type": "tool_calls", "calls": native_calls})}\n\n'
|
||||
yield "data: [DONE]\n\n"
|
||||
else:
|
||||
# Subsequent rounds: just answer plainly so the loop terminates.
|
||||
yield f'data: {json.dumps({"delta": "All done, here is your answer."})}\n\n'
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
monkeypatch.setattr(al, "stream_llm_with_fallback", _fake_stream, raising=False)
|
||||
|
||||
gen = al.stream_agent_loop(
|
||||
endpoint_url or "https://api.openai.com/v1", model,
|
||||
[{"role": "user", "content": "Do not run anything yet, just show me an example."}],
|
||||
max_rounds=max_rounds,
|
||||
relevant_tools={"bash"},
|
||||
)
|
||||
return _types(_collect(gen))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Native model, illustrative ```bash fence, NO native tool_calls
|
||||
# -> must NOT be executed.
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_native_model_illustrative_bash_fence_not_executed(monkeypatch):
|
||||
exec_calls = []
|
||||
_patch_common(monkeypatch, exec_calls)
|
||||
guide_only = (
|
||||
"Here is the command you would run locally:\n\n"
|
||||
"```bash\nnpm run plan:articles\n```\n\n"
|
||||
"Just paste that into your terminal — I'm not running it for you."
|
||||
)
|
||||
events = _run_loop(monkeypatch, "gpt-4o", [guide_only])
|
||||
assert exec_calls == [], f"illustrative fence should not be executed, but got: {exec_calls}"
|
||||
# No tool-call/action events should be emitted for this round either.
|
||||
assert not any(e.get("type") == "tool_call" for e in events), events
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Native model that DOES emit a real native tool_calls entry
|
||||
# -> that call IS resolved/executed normally (untouched native path).
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_native_model_real_native_tool_call_is_executed(monkeypatch):
|
||||
exec_calls = []
|
||||
_patch_common(monkeypatch, exec_calls)
|
||||
native_calls = [{"name": "bash", "arguments": json.dumps({"command": "echo hi"})}]
|
||||
events = _run_loop(
|
||||
monkeypatch, "gpt-4o",
|
||||
["Sure, let me check that for you."],
|
||||
native_calls=native_calls,
|
||||
max_rounds=2,
|
||||
)
|
||||
assert len(exec_calls) == 1, f"expected the native tool call to execute, got: {exec_calls}"
|
||||
assert exec_calls[0].tool_type == "bash"
|
||||
assert "echo hi" in exec_calls[0].content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Non-native / textual-only model using the legitimate fenced format it
|
||||
# depends on -> still correctly parsed and executed (regression check).
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_non_native_model_fenced_tool_call_still_executed(monkeypatch):
|
||||
exec_calls = []
|
||||
_patch_common(monkeypatch, exec_calls)
|
||||
# Neither this model name nor this endpoint host match any of the
|
||||
# native-capable keyword/host checks, so _is_api_model resolves to False
|
||||
# and the model must rely on the textual fenced-block convention to
|
||||
# invoke tools at all.
|
||||
events = _run_loop(
|
||||
monkeypatch, "llama-2-7b-chat",
|
||||
["```bash\necho hi\n```"],
|
||||
max_rounds=2,
|
||||
endpoint_url="http://192.168.1.50:8000/v1",
|
||||
)
|
||||
assert len(exec_calls) == 1, f"non-native model's fenced tool call should still execute: {exec_calls}"
|
||||
assert exec_calls[0].tool_type == "bash"
|
||||
assert "echo hi" in exec_calls[0].content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. The exact illustrative-fence shape from issue #3222's repro (```bash +
|
||||
# ```json guide-only examples) run through the real resolution path for a
|
||||
# native model -> confirm zero tool actions resolved.
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_issue_3222_repro_guide_only_response_resolves_no_tool_actions(monkeypatch):
|
||||
exec_calls = []
|
||||
_patch_common(monkeypatch, exec_calls)
|
||||
repro = (
|
||||
"Here is the command you would run locally:\n\n"
|
||||
"```bash\nnpm run plan:articles\n```\n\n"
|
||||
"And here is an example config shape:\n\n"
|
||||
"```json\n"
|
||||
"{\n"
|
||||
' "script": "npm run plan:articles",\n'
|
||||
' "mode": "guide-only"\n'
|
||||
"}\n"
|
||||
"```\n"
|
||||
)
|
||||
events = _run_loop(monkeypatch, "grok-4", [repro])
|
||||
assert exec_calls == [], f"guide-only example fences must resolve to zero tool actions: {exec_calls}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Direct unit coverage of _resolve_tool_blocks itself (the real seam the fix
|
||||
# lives in), complementing the end-to-end checks above.
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_resolve_tool_blocks_skips_textual_fallback_for_native_models_with_no_native_calls():
|
||||
guide_only = "```bash\nnpm run plan:articles\n```\n```json\n{\"a\": 1}\n```"
|
||||
blocks, used_native = al._resolve_tool_blocks(guide_only, [], round_num=1, is_api_model=True)
|
||||
assert blocks == []
|
||||
assert used_native is False
|
||||
|
||||
|
||||
def test_resolve_tool_blocks_keeps_textual_fallback_for_non_native_models():
|
||||
text = "```bash\necho hi\n```"
|
||||
blocks, used_native = al._resolve_tool_blocks(text, [], round_num=1, is_api_model=False)
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].tool_type == "bash"
|
||||
assert used_native is False
|
||||
|
||||
|
||||
def test_resolve_tool_blocks_native_path_untouched_when_native_calls_present():
|
||||
native_calls = [{"name": "bash", "arguments": json.dumps({"command": "echo hi"})}]
|
||||
blocks, used_native = al._resolve_tool_blocks("some prose", native_calls, round_num=1, is_api_model=True)
|
||||
assert used_native is True
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].tool_type == "bash"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Booyaka101's review on #3356: short-circuiting the *whole* parser for native
|
||||
# models (`tool_blocks = [] if is_api_model else parse_tool_blocks(...)`) also
|
||||
# silently dropped explicit [TOOL_CALL]/<invoke>/<tool_code>/DSML markup that
|
||||
# leaked into content as text — a real regression for e.g. DeepSeek-V falling
|
||||
# back to DSML when it can't emit structured tool_calls. The fix gates ONLY
|
||||
# the fenced-code pattern (via `skip_fenced=`) so Patterns 2-5 stay active.
|
||||
# ---------------------------------------------------------------------------
|
||||
from src.tool_parsing import parse_tool_blocks, strip_tool_blocks # noqa: E402
|
||||
|
||||
|
||||
def test_skip_fenced_still_recovers_xml_invoke_markup():
|
||||
leaked = (
|
||||
"Sure, I'll look that up.\n"
|
||||
'<invoke name="web_search"><parameter name="query">latest python release</parameter></invoke>'
|
||||
)
|
||||
blocks = parse_tool_blocks(leaked, skip_fenced=True)
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].tool_type == "web_search"
|
||||
assert "latest python release" in blocks[0].content
|
||||
|
||||
|
||||
def test_skip_fenced_still_recovers_dsml_markup():
|
||||
dsml = (
|
||||
"Let me search for that.\n"
|
||||
"<||DSML||tool_calls>"
|
||||
'<||DSML||invoke name="web_search">'
|
||||
'<||DSML||parameter name="query" string="true">latest python release</||DSML||parameter>'
|
||||
"</||DSML||invoke>"
|
||||
"</||DSML||tool_calls>"
|
||||
)
|
||||
blocks = parse_tool_blocks(dsml, skip_fenced=True)
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].tool_type == "web_search"
|
||||
assert "latest python release" in blocks[0].content
|
||||
|
||||
|
||||
def test_skip_fenced_ignores_only_the_fenced_pattern():
|
||||
text = "```bash\nnpm run plan:articles\n```"
|
||||
assert parse_tool_blocks(text, skip_fenced=True) == []
|
||||
assert len(parse_tool_blocks(text, skip_fenced=False)) == 1
|
||||
|
||||
|
||||
def test_resolve_tool_blocks_recovers_invoke_markup_for_native_model_with_no_native_calls():
|
||||
"""End-to-end: a native model (is_api_model=True) that emitted no
|
||||
structured tool_calls but leaked an <invoke> call into its text content
|
||||
must still have that real call recovered — not dropped alongside the
|
||||
fenced-example gating."""
|
||||
leaked = (
|
||||
"I'll search for that now.\n"
|
||||
'<invoke name="web_search"><parameter name="query">odysseus changelog</parameter></invoke>'
|
||||
)
|
||||
blocks, used_native = al._resolve_tool_blocks(leaked, [], round_num=1, is_api_model=True)
|
||||
assert used_native is False
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].tool_type == "web_search"
|
||||
assert "odysseus changelog" in blocks[0].content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# strip_tool_blocks must mirror the same fenced-pattern gate so persisted text
|
||||
# matches what was (not) executed: an illustrative fence that wasn't run for a
|
||||
# native model shouldn't vanish from saved/reloaded history either — otherwise
|
||||
# it streams once and then disappears on reload (Booyaka101's point #2).
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_strip_tool_blocks_preserves_fence_when_skip_fenced():
|
||||
text = "Here's an example:\n\n```bash\nnpm run plan:articles\n```\n\nJust copy that."
|
||||
cleaned = strip_tool_blocks(text, skip_fenced=True)
|
||||
assert "```bash" in cleaned
|
||||
assert "npm run plan:articles" in cleaned
|
||||
|
||||
|
||||
def test_strip_tool_blocks_still_strips_fence_by_default():
|
||||
text = "Here's an example:\n\n```bash\nnpm run plan:articles\n```\n\nJust copy that."
|
||||
cleaned = strip_tool_blocks(text, skip_fenced=False)
|
||||
assert "```bash" not in cleaned
|
||||
assert "npm run plan:articles" not in cleaned
|
||||
|
||||
|
||||
def test_strip_tool_blocks_always_strips_invoke_and_dsml_regardless_of_skip_fenced():
|
||||
leaked = (
|
||||
"Searching now.\n"
|
||||
'<invoke name="web_search"><parameter name="query">q</parameter></invoke>'
|
||||
"\nDone."
|
||||
)
|
||||
for skip in (True, False):
|
||||
cleaned = strip_tool_blocks(leaked, skip_fenced=skip)
|
||||
assert "<invoke" not in cleaned
|
||||
assert "Searching now." in cleaned
|
||||
assert "Done." in cleaned
|
||||
Reference in New Issue
Block a user