From 70d806019bc7fee7c18266423270b062e76e3303 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Sat, 27 Jun 2026 19:31:17 +0100 Subject: [PATCH] fix: tool results misthreaded to the wrong tool_call_id when a native call fails to convert (#1917) * fix: tool results misthreaded when a native call fails to convert * Unpack the third converted_calls return from _resolve_tool_blocks in the fenced-example tests --- src/agent_loop.py | 13 +++++-- ..._example_not_executed_for_native_models.py | 8 ++-- tests/test_native_tool_result_threading.py | 39 +++++++++++++++++++ 3 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 tests/test_native_tool_result_threading.py diff --git a/src/agent_loop.py b/src/agent_loop.py index 24d27ef6d..be80c316a 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1648,6 +1648,7 @@ def _build_base_prompt( def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num: int, is_api_model: bool = False): """Choose native function calls or fenced code block parsing. Returns (tool_blocks, used_native).""" used_native = False + converted_calls = [] # native calls that converted, ALIGNED with tool_blocks if native_tool_calls: tool_blocks = [] for tc in native_tool_calls: @@ -1656,6 +1657,7 @@ def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num block = function_call_to_tool_block(tc_name, tc_args) if block: tool_blocks.append(block) + converted_calls.append(tc) logger.info(f" -> converted: {tc_name} -> {block.tool_type}") else: logger.warning(f" -> FAILED to convert native call: {tc_name} args={tc_args[:200]}") @@ -1685,7 +1687,7 @@ def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num f"{len(native_tool_calls)} native calls, " f"{len(tool_blocks)} tool blocks. Preview: {resp_preview}") - return tool_blocks, used_native + return tool_blocks, used_native, converted_calls def _append_tool_results( @@ -2868,7 +2870,7 @@ async def stream_agent_loop( _round_first_event_logged, _round_first_token_logged, ) - tool_blocks, used_native = _resolve_tool_blocks( + tool_blocks, used_native, converted_calls = _resolve_tool_blocks( round_response, native_tool_calls, round_num, @@ -3500,7 +3502,12 @@ async def stream_agent_loop( break # Feed results back to LLM for next round - _append_tool_results(messages, round_response, native_tool_calls, + # Pass the CONVERTED calls (aligned 1:1 with tool_result_texts), not the + # raw native_tool_calls: a call that failed to convert is dropped from + # tool_blocks but stayed in native_tool_calls, so indexing results by + # native position mis-attached each result to the wrong tool_call_id + # (and left the real call answered empty). + _append_tool_results(messages, round_response, converted_calls, tool_results, tool_result_texts, used_native, round_num, round_reasoning=round_reasoning) diff --git a/tests/test_fenced_example_not_executed_for_native_models.py b/tests/test_fenced_example_not_executed_for_native_models.py index 9cac7ab8d..e6dccae77 100644 --- a/tests/test_fenced_example_not_executed_for_native_models.py +++ b/tests/test_fenced_example_not_executed_for_native_models.py @@ -178,14 +178,14 @@ def test_issue_3222_repro_guide_only_response_resolves_no_tool_actions(monkeypat # --------------------------------------------------------------------------- def test_resolve_tool_blocks_skips_textual_fallback_for_native_models_with_no_native_calls(): guide_only = "```bash\nnpm run plan:articles\n```\n```json\n{\"a\": 1}\n```" - blocks, used_native = al._resolve_tool_blocks(guide_only, [], round_num=1, is_api_model=True) + blocks, used_native, _ = al._resolve_tool_blocks(guide_only, [], round_num=1, is_api_model=True) assert blocks == [] assert used_native is False def test_resolve_tool_blocks_keeps_textual_fallback_for_non_native_models(): text = "```bash\necho hi\n```" - blocks, used_native = al._resolve_tool_blocks(text, [], round_num=1, is_api_model=False) + blocks, used_native, _ = al._resolve_tool_blocks(text, [], round_num=1, is_api_model=False) assert len(blocks) == 1 assert blocks[0].tool_type == "bash" assert used_native is False @@ -193,7 +193,7 @@ def test_resolve_tool_blocks_keeps_textual_fallback_for_non_native_models(): def test_resolve_tool_blocks_native_path_untouched_when_native_calls_present(): native_calls = [{"name": "bash", "arguments": json.dumps({"command": "echo hi"})}] - blocks, used_native = al._resolve_tool_blocks("some prose", native_calls, round_num=1, is_api_model=True) + blocks, used_native, _ = al._resolve_tool_blocks("some prose", native_calls, round_num=1, is_api_model=True) assert used_native is True assert len(blocks) == 1 assert blocks[0].tool_type == "bash" @@ -305,7 +305,7 @@ def test_resolve_tool_blocks_recovers_invoke_markup_for_native_model_with_no_nat "I'll search for that now.\n" 'odysseus changelog' ) - blocks, used_native = al._resolve_tool_blocks(leaked, [], round_num=1, is_api_model=True) + blocks, used_native, _ = al._resolve_tool_blocks(leaked, [], round_num=1, is_api_model=True) assert used_native is False assert len(blocks) == 1 assert blocks[0].tool_type == "web_search" diff --git a/tests/test_native_tool_result_threading.py b/tests/test_native_tool_result_threading.py new file mode 100644 index 000000000..c351087db --- /dev/null +++ b/tests/test_native_tool_result_threading.py @@ -0,0 +1,39 @@ +"""Native tool-call results must be threaded by CONVERTED-call position. + +When an OpenAI/Anthropic model emits several tool_calls in one round and one +fails to convert (hallucinated name or bad-JSON args), it is dropped from +tool_blocks (so it produces no result) but used to stay in native_tool_calls. +_append_tool_results indexed tool_result_texts by native-call position, so the +surviving result was attached to the wrong tool_call_id and the real call was +answered with an empty string. _resolve_tool_blocks now returns the converted +calls aligned 1:1 with tool_blocks/tool_result_texts, and that aligned list is +what is threaded back. +""" +import src.agent_loop as al + + +def test_resolve_returns_converted_calls_aligned(): + native = [ + {"name": "bogus_unknown_tool", "arguments": "{}", "id": "A"}, + {"name": "web_search", "arguments": '{"query": "hello"}', "id": "B"}, + ] + tool_blocks, used_native, converted = al._resolve_tool_blocks("", native, 1) + assert used_native is True + assert len(tool_blocks) == 1 # only web_search converted + assert [c["name"] for c in converted] == ["web_search"] + assert len(converted) == len(tool_blocks) # aligned 1:1 + + +def test_append_threads_result_to_correct_tool_call_id(): + messages = [] + converted = [{"id": "B", "name": "web_search", "arguments": "{}"}] + al._append_tool_results( + messages, "some response", converted, + ["RESULT"], ["RESULT"], True, 1, + ) + tool_msgs = [m for m in messages if m.get("role") == "tool"] + assert len(tool_msgs) == 1 + assert tool_msgs[0]["tool_call_id"] == "B" + assert tool_msgs[0]["content"] == "RESULT" + asst = next(m for m in messages if m.get("role") == "assistant") + assert [tc["id"] for tc in asst["tool_calls"]] == ["B"]