From 70d806019bc7fee7c18266423270b062e76e3303 Mon Sep 17 00:00:00 2001
From: Afonso Coutinho <afonso@omelhorsite.pt>
Date: Sat, 27 Jun 2026 19:31:17 +0100
Subject: [PATCH] fix: tool results misthreaded to the wrong tool_call_id when
 a native call fails to convert (#1917)

* fix: tool results misthreaded when a native call fails to convert

* Unpack the third converted_calls return from _resolve_tool_blocks in the fenced-example tests
---
 src/agent_loop.py                             | 13 +++++--
 ..._example_not_executed_for_native_models.py |  8 ++--
 tests/test_native_tool_result_threading.py    | 39 +++++++++++++++++++
 3 files changed, 53 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_native_tool_result_threading.py

diff --git a/src/agent_loop.py b/src/agent_loop.py
index 24d27ef6d..be80c316a 100644
--- a/src/agent_loop.py
+++ b/src/agent_loop.py
@@ -1648,6 +1648,7 @@ def _build_base_prompt(
 def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num: int, is_api_model: bool = False):
     """Choose native function calls or fenced code block parsing. Returns (tool_blocks, used_native)."""
     used_native = False
+    converted_calls = []  # native calls that converted, ALIGNED with tool_blocks
     if native_tool_calls:
         tool_blocks = []
         for tc in native_tool_calls:
@@ -1656,6 +1657,7 @@ def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num
             block = function_call_to_tool_block(tc_name, tc_args)
             if block:
                 tool_blocks.append(block)
+                converted_calls.append(tc)
                 logger.info(f"  -> converted: {tc_name} -> {block.tool_type}")
             else:
                 logger.warning(f"  -> FAILED to convert native call: {tc_name} args={tc_args[:200]}")
@@ -1685,7 +1687,7 @@ def _resolve_tool_blocks(round_response: str, native_tool_calls: list, round_num
                 f"{len(native_tool_calls)} native calls, "
                 f"{len(tool_blocks)} tool blocks. Preview: {resp_preview}")
 
-    return tool_blocks, used_native
+    return tool_blocks, used_native, converted_calls
 
 
 def _append_tool_results(
@@ -2868,7 +2870,7 @@ async def stream_agent_loop(
             _round_first_event_logged,
             _round_first_token_logged,
         )
-        tool_blocks, used_native = _resolve_tool_blocks(
+        tool_blocks, used_native, converted_calls = _resolve_tool_blocks(
             round_response,
             native_tool_calls,
             round_num,
@@ -3500,7 +3502,12 @@ async def stream_agent_loop(
             break
 
         # Feed results back to LLM for next round
-        _append_tool_results(messages, round_response, native_tool_calls,
+        # Pass the CONVERTED calls (aligned 1:1 with tool_result_texts), not the
+        # raw native_tool_calls: a call that failed to convert is dropped from
+        # tool_blocks but stayed in native_tool_calls, so indexing results by
+        # native position mis-attached each result to the wrong tool_call_id
+        # (and left the real call answered empty).
+        _append_tool_results(messages, round_response, converted_calls,
                              tool_results, tool_result_texts, used_native, round_num,
                              round_reasoning=round_reasoning)
 
diff --git a/tests/test_fenced_example_not_executed_for_native_models.py b/tests/test_fenced_example_not_executed_for_native_models.py
index 9cac7ab8d..e6dccae77 100644
--- a/tests/test_fenced_example_not_executed_for_native_models.py
+++ b/tests/test_fenced_example_not_executed_for_native_models.py
@@ -178,14 +178,14 @@ def test_issue_3222_repro_guide_only_response_resolves_no_tool_actions(monkeypat
 # ---------------------------------------------------------------------------
 def test_resolve_tool_blocks_skips_textual_fallback_for_native_models_with_no_native_calls():
     guide_only = "```bash\nnpm run plan:articles\n```\n```json\n{\"a\": 1}\n```"
-    blocks, used_native = al._resolve_tool_blocks(guide_only, [], round_num=1, is_api_model=True)
+    blocks, used_native, _ = al._resolve_tool_blocks(guide_only, [], round_num=1, is_api_model=True)
     assert blocks == []
     assert used_native is False
 
 
 def test_resolve_tool_blocks_keeps_textual_fallback_for_non_native_models():
     text = "```bash\necho hi\n```"
-    blocks, used_native = al._resolve_tool_blocks(text, [], round_num=1, is_api_model=False)
+    blocks, used_native, _ = al._resolve_tool_blocks(text, [], round_num=1, is_api_model=False)
     assert len(blocks) == 1
     assert blocks[0].tool_type == "bash"
     assert used_native is False
@@ -193,7 +193,7 @@ def test_resolve_tool_blocks_keeps_textual_fallback_for_non_native_models():
 
 def test_resolve_tool_blocks_native_path_untouched_when_native_calls_present():
     native_calls = [{"name": "bash", "arguments": json.dumps({"command": "echo hi"})}]
-    blocks, used_native = al._resolve_tool_blocks("some prose", native_calls, round_num=1, is_api_model=True)
+    blocks, used_native, _ = al._resolve_tool_blocks("some prose", native_calls, round_num=1, is_api_model=True)
     assert used_native is True
     assert len(blocks) == 1
     assert blocks[0].tool_type == "bash"
@@ -305,7 +305,7 @@ def test_resolve_tool_blocks_recovers_invoke_markup_for_native_model_with_no_nat
         "I'll search for that now.\n"
         '<invoke name="web_search"><parameter name="query">odysseus changelog</parameter></invoke>'
     )
-    blocks, used_native = al._resolve_tool_blocks(leaked, [], round_num=1, is_api_model=True)
+    blocks, used_native, _ = al._resolve_tool_blocks(leaked, [], round_num=1, is_api_model=True)
     assert used_native is False
     assert len(blocks) == 1
     assert blocks[0].tool_type == "web_search"
diff --git a/tests/test_native_tool_result_threading.py b/tests/test_native_tool_result_threading.py
new file mode 100644
index 000000000..c351087db
--- /dev/null
+++ b/tests/test_native_tool_result_threading.py
@@ -0,0 +1,39 @@
+"""Native tool-call results must be threaded by CONVERTED-call position.
+
+When an OpenAI/Anthropic model emits several tool_calls in one round and one
+fails to convert (hallucinated name or bad-JSON args), it is dropped from
+tool_blocks (so it produces no result) but used to stay in native_tool_calls.
+_append_tool_results indexed tool_result_texts by native-call position, so the
+surviving result was attached to the wrong tool_call_id and the real call was
+answered with an empty string. _resolve_tool_blocks now returns the converted
+calls aligned 1:1 with tool_blocks/tool_result_texts, and that aligned list is
+what is threaded back.
+"""
+import src.agent_loop as al
+
+
+def test_resolve_returns_converted_calls_aligned():
+    native = [
+        {"name": "bogus_unknown_tool", "arguments": "{}", "id": "A"},
+        {"name": "web_search", "arguments": '{"query": "hello"}', "id": "B"},
+    ]
+    tool_blocks, used_native, converted = al._resolve_tool_blocks("", native, 1)
+    assert used_native is True
+    assert len(tool_blocks) == 1           # only web_search converted
+    assert [c["name"] for c in converted] == ["web_search"]
+    assert len(converted) == len(tool_blocks)  # aligned 1:1
+
+
+def test_append_threads_result_to_correct_tool_call_id():
+    messages = []
+    converted = [{"id": "B", "name": "web_search", "arguments": "{}"}]
+    al._append_tool_results(
+        messages, "some response", converted,
+        ["RESULT"], ["RESULT"], True, 1,
+    )
+    tool_msgs = [m for m in messages if m.get("role") == "tool"]
+    assert len(tool_msgs) == 1
+    assert tool_msgs[0]["tool_call_id"] == "B"
+    assert tool_msgs[0]["content"] == "RESULT"
+    asst = next(m for m in messages if m.get("role") == "assistant")
+    assert [tc["id"] for tc in asst["tool_calls"]] == ["B"]