From e8175c95351f9ad062d44b8e14535b403910576d Mon Sep 17 00:00:00 2001 From: Michael <52305679+michaelxer@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:32:57 +0700 Subject: [PATCH] fix: Images cannot be seen by model that is vision capable (#4726) * fix: Images cannot be seen by model that is vision capable * fix: skip http(s) image_url for Ollama (images[] is base64-only) --------- Co-authored-by: michaelxer --- src/llm_core.py | 111 ++++++++++++++++++++++++-------- tests/test_ollama_multimodal.py | 109 +++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+), 26 deletions(-) create mode 100644 tests/test_ollama_multimodal.py diff --git a/src/llm_core.py b/src/llm_core.py index 2178f894f..9faf8d362 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -345,43 +345,102 @@ def _normalize_ollama_url(url: str) -> str: return base.rstrip("/") + "/chat" -def _ollama_normalize_tool_messages(messages: List[Dict]) -> List[Dict]: +def _ollama_normalize_messages(messages: List[Dict]) -> List[Dict]: """Adapt Odysseus' canonical OpenAI-style messages to native Ollama /api/chat. - Odysseus carries assistant tool calls in the OpenAI shape, where - `function.arguments` is a JSON *string*. Native Ollama expects it to be a - JSON *object*; given the string it fails the whole request with HTTP 400 - "Value looks like object, but can't find closing '}' symbol", which aborts - every follow-up (tool-result) round. Parse the arguments back into an object - here, on a shallow copy, leaving non-tool messages untouched. The opaque - Gemini `extra_content` (thought_signature) is dropped — it is meaningless to - Ollama and only matters when the conversation is replayed to Gemini. + Two shape mismatches silently break requests: + + 1. Tool calls: Odysseus carries `function.arguments` as a JSON *string*. + Native Ollama expects a JSON *object* and rejects the string form with + HTTP 400 ("Value looks like object, but can't find closing '}' symbol"), + aborting every follow-up (tool-result) round. Parse the arguments back + into an object here, on a shallow copy, leaving non-tool messages + untouched. The opaque Gemini `extra_content` (thought_signature) is + dropped — it is meaningless to Ollama and only matters when the + conversation is replayed to Gemini. + + 2. Images (issue #4723): Odysseus carries multimodal user content as an + OpenAI-style list ``[{type: "text", ...}, {type: "image_url", + image_url: {url: "data:image/...;base64,XXX"}}, ...]``. Native Ollama + does not accept a list for ``content`` — it wants ``content`` as a + string plus a separate ``images`` array of raw base64 strings (no + ``data:`` prefix). Without this conversion the image blocks pass + through untouched, the vision-capable model never sees the picture, + and the user gets "I can't see any image" even though the request + succeeded. """ out: List[Dict] = [] for m in messages or []: - tcs = m.get("tool_calls") if isinstance(m, dict) else None - if not tcs: + if not isinstance(m, dict): out.append(m) continue - new_calls = [] - for tc in tcs: - fn = tc.get("function") or {} - args = fn.get("arguments") - if isinstance(args, str): - try: - args = json.loads(args) if args.strip() else {} - except (json.JSONDecodeError, TypeError): - args = {} - call: Dict = {"function": {"name": fn.get("name", ""), "arguments": args or {}}} - if tc.get("id"): - call["id"] = tc["id"] - new_calls.append(call) + nm = dict(m) - nm["tool_calls"] = new_calls + + # 1. Tool-call argument strings -> objects. + tcs = nm.get("tool_calls") + if tcs: + new_calls = [] + for tc in tcs: + fn = tc.get("function") or {} + args = fn.get("arguments") + if isinstance(args, str): + try: + args = json.loads(args) if args.strip() else {} + except (json.JSONDecodeError, TypeError): + args = {} + call: Dict = {"function": {"name": fn.get("name", ""), "arguments": args or {}}} + if tc.get("id"): + call["id"] = tc["id"] + new_calls.append(call) + nm["tool_calls"] = new_calls + + # 2. Multimodal content list -> native content string + images array. + content = nm.get("content") + if isinstance(content, list): + text_parts: List[str] = [] + images: List[str] = list(nm.get("images") or []) + for block in content: + if not isinstance(block, dict): + continue + btype = block.get("type") + if btype == "text": + t = block.get("text") + if t: + text_parts.append(str(t)) + elif btype == "image_url": + url = (block.get("image_url") or {}).get("url", "") + if not url: + continue + if url.startswith("data:"): + # Strip the ``data:[...];base64,`` prefix — native + # Ollama wants only the base64 bytes. + _, _, b64 = url.partition(",") + if b64: + images.append(b64) + else: + # Native Ollama images[] is base64-only; it does + # not fetch HTTP URLs. Skip unsupported schemes + # rather than sending a non-base64 string that the + # model silently ignores. + logger.warning( + "Skipping non-data image_url (Ollama images[] " + "requires base64): %s", + url[:80], + ) + nm["content"] = "\n".join(text_parts).strip() + if images: + nm["images"] = images + out.append(nm) return out +# Backward-compatible alias for callers/tests that imported the older name +# (it only handled tool messages originally — issue #4723 broadened scope). +_ollama_normalize_tool_messages = _ollama_normalize_messages + + def _build_ollama_payload( model: str, messages: List[Dict], @@ -404,7 +463,7 @@ def _build_ollama_payload( """ payload: Dict = { "model": model, - "messages": _ollama_normalize_tool_messages(messages), + "messages": _ollama_normalize_messages(messages), "stream": stream, } options: Dict = {} diff --git a/tests/test_ollama_multimodal.py b/tests/test_ollama_multimodal.py new file mode 100644 index 000000000..3b473e1cb --- /dev/null +++ b/tests/test_ollama_multimodal.py @@ -0,0 +1,109 @@ +"""Regression tests for Ollama-native multimodal image routing (issue #4723). + +Odysseus builds user messages in OpenAI style:: + + {"role": "user", "content": [ + {"type": "text", "text": "..."}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}}, + ]} + +Native Ollama ``/api/chat`` does **not** accept a list for ``content``. It +expects ``content`` to be a string and images carried separately on +``images`` (a list of raw base64 strings, no ``data:`` prefix). Without +this conversion the image block silently never reaches the vision model — +the model reports "I can't see the image" even though it is vision-capable +and the request succeeded. +""" +from src import llm_core + + +def _multimodal_msg(): + return { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this picture?"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,BBBB"}}, + ], + } + + +def test_ollama_payload_converts_openai_image_blocks_to_native_images_array(): + payload = llm_core._build_ollama_payload( + "gemma4:e4b", [_multimodal_msg()], temperature=0.0, max_tokens=0, + ) + msg = payload["messages"][0] + # Content must be a string, not a list — native Ollama rejects lists. + assert isinstance(msg["content"], str) + assert "What is in this picture?" in msg["content"] + # Base64 data extracted into the native images array (no data: prefix). + assert msg["images"] == ["AAAA", "BBBB"] + + +def test_ollama_payload_skips_http_image_url(): + """Non-data-URI image_url values are skipped with a warning because + native Ollama images[] accepts base64 only.""" + msg = { + "role": "user", + "content": [ + {"type": "text", "text": "Look"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}}, + ], + } + payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0) + out = payload["messages"][0] + assert out["content"] == "Look" + # HTTP URL is NOT added to images — Ollama cannot fetch it. + assert "images" not in out + + +def test_ollama_payload_preserves_native_images_array(): + """If the caller already used Ollama's native shape, leave it alone.""" + msg = { + "role": "user", + "content": "Describe", + "images": ["XXXX"], + } + payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0) + out = payload["messages"][0] + assert out["content"] == "Describe" + assert out["images"] == ["XXXX"] + + +def test_ollama_payload_merges_native_and_openai_images(): + """A message that carries both native ``images`` and OpenAI ``image_url`` + blocks (e.g. assembled by different code paths) must produce one combined + list rather than drop either half.""" + msg = { + "role": "user", + "content": [ + {"type": "text", "text": "Hi"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,OPENAI"}}, + ], + "images": ["NATIVE"], + } + payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0) + out = payload["messages"][0] + assert out["content"] == "Hi" + assert out["images"] == ["NATIVE", "OPENAI"] + + +def test_ollama_payload_text_only_message_untouched(): + msgs = [{"role": "user", "content": "hello"}] + payload = llm_core._build_ollama_payload("gemma4:e4b", msgs, temperature=0.0, max_tokens=0) + assert payload["messages"][0] == {"role": "user", "content": "hello"} + + +def test_ollama_payload_string_content_with_only_image_block(): + """A message whose content list has only image_url blocks (no text part) + still yields a non-empty content string so native Ollama accepts it.""" + msg = { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,QQ=="}}, + ], + } + payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0) + out = payload["messages"][0] + assert isinstance(out["content"], str) + assert out["images"] == ["QQ=="]