fix: Images cannot be seen by model that is vision capable (#4726)

* fix: Images cannot be seen by model that is vision capable * fix: skip http(s) image_url for Ollama (images[] is base64-only) --------- Co-authored-by: michaelxer <michaelxer@users.noreply.github.com>
2026-06-23 13:15:29 -04:00 · 2026-06-23 15:32:57 +07:00
parent bd9149f79a
commit e8175c9535
2 changed files with 194 additions and 26 deletions
@@ -345,43 +345,102 @@ def _normalize_ollama_url(url: str) -> str:
    return base.rstrip("/") + "/chat"


-def _ollama_normalize_tool_messages(messages: List[Dict]) -> List[Dict]:
+def _ollama_normalize_messages(messages: List[Dict]) -> List[Dict]:
    """Adapt Odysseus' canonical OpenAI-style messages to native Ollama /api/chat.

-    Odysseus carries assistant tool calls in the OpenAI shape, where
-    `function.arguments` is a JSON *string*. Native Ollama expects it to be a
-    JSON *object*; given the string it fails the whole request with HTTP 400
-    "Value looks like object, but can't find closing '}' symbol", which aborts
-    every follow-up (tool-result) round. Parse the arguments back into an object
-    here, on a shallow copy, leaving non-tool messages untouched. The opaque
-    Gemini `extra_content` (thought_signature) is dropped — it is meaningless to
-    Ollama and only matters when the conversation is replayed to Gemini.
+    Two shape mismatches silently break requests:
+
+    1. Tool calls: Odysseus carries `function.arguments` as a JSON *string*.
+       Native Ollama expects a JSON *object* and rejects the string form with
+       HTTP 400 ("Value looks like object, but can't find closing '}' symbol"),
+       aborting every follow-up (tool-result) round. Parse the arguments back
+       into an object here, on a shallow copy, leaving non-tool messages
+       untouched. The opaque Gemini `extra_content` (thought_signature) is
+       dropped — it is meaningless to Ollama and only matters when the
+       conversation is replayed to Gemini.
+
+    2. Images (issue #4723): Odysseus carries multimodal user content as an
+       OpenAI-style list ``[{type: "text", ...}, {type: "image_url",
+       image_url: {url: "data:image/...;base64,XXX"}}, ...]``. Native Ollama
+       does not accept a list for ``content`` — it wants ``content`` as a
+       string plus a separate ``images`` array of raw base64 strings (no
+       ``data:`` prefix). Without this conversion the image blocks pass
+       through untouched, the vision-capable model never sees the picture,
+       and the user gets "I can't see any image" even though the request
+       succeeded.
    """
    out: List[Dict] = []
    for m in messages or []:
-        tcs = m.get("tool_calls") if isinstance(m, dict) else None
-        if not tcs:
+        if not isinstance(m, dict):
            out.append(m)
            continue
-        new_calls = []
-        for tc in tcs:
-            fn = tc.get("function") or {}
-            args = fn.get("arguments")
-            if isinstance(args, str):
-                try:
-                    args = json.loads(args) if args.strip() else {}
-                except (json.JSONDecodeError, TypeError):
-                    args = {}
-            call: Dict = {"function": {"name": fn.get("name", ""), "arguments": args or {}}}
-            if tc.get("id"):
-                call["id"] = tc["id"]
-            new_calls.append(call)
+
        nm = dict(m)
-        nm["tool_calls"] = new_calls
+
+        # 1. Tool-call argument strings -> objects.
+        tcs = nm.get("tool_calls")
+        if tcs:
+            new_calls = []
+            for tc in tcs:
+                fn = tc.get("function") or {}
+                args = fn.get("arguments")
+                if isinstance(args, str):
+                    try:
+                        args = json.loads(args) if args.strip() else {}
+                    except (json.JSONDecodeError, TypeError):
+                        args = {}
+                call: Dict = {"function": {"name": fn.get("name", ""), "arguments": args or {}}}
+                if tc.get("id"):
+                    call["id"] = tc["id"]
+                new_calls.append(call)
+            nm["tool_calls"] = new_calls
+
+        # 2. Multimodal content list -> native content string + images array.
+        content = nm.get("content")
+        if isinstance(content, list):
+            text_parts: List[str] = []
+            images: List[str] = list(nm.get("images") or [])
+            for block in content:
+                if not isinstance(block, dict):
+                    continue
+                btype = block.get("type")
+                if btype == "text":
+                    t = block.get("text")
+                    if t:
+                        text_parts.append(str(t))
+                elif btype == "image_url":
+                    url = (block.get("image_url") or {}).get("url", "")
+                    if not url:
+                        continue
+                    if url.startswith("data:"):
+                        # Strip the ``data:[...];base64,`` prefix — native
+                        # Ollama wants only the base64 bytes.
+                        _, _, b64 = url.partition(",")
+                        if b64:
+                            images.append(b64)
+                    else:
+                        # Native Ollama images[] is base64-only; it does
+                        # not fetch HTTP URLs.  Skip unsupported schemes
+                        # rather than sending a non-base64 string that the
+                        # model silently ignores.
+                        logger.warning(
+                            "Skipping non-data image_url (Ollama images[] "
+                            "requires base64): %s",
+                            url[:80],
+                        )
+            nm["content"] = "\n".join(text_parts).strip()
+            if images:
+                nm["images"] = images
+
        out.append(nm)
    return out


+# Backward-compatible alias for callers/tests that imported the older name
+# (it only handled tool messages originally — issue #4723 broadened scope).
+_ollama_normalize_tool_messages = _ollama_normalize_messages
+
+
 def _build_ollama_payload(
    model: str,
    messages: List[Dict],
@@ -404,7 +463,7 @@ def _build_ollama_payload(
    """
    payload: Dict = {
        "model": model,
-        "messages": _ollama_normalize_tool_messages(messages),
+        "messages": _ollama_normalize_messages(messages),
        "stream": stream,
    }
    options: Dict = {}
@@ -0,0 +1,109 @@
+"""Regression tests for Ollama-native multimodal image routing (issue #4723).
+
+Odysseus builds user messages in OpenAI style::
+
+    {"role": "user", "content": [
+        {"type": "text", "text": "..."},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}},
+    ]}
+
+Native Ollama ``/api/chat`` does **not** accept a list for ``content``. It
+expects ``content`` to be a string and images carried separately on
+``images`` (a list of raw base64 strings, no ``data:`` prefix). Without
+this conversion the image block silently never reaches the vision model —
+the model reports "I can't see the image" even though it is vision-capable
+and the request succeeded.
+"""
+from src import llm_core
+
+
+def _multimodal_msg():
+    return {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "What is in this picture?"},
+            {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
+            {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,BBBB"}},
+        ],
+    }
+
+
+def test_ollama_payload_converts_openai_image_blocks_to_native_images_array():
+    payload = llm_core._build_ollama_payload(
+        "gemma4:e4b", [_multimodal_msg()], temperature=0.0, max_tokens=0,
+    )
+    msg = payload["messages"][0]
+    # Content must be a string, not a list — native Ollama rejects lists.
+    assert isinstance(msg["content"], str)
+    assert "What is in this picture?" in msg["content"]
+    # Base64 data extracted into the native images array (no data: prefix).
+    assert msg["images"] == ["AAAA", "BBBB"]
+
+
+def test_ollama_payload_skips_http_image_url():
+    """Non-data-URI image_url values are skipped with a warning because
+    native Ollama images[] accepts base64 only."""
+    msg = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Look"},
+            {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}},
+        ],
+    }
+    payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
+    out = payload["messages"][0]
+    assert out["content"] == "Look"
+    # HTTP URL is NOT added to images — Ollama cannot fetch it.
+    assert "images" not in out
+
+
+def test_ollama_payload_preserves_native_images_array():
+    """If the caller already used Ollama's native shape, leave it alone."""
+    msg = {
+        "role": "user",
+        "content": "Describe",
+        "images": ["XXXX"],
+    }
+    payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
+    out = payload["messages"][0]
+    assert out["content"] == "Describe"
+    assert out["images"] == ["XXXX"]
+
+
+def test_ollama_payload_merges_native_and_openai_images():
+    """A message that carries both native ``images`` and OpenAI ``image_url``
+    blocks (e.g. assembled by different code paths) must produce one combined
+    list rather than drop either half."""
+    msg = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Hi"},
+            {"type": "image_url", "image_url": {"url": "data:image/png;base64,OPENAI"}},
+        ],
+        "images": ["NATIVE"],
+    }
+    payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
+    out = payload["messages"][0]
+    assert out["content"] == "Hi"
+    assert out["images"] == ["NATIVE", "OPENAI"]
+
+
+def test_ollama_payload_text_only_message_untouched():
+    msgs = [{"role": "user", "content": "hello"}]
+    payload = llm_core._build_ollama_payload("gemma4:e4b", msgs, temperature=0.0, max_tokens=0)
+    assert payload["messages"][0] == {"role": "user", "content": "hello"}
+
+
+def test_ollama_payload_string_content_with_only_image_block():
+    """A message whose content list has only image_url blocks (no text part)
+    still yields a non-empty content string so native Ollama accepts it."""
+    msg = {
+        "role": "user",
+        "content": [
+            {"type": "image_url", "image_url": {"url": "data:image/png;base64,QQ=="}},
+        ],
+    }
+    payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
+    out = payload["messages"][0]
+    assert isinstance(out["content"], str)
+    assert out["images"] == ["QQ=="]