mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-23 13:15:29 -04:00
fix: Images cannot be seen by model that is vision capable (#4726)
* fix: Images cannot be seen by model that is vision capable * fix: skip http(s) image_url for Ollama (images[] is base64-only) --------- Co-authored-by: michaelxer <michaelxer@users.noreply.github.com>
This commit is contained in:
+85
-26
@@ -345,43 +345,102 @@ def _normalize_ollama_url(url: str) -> str:
|
||||
return base.rstrip("/") + "/chat"
|
||||
|
||||
|
||||
def _ollama_normalize_tool_messages(messages: List[Dict]) -> List[Dict]:
|
||||
def _ollama_normalize_messages(messages: List[Dict]) -> List[Dict]:
|
||||
"""Adapt Odysseus' canonical OpenAI-style messages to native Ollama /api/chat.
|
||||
|
||||
Odysseus carries assistant tool calls in the OpenAI shape, where
|
||||
`function.arguments` is a JSON *string*. Native Ollama expects it to be a
|
||||
JSON *object*; given the string it fails the whole request with HTTP 400
|
||||
"Value looks like object, but can't find closing '}' symbol", which aborts
|
||||
every follow-up (tool-result) round. Parse the arguments back into an object
|
||||
here, on a shallow copy, leaving non-tool messages untouched. The opaque
|
||||
Gemini `extra_content` (thought_signature) is dropped — it is meaningless to
|
||||
Ollama and only matters when the conversation is replayed to Gemini.
|
||||
Two shape mismatches silently break requests:
|
||||
|
||||
1. Tool calls: Odysseus carries `function.arguments` as a JSON *string*.
|
||||
Native Ollama expects a JSON *object* and rejects the string form with
|
||||
HTTP 400 ("Value looks like object, but can't find closing '}' symbol"),
|
||||
aborting every follow-up (tool-result) round. Parse the arguments back
|
||||
into an object here, on a shallow copy, leaving non-tool messages
|
||||
untouched. The opaque Gemini `extra_content` (thought_signature) is
|
||||
dropped — it is meaningless to Ollama and only matters when the
|
||||
conversation is replayed to Gemini.
|
||||
|
||||
2. Images (issue #4723): Odysseus carries multimodal user content as an
|
||||
OpenAI-style list ``[{type: "text", ...}, {type: "image_url",
|
||||
image_url: {url: "data:image/...;base64,XXX"}}, ...]``. Native Ollama
|
||||
does not accept a list for ``content`` — it wants ``content`` as a
|
||||
string plus a separate ``images`` array of raw base64 strings (no
|
||||
``data:`` prefix). Without this conversion the image blocks pass
|
||||
through untouched, the vision-capable model never sees the picture,
|
||||
and the user gets "I can't see any image" even though the request
|
||||
succeeded.
|
||||
"""
|
||||
out: List[Dict] = []
|
||||
for m in messages or []:
|
||||
tcs = m.get("tool_calls") if isinstance(m, dict) else None
|
||||
if not tcs:
|
||||
if not isinstance(m, dict):
|
||||
out.append(m)
|
||||
continue
|
||||
new_calls = []
|
||||
for tc in tcs:
|
||||
fn = tc.get("function") or {}
|
||||
args = fn.get("arguments")
|
||||
if isinstance(args, str):
|
||||
try:
|
||||
args = json.loads(args) if args.strip() else {}
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
args = {}
|
||||
call: Dict = {"function": {"name": fn.get("name", ""), "arguments": args or {}}}
|
||||
if tc.get("id"):
|
||||
call["id"] = tc["id"]
|
||||
new_calls.append(call)
|
||||
|
||||
nm = dict(m)
|
||||
nm["tool_calls"] = new_calls
|
||||
|
||||
# 1. Tool-call argument strings -> objects.
|
||||
tcs = nm.get("tool_calls")
|
||||
if tcs:
|
||||
new_calls = []
|
||||
for tc in tcs:
|
||||
fn = tc.get("function") or {}
|
||||
args = fn.get("arguments")
|
||||
if isinstance(args, str):
|
||||
try:
|
||||
args = json.loads(args) if args.strip() else {}
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
args = {}
|
||||
call: Dict = {"function": {"name": fn.get("name", ""), "arguments": args or {}}}
|
||||
if tc.get("id"):
|
||||
call["id"] = tc["id"]
|
||||
new_calls.append(call)
|
||||
nm["tool_calls"] = new_calls
|
||||
|
||||
# 2. Multimodal content list -> native content string + images array.
|
||||
content = nm.get("content")
|
||||
if isinstance(content, list):
|
||||
text_parts: List[str] = []
|
||||
images: List[str] = list(nm.get("images") or [])
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
btype = block.get("type")
|
||||
if btype == "text":
|
||||
t = block.get("text")
|
||||
if t:
|
||||
text_parts.append(str(t))
|
||||
elif btype == "image_url":
|
||||
url = (block.get("image_url") or {}).get("url", "")
|
||||
if not url:
|
||||
continue
|
||||
if url.startswith("data:"):
|
||||
# Strip the ``data:[...];base64,`` prefix — native
|
||||
# Ollama wants only the base64 bytes.
|
||||
_, _, b64 = url.partition(",")
|
||||
if b64:
|
||||
images.append(b64)
|
||||
else:
|
||||
# Native Ollama images[] is base64-only; it does
|
||||
# not fetch HTTP URLs. Skip unsupported schemes
|
||||
# rather than sending a non-base64 string that the
|
||||
# model silently ignores.
|
||||
logger.warning(
|
||||
"Skipping non-data image_url (Ollama images[] "
|
||||
"requires base64): %s",
|
||||
url[:80],
|
||||
)
|
||||
nm["content"] = "\n".join(text_parts).strip()
|
||||
if images:
|
||||
nm["images"] = images
|
||||
|
||||
out.append(nm)
|
||||
return out
|
||||
|
||||
|
||||
# Backward-compatible alias for callers/tests that imported the older name
|
||||
# (it only handled tool messages originally — issue #4723 broadened scope).
|
||||
_ollama_normalize_tool_messages = _ollama_normalize_messages
|
||||
|
||||
|
||||
def _build_ollama_payload(
|
||||
model: str,
|
||||
messages: List[Dict],
|
||||
@@ -404,7 +463,7 @@ def _build_ollama_payload(
|
||||
"""
|
||||
payload: Dict = {
|
||||
"model": model,
|
||||
"messages": _ollama_normalize_tool_messages(messages),
|
||||
"messages": _ollama_normalize_messages(messages),
|
||||
"stream": stream,
|
||||
}
|
||||
options: Dict = {}
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
"""Regression tests for Ollama-native multimodal image routing (issue #4723).
|
||||
|
||||
Odysseus builds user messages in OpenAI style::
|
||||
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "..."},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}},
|
||||
]}
|
||||
|
||||
Native Ollama ``/api/chat`` does **not** accept a list for ``content``. It
|
||||
expects ``content`` to be a string and images carried separately on
|
||||
``images`` (a list of raw base64 strings, no ``data:`` prefix). Without
|
||||
this conversion the image block silently never reaches the vision model —
|
||||
the model reports "I can't see the image" even though it is vision-capable
|
||||
and the request succeeded.
|
||||
"""
|
||||
from src import llm_core
|
||||
|
||||
|
||||
def _multimodal_msg():
|
||||
return {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What is in this picture?"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,BBBB"}},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_ollama_payload_converts_openai_image_blocks_to_native_images_array():
|
||||
payload = llm_core._build_ollama_payload(
|
||||
"gemma4:e4b", [_multimodal_msg()], temperature=0.0, max_tokens=0,
|
||||
)
|
||||
msg = payload["messages"][0]
|
||||
# Content must be a string, not a list — native Ollama rejects lists.
|
||||
assert isinstance(msg["content"], str)
|
||||
assert "What is in this picture?" in msg["content"]
|
||||
# Base64 data extracted into the native images array (no data: prefix).
|
||||
assert msg["images"] == ["AAAA", "BBBB"]
|
||||
|
||||
|
||||
def test_ollama_payload_skips_http_image_url():
|
||||
"""Non-data-URI image_url values are skipped with a warning because
|
||||
native Ollama images[] accepts base64 only."""
|
||||
msg = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Look"},
|
||||
{"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}},
|
||||
],
|
||||
}
|
||||
payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
|
||||
out = payload["messages"][0]
|
||||
assert out["content"] == "Look"
|
||||
# HTTP URL is NOT added to images — Ollama cannot fetch it.
|
||||
assert "images" not in out
|
||||
|
||||
|
||||
def test_ollama_payload_preserves_native_images_array():
|
||||
"""If the caller already used Ollama's native shape, leave it alone."""
|
||||
msg = {
|
||||
"role": "user",
|
||||
"content": "Describe",
|
||||
"images": ["XXXX"],
|
||||
}
|
||||
payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
|
||||
out = payload["messages"][0]
|
||||
assert out["content"] == "Describe"
|
||||
assert out["images"] == ["XXXX"]
|
||||
|
||||
|
||||
def test_ollama_payload_merges_native_and_openai_images():
|
||||
"""A message that carries both native ``images`` and OpenAI ``image_url``
|
||||
blocks (e.g. assembled by different code paths) must produce one combined
|
||||
list rather than drop either half."""
|
||||
msg = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Hi"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,OPENAI"}},
|
||||
],
|
||||
"images": ["NATIVE"],
|
||||
}
|
||||
payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
|
||||
out = payload["messages"][0]
|
||||
assert out["content"] == "Hi"
|
||||
assert out["images"] == ["NATIVE", "OPENAI"]
|
||||
|
||||
|
||||
def test_ollama_payload_text_only_message_untouched():
|
||||
msgs = [{"role": "user", "content": "hello"}]
|
||||
payload = llm_core._build_ollama_payload("gemma4:e4b", msgs, temperature=0.0, max_tokens=0)
|
||||
assert payload["messages"][0] == {"role": "user", "content": "hello"}
|
||||
|
||||
|
||||
def test_ollama_payload_string_content_with_only_image_block():
|
||||
"""A message whose content list has only image_url blocks (no text part)
|
||||
still yields a non-empty content string so native Ollama accepts it."""
|
||||
msg = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,QQ=="}},
|
||||
],
|
||||
}
|
||||
payload = llm_core._build_ollama_payload("gemma4:e4b", [msg], temperature=0.0, max_tokens=0)
|
||||
out = payload["messages"][0]
|
||||
assert isinstance(out["content"], str)
|
||||
assert out["images"] == ["QQ=="]
|
||||
Reference in New Issue
Block a user