mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Recognize local vision models so their images aren't dropped (#185)
An image attachment only got through if the model name was on a short built-in list. Anything else was treated as text-only and the image was quietly dropped, so the model never saw it. That left out a lot of the smaller vision models you can run locally (moondream was the one I hit). Pulled the check into is_vision_model() in chat_helpers, broadened it to cover those, and added a test. Models that already worked are unaffected. Fixes #124.
This commit is contained in:
committed by
GitHub
parent
32e7cec362
commit
91d3511580
+2
-12
@@ -15,7 +15,7 @@ from src.constants import (
|
||||
UPLOAD_DIR,
|
||||
)
|
||||
from core.models import ChatMessage
|
||||
from src.chat_helpers import extract_urls
|
||||
from src.chat_helpers import extract_urls, is_vision_model
|
||||
from src.document_processor import build_user_content, analyze_image_with_vl_result
|
||||
from src.youtube_handler import (
|
||||
is_youtube_url,
|
||||
@@ -147,17 +147,7 @@ class ChatHandler:
|
||||
# Analyze images — skip if vision disabled, or if main model is vision-capable
|
||||
from src.settings import get_setting
|
||||
vision_enabled = get_setting("vision_enabled", True)
|
||||
VISION_KEYWORDS = [
|
||||
"gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision",
|
||||
"claude-sonnet", "claude-opus", "claude-haiku",
|
||||
"gemini", "llava", "pixtral", "qwen2-vl", "qwen-vl", "qwen3-vl", "qwen3vl", "minicpm",
|
||||
]
|
||||
main_model = (sess.model or "").lower()
|
||||
main_is_vision = any(kw in main_model for kw in VISION_KEYWORDS)
|
||||
# Also match models with "vl" in the name (e.g. Qwen3VL, InternVL, any *-VL-*)
|
||||
if not main_is_vision:
|
||||
import re
|
||||
main_is_vision = bool(re.search(r'\dvl|vl\d|[-_]vl[-_.\d]|vl-', main_model))
|
||||
main_is_vision = is_vision_model(sess.model or "")
|
||||
|
||||
# Read uploads DB once and index by id (was read twice + linear-scanned per attachment)
|
||||
files_by_id: Dict[str, Dict] = {}
|
||||
|
||||
@@ -23,6 +23,36 @@ def extract_urls(text: str) -> List[str]:
|
||||
return cleaned_urls
|
||||
|
||||
|
||||
# Model-name substrings that signal native image input. A missed match here
|
||||
# silently drops the image from the chat request (it gets swapped for a text
|
||||
# caption), so the model never sees it. Keep this broad, especially for local
|
||||
# models (Ollama/llama.cpp) that ship under many names. See issue #124.
|
||||
_VISION_MODEL_KEYWORDS = (
|
||||
# hosted
|
||||
"gpt-4o", "gpt-4.1", "gpt-4.5", "gpt-4-turbo", "gpt-4-vision",
|
||||
"claude-sonnet", "claude-opus", "claude-haiku", "gemini",
|
||||
# open / local
|
||||
"vision", "llava", "bakllava", "moondream", "pixtral", "minicpm",
|
||||
"internvl", "cogvlm", "qwen-vl", "qwen2-vl", "qwen3-vl", "qwen3vl",
|
||||
)
|
||||
# Catches the "*-VL-*" / "*VL*" family not covered by a literal keyword above
|
||||
# (e.g. Qwen2.5-VL and various tags): a standalone "vl" token, plus "vlm".
|
||||
_VISION_VL_RE = re.compile(r'(?<![a-z])vl(?![a-z])|vlm')
|
||||
|
||||
|
||||
def is_vision_model(model_name: str) -> bool:
|
||||
"""Best-effort check of whether a model can natively accept images.
|
||||
|
||||
Decides whether image attachments get passed through to the model or
|
||||
swapped for a separate caption. Err toward True, since a false negative
|
||||
drops the image entirely. See issue #124.
|
||||
"""
|
||||
m = (model_name or "").lower()
|
||||
if any(kw in m for kw in _VISION_MODEL_KEYWORDS):
|
||||
return True
|
||||
return bool(_VISION_VL_RE.search(m))
|
||||
|
||||
|
||||
def validate_message(message: str) -> str:
|
||||
"""Validate message input."""
|
||||
if not message:
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
"""Tests for is_vision_model (issue #124).
|
||||
|
||||
Local vision models served through Ollama/llama.cpp show up under many
|
||||
names. If one isn't recognized as vision-capable, the image attachment is
|
||||
stripped from the request before it reaches the model, so it silently never
|
||||
sees the picture.
|
||||
"""
|
||||
from src.chat_helpers import is_vision_model
|
||||
|
||||
|
||||
def test_recognizes_local_and_hosted_vision_models():
|
||||
for name in [
|
||||
# the ones #124 missed
|
||||
"moondream", "moondream:latest",
|
||||
"llama3.2-vision:11b", "granite3.2-vision",
|
||||
"qwen2.5-vl:7b", "qwen2.5vl", "internvl2.5", "cogvlm",
|
||||
# already worked, keep them working
|
||||
"llava", "llava:7b", "bakllava", "minicpm-v",
|
||||
"gpt-4o", "claude-sonnet-4", "gemini-2.0-flash", "pixtral-12b",
|
||||
]:
|
||||
assert is_vision_model(name), f"{name!r} should be detected as vision-capable"
|
||||
|
||||
|
||||
def test_text_only_models_not_flagged():
|
||||
for name in ["qwen2.5:3b", "mistral", "llama3.1:8b", "deepseek-r1", "phi3", "vicuna", ""]:
|
||||
assert not is_vision_model(name), f"{name!r} should not be flagged as vision"
|
||||
|
||||
|
||||
def test_none_is_safe():
|
||||
assert is_vision_model(None) is False
|
||||
Reference in New Issue
Block a user