mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
fix(llm): suppress thinking mode for qwen3/gemma4 on Ollama /v1 endpoint (#3228)
* fix(llm): suppress thinking for qwen3/gemma4 on Ollama /v1 compat endpoint When using qwen3, QwQ, gemma4, or other thinking models via Ollama's OpenAI-compatible /v1 endpoint, the model routes all output into its <think>...</think> reasoning block. Since Odysseus strips thinking content from round_response and only accumulates native tool_calls, this produces a round with 0 chars, 0 native calls, 0 tool blocks — the agent appears to silently do nothing. Root cause: Odysseus classifies the /v1 endpoint as provider="openai" (not "ollama"), so the payload is built as a standard OpenAI payload without any Ollama-specific options. Ollama's /v1 endpoint accepts "think": false as a top-level parameter to suppress extended thinking, but this was never sent. Fix: - Add _is_ollama_openai_compat_url() to detect local Ollama /v1 URLs - Inject "think": false in both stream_llm and llm_call_async for thinking models (qwen3, QwQ, gemma4, DeepSeek-R1, etc.) on this endpoint Verified with qwen3:14b on Ollama 0.24: with think=False the model correctly emits native tool_calls in a single streaming chunk and the agent executes bash/file/web tools as expected. * fix(llm): extend _is_ollama_openai_compat_url to match localhost on any port Per reviewer feedback on PR #3228: 1. Generalize host detection to mirror _is_ollama_native_url: match any localhost/127.0.0.1/0.0.0.0/::1 host (not just port 11434) so that custom OLLAMA_HOST ports and container remaps are also covered. 2. Add tests/test_llm_core_ollama_thinking.py covering: - _is_ollama_openai_compat_url for all positive/negative URL cases including IPv6, non-default port, native /api path, and real OpenAI - Payload injection: think:false set for Ollama /v1 thinking model, not set for non-thinking model, not set for real OpenAI endpoint, and set for localhost on a non-default port (the new case)
This commit is contained in:
@@ -276,6 +276,24 @@ def _is_ollama_native_url(url: str) -> bool:
|
||||
return local_ollama_host and (path == "" or path == "/api" or path.startswith("/api/"))
|
||||
|
||||
|
||||
def _is_ollama_openai_compat_url(url: str) -> bool:
|
||||
"""Return True for local Ollama's OpenAI-compatible /v1 surface.
|
||||
|
||||
Mirrors the host detection used by ``_is_ollama_native_url`` so that the
|
||||
two helpers stay in lockstep: a localhost Ollama on a non-default port
|
||||
(custom ``OLLAMA_HOST``, reverse proxy, container port remap) is treated
|
||||
the same way here as it is on the native ``/api`` path.
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url or "")
|
||||
except Exception:
|
||||
return False
|
||||
host = parsed.hostname or ""
|
||||
path = (parsed.path or "").rstrip("/")
|
||||
local_ollama_host = host in {"localhost", "127.0.0.1", "0.0.0.0", "::1"} or parsed.port == 11434
|
||||
return local_ollama_host and (path == "/v1" or path.startswith("/v1/"))
|
||||
|
||||
|
||||
def _ollama_api_root(url: str) -> str:
|
||||
"""Return a native Ollama API root such as https://ollama.com/api."""
|
||||
url = (url or "").strip().rstrip("/")
|
||||
@@ -1344,6 +1362,9 @@ async def llm_call_async(
|
||||
if max_tokens and max_tokens > 0:
|
||||
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
|
||||
payload[tok_key] = max_tokens
|
||||
# Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm.
|
||||
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
|
||||
payload["think"] = False
|
||||
|
||||
if _is_host_dead(target_url):
|
||||
raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
|
||||
@@ -1461,6 +1482,11 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
payload[tok_key] = max_tokens
|
||||
if tools:
|
||||
payload["tools"] = tools
|
||||
# For Ollama's OpenAI-compat /v1 endpoint with thinking models (qwen3,
|
||||
# gemma4, etc.), suppress thinking so tool calls aren't swallowed inside
|
||||
# <think> blocks. Ollama /v1 accepts "think": false as a top-level param.
|
||||
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
|
||||
payload["think"] = False
|
||||
h = _provider_headers(provider, headers)
|
||||
if provider == "copilot":
|
||||
from src.copilot import apply_request_headers
|
||||
|
||||
@@ -0,0 +1,165 @@
|
||||
"""Tests for Ollama /v1 thinking-suppression helpers.
|
||||
|
||||
Covers:
|
||||
- _is_ollama_openai_compat_url: URL classification (local host + /v1 path)
|
||||
- think: false is injected into the payload for Ollama /v1 thinking models
|
||||
- think: false is NOT injected for non-thinking models or non-Ollama /v1 endpoints
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from src import llm_core
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fake HTTP client — captures the outgoing payload without network I/O
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _FakeResp:
|
||||
status_code = 200
|
||||
|
||||
async def aiter_lines(self):
|
||||
# Yield a minimal done event so stream_llm exits cleanly
|
||||
yield json.dumps({"choices": [{"delta": {"content": "ok"}, "finish_reason": "stop"}]})
|
||||
yield "data: [DONE]"
|
||||
|
||||
async def aread(self):
|
||||
return b""
|
||||
|
||||
|
||||
class _FakeStreamCtx:
|
||||
def __init__(self, captured):
|
||||
self._captured = captured
|
||||
|
||||
async def __aenter__(self):
|
||||
return _FakeResp()
|
||||
|
||||
async def __aexit__(self, *a):
|
||||
return False
|
||||
|
||||
|
||||
class _FakeClient:
|
||||
"""Minimal stand-in for httpx.AsyncClient that captures request payload."""
|
||||
|
||||
def __init__(self):
|
||||
self.captured_payload = {}
|
||||
|
||||
def stream(self, method, url, **kw):
|
||||
self.captured_payload = kw.get("json") or {}
|
||||
return _FakeStreamCtx(self.captured_payload)
|
||||
|
||||
|
||||
def _capture_payload(monkeypatch, url, model):
|
||||
"""Run stream_llm, intercept the HTTP payload, and return it."""
|
||||
client = _FakeClient()
|
||||
monkeypatch.setattr(llm_core, "_get_http_client", lambda: client)
|
||||
monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
|
||||
monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
|
||||
monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
|
||||
monkeypatch.setattr(llm_core, "get_context_length", lambda u, m: 32768)
|
||||
|
||||
async def run():
|
||||
return [c async for c in llm_core.stream_llm(
|
||||
url, model, [{"role": "user", "content": "hi"}],
|
||||
)]
|
||||
|
||||
asyncio.run(run())
|
||||
return client.captured_payload
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _is_ollama_openai_compat_url — pure function, no I/O
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIsOllamaOpenAICompatUrl:
|
||||
"""Unit tests for the URL classifier that gates think-suppression."""
|
||||
|
||||
# Positive cases — should be True
|
||||
def test_default_port_v1_root(self):
|
||||
assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1")
|
||||
|
||||
def test_default_port_chat_completions(self):
|
||||
assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1/chat/completions")
|
||||
|
||||
def test_localhost_default_port(self):
|
||||
assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1")
|
||||
|
||||
def test_localhost_default_port_with_path(self):
|
||||
assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1/chat/completions")
|
||||
|
||||
def test_loopback_ipv6(self):
|
||||
# IPv6 addresses in URLs require square brackets per RFC 3986
|
||||
assert llm_core._is_ollama_openai_compat_url("http://[::1]:11434/v1")
|
||||
|
||||
def test_any_local_non_default_port(self):
|
||||
"""Localhost on a non-default port (custom OLLAMA_HOST) must also match."""
|
||||
assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11435/v1")
|
||||
|
||||
def test_localhost_non_default_port(self):
|
||||
assert llm_core._is_ollama_openai_compat_url("http://localhost:8080/v1/chat/completions")
|
||||
|
||||
def test_zero_dot_zero_host(self):
|
||||
assert llm_core._is_ollama_openai_compat_url("http://0.0.0.0:11434/v1")
|
||||
|
||||
# Negative cases — should be False
|
||||
def test_openai_api_v1(self):
|
||||
"""Real OpenAI endpoint must never match, even though path is /v1."""
|
||||
assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1")
|
||||
|
||||
def test_openai_chat_completions(self):
|
||||
assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1/chat/completions")
|
||||
|
||||
def test_ollama_native_api_path(self):
|
||||
"""The native /api path is a different surface and must not match /v1."""
|
||||
assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api")
|
||||
|
||||
def test_ollama_native_api_chat(self):
|
||||
assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api/chat")
|
||||
|
||||
def test_remote_openrouter(self):
|
||||
assert not llm_core._is_ollama_openai_compat_url("https://openrouter.ai/api/v1")
|
||||
|
||||
def test_empty_string(self):
|
||||
assert not llm_core._is_ollama_openai_compat_url("")
|
||||
|
||||
def test_none_like_empty(self):
|
||||
assert not llm_core._is_ollama_openai_compat_url(None) # type: ignore[arg-type]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Payload injection — think: false only when both conditions hold
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestThinkSuppression:
|
||||
"""Assert think:false is present/absent in the outgoing HTTP payload."""
|
||||
|
||||
def test_think_false_for_ollama_v1_thinking_model(self, monkeypatch):
|
||||
"""think:false must be set for qwen3 on Ollama /v1."""
|
||||
payload = _capture_payload(
|
||||
monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "qwen3:14b"
|
||||
)
|
||||
assert payload.get("think") is False
|
||||
|
||||
def test_no_think_for_ollama_v1_non_thinking_model(self, monkeypatch):
|
||||
"""think must NOT be set for a plain (non-thinking) model on Ollama /v1."""
|
||||
payload = _capture_payload(
|
||||
monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "llama3.2:3b"
|
||||
)
|
||||
assert "think" not in payload
|
||||
|
||||
def test_no_think_for_openai_endpoint_with_thinking_model_name(self, monkeypatch):
|
||||
"""think must NOT leak to a real OpenAI endpoint even if the model name
|
||||
matches a thinking pattern — the URL guard is what matters."""
|
||||
payload = _capture_payload(
|
||||
monkeypatch, "https://api.openai.com/v1/chat/completions", "qwen3:14b"
|
||||
)
|
||||
assert "think" not in payload
|
||||
|
||||
def test_think_false_for_non_default_port_thinking_model(self, monkeypatch):
|
||||
"""Custom-port localhost Ollama (e.g. OLLAMA_HOST=0.0.0.0:11435) must
|
||||
also receive think:false — this is the regression guarded by the
|
||||
host-set check added in this fix."""
|
||||
payload = _capture_payload(
|
||||
monkeypatch, "http://127.0.0.1:11435/v1/chat/completions", "qwen3:14b"
|
||||
)
|
||||
assert payload.get("think") is False
|
||||
Reference in New Issue
Block a user