fix(llm): suppress thinking mode for qwen3/gemma4 on Ollama /v1 endpoint (#3228)

* fix(llm): suppress thinking for qwen3/gemma4 on Ollama /v1 compat endpoint

When using qwen3, QwQ, gemma4, or other thinking models via Ollama's
OpenAI-compatible /v1 endpoint, the model routes all output into its
<think>...</think> reasoning block. Since Odysseus strips thinking
content from round_response and only accumulates native tool_calls,
this produces a round with 0 chars, 0 native calls, 0 tool blocks —
the agent appears to silently do nothing.

Root cause: Odysseus classifies the /v1 endpoint as provider="openai"
(not "ollama"), so the payload is built as a standard OpenAI payload
without any Ollama-specific options. Ollama's /v1 endpoint accepts
"think": false as a top-level parameter to suppress extended thinking,
but this was never sent.

Fix:
- Add _is_ollama_openai_compat_url() to detect local Ollama /v1 URLs
- Inject "think": false in both stream_llm and llm_call_async for
  thinking models (qwen3, QwQ, gemma4, DeepSeek-R1, etc.) on this
  endpoint

Verified with qwen3:14b on Ollama 0.24: with think=False the model
correctly emits native tool_calls in a single streaming chunk and
the agent executes bash/file/web tools as expected.

* fix(llm): extend _is_ollama_openai_compat_url to match localhost on any port

Per reviewer feedback on PR #3228:

1. Generalize host detection to mirror _is_ollama_native_url: match any
   localhost/127.0.0.1/0.0.0.0/::1 host (not just port 11434) so that
   custom OLLAMA_HOST ports and container remaps are also covered.

2. Add tests/test_llm_core_ollama_thinking.py covering:
   - _is_ollama_openai_compat_url for all positive/negative URL cases
     including IPv6, non-default port, native /api path, and real OpenAI
   - Payload injection: think:false set for Ollama /v1 thinking model,
     not set for non-thinking model, not set for real OpenAI endpoint,
     and set for localhost on a non-default port (the new case)
This commit is contained in:
onemorethan0
2026-06-09 00:35:15 -05:00
committed by GitHub
parent 637a34515d
commit 8ae2b5f58c
2 changed files with 191 additions and 0 deletions
+26
View File
@@ -276,6 +276,24 @@ def _is_ollama_native_url(url: str) -> bool:
return local_ollama_host and (path == "" or path == "/api" or path.startswith("/api/"))
def _is_ollama_openai_compat_url(url: str) -> bool:
"""Return True for local Ollama's OpenAI-compatible /v1 surface.
Mirrors the host detection used by ``_is_ollama_native_url`` so that the
two helpers stay in lockstep: a localhost Ollama on a non-default port
(custom ``OLLAMA_HOST``, reverse proxy, container port remap) is treated
the same way here as it is on the native ``/api`` path.
"""
try:
parsed = urlparse(url or "")
except Exception:
return False
host = parsed.hostname or ""
path = (parsed.path or "").rstrip("/")
local_ollama_host = host in {"localhost", "127.0.0.1", "0.0.0.0", "::1"} or parsed.port == 11434
return local_ollama_host and (path == "/v1" or path.startswith("/v1/"))
def _ollama_api_root(url: str) -> str:
"""Return a native Ollama API root such as https://ollama.com/api."""
url = (url or "").strip().rstrip("/")
@@ -1344,6 +1362,9 @@ async def llm_call_async(
if max_tokens and max_tokens > 0:
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
payload[tok_key] = max_tokens
# Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm.
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
payload["think"] = False
if _is_host_dead(target_url):
raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
@@ -1461,6 +1482,11 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
payload[tok_key] = max_tokens
if tools:
payload["tools"] = tools
# For Ollama's OpenAI-compat /v1 endpoint with thinking models (qwen3,
# gemma4, etc.), suppress thinking so tool calls aren't swallowed inside
# <think> blocks. Ollama /v1 accepts "think": false as a top-level param.
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
payload["think"] = False
h = _provider_headers(provider, headers)
if provider == "copilot":
from src.copilot import apply_request_headers
+165
View File
@@ -0,0 +1,165 @@
"""Tests for Ollama /v1 thinking-suppression helpers.
Covers:
- _is_ollama_openai_compat_url: URL classification (local host + /v1 path)
- think: false is injected into the payload for Ollama /v1 thinking models
- think: false is NOT injected for non-thinking models or non-Ollama /v1 endpoints
"""
import asyncio
import json
from src import llm_core
# ---------------------------------------------------------------------------
# Fake HTTP client — captures the outgoing payload without network I/O
# ---------------------------------------------------------------------------
class _FakeResp:
status_code = 200
async def aiter_lines(self):
# Yield a minimal done event so stream_llm exits cleanly
yield json.dumps({"choices": [{"delta": {"content": "ok"}, "finish_reason": "stop"}]})
yield "data: [DONE]"
async def aread(self):
return b""
class _FakeStreamCtx:
def __init__(self, captured):
self._captured = captured
async def __aenter__(self):
return _FakeResp()
async def __aexit__(self, *a):
return False
class _FakeClient:
"""Minimal stand-in for httpx.AsyncClient that captures request payload."""
def __init__(self):
self.captured_payload = {}
def stream(self, method, url, **kw):
self.captured_payload = kw.get("json") or {}
return _FakeStreamCtx(self.captured_payload)
def _capture_payload(monkeypatch, url, model):
"""Run stream_llm, intercept the HTTP payload, and return it."""
client = _FakeClient()
monkeypatch.setattr(llm_core, "_get_http_client", lambda: client)
monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
monkeypatch.setattr(llm_core, "get_context_length", lambda u, m: 32768)
async def run():
return [c async for c in llm_core.stream_llm(
url, model, [{"role": "user", "content": "hi"}],
)]
asyncio.run(run())
return client.captured_payload
# ---------------------------------------------------------------------------
# _is_ollama_openai_compat_url — pure function, no I/O
# ---------------------------------------------------------------------------
class TestIsOllamaOpenAICompatUrl:
"""Unit tests for the URL classifier that gates think-suppression."""
# Positive cases — should be True
def test_default_port_v1_root(self):
assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1")
def test_default_port_chat_completions(self):
assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1/chat/completions")
def test_localhost_default_port(self):
assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1")
def test_localhost_default_port_with_path(self):
assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1/chat/completions")
def test_loopback_ipv6(self):
# IPv6 addresses in URLs require square brackets per RFC 3986
assert llm_core._is_ollama_openai_compat_url("http://[::1]:11434/v1")
def test_any_local_non_default_port(self):
"""Localhost on a non-default port (custom OLLAMA_HOST) must also match."""
assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11435/v1")
def test_localhost_non_default_port(self):
assert llm_core._is_ollama_openai_compat_url("http://localhost:8080/v1/chat/completions")
def test_zero_dot_zero_host(self):
assert llm_core._is_ollama_openai_compat_url("http://0.0.0.0:11434/v1")
# Negative cases — should be False
def test_openai_api_v1(self):
"""Real OpenAI endpoint must never match, even though path is /v1."""
assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1")
def test_openai_chat_completions(self):
assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1/chat/completions")
def test_ollama_native_api_path(self):
"""The native /api path is a different surface and must not match /v1."""
assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api")
def test_ollama_native_api_chat(self):
assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api/chat")
def test_remote_openrouter(self):
assert not llm_core._is_ollama_openai_compat_url("https://openrouter.ai/api/v1")
def test_empty_string(self):
assert not llm_core._is_ollama_openai_compat_url("")
def test_none_like_empty(self):
assert not llm_core._is_ollama_openai_compat_url(None) # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# Payload injection — think: false only when both conditions hold
# ---------------------------------------------------------------------------
class TestThinkSuppression:
"""Assert think:false is present/absent in the outgoing HTTP payload."""
def test_think_false_for_ollama_v1_thinking_model(self, monkeypatch):
"""think:false must be set for qwen3 on Ollama /v1."""
payload = _capture_payload(
monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "qwen3:14b"
)
assert payload.get("think") is False
def test_no_think_for_ollama_v1_non_thinking_model(self, monkeypatch):
"""think must NOT be set for a plain (non-thinking) model on Ollama /v1."""
payload = _capture_payload(
monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "llama3.2:3b"
)
assert "think" not in payload
def test_no_think_for_openai_endpoint_with_thinking_model_name(self, monkeypatch):
"""think must NOT leak to a real OpenAI endpoint even if the model name
matches a thinking pattern — the URL guard is what matters."""
payload = _capture_payload(
monkeypatch, "https://api.openai.com/v1/chat/completions", "qwen3:14b"
)
assert "think" not in payload
def test_think_false_for_non_default_port_thinking_model(self, monkeypatch):
"""Custom-port localhost Ollama (e.g. OLLAMA_HOST=0.0.0.0:11435) must
also receive think:false — this is the regression guarded by the
host-set check added in this fix."""
payload = _capture_payload(
monkeypatch, "http://127.0.0.1:11435/v1/chat/completions", "qwen3:14b"
)
assert payload.get("think") is False