fix(llm): suppress thinking mode for qwen3/gemma4 on Ollama /v1 endpoint (#3228)

* fix(llm): suppress thinking for qwen3/gemma4 on Ollama /v1 compat endpoint When using qwen3, QwQ, gemma4, or other thinking models via Ollama's OpenAI-compatible /v1 endpoint, the model routes all output into its <think>...</think> reasoning block. Since Odysseus strips thinking content from round_response and only accumulates native tool_calls, this produces a round with 0 chars, 0 native calls, 0 tool blocks — the agent appears to silently do nothing. Root cause: Odysseus classifies the /v1 endpoint as provider="openai" (not "ollama"), so the payload is built as a standard OpenAI payload without any Ollama-specific options. Ollama's /v1 endpoint accepts "think": false as a top-level parameter to suppress extended thinking, but this was never sent. Fix: - Add _is_ollama_openai_compat_url() to detect local Ollama /v1 URLs - Inject "think": false in both stream_llm and llm_call_async for thinking models (qwen3, QwQ, gemma4, DeepSeek-R1, etc.) on this endpoint Verified with qwen3:14b on Ollama 0.24: with think=False the model correctly emits native tool_calls in a single streaming chunk and the agent executes bash/file/web tools as expected. * fix(llm): extend _is_ollama_openai_compat_url to match localhost on any port Per reviewer feedback on PR #3228: 1. Generalize host detection to mirror _is_ollama_native_url: match any localhost/127.0.0.1/0.0.0.0/::1 host (not just port 11434) so that custom OLLAMA_HOST ports and container remaps are also covered. 2. Add tests/test_llm_core_ollama_thinking.py covering: - _is_ollama_openai_compat_url for all positive/negative URL cases including IPv6, non-default port, native /api path, and real OpenAI - Payload injection: think:false set for Ollama /v1 thinking model, not set for non-thinking model, not set for real OpenAI endpoint, and set for localhost on a non-default port (the new case)
2026-06-15 17:25:26 -04:00 · 2026-06-09 00:35:15 -05:00
parent 637a34515d
commit 8ae2b5f58c
2 changed files with 191 additions and 0 deletions
@@ -276,6 +276,24 @@ def _is_ollama_native_url(url: str) -> bool:
    return local_ollama_host and (path == "" or path == "/api" or path.startswith("/api/"))


+def _is_ollama_openai_compat_url(url: str) -> bool:
+    """Return True for local Ollama's OpenAI-compatible /v1 surface.
+
+    Mirrors the host detection used by ``_is_ollama_native_url`` so that the
+    two helpers stay in lockstep: a localhost Ollama on a non-default port
+    (custom ``OLLAMA_HOST``, reverse proxy, container port remap) is treated
+    the same way here as it is on the native ``/api`` path.
+    """
+    try:
+        parsed = urlparse(url or "")
+    except Exception:
+        return False
+    host = parsed.hostname or ""
+    path = (parsed.path or "").rstrip("/")
+    local_ollama_host = host in {"localhost", "127.0.0.1", "0.0.0.0", "::1"} or parsed.port == 11434
+    return local_ollama_host and (path == "/v1" or path.startswith("/v1/"))
+
+
 def _ollama_api_root(url: str) -> str:
    """Return a native Ollama API root such as https://ollama.com/api."""
    url = (url or "").strip().rstrip("/")
@@ -1344,6 +1362,9 @@ async def llm_call_async(
        if max_tokens and max_tokens > 0:
            tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
            payload[tok_key] = max_tokens
+        # Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm.
+        if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
+            payload["think"] = False

    if _is_host_dead(target_url):
        raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
@@ -1461,6 +1482,11 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
            payload[tok_key] = max_tokens
        if tools:
            payload["tools"] = tools
+        # For Ollama's OpenAI-compat /v1 endpoint with thinking models (qwen3,
+        # gemma4, etc.), suppress thinking so tool calls aren't swallowed inside
+        # <think> blocks. Ollama /v1 accepts "think": false as a top-level param.
+        if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
+            payload["think"] = False
        h = _provider_headers(provider, headers)
        if provider == "copilot":
            from src.copilot import apply_request_headers
@@ -0,0 +1,165 @@
+"""Tests for Ollama /v1 thinking-suppression helpers.
+
+Covers:
+- _is_ollama_openai_compat_url: URL classification (local host + /v1 path)
+- think: false is injected into the payload for Ollama /v1 thinking models
+- think: false is NOT injected for non-thinking models or non-Ollama /v1 endpoints
+"""
+import asyncio
+import json
+
+from src import llm_core
+
+
+# ---------------------------------------------------------------------------
+# Fake HTTP client — captures the outgoing payload without network I/O
+# ---------------------------------------------------------------------------
+
+class _FakeResp:
+    status_code = 200
+
+    async def aiter_lines(self):
+        # Yield a minimal done event so stream_llm exits cleanly
+        yield json.dumps({"choices": [{"delta": {"content": "ok"}, "finish_reason": "stop"}]})
+        yield "data: [DONE]"
+
+    async def aread(self):
+        return b""
+
+
+class _FakeStreamCtx:
+    def __init__(self, captured):
+        self._captured = captured
+
+    async def __aenter__(self):
+        return _FakeResp()
+
+    async def __aexit__(self, *a):
+        return False
+
+
+class _FakeClient:
+    """Minimal stand-in for httpx.AsyncClient that captures request payload."""
+
+    def __init__(self):
+        self.captured_payload = {}
+
+    def stream(self, method, url, **kw):
+        self.captured_payload = kw.get("json") or {}
+        return _FakeStreamCtx(self.captured_payload)
+
+
+def _capture_payload(monkeypatch, url, model):
+    """Run stream_llm, intercept the HTTP payload, and return it."""
+    client = _FakeClient()
+    monkeypatch.setattr(llm_core, "_get_http_client", lambda: client)
+    monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
+    monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
+    monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
+    monkeypatch.setattr(llm_core, "get_context_length", lambda u, m: 32768)
+
+    async def run():
+        return [c async for c in llm_core.stream_llm(
+            url, model, [{"role": "user", "content": "hi"}],
+        )]
+
+    asyncio.run(run())
+    return client.captured_payload
+
+
+# ---------------------------------------------------------------------------
+# _is_ollama_openai_compat_url — pure function, no I/O
+# ---------------------------------------------------------------------------
+
+class TestIsOllamaOpenAICompatUrl:
+    """Unit tests for the URL classifier that gates think-suppression."""
+
+    # Positive cases — should be True
+    def test_default_port_v1_root(self):
+        assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1")
+
+    def test_default_port_chat_completions(self):
+        assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1/chat/completions")
+
+    def test_localhost_default_port(self):
+        assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1")
+
+    def test_localhost_default_port_with_path(self):
+        assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1/chat/completions")
+
+    def test_loopback_ipv6(self):
+        # IPv6 addresses in URLs require square brackets per RFC 3986
+        assert llm_core._is_ollama_openai_compat_url("http://[::1]:11434/v1")
+
+    def test_any_local_non_default_port(self):
+        """Localhost on a non-default port (custom OLLAMA_HOST) must also match."""
+        assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11435/v1")
+
+    def test_localhost_non_default_port(self):
+        assert llm_core._is_ollama_openai_compat_url("http://localhost:8080/v1/chat/completions")
+
+    def test_zero_dot_zero_host(self):
+        assert llm_core._is_ollama_openai_compat_url("http://0.0.0.0:11434/v1")
+
+    # Negative cases — should be False
+    def test_openai_api_v1(self):
+        """Real OpenAI endpoint must never match, even though path is /v1."""
+        assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1")
+
+    def test_openai_chat_completions(self):
+        assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1/chat/completions")
+
+    def test_ollama_native_api_path(self):
+        """The native /api path is a different surface and must not match /v1."""
+        assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api")
+
+    def test_ollama_native_api_chat(self):
+        assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api/chat")
+
+    def test_remote_openrouter(self):
+        assert not llm_core._is_ollama_openai_compat_url("https://openrouter.ai/api/v1")
+
+    def test_empty_string(self):
+        assert not llm_core._is_ollama_openai_compat_url("")
+
+    def test_none_like_empty(self):
+        assert not llm_core._is_ollama_openai_compat_url(None)  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# Payload injection — think: false only when both conditions hold
+# ---------------------------------------------------------------------------
+
+class TestThinkSuppression:
+    """Assert think:false is present/absent in the outgoing HTTP payload."""
+
+    def test_think_false_for_ollama_v1_thinking_model(self, monkeypatch):
+        """think:false must be set for qwen3 on Ollama /v1."""
+        payload = _capture_payload(
+            monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "qwen3:14b"
+        )
+        assert payload.get("think") is False
+
+    def test_no_think_for_ollama_v1_non_thinking_model(self, monkeypatch):
+        """think must NOT be set for a plain (non-thinking) model on Ollama /v1."""
+        payload = _capture_payload(
+            monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "llama3.2:3b"
+        )
+        assert "think" not in payload
+
+    def test_no_think_for_openai_endpoint_with_thinking_model_name(self, monkeypatch):
+        """think must NOT leak to a real OpenAI endpoint even if the model name
+        matches a thinking pattern — the URL guard is what matters."""
+        payload = _capture_payload(
+            monkeypatch, "https://api.openai.com/v1/chat/completions", "qwen3:14b"
+        )
+        assert "think" not in payload
+
+    def test_think_false_for_non_default_port_thinking_model(self, monkeypatch):
+        """Custom-port localhost Ollama (e.g. OLLAMA_HOST=0.0.0.0:11435) must
+        also receive think:false — this is the regression guarded by the
+        host-set check added in this fix."""
+        payload = _capture_payload(
+            monkeypatch, "http://127.0.0.1:11435/v1/chat/completions", "qwen3:14b"
+        )
+        assert payload.get("think") is False