From 8ae2b5f58c02782f6bc9648d6675580b95d50b02 Mon Sep 17 00:00:00 2001
From: onemorethan0 <167813633+onemorethan0@users.noreply.github.com>
Date: Tue, 9 Jun 2026 00:35:15 -0500
Subject: [PATCH] fix(llm): suppress thinking mode for qwen3/gemma4 on Ollama
/v1 endpoint (#3228)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* fix(llm): suppress thinking for qwen3/gemma4 on Ollama /v1 compat endpoint
When using qwen3, QwQ, gemma4, or other thinking models via Ollama's
OpenAI-compatible /v1 endpoint, the model routes all output into its
... reasoning block. Since Odysseus strips thinking
content from round_response and only accumulates native tool_calls,
this produces a round with 0 chars, 0 native calls, 0 tool blocks —
the agent appears to silently do nothing.
Root cause: Odysseus classifies the /v1 endpoint as provider="openai"
(not "ollama"), so the payload is built as a standard OpenAI payload
without any Ollama-specific options. Ollama's /v1 endpoint accepts
"think": false as a top-level parameter to suppress extended thinking,
but this was never sent.
Fix:
- Add _is_ollama_openai_compat_url() to detect local Ollama /v1 URLs
- Inject "think": false in both stream_llm and llm_call_async for
thinking models (qwen3, QwQ, gemma4, DeepSeek-R1, etc.) on this
endpoint
Verified with qwen3:14b on Ollama 0.24: with think=False the model
correctly emits native tool_calls in a single streaming chunk and
the agent executes bash/file/web tools as expected.
* fix(llm): extend _is_ollama_openai_compat_url to match localhost on any port
Per reviewer feedback on PR #3228:
1. Generalize host detection to mirror _is_ollama_native_url: match any
localhost/127.0.0.1/0.0.0.0/::1 host (not just port 11434) so that
custom OLLAMA_HOST ports and container remaps are also covered.
2. Add tests/test_llm_core_ollama_thinking.py covering:
- _is_ollama_openai_compat_url for all positive/negative URL cases
including IPv6, non-default port, native /api path, and real OpenAI
- Payload injection: think:false set for Ollama /v1 thinking model,
not set for non-thinking model, not set for real OpenAI endpoint,
and set for localhost on a non-default port (the new case)
---
src/llm_core.py | 26 ++++
tests/test_llm_core_ollama_thinking.py | 165 +++++++++++++++++++++++++
2 files changed, 191 insertions(+)
create mode 100644 tests/test_llm_core_ollama_thinking.py
diff --git a/src/llm_core.py b/src/llm_core.py
index 9ed499c61..07b149ebe 100644
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -276,6 +276,24 @@ def _is_ollama_native_url(url: str) -> bool:
return local_ollama_host and (path == "" or path == "/api" or path.startswith("/api/"))
+def _is_ollama_openai_compat_url(url: str) -> bool:
+ """Return True for local Ollama's OpenAI-compatible /v1 surface.
+
+ Mirrors the host detection used by ``_is_ollama_native_url`` so that the
+ two helpers stay in lockstep: a localhost Ollama on a non-default port
+ (custom ``OLLAMA_HOST``, reverse proxy, container port remap) is treated
+ the same way here as it is on the native ``/api`` path.
+ """
+ try:
+ parsed = urlparse(url or "")
+ except Exception:
+ return False
+ host = parsed.hostname or ""
+ path = (parsed.path or "").rstrip("/")
+ local_ollama_host = host in {"localhost", "127.0.0.1", "0.0.0.0", "::1"} or parsed.port == 11434
+ return local_ollama_host and (path == "/v1" or path.startswith("/v1/"))
+
+
def _ollama_api_root(url: str) -> str:
"""Return a native Ollama API root such as https://ollama.com/api."""
url = (url or "").strip().rstrip("/")
@@ -1344,6 +1362,9 @@ async def llm_call_async(
if max_tokens and max_tokens > 0:
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
payload[tok_key] = max_tokens
+ # Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm.
+ if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
+ payload["think"] = False
if _is_host_dead(target_url):
raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
@@ -1461,6 +1482,11 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
payload[tok_key] = max_tokens
if tools:
payload["tools"] = tools
+ # For Ollama's OpenAI-compat /v1 endpoint with thinking models (qwen3,
+ # gemma4, etc.), suppress thinking so tool calls aren't swallowed inside
+ # blocks. Ollama /v1 accepts "think": false as a top-level param.
+ if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
+ payload["think"] = False
h = _provider_headers(provider, headers)
if provider == "copilot":
from src.copilot import apply_request_headers
diff --git a/tests/test_llm_core_ollama_thinking.py b/tests/test_llm_core_ollama_thinking.py
new file mode 100644
index 000000000..de706edb7
--- /dev/null
+++ b/tests/test_llm_core_ollama_thinking.py
@@ -0,0 +1,165 @@
+"""Tests for Ollama /v1 thinking-suppression helpers.
+
+Covers:
+- _is_ollama_openai_compat_url: URL classification (local host + /v1 path)
+- think: false is injected into the payload for Ollama /v1 thinking models
+- think: false is NOT injected for non-thinking models or non-Ollama /v1 endpoints
+"""
+import asyncio
+import json
+
+from src import llm_core
+
+
+# ---------------------------------------------------------------------------
+# Fake HTTP client — captures the outgoing payload without network I/O
+# ---------------------------------------------------------------------------
+
+class _FakeResp:
+ status_code = 200
+
+ async def aiter_lines(self):
+ # Yield a minimal done event so stream_llm exits cleanly
+ yield json.dumps({"choices": [{"delta": {"content": "ok"}, "finish_reason": "stop"}]})
+ yield "data: [DONE]"
+
+ async def aread(self):
+ return b""
+
+
+class _FakeStreamCtx:
+ def __init__(self, captured):
+ self._captured = captured
+
+ async def __aenter__(self):
+ return _FakeResp()
+
+ async def __aexit__(self, *a):
+ return False
+
+
+class _FakeClient:
+ """Minimal stand-in for httpx.AsyncClient that captures request payload."""
+
+ def __init__(self):
+ self.captured_payload = {}
+
+ def stream(self, method, url, **kw):
+ self.captured_payload = kw.get("json") or {}
+ return _FakeStreamCtx(self.captured_payload)
+
+
+def _capture_payload(monkeypatch, url, model):
+ """Run stream_llm, intercept the HTTP payload, and return it."""
+ client = _FakeClient()
+ monkeypatch.setattr(llm_core, "_get_http_client", lambda: client)
+ monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
+ monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
+ monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
+ monkeypatch.setattr(llm_core, "get_context_length", lambda u, m: 32768)
+
+ async def run():
+ return [c async for c in llm_core.stream_llm(
+ url, model, [{"role": "user", "content": "hi"}],
+ )]
+
+ asyncio.run(run())
+ return client.captured_payload
+
+
+# ---------------------------------------------------------------------------
+# _is_ollama_openai_compat_url — pure function, no I/O
+# ---------------------------------------------------------------------------
+
+class TestIsOllamaOpenAICompatUrl:
+ """Unit tests for the URL classifier that gates think-suppression."""
+
+ # Positive cases — should be True
+ def test_default_port_v1_root(self):
+ assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1")
+
+ def test_default_port_chat_completions(self):
+ assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1/chat/completions")
+
+ def test_localhost_default_port(self):
+ assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1")
+
+ def test_localhost_default_port_with_path(self):
+ assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1/chat/completions")
+
+ def test_loopback_ipv6(self):
+ # IPv6 addresses in URLs require square brackets per RFC 3986
+ assert llm_core._is_ollama_openai_compat_url("http://[::1]:11434/v1")
+
+ def test_any_local_non_default_port(self):
+ """Localhost on a non-default port (custom OLLAMA_HOST) must also match."""
+ assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11435/v1")
+
+ def test_localhost_non_default_port(self):
+ assert llm_core._is_ollama_openai_compat_url("http://localhost:8080/v1/chat/completions")
+
+ def test_zero_dot_zero_host(self):
+ assert llm_core._is_ollama_openai_compat_url("http://0.0.0.0:11434/v1")
+
+ # Negative cases — should be False
+ def test_openai_api_v1(self):
+ """Real OpenAI endpoint must never match, even though path is /v1."""
+ assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1")
+
+ def test_openai_chat_completions(self):
+ assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1/chat/completions")
+
+ def test_ollama_native_api_path(self):
+ """The native /api path is a different surface and must not match /v1."""
+ assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api")
+
+ def test_ollama_native_api_chat(self):
+ assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api/chat")
+
+ def test_remote_openrouter(self):
+ assert not llm_core._is_ollama_openai_compat_url("https://openrouter.ai/api/v1")
+
+ def test_empty_string(self):
+ assert not llm_core._is_ollama_openai_compat_url("")
+
+ def test_none_like_empty(self):
+ assert not llm_core._is_ollama_openai_compat_url(None) # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# Payload injection — think: false only when both conditions hold
+# ---------------------------------------------------------------------------
+
+class TestThinkSuppression:
+ """Assert think:false is present/absent in the outgoing HTTP payload."""
+
+ def test_think_false_for_ollama_v1_thinking_model(self, monkeypatch):
+ """think:false must be set for qwen3 on Ollama /v1."""
+ payload = _capture_payload(
+ monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "qwen3:14b"
+ )
+ assert payload.get("think") is False
+
+ def test_no_think_for_ollama_v1_non_thinking_model(self, monkeypatch):
+ """think must NOT be set for a plain (non-thinking) model on Ollama /v1."""
+ payload = _capture_payload(
+ monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "llama3.2:3b"
+ )
+ assert "think" not in payload
+
+ def test_no_think_for_openai_endpoint_with_thinking_model_name(self, monkeypatch):
+ """think must NOT leak to a real OpenAI endpoint even if the model name
+ matches a thinking pattern — the URL guard is what matters."""
+ payload = _capture_payload(
+ monkeypatch, "https://api.openai.com/v1/chat/completions", "qwen3:14b"
+ )
+ assert "think" not in payload
+
+ def test_think_false_for_non_default_port_thinking_model(self, monkeypatch):
+ """Custom-port localhost Ollama (e.g. OLLAMA_HOST=0.0.0.0:11435) must
+ also receive think:false — this is the regression guarded by the
+ host-set check added in this fix."""
+ payload = _capture_payload(
+ monkeypatch, "http://127.0.0.1:11435/v1/chat/completions", "qwen3:14b"
+ )
+ assert payload.get("think") is False