diff --git a/src/llm_core.py b/src/llm_core.py index 9ed499c61..07b149ebe 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -276,6 +276,24 @@ def _is_ollama_native_url(url: str) -> bool: return local_ollama_host and (path == "" or path == "/api" or path.startswith("/api/")) +def _is_ollama_openai_compat_url(url: str) -> bool: + """Return True for local Ollama's OpenAI-compatible /v1 surface. + + Mirrors the host detection used by ``_is_ollama_native_url`` so that the + two helpers stay in lockstep: a localhost Ollama on a non-default port + (custom ``OLLAMA_HOST``, reverse proxy, container port remap) is treated + the same way here as it is on the native ``/api`` path. + """ + try: + parsed = urlparse(url or "") + except Exception: + return False + host = parsed.hostname or "" + path = (parsed.path or "").rstrip("/") + local_ollama_host = host in {"localhost", "127.0.0.1", "0.0.0.0", "::1"} or parsed.port == 11434 + return local_ollama_host and (path == "/v1" or path.startswith("/v1/")) + + def _ollama_api_root(url: str) -> str: """Return a native Ollama API root such as https://ollama.com/api.""" url = (url or "").strip().rstrip("/") @@ -1344,6 +1362,9 @@ async def llm_call_async( if max_tokens and max_tokens > 0: tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens" payload[tok_key] = max_tokens + # Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm. + if _is_ollama_openai_compat_url(url) and _supports_thinking(model): + payload["think"] = False if _is_host_dead(target_url): raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)") @@ -1461,6 +1482,11 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl payload[tok_key] = max_tokens if tools: payload["tools"] = tools + # For Ollama's OpenAI-compat /v1 endpoint with thinking models (qwen3, + # gemma4, etc.), suppress thinking so tool calls aren't swallowed inside + # blocks. Ollama /v1 accepts "think": false as a top-level param. + if _is_ollama_openai_compat_url(url) and _supports_thinking(model): + payload["think"] = False h = _provider_headers(provider, headers) if provider == "copilot": from src.copilot import apply_request_headers diff --git a/tests/test_llm_core_ollama_thinking.py b/tests/test_llm_core_ollama_thinking.py new file mode 100644 index 000000000..de706edb7 --- /dev/null +++ b/tests/test_llm_core_ollama_thinking.py @@ -0,0 +1,165 @@ +"""Tests for Ollama /v1 thinking-suppression helpers. + +Covers: +- _is_ollama_openai_compat_url: URL classification (local host + /v1 path) +- think: false is injected into the payload for Ollama /v1 thinking models +- think: false is NOT injected for non-thinking models or non-Ollama /v1 endpoints +""" +import asyncio +import json + +from src import llm_core + + +# --------------------------------------------------------------------------- +# Fake HTTP client — captures the outgoing payload without network I/O +# --------------------------------------------------------------------------- + +class _FakeResp: + status_code = 200 + + async def aiter_lines(self): + # Yield a minimal done event so stream_llm exits cleanly + yield json.dumps({"choices": [{"delta": {"content": "ok"}, "finish_reason": "stop"}]}) + yield "data: [DONE]" + + async def aread(self): + return b"" + + +class _FakeStreamCtx: + def __init__(self, captured): + self._captured = captured + + async def __aenter__(self): + return _FakeResp() + + async def __aexit__(self, *a): + return False + + +class _FakeClient: + """Minimal stand-in for httpx.AsyncClient that captures request payload.""" + + def __init__(self): + self.captured_payload = {} + + def stream(self, method, url, **kw): + self.captured_payload = kw.get("json") or {} + return _FakeStreamCtx(self.captured_payload) + + +def _capture_payload(monkeypatch, url, model): + """Run stream_llm, intercept the HTTP payload, and return it.""" + client = _FakeClient() + monkeypatch.setattr(llm_core, "_get_http_client", lambda: client) + monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False) + monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None) + monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None) + monkeypatch.setattr(llm_core, "get_context_length", lambda u, m: 32768) + + async def run(): + return [c async for c in llm_core.stream_llm( + url, model, [{"role": "user", "content": "hi"}], + )] + + asyncio.run(run()) + return client.captured_payload + + +# --------------------------------------------------------------------------- +# _is_ollama_openai_compat_url — pure function, no I/O +# --------------------------------------------------------------------------- + +class TestIsOllamaOpenAICompatUrl: + """Unit tests for the URL classifier that gates think-suppression.""" + + # Positive cases — should be True + def test_default_port_v1_root(self): + assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1") + + def test_default_port_chat_completions(self): + assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11434/v1/chat/completions") + + def test_localhost_default_port(self): + assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1") + + def test_localhost_default_port_with_path(self): + assert llm_core._is_ollama_openai_compat_url("http://localhost:11434/v1/chat/completions") + + def test_loopback_ipv6(self): + # IPv6 addresses in URLs require square brackets per RFC 3986 + assert llm_core._is_ollama_openai_compat_url("http://[::1]:11434/v1") + + def test_any_local_non_default_port(self): + """Localhost on a non-default port (custom OLLAMA_HOST) must also match.""" + assert llm_core._is_ollama_openai_compat_url("http://127.0.0.1:11435/v1") + + def test_localhost_non_default_port(self): + assert llm_core._is_ollama_openai_compat_url("http://localhost:8080/v1/chat/completions") + + def test_zero_dot_zero_host(self): + assert llm_core._is_ollama_openai_compat_url("http://0.0.0.0:11434/v1") + + # Negative cases — should be False + def test_openai_api_v1(self): + """Real OpenAI endpoint must never match, even though path is /v1.""" + assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1") + + def test_openai_chat_completions(self): + assert not llm_core._is_ollama_openai_compat_url("https://api.openai.com/v1/chat/completions") + + def test_ollama_native_api_path(self): + """The native /api path is a different surface and must not match /v1.""" + assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api") + + def test_ollama_native_api_chat(self): + assert not llm_core._is_ollama_openai_compat_url("http://localhost:11434/api/chat") + + def test_remote_openrouter(self): + assert not llm_core._is_ollama_openai_compat_url("https://openrouter.ai/api/v1") + + def test_empty_string(self): + assert not llm_core._is_ollama_openai_compat_url("") + + def test_none_like_empty(self): + assert not llm_core._is_ollama_openai_compat_url(None) # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# Payload injection — think: false only when both conditions hold +# --------------------------------------------------------------------------- + +class TestThinkSuppression: + """Assert think:false is present/absent in the outgoing HTTP payload.""" + + def test_think_false_for_ollama_v1_thinking_model(self, monkeypatch): + """think:false must be set for qwen3 on Ollama /v1.""" + payload = _capture_payload( + monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "qwen3:14b" + ) + assert payload.get("think") is False + + def test_no_think_for_ollama_v1_non_thinking_model(self, monkeypatch): + """think must NOT be set for a plain (non-thinking) model on Ollama /v1.""" + payload = _capture_payload( + monkeypatch, "http://127.0.0.1:11434/v1/chat/completions", "llama3.2:3b" + ) + assert "think" not in payload + + def test_no_think_for_openai_endpoint_with_thinking_model_name(self, monkeypatch): + """think must NOT leak to a real OpenAI endpoint even if the model name + matches a thinking pattern — the URL guard is what matters.""" + payload = _capture_payload( + monkeypatch, "https://api.openai.com/v1/chat/completions", "qwen3:14b" + ) + assert "think" not in payload + + def test_think_false_for_non_default_port_thinking_model(self, monkeypatch): + """Custom-port localhost Ollama (e.g. OLLAMA_HOST=0.0.0.0:11435) must + also receive think:false — this is the regression guarded by the + host-set check added in this fix.""" + payload = _capture_payload( + monkeypatch, "http://127.0.0.1:11435/v1/chat/completions", "qwen3:14b" + ) + assert payload.get("think") is False