mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
fix(llm): suppress thinking mode for qwen3/gemma4 on Ollama /v1 endpoint (#3228)
* fix(llm): suppress thinking for qwen3/gemma4 on Ollama /v1 compat endpoint When using qwen3, QwQ, gemma4, or other thinking models via Ollama's OpenAI-compatible /v1 endpoint, the model routes all output into its <think>...</think> reasoning block. Since Odysseus strips thinking content from round_response and only accumulates native tool_calls, this produces a round with 0 chars, 0 native calls, 0 tool blocks — the agent appears to silently do nothing. Root cause: Odysseus classifies the /v1 endpoint as provider="openai" (not "ollama"), so the payload is built as a standard OpenAI payload without any Ollama-specific options. Ollama's /v1 endpoint accepts "think": false as a top-level parameter to suppress extended thinking, but this was never sent. Fix: - Add _is_ollama_openai_compat_url() to detect local Ollama /v1 URLs - Inject "think": false in both stream_llm and llm_call_async for thinking models (qwen3, QwQ, gemma4, DeepSeek-R1, etc.) on this endpoint Verified with qwen3:14b on Ollama 0.24: with think=False the model correctly emits native tool_calls in a single streaming chunk and the agent executes bash/file/web tools as expected. * fix(llm): extend _is_ollama_openai_compat_url to match localhost on any port Per reviewer feedback on PR #3228: 1. Generalize host detection to mirror _is_ollama_native_url: match any localhost/127.0.0.1/0.0.0.0/::1 host (not just port 11434) so that custom OLLAMA_HOST ports and container remaps are also covered. 2. Add tests/test_llm_core_ollama_thinking.py covering: - _is_ollama_openai_compat_url for all positive/negative URL cases including IPv6, non-default port, native /api path, and real OpenAI - Payload injection: think:false set for Ollama /v1 thinking model, not set for non-thinking model, not set for real OpenAI endpoint, and set for localhost on a non-default port (the new case)
This commit is contained in:
@@ -276,6 +276,24 @@ def _is_ollama_native_url(url: str) -> bool:
|
||||
return local_ollama_host and (path == "" or path == "/api" or path.startswith("/api/"))
|
||||
|
||||
|
||||
def _is_ollama_openai_compat_url(url: str) -> bool:
|
||||
"""Return True for local Ollama's OpenAI-compatible /v1 surface.
|
||||
|
||||
Mirrors the host detection used by ``_is_ollama_native_url`` so that the
|
||||
two helpers stay in lockstep: a localhost Ollama on a non-default port
|
||||
(custom ``OLLAMA_HOST``, reverse proxy, container port remap) is treated
|
||||
the same way here as it is on the native ``/api`` path.
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url or "")
|
||||
except Exception:
|
||||
return False
|
||||
host = parsed.hostname or ""
|
||||
path = (parsed.path or "").rstrip("/")
|
||||
local_ollama_host = host in {"localhost", "127.0.0.1", "0.0.0.0", "::1"} or parsed.port == 11434
|
||||
return local_ollama_host and (path == "/v1" or path.startswith("/v1/"))
|
||||
|
||||
|
||||
def _ollama_api_root(url: str) -> str:
|
||||
"""Return a native Ollama API root such as https://ollama.com/api."""
|
||||
url = (url or "").strip().rstrip("/")
|
||||
@@ -1344,6 +1362,9 @@ async def llm_call_async(
|
||||
if max_tokens and max_tokens > 0:
|
||||
tok_key = "max_completion_tokens" if _uses_max_completion_tokens(model) else "max_tokens"
|
||||
payload[tok_key] = max_tokens
|
||||
# Suppress thinking for qwen3/gemma4 on Ollama /v1 — same as stream_llm.
|
||||
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
|
||||
payload["think"] = False
|
||||
|
||||
if _is_host_dead(target_url):
|
||||
raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
|
||||
@@ -1461,6 +1482,11 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
payload[tok_key] = max_tokens
|
||||
if tools:
|
||||
payload["tools"] = tools
|
||||
# For Ollama's OpenAI-compat /v1 endpoint with thinking models (qwen3,
|
||||
# gemma4, etc.), suppress thinking so tool calls aren't swallowed inside
|
||||
# <think> blocks. Ollama /v1 accepts "think": false as a top-level param.
|
||||
if _is_ollama_openai_compat_url(url) and _supports_thinking(model):
|
||||
payload["think"] = False
|
||||
h = _provider_headers(provider, headers)
|
||||
if provider == "copilot":
|
||||
from src.copilot import apply_request_headers
|
||||
|
||||
Reference in New Issue
Block a user