fix: surface reasoning_content when content is empty (thinking models) (#1233)

Thinking models served via llama.cpp without --reasoning-format none (e.g. Qwen3, DeepSeek-R1) route all tokens into reasoning_content and return content="". Two call paths were silently broken: - llm_call / llm_call_async (non-streaming): hard-keyed data["choices"][0]["message"]["content"] raises KeyError or returns empty string, discarding the entire response. - stream_agent_loop end-of-round fallback: when full_response is empty but round_reasoning has content, the existing code replaced the response with the generic empty-response error message, discarding all reasoning tokens that were correctly accumulated during streaming. Fix: in both non-streaming paths use msg.get("content") or msg.get("reasoning_content") or "". In the streaming fallback, surface round_reasoning as the answer before falling through to the error path.
2026-06-16 09:45:24 -04:00 · 2026-06-02 22:11:24 +05:30
parent 257f7ee7b2
commit 7504fedb17
3 changed files with 176 additions and 6 deletions
@@ -1314,6 +1314,30 @@ async def _run_verifier_subagent(
    return [r.strip() for r in reasons.split(";") if r.strip()]


+def _empty_response_fallback(
+    full_response: str,
+    round_reasoning: str,
+    tool_events: list,
+) -> tuple:
+    """Return (final_response, sse_chunk_or_none) for the end-of-loop empty-response guard.
+
+    When a thinking model routes all tokens to reasoning_content (leaving
+    content=""), full_response is empty but round_reasoning has content.
+    The reasoning was already streamed as {thinking:true} chunks — do not
+    re-emit it as a normal delta.  Just persist it and yield nothing.
+
+    Returns:
+        (final_response: str, chunk: str | None)
+            chunk is the SSE string to yield, or None if nothing should be emitted.
+    """
+    if full_response.strip() or tool_events:
+        return full_response, None
+    if round_reasoning.strip():
+        return round_reasoning, None
+    _error_msg = "The model returned an empty response. Please try again or switch to a different model."
+    return _error_msg, f'data: {json.dumps({"delta": _error_msg})}\n\n'
+
+
 async def stream_agent_loop(
    endpoint_url: str,
    model: str,
@@ -2225,10 +2249,11 @@ async def stream_agent_loop(

    # If the response is completely empty and no tools were executed,
    # yield a fallback message so the user is not left hanging.
-    if not full_response.strip() and not tool_events:
-        _error_msg = "The model returned an empty response. Please try again or switch to a different model."
-        yield f'data: {json.dumps({"delta": _error_msg})}\n\n'
-        full_response = _error_msg
+    full_response, _fallback_chunk = _empty_response_fallback(
+        full_response, round_reasoning, tool_events
+    )
+    if _fallback_chunk:
+        yield _fallback_chunk

    # --- Final metrics ---
    total_duration = time.time() - total_start
@@ -860,7 +860,8 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
        elif provider == "ollama":
            response = _parse_ollama_response(data)
        else:
-            response = data["choices"][0]["message"]["content"]
+            msg = data["choices"][0]["message"]
+            response = msg.get("content") or msg.get("reasoning_content") or ""
        _set_cached_response(cache_key, response)
        return response
    except Exception:
@@ -997,7 +998,8 @@ async def llm_call_async(
                elif provider == "ollama":
                    response = _parse_ollama_response(data)
                else:
-                    response = data["choices"][0]["message"]["content"]
+                    msg = data["choices"][0]["message"]
+                    response = msg.get("content") or msg.get("reasoning_content") or ""
                _set_cached_response(cache_key, response)
                return response
            except Exception: