Keep reasoning (thinking) tokens out of the saved chat reply (#856)

Streamed deltas flagged thinking:true (reasoning-model traces) were being folded
into full_response and persisted as part of the assistant message, so saved
replies were polluted with the model's chain-of-thought. Forward those deltas to
the client (for a live thinking indicator) but exclude them from the accumulated
saved reply, in both chat and research-stream paths. Mirrors the existing rewrite
path's handling.
This commit is contained in:
Mahdi Salmanzade
2026-06-02 06:17:41 +04:00
committed by GitHub
parent 1007703223
commit 4a84a895a0
+13 -4
View File
@@ -708,8 +708,13 @@ def setup_chat_routes(
try: try:
data = json.loads(chunk[6:]) data = json.loads(chunk[6:])
if "delta" in data: if "delta" in data:
full_response += data["delta"] # Reasoning tokens arrive flagged thinking:true.
_stream_set(session, partial=full_response) # Forward them so the client can show a thinking
# indicator, but don't fold them into the saved
# reply (mirrors the rewrite path below).
if not data.get("thinking"):
full_response += data["delta"]
_stream_set(session, partial=full_response)
yield chunk yield chunk
elif data.get("type") == "usage": elif data.get("type") == "usage":
last_metrics = data.get("data", {}) last_metrics = data.get("data", {})
@@ -805,8 +810,12 @@ def setup_chat_routes(
try: try:
data = json.loads(chunk[6:]) data = json.loads(chunk[6:])
if "delta" in data: if "delta" in data:
full_response += data["delta"] # Reasoning tokens arrive flagged thinking:true.
_stream_set(session, partial=full_response) # Forward them for the live indicator, but keep
# them out of the saved reply (same as chat mode).
if not data.get("thinking"):
full_response += data["delta"]
_stream_set(session, partial=full_response)
yield chunk yield chunk
elif data.get("type") == "web_sources": elif data.get("type") == "web_sources":
web_sources = data.get("data", []) web_sources = data.get("data", [])