Keep reasoning (thinking) tokens out of the saved chat reply (#856)

Streamed deltas flagged thinking:true (reasoning-model traces) were being folded into full_response and persisted as part of the assistant message, so saved replies were polluted with the model's chain-of-thought. Forward those deltas to the client (for a live thinking indicator) but exclude them from the accumulated saved reply, in both chat and research-stream paths. Mirrors the existing rewrite path's handling.
2026-06-17 10:15:27 -04:00 · 2026-06-02 06:17:41 +04:00
parent 1007703223
commit 4a84a895a0
1 changed files with 13 additions and 4 deletions
@@ -708,8 +708,13 @@ def setup_chat_routes(
                            try:
                                data = json.loads(chunk[6:])
                                if "delta" in data:
-                                    full_response += data["delta"]
+                                    # Reasoning tokens arrive flagged thinking:true.
-                                    _stream_set(session, partial=full_response)
+                                    # Forward them so the client can show a thinking
                                    # indicator, but don't fold them into the saved
                                    # reply (mirrors the rewrite path below).
                                    if not data.get("thinking"):
                                        full_response += data["delta"]
                                        _stream_set(session, partial=full_response)
                                    yield chunk
                                elif data.get("type") == "usage":
                                    last_metrics = data.get("data", {})
@@ -805,8 +810,12 @@ def setup_chat_routes(
                            try:
                                data = json.loads(chunk[6:])
                                if "delta" in data:
-                                    full_response += data["delta"]
+                                    # Reasoning tokens arrive flagged thinking:true.
-                                    _stream_set(session, partial=full_response)
+                                    # Forward them for the live indicator, but keep
                                    # them out of the saved reply (same as chat mode).
                                    if not data.get("thinking"):
                                        full_response += data["delta"]
                                        _stream_set(session, partial=full_response)
                                    yield chunk
                                elif data.get("type") == "web_sources":
                                    web_sources = data.get("data", [])