mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Support vLLM 0.20.2 / NIM reasoning-parser output end-to-end (surface + agent context + render) (#602)
* fix(stream): read 'reasoning' SSE field for vLLM 0.20.2 / NIM vLLM 0.20.2 / NVIDIA NIM emit reasoning-parser output in the `reasoning` delta field; older builds use `reasoning_content`. stream_llm() read only the latter, so reasoning from models like Nemotron-3-Nano (--reasoning-parser) was silently dropped and never rendered. Accept either field. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(agent): keep reasoning_content only on the latest assistant turn The agent loop echoed each round's reasoning back as `reasoning_content` on every assistant turn, assuming vendors ignore it. Nemotron's chat template re-injects ALL prior reasoning_content as <think> blocks, and the loop is trimmed only once (before it starts) — so reasoning accumulated unbounded across rounds, bloating context and feeding the model its own prior reasoning, which reinforced repetition/looping. Strip reasoning_content from earlier assistant turns so only the most recent round carries it (still satisfies DeepSeek's thinking-mode follow-up requirement). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(agent-ui): wrap each round's reasoning in its own <think> block The streamed think-tag wrapper gated on whole-message substring checks (accumulated.includes('<think>')), which only ever wrapped ONE reasoning block per message. A multi-round agent response has a reasoning phase per round, so once round 1 closed its <think>...</think>, rounds 2+ reasoning was emitted unwrapped and leaked into the visible answer. Replace the substring checks with a stateful open/close flag that toggles per think/answer cycle, so each round's reasoning gets its own collapsible block. Single-turn chat is unchanged (one open, one close). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * test(stream): reasoning/reasoning_content delta surfaces as thinking chunk Covers @pewdiepie-archdaemon's requested regression: a streamed {reasoning: ...} delta emits a thinking chunk while {content: ...} streams as normal content; plus the older reasoning_content field for backward compat. Mirrors the #591 scenario. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+13
-1
@@ -1101,8 +1101,20 @@ def _append_tool_results(
|
|||||||
`round_reasoning` (DeepSeek / vLLM reasoning-parser deltas) is echoed
|
`round_reasoning` (DeepSeek / vLLM reasoning-parser deltas) is echoed
|
||||||
back via `reasoning_content` on the assistant message — DeepSeek's API
|
back via `reasoning_content` on the assistant message — DeepSeek's API
|
||||||
rejects follow-up requests in thinking mode that don't include the
|
rejects follow-up requests in thinking mode that don't include the
|
||||||
prior reasoning. Other vendors ignore the extra field.
|
prior reasoning.
|
||||||
|
|
||||||
|
NOTE: it is NOT universally ignored. Nemotron's chat template re-injects
|
||||||
|
EVERY prior `reasoning_content` as a <think> block, and this agent loop is
|
||||||
|
trimmed only once (before the loop), so across rounds the reasoning piles
|
||||||
|
up unbounded — bloating context and feeding the model its own prior
|
||||||
|
reasoning, which reinforces repetition/looping. So keep reasoning_content
|
||||||
|
on the MOST RECENT assistant turn only: enough for DeepSeek continuity,
|
||||||
|
without the per-round accumulation.
|
||||||
"""
|
"""
|
||||||
|
# Strip reasoning_content from earlier assistant turns; only the newest keeps it.
|
||||||
|
for _m in messages:
|
||||||
|
if _m.get("role") == "assistant":
|
||||||
|
_m.pop("reasoning_content", None)
|
||||||
if used_native and native_tool_calls:
|
if used_native and native_tool_calls:
|
||||||
assistant_msg = {"role": "assistant"}
|
assistant_msg = {"role": "assistant"}
|
||||||
# When the model emitted ONLY tool calls (no prose), content must be
|
# When the model emitted ONLY tool calls (no prose), content must be
|
||||||
|
|||||||
+2
-2
@@ -1127,8 +1127,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
|||||||
delta = j["choices"][0].get("delta") or {}
|
delta = j["choices"][0].get("delta") or {}
|
||||||
if isinstance(delta, dict):
|
if isinstance(delta, dict):
|
||||||
# Text content
|
# Text content
|
||||||
# Reasoning tokens (VLLM --reasoning-parser, e.g. Qwen3/DeepSeek-R1)
|
# Reasoning tokens (VLLM --reasoning-parser, e.g. Qwen3/DeepSeek-R1, Nemotron). vLLM 0.20.2 / NIM emit the field as `reasoning`; older builds use `reasoning_content`. Accept either.
|
||||||
reasoning = delta.get("reasoning_content") or ""
|
reasoning = delta.get("reasoning_content") or delta.get("reasoning") or ""
|
||||||
if reasoning:
|
if reasoning:
|
||||||
yield f'data: {json.dumps({"delta": reasoning, "thinking": True})}\n\n'
|
yield f'data: {json.dumps({"delta": reasoning, "thinking": True})}\n\n'
|
||||||
content = delta.get("content") or ""
|
content = delta.get("content") or ""
|
||||||
|
|||||||
+11
-4
@@ -512,6 +512,10 @@ import createResearchSynapse from './researchSynapse.js';
|
|||||||
|
|
||||||
// Declare accumulated outside try block so it's accessible in catch
|
// Declare accumulated outside try block so it's accessible in catch
|
||||||
let accumulated = '';
|
let accumulated = '';
|
||||||
|
// Are we currently inside an unclosed <think> block? Toggled per think/answer
|
||||||
|
// cycle so a multi-round agent response (one reasoning phase PER round) wraps each
|
||||||
|
// round's reasoning in its own <think>…</think> instead of leaking rounds 2+ as text.
|
||||||
|
let _thinkOpen = false;
|
||||||
let holder = null;
|
let holder = null;
|
||||||
let finalMeta = null;
|
let finalMeta = null;
|
||||||
let finalModelName = null;
|
let finalModelName = null;
|
||||||
@@ -1357,12 +1361,15 @@ import createResearchSynapse from './researchSynapse.js';
|
|||||||
if (_threadAbove && _threadAbove.classList.contains('agent-thread') && !_threadAbove.classList.contains('has-bottom')) {
|
if (_threadAbove && _threadAbove.classList.contains('agent-thread') && !_threadAbove.classList.contains('has-bottom')) {
|
||||||
_threadAbove.classList.add('has-bottom');
|
_threadAbove.classList.add('has-bottom');
|
||||||
}
|
}
|
||||||
// VLLM reasoning tokens: wrap in <think> tags for the thinking UI
|
// VLLM reasoning tokens: wrap in <think> tags for the thinking UI.
|
||||||
|
// Stateful open/close (not a whole-message substring check) so each round
|
||||||
|
// of a multi-round agent response gets its own <think>…</think> — otherwise
|
||||||
|
// only round 1 is wrapped and rounds 2+ reasoning leaks into the answer.
|
||||||
let _delta = json.delta;
|
let _delta = json.delta;
|
||||||
if (json.thinking) {
|
if (json.thinking) {
|
||||||
if (!accumulated.includes('<think>')) _delta = '<think>' + _delta;
|
if (!_thinkOpen) { _delta = '<think>' + _delta; _thinkOpen = true; }
|
||||||
} else if (accumulated.includes('<think>') && !accumulated.includes('</think>')) {
|
} else if (_thinkOpen) {
|
||||||
_delta = '</think>' + _delta;
|
_delta = '</think>' + _delta; _thinkOpen = false;
|
||||||
}
|
}
|
||||||
const wasEmpty = !accumulated;
|
const wasEmpty = !accumulated;
|
||||||
accumulated += _delta;
|
accumulated += _delta;
|
||||||
|
|||||||
@@ -0,0 +1,98 @@
|
|||||||
|
"""Regression: a streamed `reasoning` delta (vLLM 0.20.2 / NIM / Ollama) must surface
|
||||||
|
as a thinking chunk, while a `content` delta still streams as normal content. Also
|
||||||
|
covers the older `reasoning_content` field name for backward compatibility.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
|
||||||
|
from src import llm_core
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeResp:
|
||||||
|
status_code = 200
|
||||||
|
|
||||||
|
def __init__(self, lines):
|
||||||
|
self._lines = lines
|
||||||
|
|
||||||
|
async def aiter_lines(self):
|
||||||
|
for ln in self._lines:
|
||||||
|
yield ln
|
||||||
|
|
||||||
|
async def aread(self): # only used on non-200; present for safety
|
||||||
|
return b""
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeStreamCtx:
|
||||||
|
def __init__(self, lines):
|
||||||
|
self._lines = lines
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
return _FakeResp(self._lines)
|
||||||
|
|
||||||
|
async def __aexit__(self, *exc):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeClient:
|
||||||
|
def __init__(self, lines):
|
||||||
|
self._lines = lines
|
||||||
|
|
||||||
|
def stream(self, *args, **kwargs):
|
||||||
|
return _FakeStreamCtx(self._lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_stream(model, lines, monkeypatch):
|
||||||
|
"""Drive stream_llm against a faked upstream and return parsed SSE payloads."""
|
||||||
|
monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines))
|
||||||
|
|
||||||
|
async def _go():
|
||||||
|
out = []
|
||||||
|
async for chunk in llm_core.stream_llm(
|
||||||
|
"http://nim-nano:8000/v1/chat/completions",
|
||||||
|
model,
|
||||||
|
[{"role": "user", "content": "hi"}],
|
||||||
|
):
|
||||||
|
out.append(chunk)
|
||||||
|
return out
|
||||||
|
|
||||||
|
parsed = []
|
||||||
|
for chunk in asyncio.run(_go()):
|
||||||
|
for raw in chunk.splitlines():
|
||||||
|
raw = raw.strip()
|
||||||
|
if raw.startswith("data:"):
|
||||||
|
payload = raw[5:].strip()
|
||||||
|
if payload.startswith("{"):
|
||||||
|
try:
|
||||||
|
parsed.append(json.loads(payload))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
return [p for p in parsed if "delta" in p]
|
||||||
|
|
||||||
|
|
||||||
|
def test_reasoning_field_emits_thinking_chunk(monkeypatch):
|
||||||
|
deltas = _run_stream(
|
||||||
|
"nvidia/nemotron-3-nano",
|
||||||
|
[
|
||||||
|
'data: {"choices":[{"delta":{"reasoning":"weighing options"}}]}',
|
||||||
|
'data: {"choices":[{"delta":{"content":"Hello"}}]}',
|
||||||
|
"data: [DONE]",
|
||||||
|
],
|
||||||
|
monkeypatch,
|
||||||
|
)
|
||||||
|
assert any(d.get("thinking") and "weighing options" in d["delta"] for d in deltas), deltas
|
||||||
|
assert any((not d.get("thinking")) and d["delta"] == "Hello" for d in deltas), deltas
|
||||||
|
|
||||||
|
|
||||||
|
def test_reasoning_content_field_still_supported(monkeypatch):
|
||||||
|
# Older builds emit `reasoning_content`; it must still surface as thinking.
|
||||||
|
deltas = _run_stream(
|
||||||
|
"some-thinking-model",
|
||||||
|
[
|
||||||
|
'data: {"choices":[{"delta":{"reasoning_content":"older field"}}]}',
|
||||||
|
'data: {"choices":[{"delta":{"content":"Answer"}}]}',
|
||||||
|
"data: [DONE]",
|
||||||
|
],
|
||||||
|
monkeypatch,
|
||||||
|
)
|
||||||
|
assert any(d.get("thinking") and "older field" in d["delta"] for d in deltas), deltas
|
||||||
|
assert any((not d.get("thinking")) and d["delta"] == "Answer" for d in deltas), deltas
|
||||||
Reference in New Issue
Block a user