mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
fix: token usage dropped when it rides on a non-empty finish delta (#1703)
This commit is contained in:
+13
-1
@@ -1323,7 +1323,19 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
# Usage chunk (from stream_options)
|
||||
_choices = j.get("choices") or []
|
||||
_delta0 = _choices[0].get("delta") if _choices else None
|
||||
if "usage" in j and _delta0 in (None, {}, {"content": None}):
|
||||
# Capture usage whenever the chunk carries it and
|
||||
# the delta has no actual output. Some gateways /
|
||||
# local servers attach usage to the FINAL delta,
|
||||
# which also carries role/finish_reason (so it is
|
||||
# not exactly None/{}/{"content": None}); gating on
|
||||
# those exact shapes discarded their token counts.
|
||||
_delta_has_output = isinstance(_delta0, dict) and (
|
||||
_delta0.get("content")
|
||||
or _delta0.get("reasoning_content")
|
||||
or _delta0.get("reasoning")
|
||||
or _delta0.get("tool_calls")
|
||||
)
|
||||
if "usage" in j and not _delta_has_output:
|
||||
u = j["usage"]
|
||||
_usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}
|
||||
# llama.cpp puts a `timings` block alongside `usage` with the
|
||||
|
||||
Reference in New Issue
Block a user