fix: token usage dropped when it rides on a non-empty finish delta (#1703)

This commit is contained in:
Afonso Coutinho
2026-06-03 05:36:57 +01:00
committed by GitHub
parent 578f56ab92
commit 3da4edb442
2 changed files with 116 additions and 1 deletions
+13 -1
View File
@@ -1323,7 +1323,19 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
# Usage chunk (from stream_options)
_choices = j.get("choices") or []
_delta0 = _choices[0].get("delta") if _choices else None
if "usage" in j and _delta0 in (None, {}, {"content": None}):
# Capture usage whenever the chunk carries it and
# the delta has no actual output. Some gateways /
# local servers attach usage to the FINAL delta,
# which also carries role/finish_reason (so it is
# not exactly None/{}/{"content": None}); gating on
# those exact shapes discarded their token counts.
_delta_has_output = isinstance(_delta0, dict) and (
_delta0.get("content")
or _delta0.get("reasoning_content")
or _delta0.get("reasoning")
or _delta0.get("tool_calls")
)
if "usage" in j and not _delta_has_output:
u = j["usage"]
_usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}
# llama.cpp puts a `timings` block alongside `usage` with the