mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
Chat metrics: surface backend generation speed
* Chat metrics: show backend's true generation t/s, not tokens÷wall-clock
The per-message tokens/sec read low and felt wrong because it was computed as
output_tokens / total_duration, where total_duration is wall-clock including
prefill, tool calls, and network — not pure decode time. llama.cpp already
reports the correct gen speed in its stream (timings.predicted_per_second), but
it was being dropped.
- llm_core.py: when parsing the OpenAI-compatible usage chunk, also read the
sibling `timings` block llama.cpp includes — pass predicted_per_second through
as gen_tps and prompt_per_second as prefill_tps on the usage event.
- agent_loop.py: capture backend_gen_tps/backend_prefill_tps from usage events;
in _compute_final_metrics prefer backend_gen_tps over the wall-clock division
when present (fall back to computed for cloud APIs that omit timings). Tag the
result with tps_source ("backend" vs "computed") and surface prefill_tps.
Result: the displayed t/s now matches the model's real decode speed and is
stable regardless of prompt length (a long prefill no longer deflates it).
Checks: py_compile passes; verified extraction against a real llama.cpp final
chunk (gen 79 t/s surfaced vs the deflated wall-clock figure shown before).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
* Chat metrics: surface true t/s on the direct-chat path too
Follow-up to the gen-tps work: the non-agent direct-chat stream path in
chat_routes turned the raw `usage` event straight into a metrics event but only
copied token counts — it never set tokens_per_second or response_time. So simple
(non-tool) replies showed "Speed: n/a" / "Time: undefineds" and the chip fell
back to a bare token count ("27 tok") instead of t/s.
Map the usage event's gen_tps (llama.cpp timings.predicted_per_second, added in
the prior commit) into tokens_per_second here too, tag tps_source=backend, and
set response_time from wall-clock for the stats popup.
Checks: py_compile passes; verified llama.cpp emits usage+timings on the final
stream chunk (gen ~90 t/s) that this path consumes.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
* Tests: backend gen/prefill t/s passthrough and preference
Cover the two pieces of the true-t/s metric so it can be reviewed on its own:
- stream_llm surfaces llama.cpp's timings.predicted_per_second /
prompt_per_second as gen_tps / prefill_tps on the usage event (captured
llama.cpp final-chunk fixture), and omits them when the backend reports no
timings.
- _compute_final_metrics prefers backend_gen_tps over output/wall-clock,
tags tps_source ("backend" vs "computed"), and surfaces prefill_tps.
Reuses the fake-client stream harness from test_llm_core_streaming.py.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---------
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+28
-1
@@ -1176,6 +1176,8 @@ def _compute_final_metrics(
|
||||
model: str = "",
|
||||
last_round_input_tokens: int = 0,
|
||||
prep_timings: Optional[Dict[str, float]] = None,
|
||||
backend_gen_tps: float = 0,
|
||||
backend_prefill_tps: float = 0,
|
||||
) -> dict:
|
||||
"""Compute token counts, TPS, and build the final metrics dict."""
|
||||
if has_real_usage:
|
||||
@@ -1188,7 +1190,15 @@ def _compute_final_metrics(
|
||||
input_content += msg["content"] + "\n"
|
||||
input_tokens = len(input_content) // 4
|
||||
output_tokens = len(full_response) // 4
|
||||
tps = output_tokens / total_duration if total_duration > 0 else 0
|
||||
# Prefer the backend's true generation speed (llama.cpp
|
||||
# timings.predicted_per_second) — pure decode, no prefill/tool/network time.
|
||||
# Fall back to tokens/wall-clock only when the backend didn't report it
|
||||
# (e.g. cloud APIs without timings); that figure reads low because
|
||||
# total_duration includes prefill + agent overhead.
|
||||
if backend_gen_tps and backend_gen_tps > 0:
|
||||
tps = backend_gen_tps
|
||||
else:
|
||||
tps = output_tokens / total_duration if total_duration > 0 else 0
|
||||
# Use last round's input tokens for context % (peak usage) when available
|
||||
ctx_tokens = last_round_input_tokens if last_round_input_tokens > 0 else input_tokens
|
||||
ctx_pct = min(round((ctx_tokens / context_length) * 100, 1), 100.0) if context_length else 0
|
||||
@@ -1199,12 +1209,17 @@ def _compute_final_metrics(
|
||||
"input_tokens": input_tokens,
|
||||
"output_tokens": output_tokens,
|
||||
"tokens_per_second": round(tps, 2),
|
||||
# True decode speed when the backend reported it; "computed" = the
|
||||
# tokens/wall-clock fallback (reads low — includes prefill/overhead).
|
||||
"tps_source": "backend" if (backend_gen_tps and backend_gen_tps > 0) else "computed",
|
||||
"total_tokens": input_tokens + output_tokens,
|
||||
"context_length": context_length,
|
||||
"context_percent": ctx_pct,
|
||||
"usage_source": "real" if has_real_usage else "estimated",
|
||||
"model": model,
|
||||
}
|
||||
if backend_prefill_tps and backend_prefill_tps > 0:
|
||||
metrics["prefill_tps"] = round(backend_prefill_tps, 2)
|
||||
if prep_timings:
|
||||
prep_total = round(sum(prep_timings.values()), 3)
|
||||
metrics["agent_prep_time"] = prep_total
|
||||
@@ -1506,6 +1521,8 @@ async def stream_agent_loop(
|
||||
real_output_tokens = 0
|
||||
last_round_input_tokens = 0 # Last round's input tokens (for context % peak)
|
||||
has_real_usage = False
|
||||
backend_gen_tps = 0 # backend-reported true gen speed (llama.cpp timings)
|
||||
backend_prefill_tps = 0 # backend-reported prefill speed
|
||||
total_tool_calls = 0 # for budget enforcement
|
||||
|
||||
# Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get
|
||||
@@ -1655,6 +1672,14 @@ async def stream_agent_loop(
|
||||
real_output_tokens += u.get("output_tokens", 0)
|
||||
last_round_input_tokens = round_input
|
||||
has_real_usage = True
|
||||
# Backend-reported TRUE generation speed (llama.cpp
|
||||
# timings.predicted_per_second) — pure decode, excludes
|
||||
# prefill/network. Preferred over tokens/wall-clock, which
|
||||
# reads low. Keep the last round's value (the gen phase).
|
||||
if u.get("gen_tps"):
|
||||
backend_gen_tps = u["gen_tps"]
|
||||
if u.get("prefill_tps"):
|
||||
backend_prefill_tps = u["prefill_tps"]
|
||||
elif data.get("type") == "fallback":
|
||||
# The selected model failed and another answered; surface
|
||||
# the notice so a misconfigured provider isn't masked.
|
||||
@@ -2181,6 +2206,8 @@ async def stream_agent_loop(
|
||||
has_real_usage, tool_events, round_texts, model=model,
|
||||
last_round_input_tokens=last_round_input_tokens,
|
||||
prep_timings=prep_timings,
|
||||
backend_gen_tps=backend_gen_tps,
|
||||
backend_prefill_tps=backend_prefill_tps,
|
||||
)
|
||||
yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"
|
||||
|
||||
|
||||
+13
-1
@@ -1183,7 +1183,19 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
_delta0 = _choices[0].get("delta") if _choices else None
|
||||
if "usage" in j and _delta0 in (None, {}, {"content": None}):
|
||||
u = j["usage"]
|
||||
yield f'data: {json.dumps({"type": "usage", "data": {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}})}\n\n'
|
||||
_usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}
|
||||
# llama.cpp puts a `timings` block alongside `usage` with the
|
||||
# TRUE generation speed (predicted_per_second) — pure decode,
|
||||
# excluding prefill/network. Pass it through so the UI shows the
|
||||
# real gen t/s instead of recomputing tokens/wall-clock (which
|
||||
# includes prefill and reads ~20-40% low). Prefill speed too.
|
||||
_tm = j.get("timings")
|
||||
if isinstance(_tm, dict):
|
||||
if _tm.get("predicted_per_second"):
|
||||
_usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2)
|
||||
if _tm.get("prompt_per_second"):
|
||||
_usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2)
|
||||
yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n'
|
||||
elif "choices" in j:
|
||||
delta = j["choices"][0].get("delta") or {}
|
||||
if isinstance(delta, dict):
|
||||
|
||||
Reference in New Issue
Block a user