fix(chat): show requested and actual reply models

Show requested and actual reply models in chat labels when fallback or provider routing changes the responding model.
This commit is contained in:
Mohammed Riaz
2026-06-06 14:30:16 +04:00
committed by GitHub
parent 2e37d72155
commit 6ccd4500d7
8 changed files with 285 additions and 38 deletions
+10 -1
View File
@@ -1741,6 +1741,8 @@ async def stream_agent_loop(
has_real_usage = False
backend_gen_tps = 0 # backend-reported true gen speed (llama.cpp timings)
backend_prefill_tps = 0 # backend-reported prefill speed
requested_model = model
actual_model = model
total_tool_calls = 0 # for budget enforcement
# Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get
@@ -1913,6 +1915,7 @@ async def stream_agent_loop(
logger.info(f"Agent round {round_num}: received {len(native_tool_calls)} native tool call(s)")
elif data.get("type") == "usage":
u = data.get("data", {})
actual_model = u.get("model") or actual_model
round_input = u.get("input_tokens", 0)
real_input_tokens += round_input
real_output_tokens += u.get("output_tokens", 0)
@@ -1929,9 +1932,14 @@ async def stream_agent_loop(
elif data.get("type") == "fallback":
# The selected model failed and another answered; surface
# the notice so a misconfigured provider isn't masked.
actual_model = data.get("answered_by") or actual_model
logger.warning(f"[agent] round {round_num} fell back: "
f"{data.get('selected_model')} -> {data.get('answered_by')}")
yield chunk
elif data.get("type") == "model_actual":
actual_model = data.get("model") or actual_model
data["requested_model"] = requested_model
yield f"data: {json.dumps(data)}\n\n"
elif "delta" in data:
if not first_token_received:
time_to_first_token = time.time() - total_start
@@ -2562,12 +2570,13 @@ async def stream_agent_loop(
metrics = _compute_final_metrics(
messages, full_response, total_duration, time_to_first_token,
context_length, real_input_tokens, real_output_tokens,
has_real_usage, tool_events, round_texts, model=model,
has_real_usage, tool_events, round_texts, model=actual_model,
last_round_input_tokens=last_round_input_tokens,
prep_timings=prep_timings,
backend_gen_tps=backend_gen_tps,
backend_prefill_tps=backend_prefill_tps,
)
metrics["requested_model"] = requested_model
yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"
# Teacher-escalation: inline takeover visible in the chat stream.
+25
View File
@@ -167,6 +167,9 @@ def _stream_delta_event(text: str, *, thinking: bool = False) -> str:
def _model_activity_key(url: str, model: str) -> str:
return f"{(url or '').strip()}|{(model or '').strip()}"
def _same_model_identity(left: str, right: str) -> bool:
return (left or "").strip().lower() == (right or "").strip().lower()
def note_model_activity(url: str, model: str):
"""Record that a real upstream request used this endpoint/model."""
if not url or not model:
@@ -1493,6 +1496,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
_think_open_stripped = False # opening <think> tag already removed
_harmony_router = _HarmonyStreamRouter()
_harmony_active = False # sticky: gpt-oss harmony <|channel|> stream detected
_actual_model = ""
_actual_model_announced = False
def _emit_tool_calls():
"""Build the tool_calls event string if any were accumulated."""
@@ -1549,6 +1554,15 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
if data.strip():
if data.startswith("{"):
j = json.loads(data)
chunk_model = j.get("model")
if isinstance(chunk_model, str) and chunk_model.strip():
_actual_model = chunk_model.strip()
if (
not _actual_model_announced
and not _same_model_identity(_actual_model, model)
):
_actual_model_announced = True
yield f'data: {json.dumps({"type": "model_actual", "requested_model": model, "model": _actual_model})}\n\n'
# Usage chunk (from stream_options)
_choices = j.get("choices") or []
_delta0 = _choices[0].get("delta") if (_choices and _choices[0] is not None) else None
@@ -1579,6 +1593,10 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
_usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2)
if _tm.get("prompt_per_second"):
_usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2)
if _actual_model:
_usage_data["model"] = _actual_model
if not _same_model_identity(_actual_model, model):
_usage_data["requested_model"] = model
yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n'
elif "choices" in j:
_c0 = (j["choices"] or [None])[0]
@@ -1791,6 +1809,13 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
continue
# Any data chunk other than the terminal [DONE] means real output.
if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
try:
event_data = json.loads(chunk[6:])
except Exception:
event_data = {}
if event_data.get("type") == "model_actual":
yield chunk
continue
# First real output from a NON-primary candidate: tell the client
# the selected model failed and another answered. Without this the
# fallback is invisible — a misconfigured provider looks like it