mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-19 19:25:27 -04:00
fix(chat): show requested and actual reply models
Show requested and actual reply models in chat labels when fallback or provider routing changes the responding model.
This commit is contained in:
@@ -167,6 +167,9 @@ def _stream_delta_event(text: str, *, thinking: bool = False) -> str:
|
||||
def _model_activity_key(url: str, model: str) -> str:
|
||||
return f"{(url or '').strip()}|{(model or '').strip()}"
|
||||
|
||||
def _same_model_identity(left: str, right: str) -> bool:
|
||||
return (left or "").strip().lower() == (right or "").strip().lower()
|
||||
|
||||
def note_model_activity(url: str, model: str):
|
||||
"""Record that a real upstream request used this endpoint/model."""
|
||||
if not url or not model:
|
||||
@@ -1493,6 +1496,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
_think_open_stripped = False # opening <think> tag already removed
|
||||
_harmony_router = _HarmonyStreamRouter()
|
||||
_harmony_active = False # sticky: gpt-oss harmony <|channel|> stream detected
|
||||
_actual_model = ""
|
||||
_actual_model_announced = False
|
||||
|
||||
def _emit_tool_calls():
|
||||
"""Build the tool_calls event string if any were accumulated."""
|
||||
@@ -1549,6 +1554,15 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
if data.strip():
|
||||
if data.startswith("{"):
|
||||
j = json.loads(data)
|
||||
chunk_model = j.get("model")
|
||||
if isinstance(chunk_model, str) and chunk_model.strip():
|
||||
_actual_model = chunk_model.strip()
|
||||
if (
|
||||
not _actual_model_announced
|
||||
and not _same_model_identity(_actual_model, model)
|
||||
):
|
||||
_actual_model_announced = True
|
||||
yield f'data: {json.dumps({"type": "model_actual", "requested_model": model, "model": _actual_model})}\n\n'
|
||||
# Usage chunk (from stream_options)
|
||||
_choices = j.get("choices") or []
|
||||
_delta0 = _choices[0].get("delta") if (_choices and _choices[0] is not None) else None
|
||||
@@ -1579,6 +1593,10 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
_usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2)
|
||||
if _tm.get("prompt_per_second"):
|
||||
_usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2)
|
||||
if _actual_model:
|
||||
_usage_data["model"] = _actual_model
|
||||
if not _same_model_identity(_actual_model, model):
|
||||
_usage_data["requested_model"] = model
|
||||
yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n'
|
||||
elif "choices" in j:
|
||||
_c0 = (j["choices"] or [None])[0]
|
||||
@@ -1791,6 +1809,13 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
|
||||
continue
|
||||
# Any data chunk other than the terminal [DONE] means real output.
|
||||
if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
|
||||
try:
|
||||
event_data = json.loads(chunk[6:])
|
||||
except Exception:
|
||||
event_data = {}
|
||||
if event_data.get("type") == "model_actual":
|
||||
yield chunk
|
||||
continue
|
||||
# First real output from a NON-primary candidate: tell the client
|
||||
# the selected model failed and another answered. Without this the
|
||||
# fallback is invisible — a misconfigured provider looks like it
|
||||
|
||||
Reference in New Issue
Block a user