fix(chat): show requested and actual reply models

Show requested and actual reply models in chat labels when fallback or provider routing changes the responding model.
2026-06-22 04:35:29 -04:00 · 2026-06-06 14:30:16 +04:00
parent 2e37d72155
commit 6ccd4500d7
8 changed files with 285 additions and 38 deletions
@@ -1,7 +1,12 @@
 import pytest
 from fastapi import HTTPException

-from routes.chat_helpers import _enforce_chat_privileges, clean_thinking_for_save, needs_auto_name
+from routes.chat_helpers import (
+    _enforce_chat_privileges,
+    clean_thinking_for_save,
+    needs_auto_name,
+    save_assistant_response,
+)


 class _AuthManager:
@@ -64,6 +69,15 @@ def test_allowed_models_nonempty_list_still_restricts_without_new_flag(monkeypat
        )


+class _FakeSession:
+    def __init__(self, model="selected-model"):
+        self.model = model
+        self.history = []
+
+    def add_message(self, message):
+        self.history.append(message)
+
+
@pytest.mark.parametrize("name,expected", [
    # 24h format (the bug this PR fixes)
    ("deepseek-v4-flash 14:05:33", True),
@@ -130,3 +144,19 @@ def test_clean_thinking_for_save_extracts_thought_tag():

    assert content == "Final answer."
    assert metadata["thinking"] == "internal reasoning"
+
+
+def test_save_assistant_response_preserves_actual_and_requested_model():
+    sess = _FakeSession("selected-model")
+
+    save_assistant_response(
+        sess,
+        session_manager=None,
+        session_id="s1",
+        full_response="hello",
+        last_metrics={"model": "actual-model", "input_tokens": 1, "output_tokens": 2},
+        incognito=True,
+    )
+
+    assert sess.history[-1].metadata["requested_model"] == "selected-model"
+    assert sess.history[-1].metadata["model"] == "actual-model"
@@ -82,6 +82,32 @@ def _usage_event(monkeypatch, lines):
    return asyncio.run(run())


+def _stream_events(monkeypatch, lines):
+    """Drive stream_llm and return all JSON data events."""
+    monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines))
+    monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
+    monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
+    monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
+
+    async def run():
+        events = []
+        async for chunk in llm_core.stream_llm(
+            "http://127.0.0.1:8081/v1/chat/completions",
+            "openrouter/auto",
+            [{"role": "user", "content": "hi"}],
+        ):
+            for ln in chunk.split("\n"):
+                ln = ln.strip()
+                if ln.startswith("data: ") and ln[6:] != "[DONE]":
+                    try:
+                        events.append(json.loads(ln[6:]))
+                    except ValueError:
+                        pass
+        return events
+
+    return asyncio.run(run())
+
+
 # A real llama.cpp final chunk carries `usage` (delta empty / choices []) with a
 # sibling `timings` block. The decode speed here (78.91) is far above the
 # wall-clock figure the old code would have shown.
@@ -127,6 +153,31 @@ def test_stream_llm_omits_tps_when_backend_has_no_timings(monkeypatch):
    assert "prefill_tps" not in usage


+def test_stream_llm_surfaces_provider_resolved_model(monkeypatch):
+    events = _stream_events(monkeypatch, [
+        'data: ' + json.dumps({
+            "model": "meta-llama/llama-3.3-70b-instruct:free",
+            "choices": [{"index": 0, "delta": {"content": "Hi"}}],
+        }),
+        'data: ' + json.dumps({
+            "model": "meta-llama/llama-3.3-70b-instruct:free",
+            "choices": [],
+            "usage": {"prompt_tokens": 8, "completion_tokens": 5},
+        }),
+        "data: [DONE]",
+    ])
+
+    actual = [e for e in events if e.get("type") == "model_actual"]
+    assert actual == [{
+        "type": "model_actual",
+        "requested_model": "openrouter/auto",
+        "model": "meta-llama/llama-3.3-70b-instruct:free",
+    }]
+    usage = [e["data"] for e in events if e.get("type") == "usage"][0]
+    assert usage["requested_model"] == "openrouter/auto"
+    assert usage["model"] == "meta-llama/llama-3.3-70b-instruct:free"
+
+
 # --- _compute_final_metrics preference logic --------------------------------

 def _metrics(**overrides):