fix(chat): show requested and actual reply models

Show requested and actual reply models in chat labels when fallback or provider routing changes the responding model.
2026-06-16 01:35:36 -04:00 · 2026-06-06 14:30:16 +04:00
parent 2e37d72155
commit 6ccd4500d7
8 changed files with 285 additions and 38 deletions
@@ -774,7 +774,19 @@ def save_assistant_response(
 ):
    """Add assistant response to session history. In incognito mode, keeps in-memory context but skips DB persistence."""
    md = dict(last_metrics) if last_metrics else {}
-    md["model"] = sess.model
+    def _model_value(value) -> str:
        if value is None:
            return ""
        if not isinstance(value, str):
            value = str(value)
        return value.strip()
    requested_model = _model_value(md.get("requested_model") or md.get("selected_model") or getattr(sess, "model", ""))
    actual_model = _model_value(md.get("model") or md.get("actual_model") or requested_model)
    if requested_model:
        md["requested_model"] = requested_model
    if actual_model:
        md["model"] = actual_model
    if character_name:
        md["character_name"] = character_name
    if web_sources:
@@ -893,6 +893,8 @@ def setup_chat_routes(
            elif chat_mode == "chat":
                _chat_start = time.time()
                _answered_by = None  # set if the selected model failed and a fallback answered
                _requested_model = sess.model
                _actual_model = None
                # ── Chat mode: call stream_llm directly, NO tools, NO document access ──
                try:
                    _chat_candidates = [(sess.endpoint_url, sess.model, sess.headers)] + _fallback_candidates
@@ -925,10 +927,18 @@ def setup_chat_routes(
                                    # Selected model failed; a fallback answered.
                                    # Forward the notice and remember the real model.
                                    _answered_by = data.get("answered_by") or _answered_by
                                    _actual_model = _actual_model or _answered_by
                                    data["selected_model"] = data.get("selected_model") or _requested_model
                                    yield chunk
                                elif data.get("type") == "model_actual":
                                    _actual_model = data.get("model") or _actual_model
                                    data["requested_model"] = _requested_model
                                    yield f'data: {json.dumps(data)}\n\n'
                                elif data.get("type") == "usage":
                                    last_metrics = data.get("data", {})
-                                    last_metrics["model"] = _answered_by or sess.model
+                                    _reported_model = last_metrics.get("model")
                                    last_metrics["requested_model"] = _requested_model
                                    last_metrics["model"] = _reported_model or _actual_model or _answered_by or _requested_model
                                    if ctx.context_length and last_metrics.get("input_tokens"):
                                        pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0)
                                        last_metrics["context_percent"] = pct
@@ -965,7 +975,8 @@ def setup_chat_routes(
                                    "tokens_per_second": _tps,
                                    "context_percent": _ctx_pct,
                                    "context_length": ctx.context_length,
-                                    "model": sess.model,
+                                    "model": _actual_model or _answered_by or _requested_model,
                                    "requested_model": _requested_model,
                                    "usage_source": "estimated",
                                }
                                yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
@@ -994,7 +1005,14 @@ def setup_chat_routes(
                except (asyncio.CancelledError, GeneratorExit):
                    if full_response:
                        logger.info("Client disconnected mid-stream (chat mode) for session %s, saving partial (%d chars)", session, len(full_response))
-                        _stopped_content, _stopped_md = clean_thinking_for_save(full_response, {"stopped": True, "model": sess.model})
+                        _stopped_content, _stopped_md = clean_thinking_for_save(
                            full_response,
                            {
                                "stopped": True,
                                "model": _actual_model or _answered_by or _requested_model,
                                "requested_model": _requested_model,
                            },
                        )
                        sess.add_message(ChatMessage("assistant", _stopped_content, metadata=_stopped_md))
                        if not incognito:
                            session_manager.save_sessions()
@@ -1006,6 +1024,8 @@ def setup_chat_routes(
                _agent_rounds = 0
                _agent_tool_calls = 0
                _answered_by = None  # set if the selected model failed and a fallback answered
                _requested_model = sess.model
                _actual_model = None
                try:
                    from src.settings import get_setting
                    from src.agent_tools import MAX_AGENT_ROUNDS as _DEFAULT_ROUNDS
@@ -1071,10 +1091,18 @@ def setup_chat_routes(
                                    # model so metrics reflect it, not the masked
                                    # selected model.
                                    _answered_by = data.get("answered_by") or _answered_by
                                    _actual_model = _actual_model or _answered_by
                                    data["selected_model"] = data.get("selected_model") or _requested_model
                                    yield chunk
                                elif data.get("type") == "model_actual":
                                    _actual_model = data.get("model") or _actual_model
                                    data["requested_model"] = _requested_model
                                    yield f'data: {json.dumps(data)}\n\n'
                                elif data.get("type") == "metrics":
                                    last_metrics = data.get("data", {})
-                                    last_metrics["model"] = _answered_by or sess.model
+                                    _reported_model = last_metrics.get("model")
                                    last_metrics["requested_model"] = last_metrics.get("requested_model") or _requested_model
                                    last_metrics["model"] = _reported_model or _actual_model or _answered_by or _requested_model
                                    yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
                            except json.JSONDecodeError:
                                yield chunk
@@ -1115,7 +1143,14 @@ def setup_chat_routes(
                    try:
                        if full_response:
                            logger.info("Client disconnected mid-stream for session %s, saving partial response (%d chars)", session, len(full_response))
-                            _stopped_content2, _stopped_md2 = clean_thinking_for_save(full_response, {"stopped": True, "model": sess.model})
+                            _stopped_content2, _stopped_md2 = clean_thinking_for_save(
                                full_response,
                                {
                                    "stopped": True,
                                    "model": _actual_model or _answered_by or _requested_model,
                                    "requested_model": _requested_model,
                                },
                            )
                            sess.add_message(ChatMessage("assistant", _stopped_content2, metadata=_stopped_md2))
                            if not incognito:
                                session_manager.save_sessions()
@@ -1741,6 +1741,8 @@ async def stream_agent_loop(
    has_real_usage = False
    backend_gen_tps = 0      # backend-reported true gen speed (llama.cpp timings)
    backend_prefill_tps = 0  # backend-reported prefill speed
    requested_model = model
    actual_model = model
    total_tool_calls = 0  # for budget enforcement
    # Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get
@@ -1913,6 +1915,7 @@ async def stream_agent_loop(
                        logger.info(f"Agent round {round_num}: received {len(native_tool_calls)} native tool call(s)")
                    elif data.get("type") == "usage":
                        u = data.get("data", {})
                        actual_model = u.get("model") or actual_model
                        round_input = u.get("input_tokens", 0)
                        real_input_tokens += round_input
                        real_output_tokens += u.get("output_tokens", 0)
@@ -1929,9 +1932,14 @@ async def stream_agent_loop(
                    elif data.get("type") == "fallback":
                        # The selected model failed and another answered; surface
                        # the notice so a misconfigured provider isn't masked.
                        actual_model = data.get("answered_by") or actual_model
                        logger.warning(f"[agent] round {round_num} fell back: "
                                       f"{data.get('selected_model')} -> {data.get('answered_by')}")
                        yield chunk
                    elif data.get("type") == "model_actual":
                        actual_model = data.get("model") or actual_model
                        data["requested_model"] = requested_model
                        yield f"data: {json.dumps(data)}\n\n"
                    elif "delta" in data:
                        if not first_token_received:
                            time_to_first_token = time.time() - total_start
@@ -2562,12 +2570,13 @@ async def stream_agent_loop(
    metrics = _compute_final_metrics(
        messages, full_response, total_duration, time_to_first_token,
        context_length, real_input_tokens, real_output_tokens,
-        has_real_usage, tool_events, round_texts, model=model,
+        has_real_usage, tool_events, round_texts, model=actual_model,
        last_round_input_tokens=last_round_input_tokens,
        prep_timings=prep_timings,
        backend_gen_tps=backend_gen_tps,
        backend_prefill_tps=backend_prefill_tps,
    )
    metrics["requested_model"] = requested_model
    yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"
    # Teacher-escalation: inline takeover visible in the chat stream.
@@ -167,6 +167,9 @@ def _stream_delta_event(text: str, *, thinking: bool = False) -> str:
 def _model_activity_key(url: str, model: str) -> str:
    return f"{(url or '').strip()}|{(model or '').strip()}"
 def _same_model_identity(left: str, right: str) -> bool:
    return (left or "").strip().lower() == (right or "").strip().lower()
 def note_model_activity(url: str, model: str):
    """Record that a real upstream request used this endpoint/model."""
    if not url or not model:
@@ -1493,6 +1496,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
    _think_open_stripped = False  # opening <think> tag already removed
    _harmony_router = _HarmonyStreamRouter()
    _harmony_active = False       # sticky: gpt-oss harmony <|channel|> stream detected
    _actual_model = ""
    _actual_model_announced = False
    def _emit_tool_calls():
        """Build the tool_calls event string if any were accumulated."""
@@ -1549,6 +1554,15 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
                        if data.strip():
                            if data.startswith("{"):
                                j = json.loads(data)
                                chunk_model = j.get("model")
                                if isinstance(chunk_model, str) and chunk_model.strip():
                                    _actual_model = chunk_model.strip()
                                    if (
                                        not _actual_model_announced
                                        and not _same_model_identity(_actual_model, model)
                                    ):
                                        _actual_model_announced = True
                                        yield f'data: {json.dumps({"type": "model_actual", "requested_model": model, "model": _actual_model})}\n\n'
                                # Usage chunk (from stream_options)
                                _choices = j.get("choices") or []
                                _delta0 = _choices[0].get("delta") if (_choices and _choices[0] is not None) else None
@@ -1579,6 +1593,10 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
                                            _usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2)
                                        if _tm.get("prompt_per_second"):
                                            _usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2)
                                    if _actual_model:
                                        _usage_data["model"] = _actual_model
                                        if not _same_model_identity(_actual_model, model):
                                            _usage_data["requested_model"] = model
                                    yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n'
                                elif "choices" in j:
                                    _c0 = (j["choices"] or [None])[0]
@@ -1791,6 +1809,13 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
                continue
            # Any data chunk other than the terminal [DONE] means real output.
            if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
                try:
                    event_data = json.loads(chunk[6:])
                except Exception:
                    event_data = {}
                if event_data.get("type") == "model_actual":
                    yield chunk
                    continue
                # First real output from a NON-primary candidate: tell the client
                # the selected model failed and another answered. Without this the
                # fallback is invisible — a misconfigured provider looks like it
@@ -53,7 +53,27 @@ import { createStreamRenderer } from './streamingRenderer.js';
  // shortModel and modelColor are now in chatRenderer.js
  var _shortModel = chatRenderer.shortModel;
  var _modelRouteLabel = chatRenderer.modelRouteLabel;
  var _sameModelName = chatRenderer.sameModelName;
  var _applyModelColor = chatRenderer.applyModelColor;
  function _setRoleModelLabel(roleEl, requestedModel, actualModel, opts) {
    if (!roleEl) return;
    opts = opts || {};
    const tsSpan = roleEl.querySelector('.role-timestamp');
    const req = requestedModel || actualModel || '';
    const actual = actualModel || requestedModel || '';
    let label = _modelRouteLabel(req, actual);
    if (opts.suffix) label += ' (' + opts.suffix + ')';
    if (opts.characterName) label = opts.characterName;
    roleEl.textContent = label + ' ';
    _applyModelColor(roleEl, actual || req);
    if (req && actual && !_sameModelName(req, actual)) {
      roleEl.title = req + ' -> ' + actual + (opts.reason ? ': ' + opts.reason : '');
    } else if (!opts.reason) {
      roleEl.removeAttribute('title');
    }
    if (tsSpan) roleEl.appendChild(tsSpan);
  }
  // Per-session research tracking (supports concurrent research across sessions)
  const _researchingStreamIds = new Set();
  let _researchTimerEl = null, _researchTimerInterval = null;
@@ -556,7 +576,6 @@ import { createStreamRenderer } from './streamingRenderer.js';
    let _thinkOpen = false;
    let holder = null;
    let finalMeta = null;
    let finalModelName = null;
    let spinner = null;
    let timedOut = false;
    let processingProbeTimer = null;
@@ -892,11 +911,13 @@ import { createStreamRenderer } from './streamingRenderer.js';
        loadingText = 'Processing request...';
      }
-      var roleLabel = _shortModel(modelName);
+      var roleLabel = _modelRouteLabel(modelName, modelName);
      var _charNameInit = presetsModule.getCharacterName ? presetsModule.getCharacterName() : '';
      if (_charNameInit) roleLabel = _charNameInit;
      const roleTs = new Date().toLocaleTimeString([], {hour: '2-digit', minute:'2-digit'});
      holder.innerHTML = `<div class="role">${uiModule.esc(roleLabel)} <span class="role-timestamp">${roleTs}</span></div><div class="body"></div>`;
      holder._requestedModel = modelName;
      holder._actualModel = modelName;
      _applyModelColor(holder.querySelector('.role'), modelName);
      holder.style.position = 'relative';
@@ -1807,21 +1828,16 @@ import { createStreamRenderer } from './streamingRenderer.js';
                if (!_isBg && holder) {
                  const roleEl = holder.querySelector('.role');
                  if (roleEl) {
-                    const tsSpan = roleEl.querySelector('.role-timestamp');
+                    holder._requestedModel = json.requested_model || json.model || holder._requestedModel;
-                    var _modelLabel = _shortModel(json.model);
+                    holder._actualModel = json.model || holder._actualModel || holder._requestedModel;
-                    if (json.suffix) {
+                    if (json.suffix) holder._roleSuffix = json.suffix;
                      _modelLabel += ' (' + json.suffix + ')';
                      holder._roleSuffix = json.suffix;
                    }
                    // Prepend character name if sent by server or set locally
                    var _charName = json.character_name || (presetsModule.getCharacterName ? presetsModule.getCharacterName() : '');
-                    if (_charName) {
+                    if (_charName) holder._characterName = _charName;
-                      _modelLabel = _charName;
+                    _setRoleModelLabel(roleEl, holder._requestedModel, holder._actualModel, {
-                      holder._characterName = _charName;
+                      suffix: holder._roleSuffix,
-                    }
+                      characterName: holder._characterName,
-                    roleEl.textContent = _modelLabel + ' ';
+                    });
                    _applyModelColor(roleEl, json.model);
                    if (tsSpan) roleEl.appendChild(tsSpan);
                  }
                }
              } else if (json.type === 'fallback') {
@@ -1841,6 +1857,14 @@ import { createStreamRenderer } from './streamingRenderer.js';
                        (json.reason ? ': ' + json.reason : '') + ' — answered by ' + (json.answered_by || '');
                      _applyModelColor(_rEl, json.answered_by);
                      if (_tsS) _rEl.appendChild(_tsS);
                      holder._requestedModel = json.selected_model || holder._requestedModel || modelName;
                      const _hasResolvedActual = holder._actualModel && !_sameModelName(holder._actualModel, holder._requestedModel);
                      holder._actualModel = _hasResolvedActual ? holder._actualModel : (json.answered_by || holder._actualModel || holder._requestedModel);
                      _setRoleModelLabel(_rEl, holder._requestedModel, holder._actualModel, {
                        suffix: holder._roleSuffix,
                        characterName: holder._characterName,
                        reason: json.reason,
                      });
                    }
                  }
                }
@@ -1882,6 +1906,15 @@ import { createStreamRenderer } from './streamingRenderer.js';
                  _chatBox.appendChild(note);
                  try { note.scrollIntoView({ block: 'end', behavior: 'smooth' }); } catch (_) { uiModule.scrollHistory && uiModule.scrollHistory(); }
                }
              } else if (json.type === 'model_actual') {
                if (!_isBg && holder) {
                  holder._requestedModel = json.requested_model || holder._requestedModel || modelName;
                  holder._actualModel = json.model || holder._actualModel || holder._requestedModel;
                  _setRoleModelLabel(holder.querySelector('.role'), holder._requestedModel, holder._actualModel, {
                    suffix: holder._roleSuffix,
                    characterName: holder._characterName,
                  });
                }
              } else if (json.type === 'attachments') {
                if (_isBg) continue;
                // Update user bubble — replace file chips with image previews
@@ -1959,6 +1992,10 @@ import { createStreamRenderer } from './streamingRenderer.js';
                }
              } else if (json.type === 'metrics') {
                metrics = json.data;
                if (!_isBg && holder && metrics) {
                  holder._requestedModel = metrics.requested_model || holder._requestedModel || modelName;
                  holder._actualModel = metrics.model || holder._actualModel || holder._requestedModel;
                }
                if (_isBg) {
                  var bgM = _backgroundStreams.get(streamSessionId);
                  if (bgM) bgM.metrics = json.data;
@@ -2441,8 +2478,10 @@ import { createStreamRenderer } from './streamingRenderer.js';
                const newRole = document.createElement('div');
                newRole.className = 'role';
                const metaS = sessionModule.getSessions().find(s => s.id === streamSessionId);
-                newRole.textContent = _shortModel(metaS?.model) || '';
+                const _roundRequested = holder?._requestedModel || metaS?.model;
-                _applyModelColor(newRole, metaS?.model);
+                const _roundActual = holder?._actualModel || _roundRequested;
                newRole.textContent = _modelRouteLabel(_roundRequested, _roundActual) || '';
                _applyModelColor(newRole, _roundActual);
                newWrap.appendChild(newRole);
                const newBody = document.createElement('div');
                newBody.className = 'body';
@@ -2548,18 +2587,16 @@ import { createStreamRenderer } from './streamingRenderer.js';
      const _isBgFinal = (sessionModule.getCurrentSessionId() !== streamSessionId) || _backgroundStreams.has(streamSessionId);
      if (!_isBgFinal) {
        finalMeta = sessionModule.getSessions().find(s => s.id === sessionModule.getCurrentSessionId());
-        finalModelName = _shortModel(metrics?.model || finalMeta?.model);
+        const _finalActualModel = metrics?.model || holder._actualModel || finalMeta?.model;
-        // Preserve suffix (e.g. "Research") if set by model_info event
+        const _finalRequestedModel = metrics?.requested_model || holder._requestedModel || finalMeta?.model || _finalActualModel;
        if (holder._roleSuffix) finalModelName += ' (' + holder._roleSuffix + ')';
        // Prepend character name if set
        var _charNameFinal = presetsModule.getCharacterName ? presetsModule.getCharacterName() : '';
        if (_charNameFinal) finalModelName = _charNameFinal;
        const roleEl = holder.querySelector('.role');
        if (roleEl) {
-          const tsSpan = roleEl.querySelector('.role-timestamp');
+          _setRoleModelLabel(roleEl, _finalRequestedModel, _finalActualModel, {
-          roleEl.textContent = finalModelName + ' ';
+            suffix: holder._roleSuffix,
-          _applyModelColor(roleEl, metrics?.model || finalMeta?.model);
+            characterName: _charNameFinal || holder._characterName,
-          if (tsSpan) roleEl.appendChild(tsSpan);
+          });
        }
        holder.dataset.raw = accumulated;
@@ -537,6 +537,39 @@ export function shortModel(name) {
  return short;
 }
 function modelValue(name) {
  if (name == null) return '';
  return String(name).trim();
 }
 export function sameModelName(left, right) {
  const a = modelValue(left);
  const b = modelValue(right);
  if (!a || !b) return false;
  return a.toLowerCase() === b.toLowerCase()
    || shortModel(a).toLowerCase() === shortModel(b).toLowerCase();
 }
 export function modelRouteLabel(requestedModel, actualModel) {
  const requested = modelValue(requestedModel);
  const actual = modelValue(actualModel) || requested;
  if (!requested || sameModelName(requested, actual)) return shortModel(actual || requested);
  return shortModel(requested) + ' -> ' + shortModel(actual);
 }
 export function replyModelPair(modelName, metadata) {
  const meta = metadata || {};
  const actualFromMeta = modelValue(meta.model || meta.actual_model);
  const requestedFromMeta = modelValue(meta.requested_model || meta.selected_model);
  if (actualFromMeta || requestedFromMeta) {
    const actual = actualFromMeta || requestedFromMeta || modelValue(modelName);
    const requested = requestedFromMeta || actual;
    return { requestedModel: requested, actualModel: actual };
  }
  const fallback = modelValue(modelName);
  return { requestedModel: fallback, actualModel: fallback };
 }
 /**
 * Generate a consistent HSL color for a model name.
 * Returns an hsl() string. The hue is derived from a string hash,
@@ -577,7 +610,11 @@ export function applyModelColor(roleEl, modelName) {
  }
  // Replace generic dot with provider logo if available
  const logo = providerLogo(modelName);
-  if (logo && !roleEl.querySelector('.role-provider-logo')) {
+  const existingLogo = roleEl.querySelector('.role-provider-logo');
  if (!logo) {
    if (existingLogo) existingLogo.remove();
    roleEl.classList.remove('has-logo');
  } else if (!existingLogo) {
    const span = document.createElement('span');
    span.className = 'role-provider-logo';
    span.innerHTML = logo;
@@ -1933,8 +1970,12 @@ export function addMessage(role, content, modelName, metadata) {
          wrap.className = 'msg msg-ai' + (r > 0 ? ' msg-continuation' : '');
          const roleEl = document.createElement('div');
          roleEl.className = 'role';
-          const contModel = modelName || metadata?.model;
+          const pair = replyModelPair(modelName, metadata);
-          roleEl.textContent = shortModel(contModel);
+          const contModel = pair.actualModel || pair.requestedModel;
          roleEl.textContent = modelRouteLabel(pair.requestedModel, contModel);
          if (pair.requestedModel && contModel && !sameModelName(pair.requestedModel, contModel)) {
            roleEl.title = pair.requestedModel + ' -> ' + contModel;
          }
          applyModelColor(roleEl, contModel);
          if (r === 0) roleEl.appendChild(roleTimestamp(metadata?.timestamp));
          wrap.appendChild(roleEl);
@@ -2057,8 +2098,9 @@ export function addMessage(role, content, modelName, metadata) {
    r.className = 'role';
    const isSlash = metadata?.source === 'slash';
    const isCompacted = metadata?.compacted;
-    const resolvedModel = modelName || metadata?.model;
+    const replyModels = replyModelPair(modelName, metadata);
-    var _roleText = role === 'user' ? 'You' : (isSlash || isCompacted) ? 'Odysseus' : shortModel(resolvedModel);
+    const resolvedModel = replyModels.actualModel || replyModels.requestedModel;
    var _roleText = role === 'user' ? 'You' : (isSlash || isCompacted) ? 'Odysseus' : modelRouteLabel(replyModels.requestedModel, resolvedModel);
    if (role === 'assistant' && (metadata?.research || metadata?.research_clarification)) {
      _roleText += ' (Research)';
    }
@@ -2069,6 +2111,9 @@ export function addMessage(role, content, modelName, metadata) {
    }
    r.textContent = _roleText;
    if (role !== 'user') {
      if (!isSlash && !isCompacted && replyModels.requestedModel && resolvedModel && !sameModelName(replyModels.requestedModel, resolvedModel)) {
        r.title = replyModels.requestedModel + ' -> ' + resolvedModel;
      }
      if (!isSlash && !isCompacted) applyModelColor(r, resolvedModel);
      r.appendChild(roleTimestamp(metadata?.timestamp));
    }
@@ -2335,6 +2380,9 @@ export function addMessage(role, content, modelName, metadata) {
 const chatRenderer = {
  shortModel,
  sameModelName,
  modelRouteLabel,
  replyModelPair,
  modelColor,
  applyModelColor,
  getModelCost,
@@ -1,7 +1,12 @@
 import pytest
 from fastapi import HTTPException
-from routes.chat_helpers import _enforce_chat_privileges, clean_thinking_for_save, needs_auto_name
+from routes.chat_helpers import (
    _enforce_chat_privileges,
    clean_thinking_for_save,
    needs_auto_name,
    save_assistant_response,
 )
 class _AuthManager:
@@ -64,6 +69,15 @@ def test_allowed_models_nonempty_list_still_restricts_without_new_flag(monkeypat
        )
 class _FakeSession:
    def __init__(self, model="selected-model"):
        self.model = model
        self.history = []
    def add_message(self, message):
        self.history.append(message)
@pytest.mark.parametrize("name,expected", [
    # 24h format (the bug this PR fixes)
    ("deepseek-v4-flash 14:05:33", True),
@@ -130,3 +144,19 @@ def test_clean_thinking_for_save_extracts_thought_tag():
    assert content == "Final answer."
    assert metadata["thinking"] == "internal reasoning"
 def test_save_assistant_response_preserves_actual_and_requested_model():
    sess = _FakeSession("selected-model")
    save_assistant_response(
        sess,
        session_manager=None,
        session_id="s1",
        full_response="hello",
        last_metrics={"model": "actual-model", "input_tokens": 1, "output_tokens": 2},
        incognito=True,
    )
    assert sess.history[-1].metadata["requested_model"] == "selected-model"
    assert sess.history[-1].metadata["model"] == "actual-model"
@@ -82,6 +82,32 @@ def _usage_event(monkeypatch, lines):
    return asyncio.run(run())
 def _stream_events(monkeypatch, lines):
    """Drive stream_llm and return all JSON data events."""
    monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines))
    monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
    monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
    monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
    async def run():
        events = []
        async for chunk in llm_core.stream_llm(
            "http://127.0.0.1:8081/v1/chat/completions",
            "openrouter/auto",
            [{"role": "user", "content": "hi"}],
        ):
            for ln in chunk.split("\n"):
                ln = ln.strip()
                if ln.startswith("data: ") and ln[6:] != "[DONE]":
                    try:
                        events.append(json.loads(ln[6:]))
                    except ValueError:
                        pass
        return events
    return asyncio.run(run())
 # A real llama.cpp final chunk carries `usage` (delta empty / choices []) with a
 # sibling `timings` block. The decode speed here (78.91) is far above the
 # wall-clock figure the old code would have shown.
@@ -127,6 +153,31 @@ def test_stream_llm_omits_tps_when_backend_has_no_timings(monkeypatch):
    assert "prefill_tps" not in usage
 def test_stream_llm_surfaces_provider_resolved_model(monkeypatch):
    events = _stream_events(monkeypatch, [
        'data: ' + json.dumps({
            "model": "meta-llama/llama-3.3-70b-instruct:free",
            "choices": [{"index": 0, "delta": {"content": "Hi"}}],
        }),
        'data: ' + json.dumps({
            "model": "meta-llama/llama-3.3-70b-instruct:free",
            "choices": [],
            "usage": {"prompt_tokens": 8, "completion_tokens": 5},
        }),
        "data: [DONE]",
    ])
    actual = [e for e in events if e.get("type") == "model_actual"]
    assert actual == [{
        "type": "model_actual",
        "requested_model": "openrouter/auto",
        "model": "meta-llama/llama-3.3-70b-instruct:free",
    }]
    usage = [e["data"] for e in events if e.get("type") == "usage"][0]
    assert usage["requested_model"] == "openrouter/auto"
    assert usage["model"] == "meta-llama/llama-3.3-70b-instruct:free"
 # --- _compute_final_metrics preference logic --------------------------------
 def _metrics(**overrides):