From 6ccd4500d7057c8afbb52f2ce75ec8e7adfcb1d7 Mon Sep 17 00:00:00 2001 From: Mohammed Riaz <158828174+mohd-riaz@users.noreply.github.com> Date: Sat, 6 Jun 2026 14:30:16 +0400 Subject: [PATCH] fix(chat): show requested and actual reply models Show requested and actual reply models in chat labels when fallback or provider routing changes the responding model. --- routes/chat_helpers.py | 14 +++++- routes/chat_routes.py | 45 +++++++++++++++++--- src/agent_loop.py | 11 ++++- src/llm_core.py | 25 +++++++++++ static/js/chat.js | 87 +++++++++++++++++++++++++++----------- static/js/chatRenderer.js | 58 ++++++++++++++++++++++--- tests/test_chat_helpers.py | 32 +++++++++++++- tests/test_chat_metrics.py | 51 ++++++++++++++++++++++ 8 files changed, 285 insertions(+), 38 deletions(-) diff --git a/routes/chat_helpers.py b/routes/chat_helpers.py index c62d3452d..e83c2f36a 100644 --- a/routes/chat_helpers.py +++ b/routes/chat_helpers.py @@ -774,7 +774,19 @@ def save_assistant_response( ): """Add assistant response to session history. In incognito mode, keeps in-memory context but skips DB persistence.""" md = dict(last_metrics) if last_metrics else {} - md["model"] = sess.model + def _model_value(value) -> str: + if value is None: + return "" + if not isinstance(value, str): + value = str(value) + return value.strip() + + requested_model = _model_value(md.get("requested_model") or md.get("selected_model") or getattr(sess, "model", "")) + actual_model = _model_value(md.get("model") or md.get("actual_model") or requested_model) + if requested_model: + md["requested_model"] = requested_model + if actual_model: + md["model"] = actual_model if character_name: md["character_name"] = character_name if web_sources: diff --git a/routes/chat_routes.py b/routes/chat_routes.py index f6ab0977d..9554e243f 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -893,6 +893,8 @@ def setup_chat_routes( elif chat_mode == "chat": _chat_start = time.time() _answered_by = None # set if the selected model failed and a fallback answered + _requested_model = sess.model + _actual_model = None # ── Chat mode: call stream_llm directly, NO tools, NO document access ── try: _chat_candidates = [(sess.endpoint_url, sess.model, sess.headers)] + _fallback_candidates @@ -925,10 +927,18 @@ def setup_chat_routes( # Selected model failed; a fallback answered. # Forward the notice and remember the real model. _answered_by = data.get("answered_by") or _answered_by + _actual_model = _actual_model or _answered_by + data["selected_model"] = data.get("selected_model") or _requested_model yield chunk + elif data.get("type") == "model_actual": + _actual_model = data.get("model") or _actual_model + data["requested_model"] = _requested_model + yield f'data: {json.dumps(data)}\n\n' elif data.get("type") == "usage": last_metrics = data.get("data", {}) - last_metrics["model"] = _answered_by or sess.model + _reported_model = last_metrics.get("model") + last_metrics["requested_model"] = _requested_model + last_metrics["model"] = _reported_model or _actual_model or _answered_by or _requested_model if ctx.context_length and last_metrics.get("input_tokens"): pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0) last_metrics["context_percent"] = pct @@ -965,7 +975,8 @@ def setup_chat_routes( "tokens_per_second": _tps, "context_percent": _ctx_pct, "context_length": ctx.context_length, - "model": sess.model, + "model": _actual_model or _answered_by or _requested_model, + "requested_model": _requested_model, "usage_source": "estimated", } yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n' @@ -994,7 +1005,14 @@ def setup_chat_routes( except (asyncio.CancelledError, GeneratorExit): if full_response: logger.info("Client disconnected mid-stream (chat mode) for session %s, saving partial (%d chars)", session, len(full_response)) - _stopped_content, _stopped_md = clean_thinking_for_save(full_response, {"stopped": True, "model": sess.model}) + _stopped_content, _stopped_md = clean_thinking_for_save( + full_response, + { + "stopped": True, + "model": _actual_model or _answered_by or _requested_model, + "requested_model": _requested_model, + }, + ) sess.add_message(ChatMessage("assistant", _stopped_content, metadata=_stopped_md)) if not incognito: session_manager.save_sessions() @@ -1006,6 +1024,8 @@ def setup_chat_routes( _agent_rounds = 0 _agent_tool_calls = 0 _answered_by = None # set if the selected model failed and a fallback answered + _requested_model = sess.model + _actual_model = None try: from src.settings import get_setting from src.agent_tools import MAX_AGENT_ROUNDS as _DEFAULT_ROUNDS @@ -1071,10 +1091,18 @@ def setup_chat_routes( # model so metrics reflect it, not the masked # selected model. _answered_by = data.get("answered_by") or _answered_by + _actual_model = _actual_model or _answered_by + data["selected_model"] = data.get("selected_model") or _requested_model yield chunk + elif data.get("type") == "model_actual": + _actual_model = data.get("model") or _actual_model + data["requested_model"] = _requested_model + yield f'data: {json.dumps(data)}\n\n' elif data.get("type") == "metrics": last_metrics = data.get("data", {}) - last_metrics["model"] = _answered_by or sess.model + _reported_model = last_metrics.get("model") + last_metrics["requested_model"] = last_metrics.get("requested_model") or _requested_model + last_metrics["model"] = _reported_model or _actual_model or _answered_by or _requested_model yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n' except json.JSONDecodeError: yield chunk @@ -1115,7 +1143,14 @@ def setup_chat_routes( try: if full_response: logger.info("Client disconnected mid-stream for session %s, saving partial response (%d chars)", session, len(full_response)) - _stopped_content2, _stopped_md2 = clean_thinking_for_save(full_response, {"stopped": True, "model": sess.model}) + _stopped_content2, _stopped_md2 = clean_thinking_for_save( + full_response, + { + "stopped": True, + "model": _actual_model or _answered_by or _requested_model, + "requested_model": _requested_model, + }, + ) sess.add_message(ChatMessage("assistant", _stopped_content2, metadata=_stopped_md2)) if not incognito: session_manager.save_sessions() diff --git a/src/agent_loop.py b/src/agent_loop.py index 1f70ca2a5..ae13d9abb 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1741,6 +1741,8 @@ async def stream_agent_loop( has_real_usage = False backend_gen_tps = 0 # backend-reported true gen speed (llama.cpp timings) backend_prefill_tps = 0 # backend-reported prefill speed + requested_model = model + actual_model = model total_tool_calls = 0 # for budget enforcement # Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get @@ -1913,6 +1915,7 @@ async def stream_agent_loop( logger.info(f"Agent round {round_num}: received {len(native_tool_calls)} native tool call(s)") elif data.get("type") == "usage": u = data.get("data", {}) + actual_model = u.get("model") or actual_model round_input = u.get("input_tokens", 0) real_input_tokens += round_input real_output_tokens += u.get("output_tokens", 0) @@ -1929,9 +1932,14 @@ async def stream_agent_loop( elif data.get("type") == "fallback": # The selected model failed and another answered; surface # the notice so a misconfigured provider isn't masked. + actual_model = data.get("answered_by") or actual_model logger.warning(f"[agent] round {round_num} fell back: " f"{data.get('selected_model')} -> {data.get('answered_by')}") yield chunk + elif data.get("type") == "model_actual": + actual_model = data.get("model") or actual_model + data["requested_model"] = requested_model + yield f"data: {json.dumps(data)}\n\n" elif "delta" in data: if not first_token_received: time_to_first_token = time.time() - total_start @@ -2562,12 +2570,13 @@ async def stream_agent_loop( metrics = _compute_final_metrics( messages, full_response, total_duration, time_to_first_token, context_length, real_input_tokens, real_output_tokens, - has_real_usage, tool_events, round_texts, model=model, + has_real_usage, tool_events, round_texts, model=actual_model, last_round_input_tokens=last_round_input_tokens, prep_timings=prep_timings, backend_gen_tps=backend_gen_tps, backend_prefill_tps=backend_prefill_tps, ) + metrics["requested_model"] = requested_model yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n" # Teacher-escalation: inline takeover visible in the chat stream. diff --git a/src/llm_core.py b/src/llm_core.py index 6a4c81e52..9123a1b4a 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -167,6 +167,9 @@ def _stream_delta_event(text: str, *, thinking: bool = False) -> str: def _model_activity_key(url: str, model: str) -> str: return f"{(url or '').strip()}|{(model or '').strip()}" +def _same_model_identity(left: str, right: str) -> bool: + return (left or "").strip().lower() == (right or "").strip().lower() + def note_model_activity(url: str, model: str): """Record that a real upstream request used this endpoint/model.""" if not url or not model: @@ -1493,6 +1496,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl _think_open_stripped = False # opening tag already removed _harmony_router = _HarmonyStreamRouter() _harmony_active = False # sticky: gpt-oss harmony <|channel|> stream detected + _actual_model = "" + _actual_model_announced = False def _emit_tool_calls(): """Build the tool_calls event string if any were accumulated.""" @@ -1549,6 +1554,15 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl if data.strip(): if data.startswith("{"): j = json.loads(data) + chunk_model = j.get("model") + if isinstance(chunk_model, str) and chunk_model.strip(): + _actual_model = chunk_model.strip() + if ( + not _actual_model_announced + and not _same_model_identity(_actual_model, model) + ): + _actual_model_announced = True + yield f'data: {json.dumps({"type": "model_actual", "requested_model": model, "model": _actual_model})}\n\n' # Usage chunk (from stream_options) _choices = j.get("choices") or [] _delta0 = _choices[0].get("delta") if (_choices and _choices[0] is not None) else None @@ -1579,6 +1593,10 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl _usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2) if _tm.get("prompt_per_second"): _usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2) + if _actual_model: + _usage_data["model"] = _actual_model + if not _same_model_identity(_actual_model, model): + _usage_data["requested_model"] = model yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n' elif "choices" in j: _c0 = (j["choices"] or [None])[0] @@ -1791,6 +1809,13 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs): continue # Any data chunk other than the terminal [DONE] means real output. if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): + try: + event_data = json.loads(chunk[6:]) + except Exception: + event_data = {} + if event_data.get("type") == "model_actual": + yield chunk + continue # First real output from a NON-primary candidate: tell the client # the selected model failed and another answered. Without this the # fallback is invisible — a misconfigured provider looks like it diff --git a/static/js/chat.js b/static/js/chat.js index 604f8c609..1b2185c36 100644 --- a/static/js/chat.js +++ b/static/js/chat.js @@ -53,7 +53,27 @@ import { createStreamRenderer } from './streamingRenderer.js'; // shortModel and modelColor are now in chatRenderer.js var _shortModel = chatRenderer.shortModel; + var _modelRouteLabel = chatRenderer.modelRouteLabel; + var _sameModelName = chatRenderer.sameModelName; var _applyModelColor = chatRenderer.applyModelColor; + function _setRoleModelLabel(roleEl, requestedModel, actualModel, opts) { + if (!roleEl) return; + opts = opts || {}; + const tsSpan = roleEl.querySelector('.role-timestamp'); + const req = requestedModel || actualModel || ''; + const actual = actualModel || requestedModel || ''; + let label = _modelRouteLabel(req, actual); + if (opts.suffix) label += ' (' + opts.suffix + ')'; + if (opts.characterName) label = opts.characterName; + roleEl.textContent = label + ' '; + _applyModelColor(roleEl, actual || req); + if (req && actual && !_sameModelName(req, actual)) { + roleEl.title = req + ' -> ' + actual + (opts.reason ? ': ' + opts.reason : ''); + } else if (!opts.reason) { + roleEl.removeAttribute('title'); + } + if (tsSpan) roleEl.appendChild(tsSpan); + } // Per-session research tracking (supports concurrent research across sessions) const _researchingStreamIds = new Set(); let _researchTimerEl = null, _researchTimerInterval = null; @@ -556,7 +576,6 @@ import { createStreamRenderer } from './streamingRenderer.js'; let _thinkOpen = false; let holder = null; let finalMeta = null; - let finalModelName = null; let spinner = null; let timedOut = false; let processingProbeTimer = null; @@ -892,11 +911,13 @@ import { createStreamRenderer } from './streamingRenderer.js'; loadingText = 'Processing request...'; } - var roleLabel = _shortModel(modelName); + var roleLabel = _modelRouteLabel(modelName, modelName); var _charNameInit = presetsModule.getCharacterName ? presetsModule.getCharacterName() : ''; if (_charNameInit) roleLabel = _charNameInit; const roleTs = new Date().toLocaleTimeString([], {hour: '2-digit', minute:'2-digit'}); holder.innerHTML = `
${uiModule.esc(roleLabel)} ${roleTs}
`; + holder._requestedModel = modelName; + holder._actualModel = modelName; _applyModelColor(holder.querySelector('.role'), modelName); holder.style.position = 'relative'; @@ -1807,21 +1828,16 @@ import { createStreamRenderer } from './streamingRenderer.js'; if (!_isBg && holder) { const roleEl = holder.querySelector('.role'); if (roleEl) { - const tsSpan = roleEl.querySelector('.role-timestamp'); - var _modelLabel = _shortModel(json.model); - if (json.suffix) { - _modelLabel += ' (' + json.suffix + ')'; - holder._roleSuffix = json.suffix; - } + holder._requestedModel = json.requested_model || json.model || holder._requestedModel; + holder._actualModel = json.model || holder._actualModel || holder._requestedModel; + if (json.suffix) holder._roleSuffix = json.suffix; // Prepend character name if sent by server or set locally var _charName = json.character_name || (presetsModule.getCharacterName ? presetsModule.getCharacterName() : ''); - if (_charName) { - _modelLabel = _charName; - holder._characterName = _charName; - } - roleEl.textContent = _modelLabel + ' '; - _applyModelColor(roleEl, json.model); - if (tsSpan) roleEl.appendChild(tsSpan); + if (_charName) holder._characterName = _charName; + _setRoleModelLabel(roleEl, holder._requestedModel, holder._actualModel, { + suffix: holder._roleSuffix, + characterName: holder._characterName, + }); } } } else if (json.type === 'fallback') { @@ -1841,6 +1857,14 @@ import { createStreamRenderer } from './streamingRenderer.js'; (json.reason ? ': ' + json.reason : '') + ' — answered by ' + (json.answered_by || ''); _applyModelColor(_rEl, json.answered_by); if (_tsS) _rEl.appendChild(_tsS); + holder._requestedModel = json.selected_model || holder._requestedModel || modelName; + const _hasResolvedActual = holder._actualModel && !_sameModelName(holder._actualModel, holder._requestedModel); + holder._actualModel = _hasResolvedActual ? holder._actualModel : (json.answered_by || holder._actualModel || holder._requestedModel); + _setRoleModelLabel(_rEl, holder._requestedModel, holder._actualModel, { + suffix: holder._roleSuffix, + characterName: holder._characterName, + reason: json.reason, + }); } } } @@ -1882,6 +1906,15 @@ import { createStreamRenderer } from './streamingRenderer.js'; _chatBox.appendChild(note); try { note.scrollIntoView({ block: 'end', behavior: 'smooth' }); } catch (_) { uiModule.scrollHistory && uiModule.scrollHistory(); } } + } else if (json.type === 'model_actual') { + if (!_isBg && holder) { + holder._requestedModel = json.requested_model || holder._requestedModel || modelName; + holder._actualModel = json.model || holder._actualModel || holder._requestedModel; + _setRoleModelLabel(holder.querySelector('.role'), holder._requestedModel, holder._actualModel, { + suffix: holder._roleSuffix, + characterName: holder._characterName, + }); + } } else if (json.type === 'attachments') { if (_isBg) continue; // Update user bubble — replace file chips with image previews @@ -1959,6 +1992,10 @@ import { createStreamRenderer } from './streamingRenderer.js'; } } else if (json.type === 'metrics') { metrics = json.data; + if (!_isBg && holder && metrics) { + holder._requestedModel = metrics.requested_model || holder._requestedModel || modelName; + holder._actualModel = metrics.model || holder._actualModel || holder._requestedModel; + } if (_isBg) { var bgM = _backgroundStreams.get(streamSessionId); if (bgM) bgM.metrics = json.data; @@ -2441,8 +2478,10 @@ import { createStreamRenderer } from './streamingRenderer.js'; const newRole = document.createElement('div'); newRole.className = 'role'; const metaS = sessionModule.getSessions().find(s => s.id === streamSessionId); - newRole.textContent = _shortModel(metaS?.model) || ''; - _applyModelColor(newRole, metaS?.model); + const _roundRequested = holder?._requestedModel || metaS?.model; + const _roundActual = holder?._actualModel || _roundRequested; + newRole.textContent = _modelRouteLabel(_roundRequested, _roundActual) || ''; + _applyModelColor(newRole, _roundActual); newWrap.appendChild(newRole); const newBody = document.createElement('div'); newBody.className = 'body'; @@ -2548,18 +2587,16 @@ import { createStreamRenderer } from './streamingRenderer.js'; const _isBgFinal = (sessionModule.getCurrentSessionId() !== streamSessionId) || _backgroundStreams.has(streamSessionId); if (!_isBgFinal) { finalMeta = sessionModule.getSessions().find(s => s.id === sessionModule.getCurrentSessionId()); - finalModelName = _shortModel(metrics?.model || finalMeta?.model); - // Preserve suffix (e.g. "Research") if set by model_info event - if (holder._roleSuffix) finalModelName += ' (' + holder._roleSuffix + ')'; + const _finalActualModel = metrics?.model || holder._actualModel || finalMeta?.model; + const _finalRequestedModel = metrics?.requested_model || holder._requestedModel || finalMeta?.model || _finalActualModel; // Prepend character name if set var _charNameFinal = presetsModule.getCharacterName ? presetsModule.getCharacterName() : ''; - if (_charNameFinal) finalModelName = _charNameFinal; const roleEl = holder.querySelector('.role'); if (roleEl) { - const tsSpan = roleEl.querySelector('.role-timestamp'); - roleEl.textContent = finalModelName + ' '; - _applyModelColor(roleEl, metrics?.model || finalMeta?.model); - if (tsSpan) roleEl.appendChild(tsSpan); + _setRoleModelLabel(roleEl, _finalRequestedModel, _finalActualModel, { + suffix: holder._roleSuffix, + characterName: _charNameFinal || holder._characterName, + }); } holder.dataset.raw = accumulated; diff --git a/static/js/chatRenderer.js b/static/js/chatRenderer.js index e8aa9de5c..8b648d634 100644 --- a/static/js/chatRenderer.js +++ b/static/js/chatRenderer.js @@ -537,6 +537,39 @@ export function shortModel(name) { return short; } +function modelValue(name) { + if (name == null) return ''; + return String(name).trim(); +} + +export function sameModelName(left, right) { + const a = modelValue(left); + const b = modelValue(right); + if (!a || !b) return false; + return a.toLowerCase() === b.toLowerCase() + || shortModel(a).toLowerCase() === shortModel(b).toLowerCase(); +} + +export function modelRouteLabel(requestedModel, actualModel) { + const requested = modelValue(requestedModel); + const actual = modelValue(actualModel) || requested; + if (!requested || sameModelName(requested, actual)) return shortModel(actual || requested); + return shortModel(requested) + ' -> ' + shortModel(actual); +} + +export function replyModelPair(modelName, metadata) { + const meta = metadata || {}; + const actualFromMeta = modelValue(meta.model || meta.actual_model); + const requestedFromMeta = modelValue(meta.requested_model || meta.selected_model); + if (actualFromMeta || requestedFromMeta) { + const actual = actualFromMeta || requestedFromMeta || modelValue(modelName); + const requested = requestedFromMeta || actual; + return { requestedModel: requested, actualModel: actual }; + } + const fallback = modelValue(modelName); + return { requestedModel: fallback, actualModel: fallback }; +} + /** * Generate a consistent HSL color for a model name. * Returns an hsl() string. The hue is derived from a string hash, @@ -577,7 +610,11 @@ export function applyModelColor(roleEl, modelName) { } // Replace generic dot with provider logo if available const logo = providerLogo(modelName); - if (logo && !roleEl.querySelector('.role-provider-logo')) { + const existingLogo = roleEl.querySelector('.role-provider-logo'); + if (!logo) { + if (existingLogo) existingLogo.remove(); + roleEl.classList.remove('has-logo'); + } else if (!existingLogo) { const span = document.createElement('span'); span.className = 'role-provider-logo'; span.innerHTML = logo; @@ -1933,8 +1970,12 @@ export function addMessage(role, content, modelName, metadata) { wrap.className = 'msg msg-ai' + (r > 0 ? ' msg-continuation' : ''); const roleEl = document.createElement('div'); roleEl.className = 'role'; - const contModel = modelName || metadata?.model; - roleEl.textContent = shortModel(contModel); + const pair = replyModelPair(modelName, metadata); + const contModel = pair.actualModel || pair.requestedModel; + roleEl.textContent = modelRouteLabel(pair.requestedModel, contModel); + if (pair.requestedModel && contModel && !sameModelName(pair.requestedModel, contModel)) { + roleEl.title = pair.requestedModel + ' -> ' + contModel; + } applyModelColor(roleEl, contModel); if (r === 0) roleEl.appendChild(roleTimestamp(metadata?.timestamp)); wrap.appendChild(roleEl); @@ -2057,8 +2098,9 @@ export function addMessage(role, content, modelName, metadata) { r.className = 'role'; const isSlash = metadata?.source === 'slash'; const isCompacted = metadata?.compacted; - const resolvedModel = modelName || metadata?.model; - var _roleText = role === 'user' ? 'You' : (isSlash || isCompacted) ? 'Odysseus' : shortModel(resolvedModel); + const replyModels = replyModelPair(modelName, metadata); + const resolvedModel = replyModels.actualModel || replyModels.requestedModel; + var _roleText = role === 'user' ? 'You' : (isSlash || isCompacted) ? 'Odysseus' : modelRouteLabel(replyModels.requestedModel, resolvedModel); if (role === 'assistant' && (metadata?.research || metadata?.research_clarification)) { _roleText += ' (Research)'; } @@ -2069,6 +2111,9 @@ export function addMessage(role, content, modelName, metadata) { } r.textContent = _roleText; if (role !== 'user') { + if (!isSlash && !isCompacted && replyModels.requestedModel && resolvedModel && !sameModelName(replyModels.requestedModel, resolvedModel)) { + r.title = replyModels.requestedModel + ' -> ' + resolvedModel; + } if (!isSlash && !isCompacted) applyModelColor(r, resolvedModel); r.appendChild(roleTimestamp(metadata?.timestamp)); } @@ -2335,6 +2380,9 @@ export function addMessage(role, content, modelName, metadata) { const chatRenderer = { shortModel, + sameModelName, + modelRouteLabel, + replyModelPair, modelColor, applyModelColor, getModelCost, diff --git a/tests/test_chat_helpers.py b/tests/test_chat_helpers.py index ba23e43ce..1c2b060ed 100644 --- a/tests/test_chat_helpers.py +++ b/tests/test_chat_helpers.py @@ -1,7 +1,12 @@ import pytest from fastapi import HTTPException -from routes.chat_helpers import _enforce_chat_privileges, clean_thinking_for_save, needs_auto_name +from routes.chat_helpers import ( + _enforce_chat_privileges, + clean_thinking_for_save, + needs_auto_name, + save_assistant_response, +) class _AuthManager: @@ -64,6 +69,15 @@ def test_allowed_models_nonempty_list_still_restricts_without_new_flag(monkeypat ) +class _FakeSession: + def __init__(self, model="selected-model"): + self.model = model + self.history = [] + + def add_message(self, message): + self.history.append(message) + + @pytest.mark.parametrize("name,expected", [ # 24h format (the bug this PR fixes) ("deepseek-v4-flash 14:05:33", True), @@ -130,3 +144,19 @@ def test_clean_thinking_for_save_extracts_thought_tag(): assert content == "Final answer." assert metadata["thinking"] == "internal reasoning" + + +def test_save_assistant_response_preserves_actual_and_requested_model(): + sess = _FakeSession("selected-model") + + save_assistant_response( + sess, + session_manager=None, + session_id="s1", + full_response="hello", + last_metrics={"model": "actual-model", "input_tokens": 1, "output_tokens": 2}, + incognito=True, + ) + + assert sess.history[-1].metadata["requested_model"] == "selected-model" + assert sess.history[-1].metadata["model"] == "actual-model" diff --git a/tests/test_chat_metrics.py b/tests/test_chat_metrics.py index 9a218fa2e..13d5421c6 100644 --- a/tests/test_chat_metrics.py +++ b/tests/test_chat_metrics.py @@ -82,6 +82,32 @@ def _usage_event(monkeypatch, lines): return asyncio.run(run()) +def _stream_events(monkeypatch, lines): + """Drive stream_llm and return all JSON data events.""" + monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines)) + monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False) + monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None) + monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None) + + async def run(): + events = [] + async for chunk in llm_core.stream_llm( + "http://127.0.0.1:8081/v1/chat/completions", + "openrouter/auto", + [{"role": "user", "content": "hi"}], + ): + for ln in chunk.split("\n"): + ln = ln.strip() + if ln.startswith("data: ") and ln[6:] != "[DONE]": + try: + events.append(json.loads(ln[6:])) + except ValueError: + pass + return events + + return asyncio.run(run()) + + # A real llama.cpp final chunk carries `usage` (delta empty / choices []) with a # sibling `timings` block. The decode speed here (78.91) is far above the # wall-clock figure the old code would have shown. @@ -127,6 +153,31 @@ def test_stream_llm_omits_tps_when_backend_has_no_timings(monkeypatch): assert "prefill_tps" not in usage +def test_stream_llm_surfaces_provider_resolved_model(monkeypatch): + events = _stream_events(monkeypatch, [ + 'data: ' + json.dumps({ + "model": "meta-llama/llama-3.3-70b-instruct:free", + "choices": [{"index": 0, "delta": {"content": "Hi"}}], + }), + 'data: ' + json.dumps({ + "model": "meta-llama/llama-3.3-70b-instruct:free", + "choices": [], + "usage": {"prompt_tokens": 8, "completion_tokens": 5}, + }), + "data: [DONE]", + ]) + + actual = [e for e in events if e.get("type") == "model_actual"] + assert actual == [{ + "type": "model_actual", + "requested_model": "openrouter/auto", + "model": "meta-llama/llama-3.3-70b-instruct:free", + }] + usage = [e["data"] for e in events if e.get("type") == "usage"][0] + assert usage["requested_model"] == "openrouter/auto" + assert usage["model"] == "meta-llama/llama-3.3-70b-instruct:free" + + # --- _compute_final_metrics preference logic -------------------------------- def _metrics(**overrides):