mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 01:35:36 -04:00
fix(chat): show requested and actual reply models
Show requested and actual reply models in chat labels when fallback or provider routing changes the responding model.
This commit is contained in:
+13
-1
@@ -774,7 +774,19 @@ def save_assistant_response(
|
|||||||
):
|
):
|
||||||
"""Add assistant response to session history. In incognito mode, keeps in-memory context but skips DB persistence."""
|
"""Add assistant response to session history. In incognito mode, keeps in-memory context but skips DB persistence."""
|
||||||
md = dict(last_metrics) if last_metrics else {}
|
md = dict(last_metrics) if last_metrics else {}
|
||||||
md["model"] = sess.model
|
def _model_value(value) -> str:
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
if not isinstance(value, str):
|
||||||
|
value = str(value)
|
||||||
|
return value.strip()
|
||||||
|
|
||||||
|
requested_model = _model_value(md.get("requested_model") or md.get("selected_model") or getattr(sess, "model", ""))
|
||||||
|
actual_model = _model_value(md.get("model") or md.get("actual_model") or requested_model)
|
||||||
|
if requested_model:
|
||||||
|
md["requested_model"] = requested_model
|
||||||
|
if actual_model:
|
||||||
|
md["model"] = actual_model
|
||||||
if character_name:
|
if character_name:
|
||||||
md["character_name"] = character_name
|
md["character_name"] = character_name
|
||||||
if web_sources:
|
if web_sources:
|
||||||
|
|||||||
+40
-5
@@ -893,6 +893,8 @@ def setup_chat_routes(
|
|||||||
elif chat_mode == "chat":
|
elif chat_mode == "chat":
|
||||||
_chat_start = time.time()
|
_chat_start = time.time()
|
||||||
_answered_by = None # set if the selected model failed and a fallback answered
|
_answered_by = None # set if the selected model failed and a fallback answered
|
||||||
|
_requested_model = sess.model
|
||||||
|
_actual_model = None
|
||||||
# ── Chat mode: call stream_llm directly, NO tools, NO document access ──
|
# ── Chat mode: call stream_llm directly, NO tools, NO document access ──
|
||||||
try:
|
try:
|
||||||
_chat_candidates = [(sess.endpoint_url, sess.model, sess.headers)] + _fallback_candidates
|
_chat_candidates = [(sess.endpoint_url, sess.model, sess.headers)] + _fallback_candidates
|
||||||
@@ -925,10 +927,18 @@ def setup_chat_routes(
|
|||||||
# Selected model failed; a fallback answered.
|
# Selected model failed; a fallback answered.
|
||||||
# Forward the notice and remember the real model.
|
# Forward the notice and remember the real model.
|
||||||
_answered_by = data.get("answered_by") or _answered_by
|
_answered_by = data.get("answered_by") or _answered_by
|
||||||
|
_actual_model = _actual_model or _answered_by
|
||||||
|
data["selected_model"] = data.get("selected_model") or _requested_model
|
||||||
yield chunk
|
yield chunk
|
||||||
|
elif data.get("type") == "model_actual":
|
||||||
|
_actual_model = data.get("model") or _actual_model
|
||||||
|
data["requested_model"] = _requested_model
|
||||||
|
yield f'data: {json.dumps(data)}\n\n'
|
||||||
elif data.get("type") == "usage":
|
elif data.get("type") == "usage":
|
||||||
last_metrics = data.get("data", {})
|
last_metrics = data.get("data", {})
|
||||||
last_metrics["model"] = _answered_by or sess.model
|
_reported_model = last_metrics.get("model")
|
||||||
|
last_metrics["requested_model"] = _requested_model
|
||||||
|
last_metrics["model"] = _reported_model or _actual_model or _answered_by or _requested_model
|
||||||
if ctx.context_length and last_metrics.get("input_tokens"):
|
if ctx.context_length and last_metrics.get("input_tokens"):
|
||||||
pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0)
|
pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0)
|
||||||
last_metrics["context_percent"] = pct
|
last_metrics["context_percent"] = pct
|
||||||
@@ -965,7 +975,8 @@ def setup_chat_routes(
|
|||||||
"tokens_per_second": _tps,
|
"tokens_per_second": _tps,
|
||||||
"context_percent": _ctx_pct,
|
"context_percent": _ctx_pct,
|
||||||
"context_length": ctx.context_length,
|
"context_length": ctx.context_length,
|
||||||
"model": sess.model,
|
"model": _actual_model or _answered_by or _requested_model,
|
||||||
|
"requested_model": _requested_model,
|
||||||
"usage_source": "estimated",
|
"usage_source": "estimated",
|
||||||
}
|
}
|
||||||
yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
|
yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
|
||||||
@@ -994,7 +1005,14 @@ def setup_chat_routes(
|
|||||||
except (asyncio.CancelledError, GeneratorExit):
|
except (asyncio.CancelledError, GeneratorExit):
|
||||||
if full_response:
|
if full_response:
|
||||||
logger.info("Client disconnected mid-stream (chat mode) for session %s, saving partial (%d chars)", session, len(full_response))
|
logger.info("Client disconnected mid-stream (chat mode) for session %s, saving partial (%d chars)", session, len(full_response))
|
||||||
_stopped_content, _stopped_md = clean_thinking_for_save(full_response, {"stopped": True, "model": sess.model})
|
_stopped_content, _stopped_md = clean_thinking_for_save(
|
||||||
|
full_response,
|
||||||
|
{
|
||||||
|
"stopped": True,
|
||||||
|
"model": _actual_model or _answered_by or _requested_model,
|
||||||
|
"requested_model": _requested_model,
|
||||||
|
},
|
||||||
|
)
|
||||||
sess.add_message(ChatMessage("assistant", _stopped_content, metadata=_stopped_md))
|
sess.add_message(ChatMessage("assistant", _stopped_content, metadata=_stopped_md))
|
||||||
if not incognito:
|
if not incognito:
|
||||||
session_manager.save_sessions()
|
session_manager.save_sessions()
|
||||||
@@ -1006,6 +1024,8 @@ def setup_chat_routes(
|
|||||||
_agent_rounds = 0
|
_agent_rounds = 0
|
||||||
_agent_tool_calls = 0
|
_agent_tool_calls = 0
|
||||||
_answered_by = None # set if the selected model failed and a fallback answered
|
_answered_by = None # set if the selected model failed and a fallback answered
|
||||||
|
_requested_model = sess.model
|
||||||
|
_actual_model = None
|
||||||
try:
|
try:
|
||||||
from src.settings import get_setting
|
from src.settings import get_setting
|
||||||
from src.agent_tools import MAX_AGENT_ROUNDS as _DEFAULT_ROUNDS
|
from src.agent_tools import MAX_AGENT_ROUNDS as _DEFAULT_ROUNDS
|
||||||
@@ -1071,10 +1091,18 @@ def setup_chat_routes(
|
|||||||
# model so metrics reflect it, not the masked
|
# model so metrics reflect it, not the masked
|
||||||
# selected model.
|
# selected model.
|
||||||
_answered_by = data.get("answered_by") or _answered_by
|
_answered_by = data.get("answered_by") or _answered_by
|
||||||
|
_actual_model = _actual_model or _answered_by
|
||||||
|
data["selected_model"] = data.get("selected_model") or _requested_model
|
||||||
yield chunk
|
yield chunk
|
||||||
|
elif data.get("type") == "model_actual":
|
||||||
|
_actual_model = data.get("model") or _actual_model
|
||||||
|
data["requested_model"] = _requested_model
|
||||||
|
yield f'data: {json.dumps(data)}\n\n'
|
||||||
elif data.get("type") == "metrics":
|
elif data.get("type") == "metrics":
|
||||||
last_metrics = data.get("data", {})
|
last_metrics = data.get("data", {})
|
||||||
last_metrics["model"] = _answered_by or sess.model
|
_reported_model = last_metrics.get("model")
|
||||||
|
last_metrics["requested_model"] = last_metrics.get("requested_model") or _requested_model
|
||||||
|
last_metrics["model"] = _reported_model or _actual_model or _answered_by or _requested_model
|
||||||
yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
|
yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
yield chunk
|
yield chunk
|
||||||
@@ -1115,7 +1143,14 @@ def setup_chat_routes(
|
|||||||
try:
|
try:
|
||||||
if full_response:
|
if full_response:
|
||||||
logger.info("Client disconnected mid-stream for session %s, saving partial response (%d chars)", session, len(full_response))
|
logger.info("Client disconnected mid-stream for session %s, saving partial response (%d chars)", session, len(full_response))
|
||||||
_stopped_content2, _stopped_md2 = clean_thinking_for_save(full_response, {"stopped": True, "model": sess.model})
|
_stopped_content2, _stopped_md2 = clean_thinking_for_save(
|
||||||
|
full_response,
|
||||||
|
{
|
||||||
|
"stopped": True,
|
||||||
|
"model": _actual_model or _answered_by or _requested_model,
|
||||||
|
"requested_model": _requested_model,
|
||||||
|
},
|
||||||
|
)
|
||||||
sess.add_message(ChatMessage("assistant", _stopped_content2, metadata=_stopped_md2))
|
sess.add_message(ChatMessage("assistant", _stopped_content2, metadata=_stopped_md2))
|
||||||
if not incognito:
|
if not incognito:
|
||||||
session_manager.save_sessions()
|
session_manager.save_sessions()
|
||||||
|
|||||||
+10
-1
@@ -1741,6 +1741,8 @@ async def stream_agent_loop(
|
|||||||
has_real_usage = False
|
has_real_usage = False
|
||||||
backend_gen_tps = 0 # backend-reported true gen speed (llama.cpp timings)
|
backend_gen_tps = 0 # backend-reported true gen speed (llama.cpp timings)
|
||||||
backend_prefill_tps = 0 # backend-reported prefill speed
|
backend_prefill_tps = 0 # backend-reported prefill speed
|
||||||
|
requested_model = model
|
||||||
|
actual_model = model
|
||||||
total_tool_calls = 0 # for budget enforcement
|
total_tool_calls = 0 # for budget enforcement
|
||||||
|
|
||||||
# Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get
|
# Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get
|
||||||
@@ -1913,6 +1915,7 @@ async def stream_agent_loop(
|
|||||||
logger.info(f"Agent round {round_num}: received {len(native_tool_calls)} native tool call(s)")
|
logger.info(f"Agent round {round_num}: received {len(native_tool_calls)} native tool call(s)")
|
||||||
elif data.get("type") == "usage":
|
elif data.get("type") == "usage":
|
||||||
u = data.get("data", {})
|
u = data.get("data", {})
|
||||||
|
actual_model = u.get("model") or actual_model
|
||||||
round_input = u.get("input_tokens", 0)
|
round_input = u.get("input_tokens", 0)
|
||||||
real_input_tokens += round_input
|
real_input_tokens += round_input
|
||||||
real_output_tokens += u.get("output_tokens", 0)
|
real_output_tokens += u.get("output_tokens", 0)
|
||||||
@@ -1929,9 +1932,14 @@ async def stream_agent_loop(
|
|||||||
elif data.get("type") == "fallback":
|
elif data.get("type") == "fallback":
|
||||||
# The selected model failed and another answered; surface
|
# The selected model failed and another answered; surface
|
||||||
# the notice so a misconfigured provider isn't masked.
|
# the notice so a misconfigured provider isn't masked.
|
||||||
|
actual_model = data.get("answered_by") or actual_model
|
||||||
logger.warning(f"[agent] round {round_num} fell back: "
|
logger.warning(f"[agent] round {round_num} fell back: "
|
||||||
f"{data.get('selected_model')} -> {data.get('answered_by')}")
|
f"{data.get('selected_model')} -> {data.get('answered_by')}")
|
||||||
yield chunk
|
yield chunk
|
||||||
|
elif data.get("type") == "model_actual":
|
||||||
|
actual_model = data.get("model") or actual_model
|
||||||
|
data["requested_model"] = requested_model
|
||||||
|
yield f"data: {json.dumps(data)}\n\n"
|
||||||
elif "delta" in data:
|
elif "delta" in data:
|
||||||
if not first_token_received:
|
if not first_token_received:
|
||||||
time_to_first_token = time.time() - total_start
|
time_to_first_token = time.time() - total_start
|
||||||
@@ -2562,12 +2570,13 @@ async def stream_agent_loop(
|
|||||||
metrics = _compute_final_metrics(
|
metrics = _compute_final_metrics(
|
||||||
messages, full_response, total_duration, time_to_first_token,
|
messages, full_response, total_duration, time_to_first_token,
|
||||||
context_length, real_input_tokens, real_output_tokens,
|
context_length, real_input_tokens, real_output_tokens,
|
||||||
has_real_usage, tool_events, round_texts, model=model,
|
has_real_usage, tool_events, round_texts, model=actual_model,
|
||||||
last_round_input_tokens=last_round_input_tokens,
|
last_round_input_tokens=last_round_input_tokens,
|
||||||
prep_timings=prep_timings,
|
prep_timings=prep_timings,
|
||||||
backend_gen_tps=backend_gen_tps,
|
backend_gen_tps=backend_gen_tps,
|
||||||
backend_prefill_tps=backend_prefill_tps,
|
backend_prefill_tps=backend_prefill_tps,
|
||||||
)
|
)
|
||||||
|
metrics["requested_model"] = requested_model
|
||||||
yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"
|
yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"
|
||||||
|
|
||||||
# Teacher-escalation: inline takeover visible in the chat stream.
|
# Teacher-escalation: inline takeover visible in the chat stream.
|
||||||
|
|||||||
@@ -167,6 +167,9 @@ def _stream_delta_event(text: str, *, thinking: bool = False) -> str:
|
|||||||
def _model_activity_key(url: str, model: str) -> str:
|
def _model_activity_key(url: str, model: str) -> str:
|
||||||
return f"{(url or '').strip()}|{(model or '').strip()}"
|
return f"{(url or '').strip()}|{(model or '').strip()}"
|
||||||
|
|
||||||
|
def _same_model_identity(left: str, right: str) -> bool:
|
||||||
|
return (left or "").strip().lower() == (right or "").strip().lower()
|
||||||
|
|
||||||
def note_model_activity(url: str, model: str):
|
def note_model_activity(url: str, model: str):
|
||||||
"""Record that a real upstream request used this endpoint/model."""
|
"""Record that a real upstream request used this endpoint/model."""
|
||||||
if not url or not model:
|
if not url or not model:
|
||||||
@@ -1493,6 +1496,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
|||||||
_think_open_stripped = False # opening <think> tag already removed
|
_think_open_stripped = False # opening <think> tag already removed
|
||||||
_harmony_router = _HarmonyStreamRouter()
|
_harmony_router = _HarmonyStreamRouter()
|
||||||
_harmony_active = False # sticky: gpt-oss harmony <|channel|> stream detected
|
_harmony_active = False # sticky: gpt-oss harmony <|channel|> stream detected
|
||||||
|
_actual_model = ""
|
||||||
|
_actual_model_announced = False
|
||||||
|
|
||||||
def _emit_tool_calls():
|
def _emit_tool_calls():
|
||||||
"""Build the tool_calls event string if any were accumulated."""
|
"""Build the tool_calls event string if any were accumulated."""
|
||||||
@@ -1549,6 +1554,15 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
|||||||
if data.strip():
|
if data.strip():
|
||||||
if data.startswith("{"):
|
if data.startswith("{"):
|
||||||
j = json.loads(data)
|
j = json.loads(data)
|
||||||
|
chunk_model = j.get("model")
|
||||||
|
if isinstance(chunk_model, str) and chunk_model.strip():
|
||||||
|
_actual_model = chunk_model.strip()
|
||||||
|
if (
|
||||||
|
not _actual_model_announced
|
||||||
|
and not _same_model_identity(_actual_model, model)
|
||||||
|
):
|
||||||
|
_actual_model_announced = True
|
||||||
|
yield f'data: {json.dumps({"type": "model_actual", "requested_model": model, "model": _actual_model})}\n\n'
|
||||||
# Usage chunk (from stream_options)
|
# Usage chunk (from stream_options)
|
||||||
_choices = j.get("choices") or []
|
_choices = j.get("choices") or []
|
||||||
_delta0 = _choices[0].get("delta") if (_choices and _choices[0] is not None) else None
|
_delta0 = _choices[0].get("delta") if (_choices and _choices[0] is not None) else None
|
||||||
@@ -1579,6 +1593,10 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
|||||||
_usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2)
|
_usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2)
|
||||||
if _tm.get("prompt_per_second"):
|
if _tm.get("prompt_per_second"):
|
||||||
_usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2)
|
_usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2)
|
||||||
|
if _actual_model:
|
||||||
|
_usage_data["model"] = _actual_model
|
||||||
|
if not _same_model_identity(_actual_model, model):
|
||||||
|
_usage_data["requested_model"] = model
|
||||||
yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n'
|
yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n'
|
||||||
elif "choices" in j:
|
elif "choices" in j:
|
||||||
_c0 = (j["choices"] or [None])[0]
|
_c0 = (j["choices"] or [None])[0]
|
||||||
@@ -1791,6 +1809,13 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
|
|||||||
continue
|
continue
|
||||||
# Any data chunk other than the terminal [DONE] means real output.
|
# Any data chunk other than the terminal [DONE] means real output.
|
||||||
if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
|
if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
|
||||||
|
try:
|
||||||
|
event_data = json.loads(chunk[6:])
|
||||||
|
except Exception:
|
||||||
|
event_data = {}
|
||||||
|
if event_data.get("type") == "model_actual":
|
||||||
|
yield chunk
|
||||||
|
continue
|
||||||
# First real output from a NON-primary candidate: tell the client
|
# First real output from a NON-primary candidate: tell the client
|
||||||
# the selected model failed and another answered. Without this the
|
# the selected model failed and another answered. Without this the
|
||||||
# fallback is invisible — a misconfigured provider looks like it
|
# fallback is invisible — a misconfigured provider looks like it
|
||||||
|
|||||||
+62
-25
@@ -53,7 +53,27 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
|
|
||||||
// shortModel and modelColor are now in chatRenderer.js
|
// shortModel and modelColor are now in chatRenderer.js
|
||||||
var _shortModel = chatRenderer.shortModel;
|
var _shortModel = chatRenderer.shortModel;
|
||||||
|
var _modelRouteLabel = chatRenderer.modelRouteLabel;
|
||||||
|
var _sameModelName = chatRenderer.sameModelName;
|
||||||
var _applyModelColor = chatRenderer.applyModelColor;
|
var _applyModelColor = chatRenderer.applyModelColor;
|
||||||
|
function _setRoleModelLabel(roleEl, requestedModel, actualModel, opts) {
|
||||||
|
if (!roleEl) return;
|
||||||
|
opts = opts || {};
|
||||||
|
const tsSpan = roleEl.querySelector('.role-timestamp');
|
||||||
|
const req = requestedModel || actualModel || '';
|
||||||
|
const actual = actualModel || requestedModel || '';
|
||||||
|
let label = _modelRouteLabel(req, actual);
|
||||||
|
if (opts.suffix) label += ' (' + opts.suffix + ')';
|
||||||
|
if (opts.characterName) label = opts.characterName;
|
||||||
|
roleEl.textContent = label + ' ';
|
||||||
|
_applyModelColor(roleEl, actual || req);
|
||||||
|
if (req && actual && !_sameModelName(req, actual)) {
|
||||||
|
roleEl.title = req + ' -> ' + actual + (opts.reason ? ': ' + opts.reason : '');
|
||||||
|
} else if (!opts.reason) {
|
||||||
|
roleEl.removeAttribute('title');
|
||||||
|
}
|
||||||
|
if (tsSpan) roleEl.appendChild(tsSpan);
|
||||||
|
}
|
||||||
// Per-session research tracking (supports concurrent research across sessions)
|
// Per-session research tracking (supports concurrent research across sessions)
|
||||||
const _researchingStreamIds = new Set();
|
const _researchingStreamIds = new Set();
|
||||||
let _researchTimerEl = null, _researchTimerInterval = null;
|
let _researchTimerEl = null, _researchTimerInterval = null;
|
||||||
@@ -556,7 +576,6 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
let _thinkOpen = false;
|
let _thinkOpen = false;
|
||||||
let holder = null;
|
let holder = null;
|
||||||
let finalMeta = null;
|
let finalMeta = null;
|
||||||
let finalModelName = null;
|
|
||||||
let spinner = null;
|
let spinner = null;
|
||||||
let timedOut = false;
|
let timedOut = false;
|
||||||
let processingProbeTimer = null;
|
let processingProbeTimer = null;
|
||||||
@@ -892,11 +911,13 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
loadingText = 'Processing request...';
|
loadingText = 'Processing request...';
|
||||||
}
|
}
|
||||||
|
|
||||||
var roleLabel = _shortModel(modelName);
|
var roleLabel = _modelRouteLabel(modelName, modelName);
|
||||||
var _charNameInit = presetsModule.getCharacterName ? presetsModule.getCharacterName() : '';
|
var _charNameInit = presetsModule.getCharacterName ? presetsModule.getCharacterName() : '';
|
||||||
if (_charNameInit) roleLabel = _charNameInit;
|
if (_charNameInit) roleLabel = _charNameInit;
|
||||||
const roleTs = new Date().toLocaleTimeString([], {hour: '2-digit', minute:'2-digit'});
|
const roleTs = new Date().toLocaleTimeString([], {hour: '2-digit', minute:'2-digit'});
|
||||||
holder.innerHTML = `<div class="role">${uiModule.esc(roleLabel)} <span class="role-timestamp">${roleTs}</span></div><div class="body"></div>`;
|
holder.innerHTML = `<div class="role">${uiModule.esc(roleLabel)} <span class="role-timestamp">${roleTs}</span></div><div class="body"></div>`;
|
||||||
|
holder._requestedModel = modelName;
|
||||||
|
holder._actualModel = modelName;
|
||||||
_applyModelColor(holder.querySelector('.role'), modelName);
|
_applyModelColor(holder.querySelector('.role'), modelName);
|
||||||
holder.style.position = 'relative';
|
holder.style.position = 'relative';
|
||||||
|
|
||||||
@@ -1807,21 +1828,16 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
if (!_isBg && holder) {
|
if (!_isBg && holder) {
|
||||||
const roleEl = holder.querySelector('.role');
|
const roleEl = holder.querySelector('.role');
|
||||||
if (roleEl) {
|
if (roleEl) {
|
||||||
const tsSpan = roleEl.querySelector('.role-timestamp');
|
holder._requestedModel = json.requested_model || json.model || holder._requestedModel;
|
||||||
var _modelLabel = _shortModel(json.model);
|
holder._actualModel = json.model || holder._actualModel || holder._requestedModel;
|
||||||
if (json.suffix) {
|
if (json.suffix) holder._roleSuffix = json.suffix;
|
||||||
_modelLabel += ' (' + json.suffix + ')';
|
|
||||||
holder._roleSuffix = json.suffix;
|
|
||||||
}
|
|
||||||
// Prepend character name if sent by server or set locally
|
// Prepend character name if sent by server or set locally
|
||||||
var _charName = json.character_name || (presetsModule.getCharacterName ? presetsModule.getCharacterName() : '');
|
var _charName = json.character_name || (presetsModule.getCharacterName ? presetsModule.getCharacterName() : '');
|
||||||
if (_charName) {
|
if (_charName) holder._characterName = _charName;
|
||||||
_modelLabel = _charName;
|
_setRoleModelLabel(roleEl, holder._requestedModel, holder._actualModel, {
|
||||||
holder._characterName = _charName;
|
suffix: holder._roleSuffix,
|
||||||
}
|
characterName: holder._characterName,
|
||||||
roleEl.textContent = _modelLabel + ' ';
|
});
|
||||||
_applyModelColor(roleEl, json.model);
|
|
||||||
if (tsSpan) roleEl.appendChild(tsSpan);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (json.type === 'fallback') {
|
} else if (json.type === 'fallback') {
|
||||||
@@ -1841,6 +1857,14 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
(json.reason ? ': ' + json.reason : '') + ' — answered by ' + (json.answered_by || '');
|
(json.reason ? ': ' + json.reason : '') + ' — answered by ' + (json.answered_by || '');
|
||||||
_applyModelColor(_rEl, json.answered_by);
|
_applyModelColor(_rEl, json.answered_by);
|
||||||
if (_tsS) _rEl.appendChild(_tsS);
|
if (_tsS) _rEl.appendChild(_tsS);
|
||||||
|
holder._requestedModel = json.selected_model || holder._requestedModel || modelName;
|
||||||
|
const _hasResolvedActual = holder._actualModel && !_sameModelName(holder._actualModel, holder._requestedModel);
|
||||||
|
holder._actualModel = _hasResolvedActual ? holder._actualModel : (json.answered_by || holder._actualModel || holder._requestedModel);
|
||||||
|
_setRoleModelLabel(_rEl, holder._requestedModel, holder._actualModel, {
|
||||||
|
suffix: holder._roleSuffix,
|
||||||
|
characterName: holder._characterName,
|
||||||
|
reason: json.reason,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1882,6 +1906,15 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
_chatBox.appendChild(note);
|
_chatBox.appendChild(note);
|
||||||
try { note.scrollIntoView({ block: 'end', behavior: 'smooth' }); } catch (_) { uiModule.scrollHistory && uiModule.scrollHistory(); }
|
try { note.scrollIntoView({ block: 'end', behavior: 'smooth' }); } catch (_) { uiModule.scrollHistory && uiModule.scrollHistory(); }
|
||||||
}
|
}
|
||||||
|
} else if (json.type === 'model_actual') {
|
||||||
|
if (!_isBg && holder) {
|
||||||
|
holder._requestedModel = json.requested_model || holder._requestedModel || modelName;
|
||||||
|
holder._actualModel = json.model || holder._actualModel || holder._requestedModel;
|
||||||
|
_setRoleModelLabel(holder.querySelector('.role'), holder._requestedModel, holder._actualModel, {
|
||||||
|
suffix: holder._roleSuffix,
|
||||||
|
characterName: holder._characterName,
|
||||||
|
});
|
||||||
|
}
|
||||||
} else if (json.type === 'attachments') {
|
} else if (json.type === 'attachments') {
|
||||||
if (_isBg) continue;
|
if (_isBg) continue;
|
||||||
// Update user bubble — replace file chips with image previews
|
// Update user bubble — replace file chips with image previews
|
||||||
@@ -1959,6 +1992,10 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
}
|
}
|
||||||
} else if (json.type === 'metrics') {
|
} else if (json.type === 'metrics') {
|
||||||
metrics = json.data;
|
metrics = json.data;
|
||||||
|
if (!_isBg && holder && metrics) {
|
||||||
|
holder._requestedModel = metrics.requested_model || holder._requestedModel || modelName;
|
||||||
|
holder._actualModel = metrics.model || holder._actualModel || holder._requestedModel;
|
||||||
|
}
|
||||||
if (_isBg) {
|
if (_isBg) {
|
||||||
var bgM = _backgroundStreams.get(streamSessionId);
|
var bgM = _backgroundStreams.get(streamSessionId);
|
||||||
if (bgM) bgM.metrics = json.data;
|
if (bgM) bgM.metrics = json.data;
|
||||||
@@ -2441,8 +2478,10 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
const newRole = document.createElement('div');
|
const newRole = document.createElement('div');
|
||||||
newRole.className = 'role';
|
newRole.className = 'role';
|
||||||
const metaS = sessionModule.getSessions().find(s => s.id === streamSessionId);
|
const metaS = sessionModule.getSessions().find(s => s.id === streamSessionId);
|
||||||
newRole.textContent = _shortModel(metaS?.model) || '';
|
const _roundRequested = holder?._requestedModel || metaS?.model;
|
||||||
_applyModelColor(newRole, metaS?.model);
|
const _roundActual = holder?._actualModel || _roundRequested;
|
||||||
|
newRole.textContent = _modelRouteLabel(_roundRequested, _roundActual) || '';
|
||||||
|
_applyModelColor(newRole, _roundActual);
|
||||||
newWrap.appendChild(newRole);
|
newWrap.appendChild(newRole);
|
||||||
const newBody = document.createElement('div');
|
const newBody = document.createElement('div');
|
||||||
newBody.className = 'body';
|
newBody.className = 'body';
|
||||||
@@ -2548,18 +2587,16 @@ import { createStreamRenderer } from './streamingRenderer.js';
|
|||||||
const _isBgFinal = (sessionModule.getCurrentSessionId() !== streamSessionId) || _backgroundStreams.has(streamSessionId);
|
const _isBgFinal = (sessionModule.getCurrentSessionId() !== streamSessionId) || _backgroundStreams.has(streamSessionId);
|
||||||
if (!_isBgFinal) {
|
if (!_isBgFinal) {
|
||||||
finalMeta = sessionModule.getSessions().find(s => s.id === sessionModule.getCurrentSessionId());
|
finalMeta = sessionModule.getSessions().find(s => s.id === sessionModule.getCurrentSessionId());
|
||||||
finalModelName = _shortModel(metrics?.model || finalMeta?.model);
|
const _finalActualModel = metrics?.model || holder._actualModel || finalMeta?.model;
|
||||||
// Preserve suffix (e.g. "Research") if set by model_info event
|
const _finalRequestedModel = metrics?.requested_model || holder._requestedModel || finalMeta?.model || _finalActualModel;
|
||||||
if (holder._roleSuffix) finalModelName += ' (' + holder._roleSuffix + ')';
|
|
||||||
// Prepend character name if set
|
// Prepend character name if set
|
||||||
var _charNameFinal = presetsModule.getCharacterName ? presetsModule.getCharacterName() : '';
|
var _charNameFinal = presetsModule.getCharacterName ? presetsModule.getCharacterName() : '';
|
||||||
if (_charNameFinal) finalModelName = _charNameFinal;
|
|
||||||
const roleEl = holder.querySelector('.role');
|
const roleEl = holder.querySelector('.role');
|
||||||
if (roleEl) {
|
if (roleEl) {
|
||||||
const tsSpan = roleEl.querySelector('.role-timestamp');
|
_setRoleModelLabel(roleEl, _finalRequestedModel, _finalActualModel, {
|
||||||
roleEl.textContent = finalModelName + ' ';
|
suffix: holder._roleSuffix,
|
||||||
_applyModelColor(roleEl, metrics?.model || finalMeta?.model);
|
characterName: _charNameFinal || holder._characterName,
|
||||||
if (tsSpan) roleEl.appendChild(tsSpan);
|
});
|
||||||
}
|
}
|
||||||
holder.dataset.raw = accumulated;
|
holder.dataset.raw = accumulated;
|
||||||
|
|
||||||
|
|||||||
@@ -537,6 +537,39 @@ export function shortModel(name) {
|
|||||||
return short;
|
return short;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function modelValue(name) {
|
||||||
|
if (name == null) return '';
|
||||||
|
return String(name).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
export function sameModelName(left, right) {
|
||||||
|
const a = modelValue(left);
|
||||||
|
const b = modelValue(right);
|
||||||
|
if (!a || !b) return false;
|
||||||
|
return a.toLowerCase() === b.toLowerCase()
|
||||||
|
|| shortModel(a).toLowerCase() === shortModel(b).toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
export function modelRouteLabel(requestedModel, actualModel) {
|
||||||
|
const requested = modelValue(requestedModel);
|
||||||
|
const actual = modelValue(actualModel) || requested;
|
||||||
|
if (!requested || sameModelName(requested, actual)) return shortModel(actual || requested);
|
||||||
|
return shortModel(requested) + ' -> ' + shortModel(actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function replyModelPair(modelName, metadata) {
|
||||||
|
const meta = metadata || {};
|
||||||
|
const actualFromMeta = modelValue(meta.model || meta.actual_model);
|
||||||
|
const requestedFromMeta = modelValue(meta.requested_model || meta.selected_model);
|
||||||
|
if (actualFromMeta || requestedFromMeta) {
|
||||||
|
const actual = actualFromMeta || requestedFromMeta || modelValue(modelName);
|
||||||
|
const requested = requestedFromMeta || actual;
|
||||||
|
return { requestedModel: requested, actualModel: actual };
|
||||||
|
}
|
||||||
|
const fallback = modelValue(modelName);
|
||||||
|
return { requestedModel: fallback, actualModel: fallback };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate a consistent HSL color for a model name.
|
* Generate a consistent HSL color for a model name.
|
||||||
* Returns an hsl() string. The hue is derived from a string hash,
|
* Returns an hsl() string. The hue is derived from a string hash,
|
||||||
@@ -577,7 +610,11 @@ export function applyModelColor(roleEl, modelName) {
|
|||||||
}
|
}
|
||||||
// Replace generic dot with provider logo if available
|
// Replace generic dot with provider logo if available
|
||||||
const logo = providerLogo(modelName);
|
const logo = providerLogo(modelName);
|
||||||
if (logo && !roleEl.querySelector('.role-provider-logo')) {
|
const existingLogo = roleEl.querySelector('.role-provider-logo');
|
||||||
|
if (!logo) {
|
||||||
|
if (existingLogo) existingLogo.remove();
|
||||||
|
roleEl.classList.remove('has-logo');
|
||||||
|
} else if (!existingLogo) {
|
||||||
const span = document.createElement('span');
|
const span = document.createElement('span');
|
||||||
span.className = 'role-provider-logo';
|
span.className = 'role-provider-logo';
|
||||||
span.innerHTML = logo;
|
span.innerHTML = logo;
|
||||||
@@ -1933,8 +1970,12 @@ export function addMessage(role, content, modelName, metadata) {
|
|||||||
wrap.className = 'msg msg-ai' + (r > 0 ? ' msg-continuation' : '');
|
wrap.className = 'msg msg-ai' + (r > 0 ? ' msg-continuation' : '');
|
||||||
const roleEl = document.createElement('div');
|
const roleEl = document.createElement('div');
|
||||||
roleEl.className = 'role';
|
roleEl.className = 'role';
|
||||||
const contModel = modelName || metadata?.model;
|
const pair = replyModelPair(modelName, metadata);
|
||||||
roleEl.textContent = shortModel(contModel);
|
const contModel = pair.actualModel || pair.requestedModel;
|
||||||
|
roleEl.textContent = modelRouteLabel(pair.requestedModel, contModel);
|
||||||
|
if (pair.requestedModel && contModel && !sameModelName(pair.requestedModel, contModel)) {
|
||||||
|
roleEl.title = pair.requestedModel + ' -> ' + contModel;
|
||||||
|
}
|
||||||
applyModelColor(roleEl, contModel);
|
applyModelColor(roleEl, contModel);
|
||||||
if (r === 0) roleEl.appendChild(roleTimestamp(metadata?.timestamp));
|
if (r === 0) roleEl.appendChild(roleTimestamp(metadata?.timestamp));
|
||||||
wrap.appendChild(roleEl);
|
wrap.appendChild(roleEl);
|
||||||
@@ -2057,8 +2098,9 @@ export function addMessage(role, content, modelName, metadata) {
|
|||||||
r.className = 'role';
|
r.className = 'role';
|
||||||
const isSlash = metadata?.source === 'slash';
|
const isSlash = metadata?.source === 'slash';
|
||||||
const isCompacted = metadata?.compacted;
|
const isCompacted = metadata?.compacted;
|
||||||
const resolvedModel = modelName || metadata?.model;
|
const replyModels = replyModelPair(modelName, metadata);
|
||||||
var _roleText = role === 'user' ? 'You' : (isSlash || isCompacted) ? 'Odysseus' : shortModel(resolvedModel);
|
const resolvedModel = replyModels.actualModel || replyModels.requestedModel;
|
||||||
|
var _roleText = role === 'user' ? 'You' : (isSlash || isCompacted) ? 'Odysseus' : modelRouteLabel(replyModels.requestedModel, resolvedModel);
|
||||||
if (role === 'assistant' && (metadata?.research || metadata?.research_clarification)) {
|
if (role === 'assistant' && (metadata?.research || metadata?.research_clarification)) {
|
||||||
_roleText += ' (Research)';
|
_roleText += ' (Research)';
|
||||||
}
|
}
|
||||||
@@ -2069,6 +2111,9 @@ export function addMessage(role, content, modelName, metadata) {
|
|||||||
}
|
}
|
||||||
r.textContent = _roleText;
|
r.textContent = _roleText;
|
||||||
if (role !== 'user') {
|
if (role !== 'user') {
|
||||||
|
if (!isSlash && !isCompacted && replyModels.requestedModel && resolvedModel && !sameModelName(replyModels.requestedModel, resolvedModel)) {
|
||||||
|
r.title = replyModels.requestedModel + ' -> ' + resolvedModel;
|
||||||
|
}
|
||||||
if (!isSlash && !isCompacted) applyModelColor(r, resolvedModel);
|
if (!isSlash && !isCompacted) applyModelColor(r, resolvedModel);
|
||||||
r.appendChild(roleTimestamp(metadata?.timestamp));
|
r.appendChild(roleTimestamp(metadata?.timestamp));
|
||||||
}
|
}
|
||||||
@@ -2335,6 +2380,9 @@ export function addMessage(role, content, modelName, metadata) {
|
|||||||
|
|
||||||
const chatRenderer = {
|
const chatRenderer = {
|
||||||
shortModel,
|
shortModel,
|
||||||
|
sameModelName,
|
||||||
|
modelRouteLabel,
|
||||||
|
replyModelPair,
|
||||||
modelColor,
|
modelColor,
|
||||||
applyModelColor,
|
applyModelColor,
|
||||||
getModelCost,
|
getModelCost,
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
|
|
||||||
from routes.chat_helpers import _enforce_chat_privileges, clean_thinking_for_save, needs_auto_name
|
from routes.chat_helpers import (
|
||||||
|
_enforce_chat_privileges,
|
||||||
|
clean_thinking_for_save,
|
||||||
|
needs_auto_name,
|
||||||
|
save_assistant_response,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class _AuthManager:
|
class _AuthManager:
|
||||||
@@ -64,6 +69,15 @@ def test_allowed_models_nonempty_list_still_restricts_without_new_flag(monkeypat
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeSession:
|
||||||
|
def __init__(self, model="selected-model"):
|
||||||
|
self.model = model
|
||||||
|
self.history = []
|
||||||
|
|
||||||
|
def add_message(self, message):
|
||||||
|
self.history.append(message)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("name,expected", [
|
@pytest.mark.parametrize("name,expected", [
|
||||||
# 24h format (the bug this PR fixes)
|
# 24h format (the bug this PR fixes)
|
||||||
("deepseek-v4-flash 14:05:33", True),
|
("deepseek-v4-flash 14:05:33", True),
|
||||||
@@ -130,3 +144,19 @@ def test_clean_thinking_for_save_extracts_thought_tag():
|
|||||||
|
|
||||||
assert content == "Final answer."
|
assert content == "Final answer."
|
||||||
assert metadata["thinking"] == "internal reasoning"
|
assert metadata["thinking"] == "internal reasoning"
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_assistant_response_preserves_actual_and_requested_model():
|
||||||
|
sess = _FakeSession("selected-model")
|
||||||
|
|
||||||
|
save_assistant_response(
|
||||||
|
sess,
|
||||||
|
session_manager=None,
|
||||||
|
session_id="s1",
|
||||||
|
full_response="hello",
|
||||||
|
last_metrics={"model": "actual-model", "input_tokens": 1, "output_tokens": 2},
|
||||||
|
incognito=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert sess.history[-1].metadata["requested_model"] == "selected-model"
|
||||||
|
assert sess.history[-1].metadata["model"] == "actual-model"
|
||||||
|
|||||||
@@ -82,6 +82,32 @@ def _usage_event(monkeypatch, lines):
|
|||||||
return asyncio.run(run())
|
return asyncio.run(run())
|
||||||
|
|
||||||
|
|
||||||
|
def _stream_events(monkeypatch, lines):
|
||||||
|
"""Drive stream_llm and return all JSON data events."""
|
||||||
|
monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines))
|
||||||
|
monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
|
||||||
|
monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
|
||||||
|
monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
events = []
|
||||||
|
async for chunk in llm_core.stream_llm(
|
||||||
|
"http://127.0.0.1:8081/v1/chat/completions",
|
||||||
|
"openrouter/auto",
|
||||||
|
[{"role": "user", "content": "hi"}],
|
||||||
|
):
|
||||||
|
for ln in chunk.split("\n"):
|
||||||
|
ln = ln.strip()
|
||||||
|
if ln.startswith("data: ") and ln[6:] != "[DONE]":
|
||||||
|
try:
|
||||||
|
events.append(json.loads(ln[6:]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return events
|
||||||
|
|
||||||
|
return asyncio.run(run())
|
||||||
|
|
||||||
|
|
||||||
# A real llama.cpp final chunk carries `usage` (delta empty / choices []) with a
|
# A real llama.cpp final chunk carries `usage` (delta empty / choices []) with a
|
||||||
# sibling `timings` block. The decode speed here (78.91) is far above the
|
# sibling `timings` block. The decode speed here (78.91) is far above the
|
||||||
# wall-clock figure the old code would have shown.
|
# wall-clock figure the old code would have shown.
|
||||||
@@ -127,6 +153,31 @@ def test_stream_llm_omits_tps_when_backend_has_no_timings(monkeypatch):
|
|||||||
assert "prefill_tps" not in usage
|
assert "prefill_tps" not in usage
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_llm_surfaces_provider_resolved_model(monkeypatch):
|
||||||
|
events = _stream_events(monkeypatch, [
|
||||||
|
'data: ' + json.dumps({
|
||||||
|
"model": "meta-llama/llama-3.3-70b-instruct:free",
|
||||||
|
"choices": [{"index": 0, "delta": {"content": "Hi"}}],
|
||||||
|
}),
|
||||||
|
'data: ' + json.dumps({
|
||||||
|
"model": "meta-llama/llama-3.3-70b-instruct:free",
|
||||||
|
"choices": [],
|
||||||
|
"usage": {"prompt_tokens": 8, "completion_tokens": 5},
|
||||||
|
}),
|
||||||
|
"data: [DONE]",
|
||||||
|
])
|
||||||
|
|
||||||
|
actual = [e for e in events if e.get("type") == "model_actual"]
|
||||||
|
assert actual == [{
|
||||||
|
"type": "model_actual",
|
||||||
|
"requested_model": "openrouter/auto",
|
||||||
|
"model": "meta-llama/llama-3.3-70b-instruct:free",
|
||||||
|
}]
|
||||||
|
usage = [e["data"] for e in events if e.get("type") == "usage"][0]
|
||||||
|
assert usage["requested_model"] == "openrouter/auto"
|
||||||
|
assert usage["model"] == "meta-llama/llama-3.3-70b-instruct:free"
|
||||||
|
|
||||||
|
|
||||||
# --- _compute_final_metrics preference logic --------------------------------
|
# --- _compute_final_metrics preference logic --------------------------------
|
||||||
|
|
||||||
def _metrics(**overrides):
|
def _metrics(**overrides):
|
||||||
|
|||||||
Reference in New Issue
Block a user