Surface silent model fallback instead of masking it (#868)

When the selected model fails before producing output, stream_llm_with_fallback
quietly switches to the next candidate and the reply is shown under the
originally selected model's name, so a misconfigured provider looks like it
works. (Concretely: a Bedrock gateway that 400s every Anthropic/Claude request
appears fine because another model silently answers under the Claude label.)

Emit a `fallback` SSE event ({selected_model, answered_by, reason}) the first
time a non-primary candidate produces output, forward it through the agent loop
and both chat-route paths, stamp the response metrics with the model that
actually answered, and show a notice + relabel the reply in the UI.

Tested: python -m pytest tests/test_llm_core_fallback.py (3 pass);
python -m py_compile src/llm_core.py src/agent_loop.py routes/chat_routes.py;
node --check static/js/chat.js.
This commit is contained in:
James Arslan
2026-06-02 04:37:25 +02:00
committed by GitHub
parent 2d6b777799
commit 6776c7d691
5 changed files with 135 additions and 2 deletions
+16 -2
View File
@@ -769,6 +769,7 @@ def setup_chat_routes(
return
elif chat_mode == "chat":
_chat_start = time.time()
_answered_by = None # set if the selected model failed and a fallback answered
# ── Chat mode: call stream_llm directly, NO tools, NO document access ──
try:
_chat_candidates = [(sess.endpoint_url, sess.model, sess.headers)] + _fallback_candidates
@@ -797,9 +798,14 @@ def setup_chat_routes(
full_response += data["delta"]
_stream_set(session, partial=full_response)
yield chunk
elif data.get("type") == "fallback":
# Selected model failed; a fallback answered.
# Forward the notice and remember the real model.
_answered_by = data.get("answered_by") or _answered_by
yield chunk
elif data.get("type") == "usage":
last_metrics = data.get("data", {})
last_metrics["model"] = sess.model
last_metrics["model"] = _answered_by or sess.model
if ctx.context_length and last_metrics.get("input_tokens"):
pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0)
last_metrics["context_percent"] = pct
@@ -867,6 +873,7 @@ def setup_chat_routes(
# ── Agent mode: full agent loop with tools ──
_agent_rounds = 0
_agent_tool_calls = 0
_answered_by = None # set if the selected model failed and a fallback answered
try:
from src.settings import get_setting
_tool_budget = int(get_setting("agent_max_tool_calls", 0))
@@ -911,9 +918,16 @@ def setup_chat_routes(
elif data.get("type") == "tool_start":
_agent_tool_calls += 1
yield chunk
elif data.get("type") == "fallback":
# Selected model failed; a fallback answered.
# Forward the notice and remember the real
# model so metrics reflect it, not the masked
# selected model.
_answered_by = data.get("answered_by") or _answered_by
yield chunk
elif data.get("type") == "metrics":
last_metrics = data.get("data", {})
last_metrics["model"] = sess.model
last_metrics["model"] = _answered_by or sess.model
yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
except json.JSONDecodeError:
yield chunk