Don't attempt the same (url, model) route twice in the fallback chains (#1733)

The fallback helpers (llm_call_with_fallback, llm_call_async_with_fallback, stream_llm_with_fallback) build their candidate list as the primary target followed by the configured fallbacks. Callers prepend the session's live (url, model) to default_model_fallbacks, so if the user also lists their current model among the fallbacks — a common misconfiguration — the chain re-attempts the very route that just failed: a wasted round-trip (and, for the streaming path, a spurious 'fallback' notice for a switch that didn't actually happen). Add a small _dedupe_candidates() helper that filters malformed entries and drops a later repeat of an already-seen (url, model), preserving order (first wins, keeping its headers). Apply it in all three fallback chains. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-16 17:55:26 -04:00 · 2026-06-03 12:33:50 +08:00
parent 77614e9feb
commit 126e91e8b9
2 changed files with 66 additions and 3 deletions
@@ -874,6 +874,31 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
        raise HTTPException(502, f"Unexpected schema from {target_url}: {str(data)[:400]}")


+def _dedupe_candidates(candidates):
+    """Filter malformed entries and drop a later repeat of an already-seen
+    ``(url, model)`` route, preserving order (first occurrence wins).
+
+    The chain is the primary target followed by the configured fallbacks, so a
+    fallback that repeats the session's current model — a common misconfiguration,
+    since callers prepend the live ``(url, model)`` to ``default_model_fallbacks``
+    — would otherwise make the chain re-attempt the very route that just failed:
+    a wasted round-trip plus a spurious ``fallback`` notice for a switch that did
+    not happen. Headers are not part of the key; the first tuple (with its
+    headers) is the one kept.
+    """
+    seen = set()
+    out = []
+    for c in candidates or []:
+        if not c or not c[0] or not c[1]:
+            continue
+        key = (c[0], c[1])
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(c)
+    return out
+
+
 def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
    """Sync `llm_call` with an ordered fallback chain.

@@ -882,7 +907,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
    the next candidate. The dead-host cooldown inside `llm_call` makes repeat
    attempts at an offline primary effectively free.
    """
-    cands = [c for c in (candidates or []) if c and c[0] and c[1]]
+    cands = _dedupe_candidates(candidates)
    if not cands:
        raise HTTPException(503, "No model endpoint configured")
    last_err = None
@@ -899,7 +924,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str:

 async def llm_call_async_with_fallback(candidates, messages, **kwargs) -> str:
    """Async variant of `llm_call_with_fallback` — same semantics."""
-    cands = [c for c in (candidates or []) if c and c[0] and c[1]]
+    cands = _dedupe_candidates(candidates)
    if not cands:
        raise HTTPException(503, "No model endpoint configured")
    last_err = None
@@ -1436,7 +1461,7 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):

    Yields the same SSE chunk protocol as stream_llm.
    """
-    cands = [c for c in (candidates or []) if c and c[0] and c[1]]
+    cands = _dedupe_candidates(candidates)
    if not cands:
        yield f'event: error\ndata: {json.dumps({"error": "No model endpoint configured", "status": 503})}\n\n'
        return