mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-19 03:05:24 -04:00
fix(llm): make connect timeout configurable
Use a configurable LLM_CONNECT_TIMEOUT for call and stream connect budgets instead of the previous hard-coded 3s default.
This commit is contained in:
+26
-4
@@ -7,6 +7,7 @@ import logging
|
||||
import hashlib
|
||||
import threading
|
||||
import re
|
||||
import os
|
||||
from fastapi import HTTPException
|
||||
from typing import Optional, Dict, List, Tuple
|
||||
from src.model_context import get_context_length, DEFAULT_CONTEXT
|
||||
@@ -22,6 +23,24 @@ class LLMConfig:
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 0.5
|
||||
STREAM_TIMEOUT = 300
|
||||
# TCP+TLS connect budget for a SINGLE attempt. The old hard-coded 3.0s
|
||||
# assumed LAN/Tailscale peers ('SYN in <100ms'); it is too tight for public
|
||||
# cloud endpoints (offshore APIs take ~0.5-1.5s cold, with jitter), so a
|
||||
# brief blip on the first connect of an idle chat surfaced as a 503 on the
|
||||
# streaming path (which, unlike llm_call, does not retry the connect). A
|
||||
# genuinely dead upstream stays bounded by the dead-host cooldown. Override
|
||||
# with env LLM_CONNECT_TIMEOUT (seconds).
|
||||
CONNECT_TIMEOUT = float(os.getenv('LLM_CONNECT_TIMEOUT', '10') or '10')
|
||||
|
||||
|
||||
def _call_timeout(read_timeout) -> httpx.Timeout:
|
||||
"""Per-request timeout for non-streaming LLM calls (connect from config)."""
|
||||
return httpx.Timeout(connect=LLMConfig.CONNECT_TIMEOUT, read=float(read_timeout), write=10.0, pool=5.0)
|
||||
|
||||
|
||||
def _stream_timeout(read_timeout) -> httpx.Timeout:
|
||||
"""Per-request timeout for streaming LLM calls (connect from config)."""
|
||||
return httpx.Timeout(connect=LLMConfig.CONNECT_TIMEOUT, read=float(read_timeout), write=30.0, pool=5.0)
|
||||
|
||||
|
||||
# Cache for LLM responses
|
||||
@@ -1446,7 +1465,7 @@ async def llm_call_async(
|
||||
if _is_host_dead(target_url):
|
||||
raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
|
||||
|
||||
call_timeout = httpx.Timeout(connect=3.0, read=float(timeout), write=10.0, pool=5.0)
|
||||
call_timeout = _call_timeout(timeout)
|
||||
attempt = 0
|
||||
while attempt < max_retries:
|
||||
attempt += 1
|
||||
@@ -1570,9 +1589,12 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
|
||||
from src.copilot import apply_request_headers
|
||||
apply_request_headers(h, messages_copy)
|
||||
|
||||
# Short connect timeout: a reachable peer answers SYN in <100ms even on
|
||||
# Tailscale. 3s is plenty; 30s let one dead upstream wedge the UI.
|
||||
stream_timeout = httpx.Timeout(connect=3.0, read=float(timeout), write=30.0, pool=5.0)
|
||||
# Connect budget from LLMConfig.CONNECT_TIMEOUT (env LLM_CONNECT_TIMEOUT).
|
||||
# The dead-host cooldown still bounds a genuinely unreachable upstream, so a
|
||||
# wider connect budget only affects first contact and stops a brief cold
|
||||
# connect blip (offshore/public endpoints) surfacing as a 503 on this stream
|
||||
# path, which -- unlike llm_call -- does not retry the connect.
|
||||
stream_timeout = _stream_timeout(timeout)
|
||||
|
||||
if _is_host_dead(target_url):
|
||||
yield f'event: error\ndata: {json.dumps({"error": f"Upstream {_host_key(target_url)} unreachable (cooldown active)", "status": 503})}\n\n'
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
"""Regression tests for the configurable LLM connect timeout.
|
||||
|
||||
Background: chat uses the streaming path, which (unlike llm_call) does not retry
|
||||
a connect error -- it marks the host and emits a 503 immediately. With the old
|
||||
hard-coded connect=3.0s, a brief blip on the first (cold) connect of an idle
|
||||
chat to an offshore/public endpoint surfaced as an intermittent 503 that cleared
|
||||
on resend. The connect budget is now LLMConfig.CONNECT_TIMEOUT (env
|
||||
LLM_CONNECT_TIMEOUT), applied via _call_timeout/_stream_timeout helpers.
|
||||
"""
|
||||
import importlib
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from src import llm_core
|
||||
from src.llm_core import LLMConfig, _call_timeout, _stream_timeout
|
||||
|
||||
|
||||
def test_default_connect_timeout_is_widened_not_three():
|
||||
# Regression guard: must not regress to the old too-tight 3.0s default.
|
||||
assert LLMConfig.CONNECT_TIMEOUT >= 8.0
|
||||
assert LLMConfig.CONNECT_TIMEOUT != 3.0
|
||||
assert LLMConfig.CONNECT_TIMEOUT == 10.0
|
||||
|
||||
|
||||
def test_call_timeout_uses_config_connect_and_passes_read():
|
||||
t = _call_timeout(45)
|
||||
assert isinstance(t, httpx.Timeout)
|
||||
assert t.connect == LLMConfig.CONNECT_TIMEOUT
|
||||
assert t.read == 45.0
|
||||
assert t.write == 10.0
|
||||
assert t.pool == 5.0
|
||||
|
||||
|
||||
def test_stream_timeout_uses_config_connect_and_passes_read():
|
||||
t = _stream_timeout(300)
|
||||
assert isinstance(t, httpx.Timeout)
|
||||
assert t.connect == LLMConfig.CONNECT_TIMEOUT
|
||||
assert t.read == 300.0
|
||||
assert t.write == 30.0
|
||||
assert t.pool == 5.0
|
||||
|
||||
|
||||
def test_helpers_are_config_driven(monkeypatch):
|
||||
# Helpers read LLMConfig at call time, so ops can tune without code edits.
|
||||
monkeypatch.setattr(LLMConfig, "CONNECT_TIMEOUT", 4.5)
|
||||
assert _call_timeout(30).connect == 4.5
|
||||
assert _stream_timeout(30).connect == 4.5
|
||||
|
||||
|
||||
def test_env_override_is_honoured(monkeypatch):
|
||||
monkeypatch.setenv("LLM_CONNECT_TIMEOUT", "6.5")
|
||||
reloaded = importlib.reload(llm_core)
|
||||
try:
|
||||
assert reloaded.LLMConfig.CONNECT_TIMEOUT == 6.5
|
||||
finally:
|
||||
monkeypatch.delenv("LLM_CONNECT_TIMEOUT", raising=False)
|
||||
importlib.reload(llm_core) # restore module-level default for other tests
|
||||
Reference in New Issue
Block a user