fix(llm): make connect timeout configurable

Use a configurable LLM_CONNECT_TIMEOUT for call and stream connect budgets instead of the previous hard-coded 3s default.
This commit is contained in:
cirim
2026-06-15 06:11:38 +00:00
committed by GitHub
parent faf27c4a90
commit 056d1fb960
2 changed files with 83 additions and 4 deletions
+26 -4
View File
@@ -7,6 +7,7 @@ import logging
import hashlib import hashlib
import threading import threading
import re import re
import os
from fastapi import HTTPException from fastapi import HTTPException
from typing import Optional, Dict, List, Tuple from typing import Optional, Dict, List, Tuple
from src.model_context import get_context_length, DEFAULT_CONTEXT from src.model_context import get_context_length, DEFAULT_CONTEXT
@@ -22,6 +23,24 @@ class LLMConfig:
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_DELAY = 0.5 RETRY_DELAY = 0.5
STREAM_TIMEOUT = 300 STREAM_TIMEOUT = 300
# TCP+TLS connect budget for a SINGLE attempt. The old hard-coded 3.0s
# assumed LAN/Tailscale peers ('SYN in <100ms'); it is too tight for public
# cloud endpoints (offshore APIs take ~0.5-1.5s cold, with jitter), so a
# brief blip on the first connect of an idle chat surfaced as a 503 on the
# streaming path (which, unlike llm_call, does not retry the connect). A
# genuinely dead upstream stays bounded by the dead-host cooldown. Override
# with env LLM_CONNECT_TIMEOUT (seconds).
CONNECT_TIMEOUT = float(os.getenv('LLM_CONNECT_TIMEOUT', '10') or '10')
def _call_timeout(read_timeout) -> httpx.Timeout:
"""Per-request timeout for non-streaming LLM calls (connect from config)."""
return httpx.Timeout(connect=LLMConfig.CONNECT_TIMEOUT, read=float(read_timeout), write=10.0, pool=5.0)
def _stream_timeout(read_timeout) -> httpx.Timeout:
"""Per-request timeout for streaming LLM calls (connect from config)."""
return httpx.Timeout(connect=LLMConfig.CONNECT_TIMEOUT, read=float(read_timeout), write=30.0, pool=5.0)
# Cache for LLM responses # Cache for LLM responses
@@ -1446,7 +1465,7 @@ async def llm_call_async(
if _is_host_dead(target_url): if _is_host_dead(target_url):
raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)") raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)")
call_timeout = httpx.Timeout(connect=3.0, read=float(timeout), write=10.0, pool=5.0) call_timeout = _call_timeout(timeout)
attempt = 0 attempt = 0
while attempt < max_retries: while attempt < max_retries:
attempt += 1 attempt += 1
@@ -1570,9 +1589,12 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
from src.copilot import apply_request_headers from src.copilot import apply_request_headers
apply_request_headers(h, messages_copy) apply_request_headers(h, messages_copy)
# Short connect timeout: a reachable peer answers SYN in <100ms even on # Connect budget from LLMConfig.CONNECT_TIMEOUT (env LLM_CONNECT_TIMEOUT).
# Tailscale. 3s is plenty; 30s let one dead upstream wedge the UI. # The dead-host cooldown still bounds a genuinely unreachable upstream, so a
stream_timeout = httpx.Timeout(connect=3.0, read=float(timeout), write=30.0, pool=5.0) # wider connect budget only affects first contact and stops a brief cold
# connect blip (offshore/public endpoints) surfacing as a 503 on this stream
# path, which -- unlike llm_call -- does not retry the connect.
stream_timeout = _stream_timeout(timeout)
if _is_host_dead(target_url): if _is_host_dead(target_url):
yield f'event: error\ndata: {json.dumps({"error": f"Upstream {_host_key(target_url)} unreachable (cooldown active)", "status": 503})}\n\n' yield f'event: error\ndata: {json.dumps({"error": f"Upstream {_host_key(target_url)} unreachable (cooldown active)", "status": 503})}\n\n'
+57
View File
@@ -0,0 +1,57 @@
"""Regression tests for the configurable LLM connect timeout.
Background: chat uses the streaming path, which (unlike llm_call) does not retry
a connect error -- it marks the host and emits a 503 immediately. With the old
hard-coded connect=3.0s, a brief blip on the first (cold) connect of an idle
chat to an offshore/public endpoint surfaced as an intermittent 503 that cleared
on resend. The connect budget is now LLMConfig.CONNECT_TIMEOUT (env
LLM_CONNECT_TIMEOUT), applied via _call_timeout/_stream_timeout helpers.
"""
import importlib
import httpx
import pytest
from src import llm_core
from src.llm_core import LLMConfig, _call_timeout, _stream_timeout
def test_default_connect_timeout_is_widened_not_three():
# Regression guard: must not regress to the old too-tight 3.0s default.
assert LLMConfig.CONNECT_TIMEOUT >= 8.0
assert LLMConfig.CONNECT_TIMEOUT != 3.0
assert LLMConfig.CONNECT_TIMEOUT == 10.0
def test_call_timeout_uses_config_connect_and_passes_read():
t = _call_timeout(45)
assert isinstance(t, httpx.Timeout)
assert t.connect == LLMConfig.CONNECT_TIMEOUT
assert t.read == 45.0
assert t.write == 10.0
assert t.pool == 5.0
def test_stream_timeout_uses_config_connect_and_passes_read():
t = _stream_timeout(300)
assert isinstance(t, httpx.Timeout)
assert t.connect == LLMConfig.CONNECT_TIMEOUT
assert t.read == 300.0
assert t.write == 30.0
assert t.pool == 5.0
def test_helpers_are_config_driven(monkeypatch):
# Helpers read LLMConfig at call time, so ops can tune without code edits.
monkeypatch.setattr(LLMConfig, "CONNECT_TIMEOUT", 4.5)
assert _call_timeout(30).connect == 4.5
assert _stream_timeout(30).connect == 4.5
def test_env_override_is_honoured(monkeypatch):
monkeypatch.setenv("LLM_CONNECT_TIMEOUT", "6.5")
reloaded = importlib.reload(llm_core)
try:
assert reloaded.LLMConfig.CONNECT_TIMEOUT == 6.5
finally:
monkeypatch.delenv("LLM_CONNECT_TIMEOUT", raising=False)
importlib.reload(llm_core) # restore module-level default for other tests