diff --git a/src/llm_core.py b/src/llm_core.py index 88061c9ea..3b7369153 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -7,6 +7,7 @@ import logging import hashlib import threading import re +import os from fastapi import HTTPException from typing import Optional, Dict, List, Tuple from src.model_context import get_context_length, DEFAULT_CONTEXT @@ -22,6 +23,24 @@ class LLMConfig: MAX_RETRIES = 3 RETRY_DELAY = 0.5 STREAM_TIMEOUT = 300 + # TCP+TLS connect budget for a SINGLE attempt. The old hard-coded 3.0s + # assumed LAN/Tailscale peers ('SYN in <100ms'); it is too tight for public + # cloud endpoints (offshore APIs take ~0.5-1.5s cold, with jitter), so a + # brief blip on the first connect of an idle chat surfaced as a 503 on the + # streaming path (which, unlike llm_call, does not retry the connect). A + # genuinely dead upstream stays bounded by the dead-host cooldown. Override + # with env LLM_CONNECT_TIMEOUT (seconds). + CONNECT_TIMEOUT = float(os.getenv('LLM_CONNECT_TIMEOUT', '10') or '10') + + +def _call_timeout(read_timeout) -> httpx.Timeout: + """Per-request timeout for non-streaming LLM calls (connect from config).""" + return httpx.Timeout(connect=LLMConfig.CONNECT_TIMEOUT, read=float(read_timeout), write=10.0, pool=5.0) + + +def _stream_timeout(read_timeout) -> httpx.Timeout: + """Per-request timeout for streaming LLM calls (connect from config).""" + return httpx.Timeout(connect=LLMConfig.CONNECT_TIMEOUT, read=float(read_timeout), write=30.0, pool=5.0) # Cache for LLM responses @@ -1446,7 +1465,7 @@ async def llm_call_async( if _is_host_dead(target_url): raise HTTPException(503, f"Upstream {_host_key(target_url)} marked unreachable (cooldown active)") - call_timeout = httpx.Timeout(connect=3.0, read=float(timeout), write=10.0, pool=5.0) + call_timeout = _call_timeout(timeout) attempt = 0 while attempt < max_retries: attempt += 1 @@ -1570,9 +1589,12 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl from src.copilot import apply_request_headers apply_request_headers(h, messages_copy) - # Short connect timeout: a reachable peer answers SYN in <100ms even on - # Tailscale. 3s is plenty; 30s let one dead upstream wedge the UI. - stream_timeout = httpx.Timeout(connect=3.0, read=float(timeout), write=30.0, pool=5.0) + # Connect budget from LLMConfig.CONNECT_TIMEOUT (env LLM_CONNECT_TIMEOUT). + # The dead-host cooldown still bounds a genuinely unreachable upstream, so a + # wider connect budget only affects first contact and stops a brief cold + # connect blip (offshore/public endpoints) surfacing as a 503 on this stream + # path, which -- unlike llm_call -- does not retry the connect. + stream_timeout = _stream_timeout(timeout) if _is_host_dead(target_url): yield f'event: error\ndata: {json.dumps({"error": f"Upstream {_host_key(target_url)} unreachable (cooldown active)", "status": 503})}\n\n' diff --git a/tests/test_llm_core_connect_timeout.py b/tests/test_llm_core_connect_timeout.py new file mode 100644 index 000000000..ef430c43e --- /dev/null +++ b/tests/test_llm_core_connect_timeout.py @@ -0,0 +1,57 @@ +"""Regression tests for the configurable LLM connect timeout. + +Background: chat uses the streaming path, which (unlike llm_call) does not retry +a connect error -- it marks the host and emits a 503 immediately. With the old +hard-coded connect=3.0s, a brief blip on the first (cold) connect of an idle +chat to an offshore/public endpoint surfaced as an intermittent 503 that cleared +on resend. The connect budget is now LLMConfig.CONNECT_TIMEOUT (env +LLM_CONNECT_TIMEOUT), applied via _call_timeout/_stream_timeout helpers. +""" +import importlib +import httpx +import pytest + +from src import llm_core +from src.llm_core import LLMConfig, _call_timeout, _stream_timeout + + +def test_default_connect_timeout_is_widened_not_three(): + # Regression guard: must not regress to the old too-tight 3.0s default. + assert LLMConfig.CONNECT_TIMEOUT >= 8.0 + assert LLMConfig.CONNECT_TIMEOUT != 3.0 + assert LLMConfig.CONNECT_TIMEOUT == 10.0 + + +def test_call_timeout_uses_config_connect_and_passes_read(): + t = _call_timeout(45) + assert isinstance(t, httpx.Timeout) + assert t.connect == LLMConfig.CONNECT_TIMEOUT + assert t.read == 45.0 + assert t.write == 10.0 + assert t.pool == 5.0 + + +def test_stream_timeout_uses_config_connect_and_passes_read(): + t = _stream_timeout(300) + assert isinstance(t, httpx.Timeout) + assert t.connect == LLMConfig.CONNECT_TIMEOUT + assert t.read == 300.0 + assert t.write == 30.0 + assert t.pool == 5.0 + + +def test_helpers_are_config_driven(monkeypatch): + # Helpers read LLMConfig at call time, so ops can tune without code edits. + monkeypatch.setattr(LLMConfig, "CONNECT_TIMEOUT", 4.5) + assert _call_timeout(30).connect == 4.5 + assert _stream_timeout(30).connect == 4.5 + + +def test_env_override_is_honoured(monkeypatch): + monkeypatch.setenv("LLM_CONNECT_TIMEOUT", "6.5") + reloaded = importlib.reload(llm_core) + try: + assert reloaded.LLMConfig.CONNECT_TIMEOUT == 6.5 + finally: + monkeypatch.delenv("LLM_CONNECT_TIMEOUT", raising=False) + importlib.reload(llm_core) # restore module-level default for other tests