diff --git a/services/search/content.py b/services/search/content.py index 39b1e2106..49d050a4f 100644 --- a/services/search/content.py +++ b/services/search/content.py @@ -15,7 +15,7 @@ from urllib.parse import urljoin, urlparse import httpx from bs4 import BeautifulSoup -from src.constants import WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES +from src.constants import WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES, WEB_FETCH_USER_AGENT from .analytics import RateLimitError, error_logger from .cache import ( @@ -369,7 +369,7 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0, # Fetch try: headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "User-Agent": WEB_FETCH_USER_AGENT, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", # identity so the streamed size cap in _get_public_url stays honest diff --git a/services/search/providers.py b/services/search/providers.py index 89fe12a2d..d0ca1b0de 100644 --- a/services/search/providers.py +++ b/services/search/providers.py @@ -9,7 +9,7 @@ from urllib.parse import urljoin, urlparse, parse_qs import httpx from bs4 import BeautifulSoup -from src.constants import SEARXNG_INSTANCE, REQUEST_TIMEOUT +from src.constants import SEARXNG_INSTANCE, REQUEST_TIMEOUT, WEB_FETCH_USER_AGENT from .analytics import RateLimitError, error_logger from .query import build_enhanced_query @@ -138,7 +138,7 @@ def searxng_search_api(query: str, count: Optional[int] = None, categories: str count = count if count is not None else _get_result_count() instance = _get_search_instance() api_key = "" - headers = {"User-Agent": "Mozilla/5.0"} + headers = {"User-Agent": WEB_FETCH_USER_AGENT} if api_key: headers["Authorization"] = f"Bearer {api_key}" # News/fresh queries do badly in the 'general' category — it favours @@ -250,7 +250,7 @@ def searxng_search(query, max_results=10): """Search using SearXNG instance - parsing HTML.""" instance = _get_search_instance() api_key = "" - req_headers = {"User-Agent": "Mozilla/5.0"} + req_headers = {"User-Agent": WEB_FETCH_USER_AGENT} if api_key: req_headers["Authorization"] = f"Bearer {api_key}" try: @@ -389,7 +389,7 @@ def duckduckgo_search(query: str, count: Optional[int] = None, time_filter: Opti response = httpx.get( "https://html.duckduckgo.com/html/", params={"q": query, "kp": _safesearch_for("duckduckgo_html")}, - headers={"User-Agent": "Mozilla/5.0"}, + headers={"User-Agent": WEB_FETCH_USER_AGENT}, timeout=REQUEST_TIMEOUT, ) response.raise_for_status() diff --git a/src/constants.py b/src/constants.py index a774439a6..622f7e509 100644 --- a/src/constants.py +++ b/src/constants.py @@ -78,6 +78,13 @@ MAX_CONTEXT_MESSAGES = 90 REQUEST_TIMEOUT = 20 OPENAI_COMPAT_PATH = "/v1/chat/completions" +# Outbound UA for web_fetch / web_search scraping; common desktop UA so pages serve normal HTML. +WEB_FETCH_USER_AGENT = os.environ.get( + "WEB_FETCH_USER_AGENT", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36", +) + # Environment variables with defaults DEFAULT_HOST = os.getenv("LLM_HOST", "localhost") LLM_HOSTS = [h.strip() for h in os.getenv("LLM_HOSTS", "").split(",") if h.strip()] diff --git a/tests/test_web_user_agent_constant.py b/tests/test_web_user_agent_constant.py new file mode 100644 index 000000000..8d9e802a8 --- /dev/null +++ b/tests/test_web_user_agent_constant.py @@ -0,0 +1,18 @@ +"""The web scraping path routes its User-Agent through one constant. + +Guards the dedup: web_fetch / web_search outbound UAs go through +WEB_FETCH_USER_AGENT, so a stale or bare Mozilla string cannot be re-inlined in +the search sources. +""" +from pathlib import Path + +_SEARCH = Path(__file__).resolve().parent.parent / "services" / "search" + + +def test_search_sources_have_no_inline_mozilla_ua(): + offenders = [ + str(py.relative_to(_SEARCH.parent.parent)) + for py in _SEARCH.rglob("*.py") + if "Mozilla/" in py.read_text(encoding="utf-8") + ] + assert not offenders, f"inline Mozilla UA found; use WEB_FETCH_USER_AGENT: {offenders}"