mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
refactor(search): centralize the web-scraping User-Agent into one constant (#4325)
The outbound UA for web_fetch / web_search was inlined in four places with two different values and nothing keeping them current: content.py pinned a mid-2021 Chrome 91 build, and providers.py sent a bare Mozilla/5.0 in three spots. Some sites serve a degraded or blocked page to a UA that old. Add WEB_FETCH_USER_AGENT to src/constants.py (env-overridable, matching the existing Copilot/Kimi UA-constant pattern) and import it in content.py and providers.py. Default to a current, common desktop UA so pages return their normal HTML: the market-leading desktop OS (Windows; NT 10.0 covers Windows 10 and 11) and browser (Chrome) on a current stable build. The version is now bumped in one place. Service-specific self-identifying agents (Copilot, Kimi, webhooks, cookbook) are intentionally left separate. Adds a regression pinning the constant shape, the env override, and a guard against a new inline Mozilla literal in the search sources. Closes #4324
This commit is contained in:
committed by
GitHub
parent
b58af4267b
commit
fafaf089c5
@@ -15,7 +15,7 @@ from urllib.parse import urljoin, urlparse
|
|||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from src.constants import WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES
|
from src.constants import WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES, WEB_FETCH_USER_AGENT
|
||||||
|
|
||||||
from .analytics import RateLimitError, error_logger
|
from .analytics import RateLimitError, error_logger
|
||||||
from .cache import (
|
from .cache import (
|
||||||
@@ -369,7 +369,7 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0,
|
|||||||
# Fetch
|
# Fetch
|
||||||
try:
|
try:
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
"User-Agent": WEB_FETCH_USER_AGENT,
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
"Accept-Language": "en-US,en;q=0.5",
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
# identity so the streamed size cap in _get_public_url stays honest
|
# identity so the streamed size cap in _get_public_url stays honest
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from urllib.parse import urljoin, urlparse, parse_qs
|
|||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from src.constants import SEARXNG_INSTANCE, REQUEST_TIMEOUT
|
from src.constants import SEARXNG_INSTANCE, REQUEST_TIMEOUT, WEB_FETCH_USER_AGENT
|
||||||
from .analytics import RateLimitError, error_logger
|
from .analytics import RateLimitError, error_logger
|
||||||
from .query import build_enhanced_query
|
from .query import build_enhanced_query
|
||||||
|
|
||||||
@@ -138,7 +138,7 @@ def searxng_search_api(query: str, count: Optional[int] = None, categories: str
|
|||||||
count = count if count is not None else _get_result_count()
|
count = count if count is not None else _get_result_count()
|
||||||
instance = _get_search_instance()
|
instance = _get_search_instance()
|
||||||
api_key = ""
|
api_key = ""
|
||||||
headers = {"User-Agent": "Mozilla/5.0"}
|
headers = {"User-Agent": WEB_FETCH_USER_AGENT}
|
||||||
if api_key:
|
if api_key:
|
||||||
headers["Authorization"] = f"Bearer {api_key}"
|
headers["Authorization"] = f"Bearer {api_key}"
|
||||||
# News/fresh queries do badly in the 'general' category — it favours
|
# News/fresh queries do badly in the 'general' category — it favours
|
||||||
@@ -250,7 +250,7 @@ def searxng_search(query, max_results=10):
|
|||||||
"""Search using SearXNG instance - parsing HTML."""
|
"""Search using SearXNG instance - parsing HTML."""
|
||||||
instance = _get_search_instance()
|
instance = _get_search_instance()
|
||||||
api_key = ""
|
api_key = ""
|
||||||
req_headers = {"User-Agent": "Mozilla/5.0"}
|
req_headers = {"User-Agent": WEB_FETCH_USER_AGENT}
|
||||||
if api_key:
|
if api_key:
|
||||||
req_headers["Authorization"] = f"Bearer {api_key}"
|
req_headers["Authorization"] = f"Bearer {api_key}"
|
||||||
try:
|
try:
|
||||||
@@ -389,7 +389,7 @@ def duckduckgo_search(query: str, count: Optional[int] = None, time_filter: Opti
|
|||||||
response = httpx.get(
|
response = httpx.get(
|
||||||
"https://html.duckduckgo.com/html/",
|
"https://html.duckduckgo.com/html/",
|
||||||
params={"q": query, "kp": _safesearch_for("duckduckgo_html")},
|
params={"q": query, "kp": _safesearch_for("duckduckgo_html")},
|
||||||
headers={"User-Agent": "Mozilla/5.0"},
|
headers={"User-Agent": WEB_FETCH_USER_AGENT},
|
||||||
timeout=REQUEST_TIMEOUT,
|
timeout=REQUEST_TIMEOUT,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|||||||
@@ -78,6 +78,13 @@ MAX_CONTEXT_MESSAGES = 90
|
|||||||
REQUEST_TIMEOUT = 20
|
REQUEST_TIMEOUT = 20
|
||||||
OPENAI_COMPAT_PATH = "/v1/chat/completions"
|
OPENAI_COMPAT_PATH = "/v1/chat/completions"
|
||||||
|
|
||||||
|
# Outbound UA for web_fetch / web_search scraping; common desktop UA so pages serve normal HTML.
|
||||||
|
WEB_FETCH_USER_AGENT = os.environ.get(
|
||||||
|
"WEB_FETCH_USER_AGENT",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36",
|
||||||
|
)
|
||||||
|
|
||||||
# Environment variables with defaults
|
# Environment variables with defaults
|
||||||
DEFAULT_HOST = os.getenv("LLM_HOST", "localhost")
|
DEFAULT_HOST = os.getenv("LLM_HOST", "localhost")
|
||||||
LLM_HOSTS = [h.strip() for h in os.getenv("LLM_HOSTS", "").split(",") if h.strip()]
|
LLM_HOSTS = [h.strip() for h in os.getenv("LLM_HOSTS", "").split(",") if h.strip()]
|
||||||
|
|||||||
@@ -0,0 +1,18 @@
|
|||||||
|
"""The web scraping path routes its User-Agent through one constant.
|
||||||
|
|
||||||
|
Guards the dedup: web_fetch / web_search outbound UAs go through
|
||||||
|
WEB_FETCH_USER_AGENT, so a stale or bare Mozilla string cannot be re-inlined in
|
||||||
|
the search sources.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
_SEARCH = Path(__file__).resolve().parent.parent / "services" / "search"
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_sources_have_no_inline_mozilla_ua():
|
||||||
|
offenders = [
|
||||||
|
str(py.relative_to(_SEARCH.parent.parent))
|
||||||
|
for py in _SEARCH.rglob("*.py")
|
||||||
|
if "Mozilla/" in py.read_text(encoding="utf-8")
|
||||||
|
]
|
||||||
|
assert not offenders, f"inline Mozilla UA found; use WEB_FETCH_USER_AGENT: {offenders}"
|
||||||
Reference in New Issue
Block a user