Files
odysseus/services/search/content.py
T
Lucas Daniel fa7c4f8ea9 fix(search): catch HTTPStatusError so 403/404 URLs degrade gracefully instead of 500 (#2203)
raise_for_status() raises httpx.HTTPStatusError for 4xx/5xx responses,
but the surrounding try/except only caught httpx.RequestError (network
errors) and RateLimitError (429). Any other HTTP error code propagated
uncaught up through chat_processor -> chat_helpers -> chat_routes and
surfaced as a 500 Internal Server Error.

Added an explicit except httpx.HTTPStatusError clause that logs a warning
and returns an empty result, matching the behaviour already in place for
network errors.

Also adds focused regression tests that exercise the real
fetch_webpage_content() path with a mocked _get_public_url:
- 403/404 responses return the standard empty-result shape instead of
  raising, proving the new HTTPStatusError handling works end to end.
- 429 responses still take their own dedicated rate-limit branch (the
  status_code == 429 check runs before raise_for_status() is reached),
  keeping that behaviour distinct from the new generic HTTPStatusError
  handling.

Dropped the unrelated builtin_mcp.py change that had been carried over
from a rebase; that fix is tracked separately in #2018 and this branch
should stay scoped to the search content fetch path.

Closes #2148
2026-06-08 01:09:21 +01:00

416 lines
15 KiB
Python

"""Webpage content fetching with caching, PDF extraction, and summarization helpers."""
import copy
import io
import ipaddress
import json
import os
import re
import logging
import socket
from datetime import datetime, timedelta
from typing import List
from urllib.parse import urljoin, urlparse
import httpx
from bs4 import BeautifulSoup
from .analytics import RateLimitError, error_logger
from .cache import (
CONTENT_CACHE_DIR,
content_cache_index,
generate_cache_key,
cleanup_cache,
)
logger = logging.getLogger(__name__)
_PRIVATE_NETWORKS = (
ipaddress.ip_network("0.0.0.0/8"),
ipaddress.ip_network("10.0.0.0/8"),
ipaddress.ip_network("127.0.0.0/8"),
ipaddress.ip_network("169.254.0.0/16"),
ipaddress.ip_network("172.16.0.0/12"),
ipaddress.ip_network("192.168.0.0/16"),
ipaddress.ip_network("::1/128"),
ipaddress.ip_network("fc00::/7"),
ipaddress.ip_network("fe80::/10"),
)
def _is_private_address(addr: ipaddress._BaseAddress) -> bool:
if isinstance(addr, ipaddress.IPv6Address) and addr.ipv4_mapped is not None:
addr = addr.ipv4_mapped
return (
addr.is_private
or addr.is_loopback
or addr.is_link_local
or addr.is_reserved
or addr.is_multicast
or addr.is_unspecified
or any(addr in net for net in _PRIVATE_NETWORKS)
)
def _resolve_hostname_ips(hostname: str) -> list[ipaddress._BaseAddress]:
try:
infos = socket.getaddrinfo(hostname, None)
except Exception:
return []
out = []
for info in infos:
try:
out.append(ipaddress.ip_address(info[4][0]))
except Exception:
continue
return out
def _public_http_url(url: str) -> bool:
try:
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
return False
host = (parsed.hostname or "").strip()
if not host:
return False
lower = host.lower()
if lower in ("localhost", "metadata", "metadata.google.internal"):
return False
if lower.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")):
return False
try:
return not _is_private_address(ipaddress.ip_address(host))
except ValueError:
pass
addrs = _resolve_hostname_ips(host)
return bool(addrs) and not any(_is_private_address(a) for a in addrs)
except Exception:
return False
def _get_public_url(url: str, headers: dict, timeout: int, max_redirects: int = 5) -> httpx.Response:
current = url
for _ in range(max_redirects + 1):
if not _public_http_url(current):
raise httpx.RequestError("Blocked private/internal URL", request=httpx.Request("GET", current))
response = httpx.get(current, headers=headers, timeout=timeout, follow_redirects=False)
if response.status_code not in (301, 302, 303, 307, 308):
return response
location = response.headers.get("location")
if not location:
return response
current = urljoin(str(response.url), location)
raise httpx.RequestError("Too many redirects", request=httpx.Request("GET", current))
# PDF extraction (optional dependency)
try:
from pdfminer.high_level import extract_text as pdf_extract_text
except ImportError:
pdf_extract_text = None # type: ignore
# ----------------------------------------------------------------------
# HTML extraction helpers
# ----------------------------------------------------------------------
def _extract_meta(soup: BeautifulSoup) -> dict:
"""Pull meta description and keywords if present."""
description = ""
keywords = ""
desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)})
if desc_tag and desc_tag.get("content"):
description = desc_tag["content"].strip()
kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)})
if kw_tag and kw_tag.get("content"):
keywords = kw_tag["content"].strip()
return {"description": description, "keywords": keywords}
def _extract_og_image(soup: BeautifulSoup) -> str:
"""Extract the best representative image URL from meta tags.
Only returns absolute http(s) URLs -- skips relative paths and data URIs.
"""
candidates = []
for prop in ("og:image", "og:image:url", "og:image:secure_url"):
tag = soup.find("meta", attrs={"property": prop})
if tag and tag.get("content", "").strip():
candidates.append(tag["content"].strip())
tag = soup.find("meta", attrs={"name": "twitter:image"})
if tag and tag.get("content", "").strip():
candidates.append(tag["content"].strip())
tag = soup.find("meta", attrs={"name": "thumbnail"})
if tag and tag.get("content", "").strip():
candidates.append(tag["content"].strip())
for url in candidates:
if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")):
return url
return ""
def _extract_lists(soup: BeautifulSoup) -> List[List[str]]:
"""Return a list of lists, each inner list representing a <ul>/<ol>."""
all_lists = []
for lst in soup.find_all(["ul", "ol"]):
items = [li.get_text(separator=" ", strip=True) for li in lst.find_all("li")]
if items:
all_lists.append(items)
return all_lists
def _extract_tables(soup: BeautifulSoup) -> List[List[List[str]]]:
"""Return a list of tables, each table is a list of rows, each row a list of cell texts."""
tables_data = []
for table in soup.find_all("table"):
rows = []
for tr in table.find_all("tr"):
cells = [td.get_text(separator=" ", strip=True) for td in tr.find_all(["td", "th"])]
if cells:
rows.append(cells)
if rows:
tables_data.append(rows)
return tables_data
def _extract_code_blocks(soup: BeautifulSoup) -> List[str]:
"""Collect text from <pre> and <code> blocks."""
blocks = []
for tag in soup.find_all(["pre", "code"]):
txt = tag.get_text(separator=" ", strip=True)
if txt:
blocks.append(txt)
return blocks
def _detect_js_frameworks(soup: BeautifulSoup) -> bool:
"""Very naive detection of common JS frameworks."""
js_indicators = [
"react", "angular", "vue", "svelte", "next", "nuxt",
"ember", "backbone", "jquery", "polymer", "mithril",
]
for script in soup.find_all("script"):
src = script.get("src", "").lower()
if any(fr in src for fr in js_indicators):
return True
if script.string:
content = script.string.lower()
if any(fr in content for fr in js_indicators):
return True
if soup.find(attrs={"data-reactroot": True}) or soup.find(attrs={"ng-app": True}):
return True
return False
def _empty_result(url: str, error: str = "") -> dict:
"""Build a standard failure result dict."""
return {
"url": url,
"title": "",
"content": "",
"lists": [],
"tables": [],
"code_blocks": [],
"meta_description": "",
"meta_keywords": "",
"js_rendered": False,
"js_message": "",
"success": False,
"error": error,
}
# ----------------------------------------------------------------------
# Main content fetcher
# ----------------------------------------------------------------------
def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) -> dict:
"""Fetch and extract meaningful content from a webpage with caching."""
cache_key = generate_cache_key(url)
cache_file = CONTENT_CACHE_DIR / f"{cache_key}.cache"
# Check cache
if cache_file.exists():
try:
with open(cache_file, "r", encoding="utf-8") as f:
cached_data = json.load(f)
timestamp = datetime.fromisoformat(cached_data["timestamp"])
if datetime.now() - timestamp < timedelta(hours=2):
logger.debug(f"Content cache hit for URL: {url}")
return cached_data["data"]
else:
cache_file.unlink(missing_ok=True)
content_cache_index.pop(cache_key, None)
except Exception as e:
logger.warning(f"Failed to read content cache for {url}: {e}")
cache_file.unlink(missing_ok=True)
content_cache_index.pop(cache_key, None)
# Fetch
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
response = _get_public_url(url, headers=headers, timeout=timeout)
if response.status_code == 429:
raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})")
response.raise_for_status()
except httpx.HTTPStatusError as e:
error_logger.warning(f"HTTP {e.response.status_code} fetching {url}: {e}")
return _empty_result(url, f"HTTP {e.response.status_code}: {e}")
except httpx.RequestError as e:
error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}")
return _empty_result(url, f"NetworkError: {e}")
except RateLimitError as e:
error_logger.error(str(e))
return _empty_result(url, str(e))
# PDF handling
content_type = response.headers.get("Content-Type", "").lower()
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
if pdf_extract_text is None:
logger.error("pdfminer.six is not installed; cannot extract PDF text.")
pdf_text = ""
else:
try:
pdf_bytes = io.BytesIO(response.content)
pdf_text = pdf_extract_text(pdf_bytes)
except Exception as e:
logger.warning(f"PDF extraction failed for {url}: {e}")
pdf_text = ""
result = {
"url": url,
"title": os.path.basename(url),
"content": pdf_text,
"lists": [],
"tables": [],
"code_blocks": [],
"meta_description": "",
"meta_keywords": "",
"js_rendered": False,
"js_message": "",
"success": bool(pdf_text),
"error": "" if pdf_text else "Failed to extract PDF text",
}
_cache_result(cache_file, cache_key, result, url)
return result
# HTML handling
try:
soup = BeautifulSoup(response.text, "html.parser")
except Exception as e:
error_logger.error(f"ParseError parsing HTML from {url} (attempt {retry_attempt}): {e}")
result = _empty_result(url, f"ParseError: {e}")
_cache_result(cache_file, cache_key, result, url)
return result
title_tag = soup.find("title")
title_text = title_tag.get_text(strip=True) if title_tag else ""
meta_info = _extract_meta(soup)
og_image = _extract_og_image(soup)
js_rendered = _detect_js_frameworks(soup)
js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
# Main textual content (heuristic): prefer semantic / "content"-classed
# containers to skip nav/footer/boilerplate; tuned for article pages.
main_content = ""
content_areas = soup.find_all(
["main", "article", "section", "div"],
class_=re.compile("content|main|body|article|post|entry|text", re.I),
)
if content_areas:
for area in content_areas[:3]:
main_content += area.get_text(separator=" ", strip=True) + " "
main_content = re.sub(r"\s+", " ", main_content).strip()
# If the heuristic finds only a tiny wrapper, fall back to body text with
# obvious boilerplate stripped so UI/deep-research search results do not
# look empty for app/landing pages.
THIN_CONTENT_CHARS = 600
if len(main_content) < THIN_CONTENT_CHARS:
body = soup.find("body")
if body:
body_copy = copy.copy(body)
for noise in body_copy.find_all(
["script", "style", "noscript", "template", "nav", "header", "footer", "aside"]
):
noise.extract()
body_text = re.sub(r"\s+", " ", body_copy.get_text(separator=" ", strip=True)).strip()
if len(body_text) > len(main_content):
main_content = body_text
result = {
"url": url,
"title": title_text,
"content": main_content,
"lists": _extract_lists(soup),
"tables": _extract_tables(soup),
"code_blocks": _extract_code_blocks(soup),
"meta_description": meta_info.get("description", ""),
"meta_keywords": meta_info.get("keywords", ""),
"og_image": og_image,
"js_rendered": js_rendered,
"js_message": js_message,
"success": True,
"error": "",
}
_cache_result(cache_file, cache_key, result, url)
return result
def _cache_result(cache_file, cache_key: str, result: dict, url: str):
"""Write a result to the content cache."""
try:
cache_data = {"timestamp": datetime.now().isoformat(), "data": result}
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(cache_data, f)
content_cache_index[cache_key] = datetime.now()
cleanup_cache(CONTENT_CACHE_DIR, content_cache_index, timedelta(hours=2))
except Exception as e:
logger.warning(f"Failed to write content cache for {url}: {e}")
# ----------------------------------------------------------------------
# Content summarization helpers
# ----------------------------------------------------------------------
def extract_key_points(text: str) -> List[str]:
"""Pull out bullet-style key points from a block of text."""
points: List[str] = []
bullet_pat = re.compile(r"^\s*[-*•]\s+(.*)")
numbered_pat = re.compile(r"^\s*\d+[\.\)]\s+(.*)")
for line in text.splitlines():
m = bullet_pat.match(line) or numbered_pat.match(line)
if m:
points.append(m.group(1).strip())
return points
def get_tldr(text: str, max_sentences: int = 3) -> str:
"""Produce a very short TL;DR by taking the first few sentences."""
sentences = re.split(r"(?<=[.!?])\s+", text)
selected = [s.strip() for s in sentences if s][:max_sentences]
return " ".join(selected)
def extract_quotes(text: str) -> List[str]:
"""Return quoted excerpts that are at least 15 characters long."""
# Backreference the opening quote so the closing quote must match it —
# otherwise `"text'` (open double, close single) is treated as a quote.
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
def extract_statistics(text: str) -> List[str]:
"""Find numbers, percentages, dates and simple measurements."""
# Match a comma-grouped number (1,000,000) OR a plain digit run (50000) —
# the old `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a
# comma-less number, and the trailing `\b` dropped a closing `%`.
pattern = re.compile(
r"\b(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?",
re.IGNORECASE,
)
return [m.group(0).strip() for m in pattern.finditer(text)]