"""Webpage content fetching with caching, PDF extraction, and summarization helpers.""" import copy import io import ipaddress import json import os import re import logging import socket from datetime import datetime, timedelta from typing import List from urllib.parse import urljoin, urlparse import httpx from bs4 import BeautifulSoup from src.constants import WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES from .analytics import RateLimitError, error_logger from .cache import ( CONTENT_CACHE_DIR, content_cache_index, generate_cache_key, cleanup_cache, ) logger = logging.getLogger(__name__) _PRIVATE_NETWORKS = ( ipaddress.ip_network("0.0.0.0/8"), ipaddress.ip_network("10.0.0.0/8"), ipaddress.ip_network("127.0.0.0/8"), ipaddress.ip_network("169.254.0.0/16"), ipaddress.ip_network("172.16.0.0/12"), ipaddress.ip_network("192.168.0.0/16"), ipaddress.ip_network("::1/128"), ipaddress.ip_network("fc00::/7"), ipaddress.ip_network("fe80::/10"), ) def _is_private_address(addr: ipaddress._BaseAddress) -> bool: if isinstance(addr, ipaddress.IPv6Address) and addr.ipv4_mapped is not None: addr = addr.ipv4_mapped return ( addr.is_private or addr.is_loopback or addr.is_link_local or addr.is_reserved or addr.is_multicast or addr.is_unspecified or any(addr in net for net in _PRIVATE_NETWORKS) ) def _resolve_hostname_ips(hostname: str) -> list[ipaddress._BaseAddress]: try: infos = socket.getaddrinfo(hostname, None) except Exception: return [] out = [] for info in infos: try: out.append(ipaddress.ip_address(info[4][0])) except Exception: continue return out def _public_http_url(url: str) -> bool: try: parsed = urlparse(url) if parsed.scheme not in ("http", "https"): return False host = (parsed.hostname or "").strip() if not host: return False lower = host.lower() if lower in ("localhost", "metadata", "metadata.google.internal"): return False if lower.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")): return False try: return not _is_private_address(ipaddress.ip_address(host)) except ValueError: pass addrs = _resolve_hostname_ips(host) return bool(addrs) and not any(_is_private_address(a) for a in addrs) except Exception: return False class BodyTooLargeError(Exception): """The server declared a body larger than the hard fetch ceiling.""" def __init__(self, url: str, declared_bytes: int): self.url = url self.declared_bytes = declared_bytes super().__init__( f"response body is {declared_bytes:,} bytes, over the " f"{WEB_FETCH_HARD_MAX_BYTES:,}-byte hard cap" ) class _CappedFetch: """Result of a size-capped streaming GET. Carries just what fetch_webpage_content needs from an httpx.Response, plus the cap bookkeeping: the (possibly truncated) body, whether the cap cut it short, and the size the server declared via Content-Length (wire bytes; None when absent). """ __slots__ = ("status_code", "headers", "content", "truncated", "declared_bytes", "encoding", "url") def __init__(self, status_code, headers, content, truncated, declared_bytes, encoding, url): self.status_code = status_code self.headers = headers self.content = content self.truncated = truncated self.declared_bytes = declared_bytes self.encoding = encoding self.url = url @property def text(self) -> str: return self.content.decode(self.encoding or "utf-8", errors="replace") def raise_for_status(self): if self.status_code >= 400: request = httpx.Request("GET", self.url) raise httpx.HTTPStatusError( f"HTTP {self.status_code} for {self.url}", request=request, response=httpx.Response(self.status_code, request=request), ) def _get_public_url(url: str, headers: dict, timeout: int, max_redirects: int = 5, max_bytes: int = None) -> "_CappedFetch": """Capped streaming GET with SSRF-guarded manual redirects. The body is streamed and buffering stops at ``max_bytes`` (default: the soft cap), so an oversized resource cannot be pulled into memory or the content cache in full. When Content-Length already declares a body over the hard ceiling, the fetch is refused before any body bytes are read. """ cap = min(max_bytes or WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES) current = url for _ in range(max_redirects + 1): if not _public_http_url(current): raise httpx.RequestError("Blocked private/internal URL", request=httpx.Request("GET", current)) # Force identity transfer-encoding. With gzip/deflate the wire bytes # (and Content-Length) can be a small fraction of the decoded body, so # a tiny compressed response could pass the hard-cap preflight and then # expand past the ceiling in a single decoded chunk before the streamed # cap below can slice it. Identity makes Content-Length the true body # size and keeps each streamed chunk bounded by the network read. req_headers = dict(headers or {}) req_headers["Accept-Encoding"] = "identity" with httpx.stream("GET", current, headers=req_headers, timeout=timeout, follow_redirects=False) as response: if response.status_code in (301, 302, 303, 307, 308): location = response.headers.get("location") if not location: return _CappedFetch(response.status_code, response.headers, b"", False, None, response.encoding, str(response.url)) current = urljoin(str(response.url), location) continue # A server can ignore the identity request and still return a # compressed body; httpx.iter_bytes would then decode it, and a tiny # gzip can balloon into one decoded chunk far past the cap before we # slice. Refuse a compressed Content-Encoding so the streamed cap # stays a real memory bound (Content-Length is the compressed wire # length here, so the preflight and size metadata are unreliable too). enc = (response.headers.get("content-encoding") or "").strip().lower() if enc and enc != "identity": raise httpx.RequestError( f"Refusing compressed response (Content-Encoding: {enc}) after " "requesting identity: cannot bound decoded body size", request=httpx.Request("GET", current), ) declared = None raw_len = response.headers.get("content-length") if raw_len and raw_len.isdigit(): declared = int(raw_len) # Refuse before buffering anything when the server already tells # us the body exceeds the absolute ceiling (Content-Length is wire # bytes; the decompressed body can only be larger). if declared is not None and declared > WEB_FETCH_HARD_MAX_BYTES: raise BodyTooLargeError(current, declared) chunks = [] read = 0 truncated = False # We requested identity above, so iter_bytes yields the raw body in # network-read-sized chunks (no decompression expansion); the cap # therefore bounds what we actually buffer. for chunk in response.iter_bytes(): read += len(chunk) if read > cap: keep = cap - (read - len(chunk)) if keep > 0: chunks.append(chunk[:keep]) truncated = True break chunks.append(chunk) return _CappedFetch(response.status_code, response.headers, b"".join(chunks), truncated, declared, response.encoding, str(response.url)) raise httpx.RequestError("Too many redirects", request=httpx.Request("GET", current)) # PDF extraction (optional dependency) try: from pdfminer.high_level import extract_text as pdf_extract_text except ImportError: pdf_extract_text = None # type: ignore # ---------------------------------------------------------------------- # HTML extraction helpers # ---------------------------------------------------------------------- def _extract_meta(soup: BeautifulSoup) -> dict: """Pull meta description and keywords if present.""" description = "" keywords = "" desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)}) if desc_tag and desc_tag.get("content"): description = desc_tag["content"].strip() kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)}) if kw_tag and kw_tag.get("content"): keywords = kw_tag["content"].strip() return {"description": description, "keywords": keywords} def _extract_og_image(soup: BeautifulSoup) -> str: """Extract the best representative image URL from meta tags. Only returns absolute http(s) URLs -- skips relative paths and data URIs. """ candidates = [] for prop in ("og:image", "og:image:url", "og:image:secure_url"): tag = soup.find("meta", attrs={"property": prop}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) tag = soup.find("meta", attrs={"name": "twitter:image"}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) tag = soup.find("meta", attrs={"name": "thumbnail"}) if tag and tag.get("content", "").strip(): candidates.append(tag["content"].strip()) for url in candidates: if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")): return url return "" def _extract_lists(soup: BeautifulSoup) -> List[List[str]]: """Return a list of lists, each inner list representing a