mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 01:35:36 -04:00
074a1e6eff
* fix(search): add download budgets to web_fetch with truncation notice and hard ceiling MAX_OUTPUT_CHARS only trims what the agent sees; fetch_webpage_content buffered and cached the entire response body first, so a large or hostile URL could pull arbitrarily many bytes into memory and the content cache. The fetch is now a capped streaming GET (SSRF redirect guard unchanged): a soft default budget (WEB_FETCH_SOFT_MAX_BYTES, 2 MB), a per-call override via full/max_bytes on the web_fetch tool, and a hard ceiling (WEB_FETCH_HARD_MAX_BYTES, 20 MB) that the override can never exceed. When Content-Length already declares a body over the ceiling the fetch is refused before any body bytes are buffered. Truncated results carry truncated/fetched_bytes/total_bytes, the tool output leads with a partial-content notice telling the model how to re-fetch with full=true, and the tool schema documents the flag. A truncated PDF is reported as a budget error since a cut PDF is unparseable. The effective cap is part of the content-cache key so a truncated fetch is never served to a full-budget request. Existing tests that faked httpx.get or the old _get_public_url signature are adapted to the streaming interface; behavior pins are unchanged. Fixes #3812 * fix(search): close compressed-body cap bypass and protect the partial notice Addresses RaresKeY's review on #3955: - Force Accept-Encoding: identity for the capped fetch. With gzip/deflate the wire bytes (and Content-Length) can be a fraction of the decoded body, so a tiny compressed response could pass the hard-cap preflight and then expand past the ceiling in a single decoded chunk before the streamed cap could slice it. Identity makes Content-Length the true body size and keeps each streamed chunk bounded by the network read, so the hard ceiling actually bounds memory. - Lead web_fetch output with the partial-content notice and cap the page title. The notice is the user-facing contract for partial fetches, but the title is untrusted, uncapped page content; placed ahead of the notice a giant title could push it past MAX_OUTPUT_CHARS and drop it. The notice now leads and the title is capped as a second guard. Adds regressions: the fetch advertises identity encoding, and a truncated result with an oversized title still surfaces the partial notice. * fix(search): reject compressed responses that ignore the identity request Requesting Accept-Encoding: identity is not enough on its own: a server can ignore it and still return Content-Encoding: gzip, and httpx.iter_bytes would decode that, so a tiny compressed body could balloon into one decoded chunk far past the hard cap before the streamed loop slices it (and Content-Length, the compressed wire length, makes the preflight and size metadata unreliable). Refuse a non-identity Content-Encoding before reading the body. Adds a regression where the server ignores the identity request and returns gzip; the fetch is refused before any body is decoded.
599 lines
24 KiB
Python
599 lines
24 KiB
Python
"""Webpage content fetching with caching, PDF extraction, and summarization helpers."""
|
|
|
|
import copy
|
|
import io
|
|
import ipaddress
|
|
import json
|
|
import os
|
|
import re
|
|
import logging
|
|
import socket
|
|
from datetime import datetime, timedelta
|
|
from typing import List
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
from src.constants import WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES
|
|
|
|
from .analytics import RateLimitError, error_logger
|
|
from .cache import (
|
|
CONTENT_CACHE_DIR,
|
|
content_cache_index,
|
|
generate_cache_key,
|
|
cleanup_cache,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_PRIVATE_NETWORKS = (
|
|
ipaddress.ip_network("0.0.0.0/8"),
|
|
ipaddress.ip_network("10.0.0.0/8"),
|
|
ipaddress.ip_network("127.0.0.0/8"),
|
|
ipaddress.ip_network("169.254.0.0/16"),
|
|
ipaddress.ip_network("172.16.0.0/12"),
|
|
ipaddress.ip_network("192.168.0.0/16"),
|
|
ipaddress.ip_network("::1/128"),
|
|
ipaddress.ip_network("fc00::/7"),
|
|
ipaddress.ip_network("fe80::/10"),
|
|
)
|
|
|
|
|
|
def _is_private_address(addr: ipaddress._BaseAddress) -> bool:
|
|
if isinstance(addr, ipaddress.IPv6Address) and addr.ipv4_mapped is not None:
|
|
addr = addr.ipv4_mapped
|
|
return (
|
|
addr.is_private
|
|
or addr.is_loopback
|
|
or addr.is_link_local
|
|
or addr.is_reserved
|
|
or addr.is_multicast
|
|
or addr.is_unspecified
|
|
or any(addr in net for net in _PRIVATE_NETWORKS)
|
|
)
|
|
|
|
|
|
def _resolve_hostname_ips(hostname: str) -> list[ipaddress._BaseAddress]:
|
|
try:
|
|
infos = socket.getaddrinfo(hostname, None)
|
|
except Exception:
|
|
return []
|
|
out = []
|
|
for info in infos:
|
|
try:
|
|
out.append(ipaddress.ip_address(info[4][0]))
|
|
except Exception:
|
|
continue
|
|
return out
|
|
|
|
|
|
def _public_http_url(url: str) -> bool:
|
|
try:
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in ("http", "https"):
|
|
return False
|
|
host = (parsed.hostname or "").strip()
|
|
if not host:
|
|
return False
|
|
lower = host.lower()
|
|
if lower in ("localhost", "metadata", "metadata.google.internal"):
|
|
return False
|
|
if lower.endswith((".local", ".localhost", ".internal", ".lan", ".intranet")):
|
|
return False
|
|
try:
|
|
return not _is_private_address(ipaddress.ip_address(host))
|
|
except ValueError:
|
|
pass
|
|
addrs = _resolve_hostname_ips(host)
|
|
return bool(addrs) and not any(_is_private_address(a) for a in addrs)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
class BodyTooLargeError(Exception):
|
|
"""The server declared a body larger than the hard fetch ceiling."""
|
|
|
|
def __init__(self, url: str, declared_bytes: int):
|
|
self.url = url
|
|
self.declared_bytes = declared_bytes
|
|
super().__init__(
|
|
f"response body is {declared_bytes:,} bytes, over the "
|
|
f"{WEB_FETCH_HARD_MAX_BYTES:,}-byte hard cap"
|
|
)
|
|
|
|
|
|
class _CappedFetch:
|
|
"""Result of a size-capped streaming GET.
|
|
|
|
Carries just what fetch_webpage_content needs from an httpx.Response,
|
|
plus the cap bookkeeping: the (possibly truncated) body, whether the
|
|
cap cut it short, and the size the server declared via Content-Length
|
|
(wire bytes; None when absent).
|
|
"""
|
|
|
|
__slots__ = ("status_code", "headers", "content", "truncated",
|
|
"declared_bytes", "encoding", "url")
|
|
|
|
def __init__(self, status_code, headers, content, truncated,
|
|
declared_bytes, encoding, url):
|
|
self.status_code = status_code
|
|
self.headers = headers
|
|
self.content = content
|
|
self.truncated = truncated
|
|
self.declared_bytes = declared_bytes
|
|
self.encoding = encoding
|
|
self.url = url
|
|
|
|
@property
|
|
def text(self) -> str:
|
|
return self.content.decode(self.encoding or "utf-8", errors="replace")
|
|
|
|
def raise_for_status(self):
|
|
if self.status_code >= 400:
|
|
request = httpx.Request("GET", self.url)
|
|
raise httpx.HTTPStatusError(
|
|
f"HTTP {self.status_code} for {self.url}",
|
|
request=request,
|
|
response=httpx.Response(self.status_code, request=request),
|
|
)
|
|
|
|
|
|
def _get_public_url(url: str, headers: dict, timeout: int, max_redirects: int = 5,
|
|
max_bytes: int = None) -> "_CappedFetch":
|
|
"""Capped streaming GET with SSRF-guarded manual redirects.
|
|
|
|
The body is streamed and buffering stops at ``max_bytes`` (default: the
|
|
soft cap), so an oversized resource cannot be pulled into memory or the
|
|
content cache in full. When Content-Length already declares a body over
|
|
the hard ceiling, the fetch is refused before any body bytes are read.
|
|
"""
|
|
cap = min(max_bytes or WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES)
|
|
current = url
|
|
for _ in range(max_redirects + 1):
|
|
if not _public_http_url(current):
|
|
raise httpx.RequestError("Blocked private/internal URL", request=httpx.Request("GET", current))
|
|
# Force identity transfer-encoding. With gzip/deflate the wire bytes
|
|
# (and Content-Length) can be a small fraction of the decoded body, so
|
|
# a tiny compressed response could pass the hard-cap preflight and then
|
|
# expand past the ceiling in a single decoded chunk before the streamed
|
|
# cap below can slice it. Identity makes Content-Length the true body
|
|
# size and keeps each streamed chunk bounded by the network read.
|
|
req_headers = dict(headers or {})
|
|
req_headers["Accept-Encoding"] = "identity"
|
|
with httpx.stream("GET", current, headers=req_headers, timeout=timeout,
|
|
follow_redirects=False) as response:
|
|
if response.status_code in (301, 302, 303, 307, 308):
|
|
location = response.headers.get("location")
|
|
if not location:
|
|
return _CappedFetch(response.status_code, response.headers, b"",
|
|
False, None, response.encoding, str(response.url))
|
|
current = urljoin(str(response.url), location)
|
|
continue
|
|
|
|
# A server can ignore the identity request and still return a
|
|
# compressed body; httpx.iter_bytes would then decode it, and a tiny
|
|
# gzip can balloon into one decoded chunk far past the cap before we
|
|
# slice. Refuse a compressed Content-Encoding so the streamed cap
|
|
# stays a real memory bound (Content-Length is the compressed wire
|
|
# length here, so the preflight and size metadata are unreliable too).
|
|
enc = (response.headers.get("content-encoding") or "").strip().lower()
|
|
if enc and enc != "identity":
|
|
raise httpx.RequestError(
|
|
f"Refusing compressed response (Content-Encoding: {enc}) after "
|
|
"requesting identity: cannot bound decoded body size",
|
|
request=httpx.Request("GET", current),
|
|
)
|
|
|
|
declared = None
|
|
raw_len = response.headers.get("content-length")
|
|
if raw_len and raw_len.isdigit():
|
|
declared = int(raw_len)
|
|
# Refuse before buffering anything when the server already tells
|
|
# us the body exceeds the absolute ceiling (Content-Length is wire
|
|
# bytes; the decompressed body can only be larger).
|
|
if declared is not None and declared > WEB_FETCH_HARD_MAX_BYTES:
|
|
raise BodyTooLargeError(current, declared)
|
|
|
|
chunks = []
|
|
read = 0
|
|
truncated = False
|
|
# We requested identity above, so iter_bytes yields the raw body in
|
|
# network-read-sized chunks (no decompression expansion); the cap
|
|
# therefore bounds what we actually buffer.
|
|
for chunk in response.iter_bytes():
|
|
read += len(chunk)
|
|
if read > cap:
|
|
keep = cap - (read - len(chunk))
|
|
if keep > 0:
|
|
chunks.append(chunk[:keep])
|
|
truncated = True
|
|
break
|
|
chunks.append(chunk)
|
|
return _CappedFetch(response.status_code, response.headers,
|
|
b"".join(chunks), truncated, declared,
|
|
response.encoding, str(response.url))
|
|
raise httpx.RequestError("Too many redirects", request=httpx.Request("GET", current))
|
|
|
|
# PDF extraction (optional dependency)
|
|
try:
|
|
from pdfminer.high_level import extract_text as pdf_extract_text
|
|
except ImportError:
|
|
pdf_extract_text = None # type: ignore
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# HTML extraction helpers
|
|
# ----------------------------------------------------------------------
|
|
def _extract_meta(soup: BeautifulSoup) -> dict:
|
|
"""Pull meta description and keywords if present."""
|
|
description = ""
|
|
keywords = ""
|
|
desc_tag = soup.find("meta", attrs={"name": re.compile("description", re.I)})
|
|
if desc_tag and desc_tag.get("content"):
|
|
description = desc_tag["content"].strip()
|
|
kw_tag = soup.find("meta", attrs={"name": re.compile("keywords", re.I)})
|
|
if kw_tag and kw_tag.get("content"):
|
|
keywords = kw_tag["content"].strip()
|
|
return {"description": description, "keywords": keywords}
|
|
|
|
|
|
def _extract_og_image(soup: BeautifulSoup) -> str:
|
|
"""Extract the best representative image URL from meta tags.
|
|
|
|
Only returns absolute http(s) URLs -- skips relative paths and data URIs.
|
|
"""
|
|
candidates = []
|
|
for prop in ("og:image", "og:image:url", "og:image:secure_url"):
|
|
tag = soup.find("meta", attrs={"property": prop})
|
|
if tag and tag.get("content", "").strip():
|
|
candidates.append(tag["content"].strip())
|
|
tag = soup.find("meta", attrs={"name": "twitter:image"})
|
|
if tag and tag.get("content", "").strip():
|
|
candidates.append(tag["content"].strip())
|
|
tag = soup.find("meta", attrs={"name": "thumbnail"})
|
|
if tag and tag.get("content", "").strip():
|
|
candidates.append(tag["content"].strip())
|
|
for url in candidates:
|
|
if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")):
|
|
return url
|
|
return ""
|
|
|
|
|
|
def _extract_lists(soup: BeautifulSoup) -> List[List[str]]:
|
|
"""Return a list of lists, each inner list representing a <ul>/<ol>."""
|
|
all_lists = []
|
|
for lst in soup.find_all(["ul", "ol"]):
|
|
items = [li.get_text(separator=" ", strip=True) for li in lst.find_all("li")]
|
|
if items:
|
|
all_lists.append(items)
|
|
return all_lists
|
|
|
|
|
|
def _extract_tables(soup: BeautifulSoup) -> List[List[List[str]]]:
|
|
"""Return a list of tables, each table is a list of rows, each row a list of cell texts."""
|
|
tables_data = []
|
|
for table in soup.find_all("table"):
|
|
rows = []
|
|
for tr in table.find_all("tr"):
|
|
cells = [td.get_text(separator=" ", strip=True) for td in tr.find_all(["td", "th"])]
|
|
if cells:
|
|
rows.append(cells)
|
|
if rows:
|
|
tables_data.append(rows)
|
|
return tables_data
|
|
|
|
|
|
def _extract_code_blocks(soup: BeautifulSoup) -> List[str]:
|
|
"""Collect text from <pre> and <code> blocks."""
|
|
blocks = []
|
|
for tag in soup.find_all(["pre", "code"]):
|
|
txt = tag.get_text(separator=" ", strip=True)
|
|
if txt:
|
|
blocks.append(txt)
|
|
return blocks
|
|
|
|
|
|
def _detect_js_frameworks(soup: BeautifulSoup) -> bool:
|
|
"""Very naive detection of common JS frameworks."""
|
|
js_indicators = [
|
|
"react", "angular", "vue", "svelte", "next", "nuxt",
|
|
"ember", "backbone", "jquery", "polymer", "mithril",
|
|
]
|
|
for script in soup.find_all("script"):
|
|
src = script.get("src", "").lower()
|
|
if any(fr in src for fr in js_indicators):
|
|
return True
|
|
if script.string:
|
|
content = script.string.lower()
|
|
if any(fr in content for fr in js_indicators):
|
|
return True
|
|
if soup.find(attrs={"data-reactroot": True}) or soup.find(attrs={"ng-app": True}):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _empty_result(url: str, error: str = "") -> dict:
|
|
"""Build a standard failure result dict."""
|
|
return {
|
|
"url": url,
|
|
"title": "",
|
|
"content": "",
|
|
"lists": [],
|
|
"tables": [],
|
|
"code_blocks": [],
|
|
"meta_description": "",
|
|
"meta_keywords": "",
|
|
"js_rendered": False,
|
|
"js_message": "",
|
|
"success": False,
|
|
"error": error,
|
|
}
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Main content fetcher
|
|
# ----------------------------------------------------------------------
|
|
def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0,
|
|
max_bytes: int = None) -> dict:
|
|
"""Fetch and extract meaningful content from a webpage with caching.
|
|
|
|
``max_bytes`` raises the download budget per call (clamped to the hard
|
|
cap); the default is the soft cap. When the body is cut short the result
|
|
carries ``truncated``/``fetched_bytes``/``total_bytes`` so callers can
|
|
tell the model the content is partial (#3812).
|
|
"""
|
|
effective_cap = min(max_bytes or WEB_FETCH_SOFT_MAX_BYTES, WEB_FETCH_HARD_MAX_BYTES)
|
|
# The cap is part of the cache identity: a truncated soft-cap fetch must
|
|
# not be served to a later full-budget request for the same URL.
|
|
cache_key = generate_cache_key(f"{url}#cap={effective_cap}")
|
|
cache_file = CONTENT_CACHE_DIR / f"{cache_key}.cache"
|
|
|
|
# Check cache
|
|
if cache_file.exists():
|
|
try:
|
|
with open(cache_file, "r", encoding="utf-8") as f:
|
|
cached_data = json.load(f)
|
|
timestamp = datetime.fromisoformat(cached_data["timestamp"])
|
|
if datetime.now() - timestamp < timedelta(hours=2):
|
|
logger.debug(f"Content cache hit for URL: {url}")
|
|
return cached_data["data"]
|
|
else:
|
|
cache_file.unlink(missing_ok=True)
|
|
content_cache_index.pop(cache_key, None)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to read content cache for {url}: {e}")
|
|
cache_file.unlink(missing_ok=True)
|
|
content_cache_index.pop(cache_key, None)
|
|
|
|
# Fetch
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
# identity so the streamed size cap in _get_public_url stays honest
|
|
# (a compressed body can decode to far more than Content-Length).
|
|
"Accept-Encoding": "identity",
|
|
"Connection": "keep-alive",
|
|
}
|
|
response = _get_public_url(url, headers=headers, timeout=timeout,
|
|
max_bytes=effective_cap)
|
|
|
|
if response.status_code == 429:
|
|
raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})")
|
|
|
|
response.raise_for_status()
|
|
except BodyTooLargeError as e:
|
|
error_logger.warning(f"Refused oversized body for {url}: {e}")
|
|
return _empty_result(url, f"TooLarge: {e}")
|
|
except httpx.HTTPStatusError as e:
|
|
error_logger.warning(f"HTTP {e.response.status_code} fetching {url}: {e}")
|
|
return _empty_result(url, f"HTTP {e.response.status_code}: {e}")
|
|
except httpx.RequestError as e:
|
|
error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}")
|
|
return _empty_result(url, f"NetworkError: {e}")
|
|
except RateLimitError as e:
|
|
error_logger.error(str(e))
|
|
return _empty_result(url, str(e))
|
|
|
|
# Size bookkeeping shared by every content branch below. getattr keeps
|
|
# plain httpx.Response stand-ins (tests) working without the cap fields.
|
|
_size_fields = {
|
|
"truncated": getattr(response, "truncated", False),
|
|
"fetched_bytes": len(response.content),
|
|
"total_bytes": getattr(response, "declared_bytes", None),
|
|
}
|
|
|
|
# PDF handling
|
|
content_type = response.headers.get("Content-Type", "").lower()
|
|
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
|
|
if _size_fields["truncated"]:
|
|
# A PDF cut mid-stream is not parseable; unlike text there is no
|
|
# useful partial result, so report the budget problem instead.
|
|
_declared = _size_fields["total_bytes"]
|
|
return _empty_result(
|
|
url,
|
|
f"TooLarge: PDF exceeds the {effective_cap:,}-byte fetch budget"
|
|
+ (f" (size {_declared:,} bytes)" if _declared else "")
|
|
+ "; retry with a larger budget if it fits under the hard cap",
|
|
)
|
|
if pdf_extract_text is None:
|
|
logger.error("pdfminer.six is not installed; cannot extract PDF text.")
|
|
pdf_text = ""
|
|
else:
|
|
try:
|
|
pdf_bytes = io.BytesIO(response.content)
|
|
pdf_text = pdf_extract_text(pdf_bytes)
|
|
except Exception as e:
|
|
logger.warning(f"PDF extraction failed for {url}: {e}")
|
|
pdf_text = ""
|
|
result = {
|
|
"url": url,
|
|
"title": os.path.basename(url),
|
|
"content": pdf_text,
|
|
"lists": [],
|
|
"tables": [],
|
|
"code_blocks": [],
|
|
"meta_description": "",
|
|
"meta_keywords": "",
|
|
"js_rendered": False,
|
|
"js_message": "",
|
|
"success": bool(pdf_text),
|
|
"error": "" if pdf_text else "Failed to extract PDF text",
|
|
**_size_fields,
|
|
}
|
|
_cache_result(cache_file, cache_key, result, url)
|
|
return result
|
|
|
|
# Plain-text / Markdown / JSON handling. Sources like
|
|
# raw.githubusercontent.com serve Markdown as `text/plain`, JSON APIs and
|
|
# raw config files serve `application/json`, and a lot of code and tool
|
|
# docs live in `.md` / `.txt`. These have no HTML structure, so the HTML
|
|
# branch below would extract nothing and report "no readable text content".
|
|
# Return the body verbatim instead. The `is_html` guard keeps real HTML
|
|
# (including `application/xhtml+xml`) on the parsing path; the `json` check
|
|
# covers `application/json` and `+json` suffixes; the URL-suffix fallback
|
|
# catches servers that mislabel text files as `application/octet-stream`.
|
|
is_html = "html" in content_type
|
|
is_json = "json" in content_type
|
|
url_path = url.lower().split("?", 1)[0].split("#", 1)[0]
|
|
looks_like_text_file = url_path.endswith(
|
|
(".md", ".markdown", ".txt", ".text", ".json", ".jsonl")
|
|
)
|
|
if not is_html and (content_type.startswith("text/") or is_json or looks_like_text_file):
|
|
text_body = (response.text or "").strip()
|
|
result = {
|
|
"url": url,
|
|
"title": os.path.basename(url_path) or url,
|
|
"content": text_body,
|
|
"lists": [],
|
|
"tables": [],
|
|
"code_blocks": [],
|
|
"meta_description": "",
|
|
"meta_keywords": "",
|
|
"js_rendered": False,
|
|
"js_message": "",
|
|
"success": bool(text_body),
|
|
"error": "" if text_body else "Empty response body",
|
|
**_size_fields,
|
|
}
|
|
_cache_result(cache_file, cache_key, result, url)
|
|
return result
|
|
|
|
# HTML handling
|
|
try:
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
except Exception as e:
|
|
error_logger.error(f"ParseError parsing HTML from {url} (attempt {retry_attempt}): {e}")
|
|
result = _empty_result(url, f"ParseError: {e}")
|
|
_cache_result(cache_file, cache_key, result, url)
|
|
return result
|
|
|
|
title_tag = soup.find("title")
|
|
title_text = title_tag.get_text(strip=True) if title_tag else ""
|
|
meta_info = _extract_meta(soup)
|
|
og_image = _extract_og_image(soup)
|
|
js_rendered = _detect_js_frameworks(soup)
|
|
js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
|
|
|
|
# Main textual content (heuristic): prefer semantic / "content"-classed
|
|
# containers to skip nav/footer/boilerplate; tuned for article pages.
|
|
main_content = ""
|
|
content_areas = soup.find_all(
|
|
["main", "article", "section", "div"],
|
|
class_=re.compile("content|main|body|article|post|entry|text", re.I),
|
|
)
|
|
if content_areas:
|
|
for area in content_areas[:3]:
|
|
main_content += area.get_text(separator=" ", strip=True) + " "
|
|
main_content = re.sub(r"\s+", " ", main_content).strip()
|
|
|
|
# If the heuristic finds only a tiny wrapper, fall back to body text with
|
|
# obvious boilerplate stripped so UI/deep-research search results do not
|
|
# look empty for app/landing pages.
|
|
THIN_CONTENT_CHARS = 600
|
|
if len(main_content) < THIN_CONTENT_CHARS:
|
|
body = soup.find("body")
|
|
if body:
|
|
body_copy = copy.copy(body)
|
|
for noise in body_copy.find_all(
|
|
["script", "style", "noscript", "template", "nav", "header", "footer", "aside"]
|
|
):
|
|
noise.extract()
|
|
body_text = re.sub(r"\s+", " ", body_copy.get_text(separator=" ", strip=True)).strip()
|
|
if len(body_text) > len(main_content):
|
|
main_content = body_text
|
|
|
|
result = {
|
|
"url": url,
|
|
"title": title_text,
|
|
"content": main_content,
|
|
"lists": _extract_lists(soup),
|
|
"tables": _extract_tables(soup),
|
|
"code_blocks": _extract_code_blocks(soup),
|
|
"meta_description": meta_info.get("description", ""),
|
|
"meta_keywords": meta_info.get("keywords", ""),
|
|
"og_image": og_image,
|
|
"js_rendered": js_rendered,
|
|
"js_message": js_message,
|
|
"success": True,
|
|
"error": "",
|
|
**_size_fields,
|
|
}
|
|
_cache_result(cache_file, cache_key, result, url)
|
|
return result
|
|
|
|
|
|
def _cache_result(cache_file, cache_key: str, result: dict, url: str):
|
|
"""Write a result to the content cache."""
|
|
try:
|
|
cache_data = {"timestamp": datetime.now().isoformat(), "data": result}
|
|
with open(cache_file, "w", encoding="utf-8") as f:
|
|
json.dump(cache_data, f)
|
|
content_cache_index[cache_key] = datetime.now()
|
|
cleanup_cache(CONTENT_CACHE_DIR, content_cache_index, timedelta(hours=2))
|
|
except Exception as e:
|
|
logger.warning(f"Failed to write content cache for {url}: {e}")
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Content summarization helpers
|
|
# ----------------------------------------------------------------------
|
|
def extract_key_points(text: str) -> List[str]:
|
|
"""Pull out bullet-style key points from a block of text."""
|
|
points: List[str] = []
|
|
bullet_pat = re.compile(r"^\s*[-*•]\s+(.*)")
|
|
numbered_pat = re.compile(r"^\s*\d+[\.\)]\s+(.*)")
|
|
for line in text.splitlines():
|
|
m = bullet_pat.match(line) or numbered_pat.match(line)
|
|
if m:
|
|
points.append(m.group(1).strip())
|
|
return points
|
|
|
|
|
|
def get_tldr(text: str, max_sentences: int = 3) -> str:
|
|
"""Produce a very short TL;DR by taking the first few sentences."""
|
|
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
selected = [s.strip() for s in sentences if s][:max_sentences]
|
|
return " ".join(selected)
|
|
|
|
|
|
def extract_quotes(text: str) -> List[str]:
|
|
"""Return quoted excerpts that are at least 15 characters long."""
|
|
# Backreference the opening quote so the closing quote must match it —
|
|
# otherwise `"text'` (open double, close single) is treated as a quote.
|
|
return [m.group(2).strip() for m in re.finditer(r'(["\'])([^"\']{15,}?)\1', text)]
|
|
|
|
|
|
def extract_statistics(text: str) -> List[str]:
|
|
"""Find numbers, percentages, dates and simple measurements."""
|
|
# Match a comma-grouped number (1,000,000) OR a plain digit run (50000) —
|
|
# the old `\d{1,3}(?:,\d{3})*` matched only the first 3 digits of a
|
|
# comma-less number, and the trailing `\b` dropped a closing `%`.
|
|
pattern = re.compile(
|
|
r"\b(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?\s*(%|percent|‰|per cent|[a-zA-Z]+)?",
|
|
re.IGNORECASE,
|
|
)
|
|
return [m.group(0).strip() for m in pattern.finditer(text)]
|