Add SSRF-guarded web fetch agent tool

* feat(web-fetch): add web_fetch tool to read a specific URL's content

* test(web-fetch): add SSRF coverage and fail closed on empty DNS resolution

Add explicit SSRF regression tests for the web_fetch path covering
loopback, private LAN ranges, link-local/metadata, IPv6 private/local,
redirect-into-private, and unsupported schemes. Harden _public_http_url
to fail closed when a hostname resolves to no addresses.
This commit is contained in:
Rifqi Akram
2026-06-01 14:57:28 +07:00
committed by GitHub
parent 92c2392fd6
commit 5b1e56407b
11 changed files with 192 additions and 10 deletions
+28 -6
View File
@@ -1,5 +1,6 @@
"""Webpage content fetching with caching, PDF extraction, and summarization helpers."""
import copy
import io
import ipaddress
import json
@@ -61,9 +62,12 @@ def _public_http_url(url: str) -> bool:
except ValueError:
pass
try:
return all(not _is_private_address(ip) for ip in _resolve_hostname_ips(host))
ips = _resolve_hostname_ips(host)
except OSError:
return False
# Fail closed: a hostname that resolves to nothing is treated as
# non-public (an empty all(...) would otherwise return True).
return bool(ips) and all(not _is_private_address(ip) for ip in ips)
def _get_public_url(url: str, *, headers: dict, timeout: int) -> httpx.Response:
@@ -297,7 +301,8 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
js_rendered = _detect_js_frameworks(soup)
js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
# Main textual content (heuristic)
# Main textual content (heuristic): prefer semantic / "content"-classed
# containers to skip nav/footer/boilerplate; tuned for article pages.
main_content = ""
content_areas = soup.find_all(
["main", "article", "section", "div"],
@@ -306,12 +311,29 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
if content_areas:
for area in content_areas[:3]:
main_content += area.get_text(separator=" ", strip=True) + " "
if not main_content:
main_content = re.sub(r"\s+", " ", main_content).strip()
# The class heuristic can latch onto a small wrapper and miss the real
# content (app/landing pages, or SSR sites whose body isn't in a
# "content"-classed div, so these came back nearly empty before). When the
# heuristic returns nothing OR suspiciously little, fall back to the full
# <body>, stripping scripts/styles (so JSON/JS doesn't leak into the text)
# plus nav/header/footer/aside (boilerplate), and keep whichever yields
# more readable text.
THIN_CONTENT_CHARS = 600 # below this the heuristic likely missed the page
if len(main_content) < THIN_CONTENT_CHARS:
body = soup.find("body")
if body:
main_content = body.get_text(separator=" ", strip=True)
main_content = re.sub(r"\s+", " ", main_content).strip()
# Strip from a copy so the later list/table/code extractors still
# see the original soup unmodified.
body_copy = copy.copy(body)
for _noise in body_copy.find_all(
["script", "style", "noscript", "template", "nav", "header", "footer", "aside"]
):
_noise.extract()
body_text = re.sub(r"\s+", " ", body_copy.get_text(separator=" ", strip=True)).strip()
if len(body_text) > len(main_content):
main_content = body_text
result = {
"url": url,