Add SSRF-guarded web fetch agent tool

* feat(web-fetch): add web_fetch tool to read a specific URL's content

* test(web-fetch): add SSRF coverage and fail closed on empty DNS resolution

Add explicit SSRF regression tests for the web_fetch path covering
loopback, private LAN ranges, link-local/metadata, IPv6 private/local,
redirect-into-private, and unsupported schemes. Harden _public_http_url
to fail closed when a hostname resolves to no addresses.
This commit is contained in:
Rifqi Akram
2026-06-01 14:57:28 +07:00
committed by GitHub
parent 92c2392fd6
commit 5b1e56407b
11 changed files with 192 additions and 10 deletions
+55
View File
@@ -195,6 +195,7 @@ _MCP_TOOL_MAP = {
"read_file": ("filesystem", "read_file"),
"write_file": ("filesystem", "write_file"),
"web_search": ("web_search", "web_search"),
"web_fetch": ("web_fetch", "web_fetch"),
"generate_image": ("image_gen", "generate_image"),
}
@@ -238,6 +239,7 @@ _MCP_ARG_PARSERS: Dict[str, callable] = {
"bash": lambda c: {"command": c},
"python": lambda c: {"code": c},
"web_search": lambda c: {"query": c.split("\n")[0].strip()},
"web_fetch": lambda c: {"url": c.split("\n")[0].strip()},
"read_file": lambda c: {"path": c.split("\n")[0].strip()},
"write_file": _parse_write_file,
"generate_image": _parse_generate_image,
@@ -464,6 +466,59 @@ async def _direct_fallback(
output += "\n\n<!-- SOURCES:" + _json.dumps(sources) + " -->"
return {"output": output, "exit_code": 0}
if tool == "web_fetch":
# Lightweight single-URL fetch. Wraps the SSRF-safe fetcher used
# by deep research, so private/loopback/metadata addresses are
# already blocked there.
from src.search.content import fetch_webpage_content
raw = content.strip()
url = ""
# Accept either a JSON arg ({"url": "..."}) or a plain URL/domain.
if raw.startswith("{"):
try:
parsed = _json.loads(raw)
if isinstance(parsed, dict):
url = str(parsed.get("url") or "").strip()
except _json.JSONDecodeError:
url = ""
if not url:
# Non-JSON (or JSON without a usable url): take the first line
# only, so a URL followed by commentary still parses.
url = raw.split("\n")[0].strip()
# Reject anything that isn't a single bare URL/domain token.
if not url or url.startswith("{") or any(c in url for c in (" ", "\t", "\n")):
return {"error": "web_fetch: provide a single URL or domain, e.g. example.com", "exit_code": 1}
low = url.lower()
if "://" in low and not low.startswith(("http://", "https://")):
return {"error": f"web_fetch: unsupported URL scheme (only http/https): {url[:80]}", "exit_code": 1}
# Accept bare domains like "example.com" by defaulting to https.
if not low.startswith(("http://", "https://")):
url = "https://" + url
loop = asyncio.get_running_loop()
try:
result = await asyncio.wait_for(
loop.run_in_executor(None, lambda: fetch_webpage_content(url, timeout=10)),
timeout=30,
)
except asyncio.TimeoutError:
return {"error": f"web_fetch: timed out fetching {url}", "exit_code": 1}
err = result.get("error")
text = (result.get("content") or "").strip()
title = result.get("title") or ""
if not text:
if err:
return {"error": f"web_fetch: {url}: {err}", "exit_code": 1}
# No extractable text: non-HTML body, or a pure client-rendered
# shell. The agent can fall back to the builtin_browser tool.
return {"error": f"web_fetch: {url}: no readable text content (not HTML, or the page needs JS/login)", "exit_code": 1}
header = (f"# {title}\n" if title else "") + f"Source: {url}\n\n"
output = header + text
if len(output) > MAX_OUTPUT_CHARS:
output = output[:MAX_OUTPUT_CHARS] + "\n\n[...truncated]"
return {"output": output, "exit_code": 0}
# manage_memory / generate_image still live as MCP servers
# (mcp_servers/{memory,image_gen}_server.py); the MCP path above
# handles them.