Add SSRF-guarded web fetch agent tool

* feat(web-fetch): add web_fetch tool to read a specific URL's content

* test(web-fetch): add SSRF coverage and fail closed on empty DNS resolution

Add explicit SSRF regression tests for the web_fetch path covering
loopback, private LAN ranges, link-local/metadata, IPv6 private/local,
redirect-into-private, and unsupported schemes. Harden _public_http_url
to fail closed when a hostname resolves to no addresses.
This commit is contained in:
Rifqi Akram
2026-06-01 14:57:28 +07:00
committed by GitHub
parent 92c2392fd6
commit 5b1e56407b
11 changed files with 192 additions and 10 deletions
+2 -1
View File
@@ -389,6 +389,7 @@ def setup_chat_routes(
disabled_tools.add("bash") disabled_tools.add("bash")
if str(allow_web_search).lower() != "true": if str(allow_web_search).lower() != "true":
disabled_tools.add("web_search") disabled_tools.add("web_search")
disabled_tools.add("web_fetch")
# Nobody/incognito mode: deny tools that would expose the user's # Nobody/incognito mode: deny tools that would expose the user's
# persistent memory, past chats, or other identity-linked data. # persistent memory, past chats, or other identity-linked data.
@@ -452,7 +453,7 @@ def setup_chat_routes(
disabled_tools.update(_compare_strip) disabled_tools.update(_compare_strip)
# In chat mode compare, disable ALL agent tools (no bash, python, file ops) # In chat mode compare, disable ALL agent tools (no bash, python, file ops)
if chat_mode == 'chat': if chat_mode == 'chat':
disabled_tools.update({"bash", "python", "read_file", "write_file", "web_search", "search_chats", "manage_tasks"}) disabled_tools.update({"bash", "python", "read_file", "write_file", "web_search", "web_fetch", "search_chats", "manage_tasks"})
async def stream_with_save() -> AsyncGenerator[str, None]: async def stream_with_save() -> AsyncGenerator[str, None]:
# _effective_mode is read-only here; closure captures it from # _effective_mode is read-only here; closure captures it from
+6
View File
@@ -199,6 +199,12 @@ Or with JSON for fresh news:
``` ```
Search the web for a SINGLE quick fact/lookup mid-task. For news / "today" / "latest" queries, pass `time_filter` ("day", "week", "month", or "year"). NOT for "research X" / "do research on X" / "look into X" requests — those mean a multi-source DEEP RESEARCH job: use `trigger_research` instead (it runs in the Deep Research sidebar and produces a full report). web_search = one quick query; trigger_research = a researched report.""", Search the web for a SINGLE quick fact/lookup mid-task. For news / "today" / "latest" queries, pass `time_filter` ("day", "week", "month", or "year"). NOT for "research X" / "do research on X" / "look into X" requests — those mean a multi-source DEEP RESEARCH job: use `trigger_research` instead (it runs in the Deep Research sidebar and produces a full report). web_search = one quick query; trigger_research = a researched report.""",
"web_fetch": """\
```web_fetch
<url or domain>
```
Fetch and read the text content of a SPECIFIC URL the user names (e.g. "check example.com", "what does this page say <url>"). A bare domain like `example.com` works (defaults to https). Use this when you already have a concrete URL. For open-ended lookups use `web_search`, and for "research X" jobs use `trigger_research`.""",
"read_file": """\ "read_file": """\
```read_file ```read_file
<file path> <file path>
+1 -1
View File
@@ -26,7 +26,7 @@ MAX_OUTPUT_CHARS = 10_000
MAX_READ_CHARS = 20_000 MAX_READ_CHARS = 20_000
# Tool types that trigger execution # Tool types that trigger execution
TOOL_TAGS = {"bash", "python", "web_search", "read_file", "write_file", TOOL_TAGS = {"bash", "python", "web_search", "web_fetch", "read_file", "write_file",
"create_document", "update_document", "edit_document", "create_document", "update_document", "edit_document",
"search_chats", "search_chats",
"chat_with_model", "create_session", "list_sessions", "chat_with_model", "create_session", "list_sessions",
+28 -6
View File
@@ -1,5 +1,6 @@
"""Webpage content fetching with caching, PDF extraction, and summarization helpers.""" """Webpage content fetching with caching, PDF extraction, and summarization helpers."""
import copy
import io import io
import ipaddress import ipaddress
import json import json
@@ -61,9 +62,12 @@ def _public_http_url(url: str) -> bool:
except ValueError: except ValueError:
pass pass
try: try:
return all(not _is_private_address(ip) for ip in _resolve_hostname_ips(host)) ips = _resolve_hostname_ips(host)
except OSError: except OSError:
return False return False
# Fail closed: a hostname that resolves to nothing is treated as
# non-public (an empty all(...) would otherwise return True).
return bool(ips) and all(not _is_private_address(ip) for ip in ips)
def _get_public_url(url: str, *, headers: dict, timeout: int) -> httpx.Response: def _get_public_url(url: str, *, headers: dict, timeout: int) -> httpx.Response:
@@ -297,7 +301,8 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
js_rendered = _detect_js_frameworks(soup) js_rendered = _detect_js_frameworks(soup)
js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else "" js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
# Main textual content (heuristic) # Main textual content (heuristic): prefer semantic / "content"-classed
# containers to skip nav/footer/boilerplate; tuned for article pages.
main_content = "" main_content = ""
content_areas = soup.find_all( content_areas = soup.find_all(
["main", "article", "section", "div"], ["main", "article", "section", "div"],
@@ -306,12 +311,29 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
if content_areas: if content_areas:
for area in content_areas[:3]: for area in content_areas[:3]:
main_content += area.get_text(separator=" ", strip=True) + " " main_content += area.get_text(separator=" ", strip=True) + " "
if not main_content: main_content = re.sub(r"\s+", " ", main_content).strip()
# The class heuristic can latch onto a small wrapper and miss the real
# content (app/landing pages, or SSR sites whose body isn't in a
# "content"-classed div, so these came back nearly empty before). When the
# heuristic returns nothing OR suspiciously little, fall back to the full
# <body>, stripping scripts/styles (so JSON/JS doesn't leak into the text)
# plus nav/header/footer/aside (boilerplate), and keep whichever yields
# more readable text.
THIN_CONTENT_CHARS = 600 # below this the heuristic likely missed the page
if len(main_content) < THIN_CONTENT_CHARS:
body = soup.find("body") body = soup.find("body")
if body: if body:
main_content = body.get_text(separator=" ", strip=True) # Strip from a copy so the later list/table/code extractors still
# see the original soup unmodified.
main_content = re.sub(r"\s+", " ", main_content).strip() body_copy = copy.copy(body)
for _noise in body_copy.find_all(
["script", "style", "noscript", "template", "nav", "header", "footer", "aside"]
):
_noise.extract()
body_text = re.sub(r"\s+", " ", body_copy.get_text(separator=" ", strip=True)).strip()
if len(body_text) > len(main_content):
main_content = body_text
result = { result = {
"url": url, "url": url,
+1
View File
@@ -122,6 +122,7 @@ DEFAULT_SETTINGS = {
DEFAULT_FEATURES = { DEFAULT_FEATURES = {
"web_search": True, "web_search": True,
"web_fetch": True,
"deep_research": False, "deep_research": False,
"memory": True, "memory": True,
"document_editor": True, "document_editor": True,
+1 -1
View File
@@ -2059,7 +2059,7 @@ class TaskScheduler:
"manage_calendar", "manage_notes", "manage_tasks", "manage_memory", "manage_calendar", "manage_notes", "manage_tasks", "manage_memory",
"list_email_accounts", "list_emails", "read_email", "send_email", "reply_to_email", "archive_email", "list_email_accounts", "list_emails", "read_email", "send_email", "reply_to_email", "archive_email",
"mark_email_read", "delete_email", "resolve_contact", "mark_email_read", "delete_email", "resolve_contact",
"search_chats", "web_search", "read_file", "search_chats", "web_search", "web_fetch", "read_file",
"create_document", "update_document", "edit_document", "create_document", "update_document", "edit_document",
"generate_image", "trigger_research", "generate_image", "trigger_research",
"download_model", "serve_model", "list_served_models", "stop_served_model", "download_model", "serve_model", "list_served_models", "stop_served_model",
+55
View File
@@ -195,6 +195,7 @@ _MCP_TOOL_MAP = {
"read_file": ("filesystem", "read_file"), "read_file": ("filesystem", "read_file"),
"write_file": ("filesystem", "write_file"), "write_file": ("filesystem", "write_file"),
"web_search": ("web_search", "web_search"), "web_search": ("web_search", "web_search"),
"web_fetch": ("web_fetch", "web_fetch"),
"generate_image": ("image_gen", "generate_image"), "generate_image": ("image_gen", "generate_image"),
} }
@@ -238,6 +239,7 @@ _MCP_ARG_PARSERS: Dict[str, callable] = {
"bash": lambda c: {"command": c}, "bash": lambda c: {"command": c},
"python": lambda c: {"code": c}, "python": lambda c: {"code": c},
"web_search": lambda c: {"query": c.split("\n")[0].strip()}, "web_search": lambda c: {"query": c.split("\n")[0].strip()},
"web_fetch": lambda c: {"url": c.split("\n")[0].strip()},
"read_file": lambda c: {"path": c.split("\n")[0].strip()}, "read_file": lambda c: {"path": c.split("\n")[0].strip()},
"write_file": _parse_write_file, "write_file": _parse_write_file,
"generate_image": _parse_generate_image, "generate_image": _parse_generate_image,
@@ -464,6 +466,59 @@ async def _direct_fallback(
output += "\n\n<!-- SOURCES:" + _json.dumps(sources) + " -->" output += "\n\n<!-- SOURCES:" + _json.dumps(sources) + " -->"
return {"output": output, "exit_code": 0} return {"output": output, "exit_code": 0}
if tool == "web_fetch":
# Lightweight single-URL fetch. Wraps the SSRF-safe fetcher used
# by deep research, so private/loopback/metadata addresses are
# already blocked there.
from src.search.content import fetch_webpage_content
raw = content.strip()
url = ""
# Accept either a JSON arg ({"url": "..."}) or a plain URL/domain.
if raw.startswith("{"):
try:
parsed = _json.loads(raw)
if isinstance(parsed, dict):
url = str(parsed.get("url") or "").strip()
except _json.JSONDecodeError:
url = ""
if not url:
# Non-JSON (or JSON without a usable url): take the first line
# only, so a URL followed by commentary still parses.
url = raw.split("\n")[0].strip()
# Reject anything that isn't a single bare URL/domain token.
if not url or url.startswith("{") or any(c in url for c in (" ", "\t", "\n")):
return {"error": "web_fetch: provide a single URL or domain, e.g. example.com", "exit_code": 1}
low = url.lower()
if "://" in low and not low.startswith(("http://", "https://")):
return {"error": f"web_fetch: unsupported URL scheme (only http/https): {url[:80]}", "exit_code": 1}
# Accept bare domains like "example.com" by defaulting to https.
if not low.startswith(("http://", "https://")):
url = "https://" + url
loop = asyncio.get_running_loop()
try:
result = await asyncio.wait_for(
loop.run_in_executor(None, lambda: fetch_webpage_content(url, timeout=10)),
timeout=30,
)
except asyncio.TimeoutError:
return {"error": f"web_fetch: timed out fetching {url}", "exit_code": 1}
err = result.get("error")
text = (result.get("content") or "").strip()
title = result.get("title") or ""
if not text:
if err:
return {"error": f"web_fetch: {url}: {err}", "exit_code": 1}
# No extractable text: non-HTML body, or a pure client-rendered
# shell. The agent can fall back to the builtin_browser tool.
return {"error": f"web_fetch: {url}: no readable text content (not HTML, or the page needs JS/login)", "exit_code": 1}
header = (f"# {title}\n" if title else "") + f"Source: {url}\n\n"
output = header + text
if len(output) > MAX_OUTPUT_CHARS:
output = output[:MAX_OUTPUT_CHARS] + "\n\n[...truncated]"
return {"output": output, "exit_code": 0}
# manage_memory / generate_image still live as MCP servers # manage_memory / generate_image still live as MCP servers
# (mcp_servers/{memory,image_gen}_server.py); the MCP path above # (mcp_servers/{memory,image_gen}_server.py); the MCP path above
# handles them. # handles them.
+2 -1
View File
@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
# Tools that are ALWAYS included regardless of retrieval results. # Tools that are ALWAYS included regardless of retrieval results.
# These are the most commonly needed and should never be missing. # These are the most commonly needed and should never be missing.
ALWAYS_AVAILABLE = frozenset({ ALWAYS_AVAILABLE = frozenset({
"bash", "python", "web_search", "read_file", "bash", "python", "web_search", "web_fetch", "read_file",
"api_call", # For configured integrations (Miniflux, Gitea, Linkding, etc.) "api_call", # For configured integrations (Miniflux, Gitea, Linkding, etc.)
# The two genuinely AMBIENT cookbook tools — "what's running" and # The two genuinely AMBIENT cookbook tools — "what's running" and
# "kill it" can be asked any time without prior cookbook context, # "kill it" can be asked any time without prior cookbook context,
@@ -62,6 +62,7 @@ BUILTIN_TOOL_DESCRIPTIONS: Dict[str, str] = {
"bash": "Run shell commands on the server. Install packages, check files, git operations, curl, system info, process management, networking.", "bash": "Run shell commands on the server. Install packages, check files, git operations, curl, system info, process management, networking.",
"python": "Execute Python code for computation, data processing, math, scripting, parsing, API calls. Not for writing code for the user.", "python": "Execute Python code for computation, data processing, math, scripting, parsing, API calls. Not for writing code for the user.",
"web_search": "Quick single web lookup for a fact, current event, or doc mid-task. NOT for 'research X' / 'do research on X' requests — those are deep-research jobs (use trigger_research). web_search = one query; trigger_research = a full researched report in the sidebar.", "web_search": "Quick single web lookup for a fact, current event, or doc mid-task. NOT for 'research X' / 'do research on X' requests — those are deep-research jobs (use trigger_research). web_search = one query; trigger_research = a full researched report in the sidebar.",
"web_fetch": "Fetch and read the text content of a specific URL/website the user names (e.g. 'check example.com', 'open this link'). Use when you have a concrete URL; for open-ended lookups use web_search instead.",
"read_file": "Read a file from disk and return its contents. View source code, config files, logs.", "read_file": "Read a file from disk and return its contents. View source code, config files, logs.",
"write_file": "Write content to a file on disk. Create new files, save output, update configs.", "write_file": "Write content to a file on disk. Create new files, save output, update configs.",
"create_document": "Create a new document in the editor panel. For code, articles, text content longer than 15 lines. Specify title, language, and content.", "create_document": "Create a new document in the editor panel. For code, articles, text content longer than 15 lines. Specify title, language, and content.",
+6
View File
@@ -95,6 +95,10 @@ _TOOL_NAME_MAP = {
"search": "web_search", "search": "web_search",
"web_search": "web_search", "web_search": "web_search",
"websearch": "web_search", "websearch": "web_search",
"web_fetch": "web_fetch",
"webfetch": "web_fetch",
"fetch_url": "web_fetch",
"fetch": "web_fetch",
"read": "read_file", "read": "read_file",
"read_file": "read_file", "read_file": "read_file",
"cat": "read_file", "cat": "read_file",
@@ -305,6 +309,8 @@ def _parse_tool_code_block(raw: str) -> Optional[ToolBlock]:
content = xml_params.get("code", args_body) content = xml_params.get("code", args_body)
elif mapped == "web_search": elif mapped == "web_search":
content = xml_params.get("query", args_body) content = xml_params.get("query", args_body)
elif mapped == "web_fetch":
content = xml_params.get("url", args_body)
elif mapped in ("read_file", "write_file"): elif mapped in ("read_file", "write_file"):
content = xml_params.get("path", xml_params.get("file_path", args_body)) content = xml_params.get("path", xml_params.get("file_path", args_body))
else: else:
+14
View File
@@ -64,6 +64,20 @@ FUNCTION_TOOL_SCHEMAS = [
} }
} }
}, },
{
"type": "function",
"function": {
"name": "web_fetch",
"description": "Fetch and read the text content of a specific URL the user names (e.g. 'check example.com', 'what's on this page <url>'). Use when you already have a concrete URL/domain. NOT for open-ended searches (use web_search) or 'research X' jobs (use trigger_research).",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string", "description": "The URL or domain to fetch (http/https; a bare domain like example.com is fine)"}
},
"required": ["url"]
}
}
},
{ {
"type": "function", "type": "function",
"function": { "function": {
+76
View File
@@ -546,3 +546,79 @@ def test_mcp_config_listing_is_admin_gated():
assert "def list_servers(request: Request):" in src assert "def list_servers(request: Request):" in src
assert "def list_tools(request: Request):" in src assert "def list_tools(request: Request):" in src
assert "def list_server_tools(server_id: str, request: Request):" in src assert "def list_server_tools(server_id: str, request: Request):" in src
# ── web_fetch SSRF guard (PR #111 merge gate) ───────────────────────
# web_fetch routes every request through src.search.content's
# _public_http_url / _get_public_url, the same SSRF-safe fetcher used by
# web_search and deep research. These pin that the guard blocks every
# private/internal address class plus redirect-into-private and non-http
# schemes, so the new tool can't be turned into an SSRF primitive.
import ipaddress as _ipaddr
import pytest as _pytest
@_pytest.mark.parametrize("url", [
"http://127.0.0.1/", # IPv4 loopback
"http://localhost/", # loopback by name
"http://10.0.0.5/", # private LAN 10/8
"http://172.16.0.1/", # private LAN 172.16/12
"http://192.168.1.1/", # private LAN 192.168/16
"http://169.254.169.254/latest/", # link-local / cloud metadata
"http://metadata.google.internal/", # metadata by name
"http://[::1]/", # IPv6 loopback
"http://[fc00::1]/", # IPv6 unique-local (ULA)
"http://[fe80::1]/", # IPv6 link-local
"file:///etc/passwd", # unsupported scheme
"ftp://example.com/", # unsupported scheme
])
def test_web_fetch_guard_blocks_private_and_bad_schemes(url):
from src.search.content import _public_http_url
assert _public_http_url(url) is False
def test_web_fetch_guard_allows_public_ip():
from src.search.content import _public_http_url
assert _public_http_url("http://93.184.216.34/") is True
def test_web_fetch_guard_blocks_dns_resolving_to_private(monkeypatch):
from src.search import content
monkeypatch.setattr(content, "_resolve_hostname_ips",
lambda host: [_ipaddr.ip_address("10.0.0.5")])
assert content._public_http_url("https://innocent.example/") is False
def test_web_fetch_guard_fails_closed_on_empty_resolution(monkeypatch):
# A hostname that resolves to nothing must be treated as non-public.
from src.search import content
monkeypatch.setattr(content, "_resolve_hostname_ips", lambda host: [])
assert content._public_http_url("https://innocent.example/") is False
def test_web_fetch_guard_blocks_redirect_into_private(monkeypatch):
# A public URL that 302-redirects to an internal address must be blocked
# at the redirect hop, not followed.
import httpx
from src.search import content
monkeypatch.setattr(content, "_resolve_hostname_ips",
lambda host: [_ipaddr.ip_address("93.184.216.34")])
class _Resp:
status_code = 302
headers = {"location": "http://169.254.169.254/latest/meta-data/"}
class _FakeClient:
def __init__(self, *a, **k): pass
def __enter__(self): return self
def __exit__(self, *a): return False
def get(self, url): return _Resp()
monkeypatch.setattr(httpx, "Client", _FakeClient)
with _pytest.raises(httpx.RequestError) as exc:
content._get_public_url("http://public.example/start", headers={}, timeout=5)
assert "non-public" in str(exc.value)