mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
074a1e6eff
* fix(search): add download budgets to web_fetch with truncation notice and hard ceiling MAX_OUTPUT_CHARS only trims what the agent sees; fetch_webpage_content buffered and cached the entire response body first, so a large or hostile URL could pull arbitrarily many bytes into memory and the content cache. The fetch is now a capped streaming GET (SSRF redirect guard unchanged): a soft default budget (WEB_FETCH_SOFT_MAX_BYTES, 2 MB), a per-call override via full/max_bytes on the web_fetch tool, and a hard ceiling (WEB_FETCH_HARD_MAX_BYTES, 20 MB) that the override can never exceed. When Content-Length already declares a body over the ceiling the fetch is refused before any body bytes are buffered. Truncated results carry truncated/fetched_bytes/total_bytes, the tool output leads with a partial-content notice telling the model how to re-fetch with full=true, and the tool schema documents the flag. A truncated PDF is reported as a budget error since a cut PDF is unparseable. The effective cap is part of the content-cache key so a truncated fetch is never served to a full-budget request. Existing tests that faked httpx.get or the old _get_public_url signature are adapted to the streaming interface; behavior pins are unchanged. Fixes #3812 * fix(search): close compressed-body cap bypass and protect the partial notice Addresses RaresKeY's review on #3955: - Force Accept-Encoding: identity for the capped fetch. With gzip/deflate the wire bytes (and Content-Length) can be a fraction of the decoded body, so a tiny compressed response could pass the hard-cap preflight and then expand past the ceiling in a single decoded chunk before the streamed cap could slice it. Identity makes Content-Length the true body size and keeps each streamed chunk bounded by the network read, so the hard ceiling actually bounds memory. - Lead web_fetch output with the partial-content notice and cap the page title. The notice is the user-facing contract for partial fetches, but the title is untrusted, uncapped page content; placed ahead of the notice a giant title could push it past MAX_OUTPUT_CHARS and drop it. The notice now leads and the title is capped as a second guard. Adds regressions: the fetch advertises identity encoding, and a truncated result with an oversized title still surfaces the partial notice. * fix(search): reject compressed responses that ignore the identity request Requesting Accept-Encoding: identity is not enough on its own: a server can ignore it and still return Content-Encoding: gzip, and httpx.iter_bytes would decode that, so a tiny compressed body could balloon into one decoded chunk far past the hard cap before the streamed loop slices it (and Content-Length, the compressed wire length, makes the preflight and size metadata unreliable). Refuse a non-identity Content-Encoding before reading the body. Adds a regression where the server ignores the identity request and returns gzip; the fetch is refused before any body is decoded.
133 lines
4.5 KiB
Python
133 lines
4.5 KiB
Python
"""Content extraction behavior for the canonical services.search.content module."""
|
|
|
|
import httpx
|
|
import pytest
|
|
|
|
pytest.importorskip("bs4")
|
|
|
|
from services.search import content as service_content
|
|
|
|
|
|
class _FakeResponse:
|
|
status_code = 200
|
|
headers = {"Content-Type": "text/html; charset=utf-8"}
|
|
content = b""
|
|
|
|
def __init__(self, text: str):
|
|
self.text = text
|
|
|
|
def raise_for_status(self):
|
|
return None
|
|
|
|
|
|
class _FakeErrorResponse:
|
|
"""Mimics an httpx.Response that fails raise_for_status with a given status code."""
|
|
|
|
headers = {"Content-Type": "text/html; charset=utf-8"}
|
|
content = b""
|
|
text = ""
|
|
|
|
def __init__(self, status_code: int):
|
|
self.status_code = status_code
|
|
|
|
def raise_for_status(self):
|
|
raise httpx.HTTPStatusError(
|
|
f"{self.status_code} error", request=None, response=self
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("module", [service_content])
|
|
def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch):
|
|
html = """
|
|
<html>
|
|
<head>
|
|
<title>Example</title>
|
|
<meta property="og:image" content="https://example.com/cover.jpg">
|
|
</head>
|
|
<body>
|
|
<nav>Navigation text should not win</nav>
|
|
<div class="content">Tiny</div>
|
|
<main>
|
|
<p>This is the substantive body text that should be retained.</p>
|
|
<p>It is much longer than the tiny class-matched wrapper.</p>
|
|
</main>
|
|
<script>window.secret = "not content";</script>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
monkeypatch.setattr(module, "CONTENT_CACHE_DIR", tmp_path)
|
|
module.content_cache_index.clear()
|
|
monkeypatch.setattr(module, "_get_public_url", lambda url, headers, timeout, **kwargs: _FakeResponse(html))
|
|
|
|
result = module.fetch_webpage_content("https://example.com/parity-test")
|
|
|
|
assert result["og_image"] == "https://example.com/cover.jpg"
|
|
assert "substantive body text" in result["content"]
|
|
assert "much longer than the tiny" in result["content"]
|
|
assert "window.secret" not in result["content"]
|
|
|
|
|
|
@pytest.mark.parametrize("status_code", [403, 404])
|
|
def test_fetch_webpage_content_returns_empty_result_on_http_status_error(status_code, tmp_path, monkeypatch):
|
|
"""A 403/404 response should degrade to an empty result instead of raising.
|
|
|
|
This exercises the real fetch_webpage_content() path: _get_public_url returns
|
|
a response whose raise_for_status() raises httpx.HTTPStatusError, and the
|
|
function must catch it and hand back the standard empty-result shape rather
|
|
than letting the exception bubble up (which previously surfaced as a 500).
|
|
"""
|
|
monkeypatch.setattr(service_content, "CONTENT_CACHE_DIR", tmp_path)
|
|
service_content.content_cache_index.clear()
|
|
monkeypatch.setattr(
|
|
service_content,
|
|
"_get_public_url",
|
|
lambda url, headers, timeout, **kwargs: _FakeErrorResponse(status_code),
|
|
)
|
|
|
|
result = service_content.fetch_webpage_content(f"https://example.com/status-{status_code}")
|
|
|
|
assert result["success"] is False
|
|
assert result["content"] == ""
|
|
assert str(status_code) in result["error"]
|
|
|
|
|
|
def test_fetch_webpage_content_429_takes_distinct_rate_limit_path(tmp_path, monkeypatch):
|
|
"""A 429 response must be handled by the dedicated rate-limit branch.
|
|
|
|
The status_code == 429 check runs before raise_for_status() is ever called,
|
|
so a 429 should be reported as a rate-limit error rather than falling through
|
|
the generic HTTPStatusError handling added for 403/404. We assert on the
|
|
error message to prove it took the RateLimitError path, not the HTTP-status
|
|
empty-result path.
|
|
"""
|
|
monkeypatch.setattr(service_content, "CONTENT_CACHE_DIR", tmp_path)
|
|
service_content.content_cache_index.clear()
|
|
|
|
raise_for_status_called = False
|
|
|
|
class _FakeRateLimitResponse:
|
|
status_code = 429
|
|
headers = {"Content-Type": "text/html; charset=utf-8"}
|
|
content = b""
|
|
text = ""
|
|
|
|
def raise_for_status(self):
|
|
nonlocal raise_for_status_called
|
|
raise_for_status_called = True
|
|
|
|
monkeypatch.setattr(
|
|
service_content,
|
|
"_get_public_url",
|
|
lambda url, headers, timeout, **kwargs: _FakeRateLimitResponse(),
|
|
)
|
|
|
|
result = service_content.fetch_webpage_content("https://example.com/rate-limited")
|
|
|
|
assert result["success"] is False
|
|
assert result["content"] == ""
|
|
assert "Rate limit hit" in result["error"]
|
|
assert "HTTP 429" not in result["error"]
|
|
# The 429 short-circuit must happen before raise_for_status() is reached.
|
|
assert raise_for_status_called is False
|