mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
fix(search): catch HTTPStatusError so 403/404 URLs degrade gracefully instead of 500 (#2203)
raise_for_status() raises httpx.HTTPStatusError for 4xx/5xx responses, but the surrounding try/except only caught httpx.RequestError (network errors) and RateLimitError (429). Any other HTTP error code propagated uncaught up through chat_processor -> chat_helpers -> chat_routes and surfaced as a 500 Internal Server Error. Added an explicit except httpx.HTTPStatusError clause that logs a warning and returns an empty result, matching the behaviour already in place for network errors. Also adds focused regression tests that exercise the real fetch_webpage_content() path with a mocked _get_public_url: - 403/404 responses return the standard empty-result shape instead of raising, proving the new HTTPStatusError handling works end to end. - 429 responses still take their own dedicated rate-limit branch (the status_code == 429 check runs before raise_for_status() is reached), keeping that behaviour distinct from the new generic HTTPStatusError handling. Dropped the unrelated builtin_mcp.py change that had been carried over from a rebase; that fix is tracked separately in #2018 and this branch should stay scoped to the search content fetch path. Closes #2148
This commit is contained in:
@@ -259,6 +259,9 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
|
|||||||
raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})")
|
raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})")
|
||||||
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
error_logger.warning(f"HTTP {e.response.status_code} fetching {url}: {e}")
|
||||||
|
return _empty_result(url, f"HTTP {e.response.status_code}: {e}")
|
||||||
except httpx.RequestError as e:
|
except httpx.RequestError as e:
|
||||||
error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}")
|
error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}")
|
||||||
return _empty_result(url, f"NetworkError: {e}")
|
return _empty_result(url, f"NetworkError: {e}")
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Content extraction behavior for the canonical services.search.content module."""
|
"""Content extraction behavior for the canonical services.search.content module."""
|
||||||
|
|
||||||
|
import httpx
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
pytest.importorskip("bs4")
|
pytest.importorskip("bs4")
|
||||||
@@ -19,6 +20,22 @@ class _FakeResponse:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeErrorResponse:
|
||||||
|
"""Mimics an httpx.Response that fails raise_for_status with a given status code."""
|
||||||
|
|
||||||
|
headers = {"Content-Type": "text/html; charset=utf-8"}
|
||||||
|
content = b""
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
def __init__(self, status_code: int):
|
||||||
|
self.status_code = status_code
|
||||||
|
|
||||||
|
def raise_for_status(self):
|
||||||
|
raise httpx.HTTPStatusError(
|
||||||
|
f"{self.status_code} error", request=None, response=self
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("module", [service_content])
|
@pytest.mark.parametrize("module", [service_content])
|
||||||
def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch):
|
def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch):
|
||||||
html = """
|
html = """
|
||||||
@@ -49,3 +66,67 @@ def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, m
|
|||||||
assert "substantive body text" in result["content"]
|
assert "substantive body text" in result["content"]
|
||||||
assert "much longer than the tiny" in result["content"]
|
assert "much longer than the tiny" in result["content"]
|
||||||
assert "window.secret" not in result["content"]
|
assert "window.secret" not in result["content"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("status_code", [403, 404])
|
||||||
|
def test_fetch_webpage_content_returns_empty_result_on_http_status_error(status_code, tmp_path, monkeypatch):
|
||||||
|
"""A 403/404 response should degrade to an empty result instead of raising.
|
||||||
|
|
||||||
|
This exercises the real fetch_webpage_content() path: _get_public_url returns
|
||||||
|
a response whose raise_for_status() raises httpx.HTTPStatusError, and the
|
||||||
|
function must catch it and hand back the standard empty-result shape rather
|
||||||
|
than letting the exception bubble up (which previously surfaced as a 500).
|
||||||
|
"""
|
||||||
|
monkeypatch.setattr(service_content, "CONTENT_CACHE_DIR", tmp_path)
|
||||||
|
service_content.content_cache_index.clear()
|
||||||
|
monkeypatch.setattr(
|
||||||
|
service_content,
|
||||||
|
"_get_public_url",
|
||||||
|
lambda url, headers, timeout: _FakeErrorResponse(status_code),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = service_content.fetch_webpage_content(f"https://example.com/status-{status_code}")
|
||||||
|
|
||||||
|
assert result["success"] is False
|
||||||
|
assert result["content"] == ""
|
||||||
|
assert str(status_code) in result["error"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_webpage_content_429_takes_distinct_rate_limit_path(tmp_path, monkeypatch):
|
||||||
|
"""A 429 response must be handled by the dedicated rate-limit branch.
|
||||||
|
|
||||||
|
The status_code == 429 check runs before raise_for_status() is ever called,
|
||||||
|
so a 429 should be reported as a rate-limit error rather than falling through
|
||||||
|
the generic HTTPStatusError handling added for 403/404. We assert on the
|
||||||
|
error message to prove it took the RateLimitError path, not the HTTP-status
|
||||||
|
empty-result path.
|
||||||
|
"""
|
||||||
|
monkeypatch.setattr(service_content, "CONTENT_CACHE_DIR", tmp_path)
|
||||||
|
service_content.content_cache_index.clear()
|
||||||
|
|
||||||
|
raise_for_status_called = False
|
||||||
|
|
||||||
|
class _FakeRateLimitResponse:
|
||||||
|
status_code = 429
|
||||||
|
headers = {"Content-Type": "text/html; charset=utf-8"}
|
||||||
|
content = b""
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
def raise_for_status(self):
|
||||||
|
nonlocal raise_for_status_called
|
||||||
|
raise_for_status_called = True
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
service_content,
|
||||||
|
"_get_public_url",
|
||||||
|
lambda url, headers, timeout: _FakeRateLimitResponse(),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = service_content.fetch_webpage_content("https://example.com/rate-limited")
|
||||||
|
|
||||||
|
assert result["success"] is False
|
||||||
|
assert result["content"] == ""
|
||||||
|
assert "Rate limit hit" in result["error"]
|
||||||
|
assert "HTTP 429" not in result["error"]
|
||||||
|
# The 429 short-circuit must happen before raise_for_status() is reached.
|
||||||
|
assert raise_for_status_called is False
|
||||||
|
|||||||
Reference in New Issue
Block a user