diff --git a/services/search/content.py b/services/search/content.py index ff82a7f54..2c1f5f64c 100644 --- a/services/search/content.py +++ b/services/search/content.py @@ -259,6 +259,9 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) -> raise RateLimitError(f"Rate limit hit for {url} (attempt {retry_attempt})") response.raise_for_status() + except httpx.HTTPStatusError as e: + error_logger.warning(f"HTTP {e.response.status_code} fetching {url}: {e}") + return _empty_result(url, f"HTTP {e.response.status_code}: {e}") except httpx.RequestError as e: error_logger.error(f"NetworkError fetching {url} (attempt {retry_attempt}): {e}") return _empty_result(url, f"NetworkError: {e}") diff --git a/tests/test_search_content_extraction_parity.py b/tests/test_search_content_extraction_parity.py index ae66b7064..e5b8e7bcb 100644 --- a/tests/test_search_content_extraction_parity.py +++ b/tests/test_search_content_extraction_parity.py @@ -1,5 +1,6 @@ """Content extraction behavior for the canonical services.search.content module.""" +import httpx import pytest pytest.importorskip("bs4") @@ -19,6 +20,22 @@ class _FakeResponse: return None +class _FakeErrorResponse: + """Mimics an httpx.Response that fails raise_for_status with a given status code.""" + + headers = {"Content-Type": "text/html; charset=utf-8"} + content = b"" + text = "" + + def __init__(self, status_code: int): + self.status_code = status_code + + def raise_for_status(self): + raise httpx.HTTPStatusError( + f"{self.status_code} error", request=None, response=self + ) + + @pytest.mark.parametrize("module", [service_content]) def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch): html = """ @@ -49,3 +66,67 @@ def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, m assert "substantive body text" in result["content"] assert "much longer than the tiny" in result["content"] assert "window.secret" not in result["content"] + + +@pytest.mark.parametrize("status_code", [403, 404]) +def test_fetch_webpage_content_returns_empty_result_on_http_status_error(status_code, tmp_path, monkeypatch): + """A 403/404 response should degrade to an empty result instead of raising. + + This exercises the real fetch_webpage_content() path: _get_public_url returns + a response whose raise_for_status() raises httpx.HTTPStatusError, and the + function must catch it and hand back the standard empty-result shape rather + than letting the exception bubble up (which previously surfaced as a 500). + """ + monkeypatch.setattr(service_content, "CONTENT_CACHE_DIR", tmp_path) + service_content.content_cache_index.clear() + monkeypatch.setattr( + service_content, + "_get_public_url", + lambda url, headers, timeout: _FakeErrorResponse(status_code), + ) + + result = service_content.fetch_webpage_content(f"https://example.com/status-{status_code}") + + assert result["success"] is False + assert result["content"] == "" + assert str(status_code) in result["error"] + + +def test_fetch_webpage_content_429_takes_distinct_rate_limit_path(tmp_path, monkeypatch): + """A 429 response must be handled by the dedicated rate-limit branch. + + The status_code == 429 check runs before raise_for_status() is ever called, + so a 429 should be reported as a rate-limit error rather than falling through + the generic HTTPStatusError handling added for 403/404. We assert on the + error message to prove it took the RateLimitError path, not the HTTP-status + empty-result path. + """ + monkeypatch.setattr(service_content, "CONTENT_CACHE_DIR", tmp_path) + service_content.content_cache_index.clear() + + raise_for_status_called = False + + class _FakeRateLimitResponse: + status_code = 429 + headers = {"Content-Type": "text/html; charset=utf-8"} + content = b"" + text = "" + + def raise_for_status(self): + nonlocal raise_for_status_called + raise_for_status_called = True + + monkeypatch.setattr( + service_content, + "_get_public_url", + lambda url, headers, timeout: _FakeRateLimitResponse(), + ) + + result = service_content.fetch_webpage_content("https://example.com/rate-limited") + + assert result["success"] is False + assert result["content"] == "" + assert "Rate limit hit" in result["error"] + assert "HTTP 429" not in result["error"] + # The 429 short-circuit must happen before raise_for_status() is reached. + assert raise_for_status_called is False