mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Search: align service content extraction
Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
"""Keep src.search and services.search content extraction behavior aligned."""
|
||||
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("bs4")
|
||||
|
||||
from services.search import content as service_content
|
||||
from src.search import content as src_content
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
status_code = 200
|
||||
headers = {"Content-Type": "text/html; charset=utf-8"}
|
||||
content = b""
|
||||
|
||||
def __init__(self, text: str):
|
||||
self.text = text
|
||||
|
||||
def raise_for_status(self):
|
||||
return None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("module", [src_content, service_content])
|
||||
def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch):
|
||||
html = """
|
||||
<html>
|
||||
<head>
|
||||
<title>Example</title>
|
||||
<meta property="og:image" content="https://example.com/cover.jpg">
|
||||
</head>
|
||||
<body>
|
||||
<nav>Navigation text should not win</nav>
|
||||
<div class="content">Tiny</div>
|
||||
<main>
|
||||
<p>This is the substantive body text that should be retained.</p>
|
||||
<p>It is much longer than the tiny class-matched wrapper.</p>
|
||||
</main>
|
||||
<script>window.secret = "not content";</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
monkeypatch.setattr(module, "CONTENT_CACHE_DIR", tmp_path)
|
||||
module.content_cache_index.clear()
|
||||
monkeypatch.setattr(module, "_get_public_url", lambda url, headers, timeout: _FakeResponse(html))
|
||||
|
||||
result = module.fetch_webpage_content("https://example.com/parity-test")
|
||||
|
||||
assert result["og_image"] == "https://example.com/cover.jpg"
|
||||
assert "substantive body text" in result["content"]
|
||||
assert "much longer than the tiny" in result["content"]
|
||||
assert "window.secret" not in result["content"]
|
||||
Reference in New Issue
Block a user