Search: align service content extraction

Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
This commit is contained in:
ghreprimand
2026-06-02 06:53:07 -05:00
committed by GitHub
parent c85da91964
commit aa0a9e8b5a
2 changed files with 94 additions and 5 deletions
+42 -5
View File
@@ -1,5 +1,6 @@
"""Webpage content fetching with caching, PDF extraction, and summarization helpers.""" """Webpage content fetching with caching, PDF extraction, and summarization helpers."""
import copy
import io import io
import ipaddress import ipaddress
import json import json
@@ -115,6 +116,28 @@ def _extract_meta(soup: BeautifulSoup) -> dict:
return {"description": description, "keywords": keywords} return {"description": description, "keywords": keywords}
def _extract_og_image(soup: BeautifulSoup) -> str:
"""Extract the best representative image URL from meta tags.
Only returns absolute http(s) URLs -- skips relative paths and data URIs.
"""
candidates = []
for prop in ("og:image", "og:image:url", "og:image:secure_url"):
tag = soup.find("meta", attrs={"property": prop})
if tag and tag.get("content", "").strip():
candidates.append(tag["content"].strip())
tag = soup.find("meta", attrs={"name": "twitter:image"})
if tag and tag.get("content", "").strip():
candidates.append(tag["content"].strip())
tag = soup.find("meta", attrs={"name": "thumbnail"})
if tag and tag.get("content", "").strip():
candidates.append(tag["content"].strip())
for url in candidates:
if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")):
return url
return ""
def _extract_lists(soup: BeautifulSoup) -> List[List[str]]: def _extract_lists(soup: BeautifulSoup) -> List[List[str]]:
"""Return a list of lists, each inner list representing a <ul>/<ol>.""" """Return a list of lists, each inner list representing a <ul>/<ol>."""
all_lists = [] all_lists = []
@@ -275,10 +298,12 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
title_tag = soup.find("title") title_tag = soup.find("title")
title_text = title_tag.get_text(strip=True) if title_tag else "" title_text = title_tag.get_text(strip=True) if title_tag else ""
meta_info = _extract_meta(soup) meta_info = _extract_meta(soup)
og_image = _extract_og_image(soup)
js_rendered = _detect_js_frameworks(soup) js_rendered = _detect_js_frameworks(soup)
js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else "" js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""
# Main textual content (heuristic) # Main textual content (heuristic): prefer semantic / "content"-classed
# containers to skip nav/footer/boilerplate; tuned for article pages.
main_content = "" main_content = ""
content_areas = soup.find_all( content_areas = soup.find_all(
["main", "article", "section", "div"], ["main", "article", "section", "div"],
@@ -287,12 +312,23 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
if content_areas: if content_areas:
for area in content_areas[:3]: for area in content_areas[:3]:
main_content += area.get_text(separator=" ", strip=True) + " " main_content += area.get_text(separator=" ", strip=True) + " "
if not main_content: main_content = re.sub(r"\s+", " ", main_content).strip()
# If the heuristic finds only a tiny wrapper, fall back to body text with
# obvious boilerplate stripped so UI/deep-research search results do not
# look empty for app/landing pages.
THIN_CONTENT_CHARS = 600
if len(main_content) < THIN_CONTENT_CHARS:
body = soup.find("body") body = soup.find("body")
if body: if body:
main_content = body.get_text(separator=" ", strip=True) body_copy = copy.copy(body)
for noise in body_copy.find_all(
main_content = re.sub(r"\s+", " ", main_content).strip()[:8000] ["script", "style", "noscript", "template", "nav", "header", "footer", "aside"]
):
noise.extract()
body_text = re.sub(r"\s+", " ", body_copy.get_text(separator=" ", strip=True)).strip()
if len(body_text) > len(main_content):
main_content = body_text
result = { result = {
"url": url, "url": url,
@@ -303,6 +339,7 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
"code_blocks": _extract_code_blocks(soup), "code_blocks": _extract_code_blocks(soup),
"meta_description": meta_info.get("description", ""), "meta_description": meta_info.get("description", ""),
"meta_keywords": meta_info.get("keywords", ""), "meta_keywords": meta_info.get("keywords", ""),
"og_image": og_image,
"js_rendered": js_rendered, "js_rendered": js_rendered,
"js_message": js_message, "js_message": js_message,
"success": True, "success": True,
@@ -0,0 +1,52 @@
"""Keep src.search and services.search content extraction behavior aligned."""
import pytest
pytest.importorskip("bs4")
from services.search import content as service_content
from src.search import content as src_content
class _FakeResponse:
status_code = 200
headers = {"Content-Type": "text/html; charset=utf-8"}
content = b""
def __init__(self, text: str):
self.text = text
def raise_for_status(self):
return None
@pytest.mark.parametrize("module", [src_content, service_content])
def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch):
html = """
<html>
<head>
<title>Example</title>
<meta property="og:image" content="https://example.com/cover.jpg">
</head>
<body>
<nav>Navigation text should not win</nav>
<div class="content">Tiny</div>
<main>
<p>This is the substantive body text that should be retained.</p>
<p>It is much longer than the tiny class-matched wrapper.</p>
</main>
<script>window.secret = "not content";</script>
</body>
</html>
"""
monkeypatch.setattr(module, "CONTENT_CACHE_DIR", tmp_path)
module.content_cache_index.clear()
monkeypatch.setattr(module, "_get_public_url", lambda url, headers, timeout: _FakeResponse(html))
result = module.fetch_webpage_content("https://example.com/parity-test")
assert result["og_image"] == "https://example.com/cover.jpg"
assert "substantive body text" in result["content"]
assert "much longer than the tiny" in result["content"]
assert "window.secret" not in result["content"]