mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
fix(search): read plain-text, Markdown, and JSON URLs in fetch_webpage_content (#3809)
raw.githubusercontent.com serves Markdown as text/plain, JSON APIs and raw config files serve application/json, and a lot of code and tool documentation lives in .md/.txt. fetch_webpage_content only handled PDF and HTML, so a non-HTML body produced empty content and web_fetch reported 'no readable text content'. Add a branch that returns the body verbatim for non-HTML text/*, JSON (application/json and +json), and a .md/.txt/.text/.json URL-suffix fallback for mislabeled octet-stream. HTML and PDF handling unchanged. Fixes #3808
This commit is contained in:
committed by
GitHub
parent
cc8ba04ea8
commit
bfac1d55d6
@@ -299,6 +299,40 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
|
|||||||
_cache_result(cache_file, cache_key, result, url)
|
_cache_result(cache_file, cache_key, result, url)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# Plain-text / Markdown / JSON handling. Sources like
|
||||||
|
# raw.githubusercontent.com serve Markdown as `text/plain`, JSON APIs and
|
||||||
|
# raw config files serve `application/json`, and a lot of code and tool
|
||||||
|
# docs live in `.md` / `.txt`. These have no HTML structure, so the HTML
|
||||||
|
# branch below would extract nothing and report "no readable text content".
|
||||||
|
# Return the body verbatim instead. The `is_html` guard keeps real HTML
|
||||||
|
# (including `application/xhtml+xml`) on the parsing path; the `json` check
|
||||||
|
# covers `application/json` and `+json` suffixes; the URL-suffix fallback
|
||||||
|
# catches servers that mislabel text files as `application/octet-stream`.
|
||||||
|
is_html = "html" in content_type
|
||||||
|
is_json = "json" in content_type
|
||||||
|
url_path = url.lower().split("?", 1)[0].split("#", 1)[0]
|
||||||
|
looks_like_text_file = url_path.endswith(
|
||||||
|
(".md", ".markdown", ".txt", ".text", ".json", ".jsonl")
|
||||||
|
)
|
||||||
|
if not is_html and (content_type.startswith("text/") or is_json or looks_like_text_file):
|
||||||
|
text_body = (response.text or "").strip()
|
||||||
|
result = {
|
||||||
|
"url": url,
|
||||||
|
"title": os.path.basename(url_path) or url,
|
||||||
|
"content": text_body,
|
||||||
|
"lists": [],
|
||||||
|
"tables": [],
|
||||||
|
"code_blocks": [],
|
||||||
|
"meta_description": "",
|
||||||
|
"meta_keywords": "",
|
||||||
|
"js_rendered": False,
|
||||||
|
"js_message": "",
|
||||||
|
"success": bool(text_body),
|
||||||
|
"error": "" if text_body else "Empty response body",
|
||||||
|
}
|
||||||
|
_cache_result(cache_file, cache_key, result, url)
|
||||||
|
return result
|
||||||
|
|
||||||
# HTML handling
|
# HTML handling
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|||||||
@@ -0,0 +1,110 @@
|
|||||||
|
"""fetch_webpage_content must return plain-text and Markdown bodies verbatim.
|
||||||
|
|
||||||
|
raw.githubusercontent.com serves Markdown as `text/plain`, and a lot of code
|
||||||
|
and tool documentation lives in `.md` / `.txt`. Those have no HTML structure,
|
||||||
|
so the HTML branch extracted nothing and web_fetch reported "no readable text
|
||||||
|
content". The plain-text branch returns the body as-is. HTML stays on the
|
||||||
|
parsing path.
|
||||||
|
"""
|
||||||
|
import types
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from services.search import content as content_mod
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeResponse:
|
||||||
|
def __init__(self, text, content_type, status_code=200):
|
||||||
|
self.text = text
|
||||||
|
self.content = text.encode("utf-8")
|
||||||
|
self.headers = {"Content-Type": content_type}
|
||||||
|
self.status_code = status_code
|
||||||
|
|
||||||
|
def raise_for_status(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def no_cache(monkeypatch, tmp_path):
|
||||||
|
# Force a cache miss and skip disk writes so the test is hermetic.
|
||||||
|
monkeypatch.setattr(content_mod, "CONTENT_CACHE_DIR", tmp_path)
|
||||||
|
monkeypatch.setattr(content_mod, "_cache_result", lambda *a, **k: None)
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_fetch(monkeypatch, text, content_type):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
content_mod,
|
||||||
|
"_get_public_url",
|
||||||
|
lambda url, headers=None, timeout=5: _FakeResponse(text, content_type),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
MARKDOWN = "# Title\n\nSome **docs** with a [link](https://example.com).\n"
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_text_plain_returns_body(monkeypatch, no_cache):
|
||||||
|
_patch_fetch(monkeypatch, MARKDOWN, "text/plain; charset=utf-8")
|
||||||
|
r = content_mod.fetch_webpage_content(
|
||||||
|
"https://raw.githubusercontent.com/o/r/master/Documentation/Patterns.md"
|
||||||
|
)
|
||||||
|
assert r["success"] is True
|
||||||
|
assert r["content"] == MARKDOWN.strip()
|
||||||
|
assert r["title"] == "patterns.md"
|
||||||
|
assert r["error"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_markdown_content_type_returns_body(monkeypatch, no_cache):
|
||||||
|
_patch_fetch(monkeypatch, MARKDOWN, "text/markdown")
|
||||||
|
r = content_mod.fetch_webpage_content("https://example.com/readme")
|
||||||
|
assert r["success"] is True
|
||||||
|
assert r["content"] == MARKDOWN.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def test_octet_stream_with_txt_suffix_returns_body(monkeypatch, no_cache):
|
||||||
|
# Some servers mislabel text files; the URL-suffix fallback still reads it.
|
||||||
|
_patch_fetch(monkeypatch, "plain notes\nline two\n", "application/octet-stream")
|
||||||
|
r = content_mod.fetch_webpage_content("https://example.com/notes.txt")
|
||||||
|
assert r["success"] is True
|
||||||
|
assert r["content"] == "plain notes\nline two"
|
||||||
|
|
||||||
|
|
||||||
|
def test_application_json_returns_body(monkeypatch, no_cache):
|
||||||
|
# application/json is not text/*; it must still be returned verbatim
|
||||||
|
# instead of being fed to the HTML parser (which yields empty content).
|
||||||
|
body = '{"name": "odysseus", "items": [1, 2, 3]}'
|
||||||
|
_patch_fetch(monkeypatch, body, "application/json")
|
||||||
|
r = content_mod.fetch_webpage_content("https://api.example.com/data")
|
||||||
|
assert r["success"] is True
|
||||||
|
assert r["content"] == body
|
||||||
|
|
||||||
|
|
||||||
|
def test_ld_json_suffix_content_type_returns_body(monkeypatch, no_cache):
|
||||||
|
body = '{"@context": "https://schema.org"}'
|
||||||
|
_patch_fetch(monkeypatch, body, "application/ld+json")
|
||||||
|
r = content_mod.fetch_webpage_content("https://example.com/meta")
|
||||||
|
assert r["success"] is True
|
||||||
|
assert r["content"] == body
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_suffix_with_octet_stream_returns_body(monkeypatch, no_cache):
|
||||||
|
body = '{"raw": true}'
|
||||||
|
_patch_fetch(monkeypatch, body, "application/octet-stream")
|
||||||
|
r = content_mod.fetch_webpage_content("https://example.com/package.json")
|
||||||
|
assert r["success"] is True
|
||||||
|
assert r["content"] == body
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_text_body_is_not_success(monkeypatch, no_cache):
|
||||||
|
_patch_fetch(monkeypatch, " \n ", "text/plain")
|
||||||
|
r = content_mod.fetch_webpage_content("https://example.com/blank.txt")
|
||||||
|
assert r["success"] is False
|
||||||
|
assert r["content"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_still_uses_parser(monkeypatch, no_cache):
|
||||||
|
# An HTML body must not be short-circuited by the text branch.
|
||||||
|
html = "<html><head><title>Hi</title></head><body><p>Hello world body text</p></body></html>"
|
||||||
|
_patch_fetch(monkeypatch, html, "text/html; charset=utf-8")
|
||||||
|
r = content_mod.fetch_webpage_content("https://example.com/page")
|
||||||
|
assert r["title"] == "Hi"
|
||||||
|
assert "Hello world body text" in r["content"]
|
||||||
Reference in New Issue
Block a user