mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 01:35:36 -04:00
fix: web search content blocks numbered by fetch completion order break citations (#1672)
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
"""[CONTENT i] blocks must map to the [i] sources list.
|
||||
|
||||
comprehensive_web_search numbers its sources list by search-result order,
|
||||
but the fetched-content blocks were numbered 1..N in fetch COMPLETION
|
||||
order (as_completed). With parallel fetching the two numberings disagree,
|
||||
so the model cites "[2]" for content that actually came from source [3].
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def core(monkeypatch):
|
||||
mod = importlib.import_module("services.search.core")
|
||||
results = [
|
||||
{"url": "http://one.example/a", "title": "One", "snippet": "s1"},
|
||||
{"url": "http://two.example/b", "title": "Two", "snippet": "s2"},
|
||||
]
|
||||
monkeypatch.setattr(mod, "_get_search_settings", lambda: {"search_provider": "searxng"})
|
||||
monkeypatch.setattr(mod, "_get_result_count", lambda: 2)
|
||||
monkeypatch.setattr(mod, "_call_provider", lambda *a, **k: [dict(r) for r in results])
|
||||
monkeypatch.setattr(mod, "rank_search_results", lambda q, r: r)
|
||||
return mod
|
||||
|
||||
|
||||
def _fake_fetch_delaying_first(url, timeout=8, retry_attempt=0):
|
||||
if "one.example" in url:
|
||||
# Force the FIRST source to finish fetching LAST
|
||||
time.sleep(0.4)
|
||||
return {
|
||||
"success": True,
|
||||
"url": url,
|
||||
"title": "Title for " + url,
|
||||
"content": "Content for " + url + " " + "filler " * 20,
|
||||
}
|
||||
|
||||
|
||||
def test_content_blocks_numbered_by_source_not_completion_order(core, monkeypatch):
|
||||
monkeypatch.setattr(core, "fetch_webpage_content", _fake_fetch_delaying_first)
|
||||
out = core.comprehensive_web_search("test query", max_pages=2, max_workers=2)
|
||||
assert "[CONTENT 1] From: http://one.example/a" in out
|
||||
assert "[CONTENT 2] From: http://two.example/b" in out
|
||||
assert out.index("[CONTENT 1]") < out.index("[CONTENT 2]")
|
||||
|
||||
|
||||
def test_redirected_fetch_keeps_its_source_index(core, monkeypatch):
|
||||
def fetch(url, timeout=8, retry_attempt=0):
|
||||
final = "http://final.example/landing" if "two.example" in url else url
|
||||
return {
|
||||
"success": True,
|
||||
"url": final,
|
||||
"title": "Title",
|
||||
"content": "Content for " + final + " " + "filler " * 20,
|
||||
}
|
||||
|
||||
monkeypatch.setattr(core, "fetch_webpage_content", fetch)
|
||||
out = core.comprehensive_web_search("test query", max_pages=2, max_workers=2)
|
||||
assert "[CONTENT 2] From: http://final.example/landing" in out
|
||||
Reference in New Issue
Block a user