mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
fix(research): track analyzed URLs separately (#3125)
Co-authored-by: Alexandre Teixeira <111787685+alteixeira20@users.noreply.github.com>
This commit is contained in:
@@ -285,6 +285,7 @@ class ResearchHandler:
|
||||
query, report, stats, elapsed,
|
||||
findings=researcher.findings,
|
||||
evolving_report=researcher.evolving_report,
|
||||
analyzed_urls=getattr(researcher, "analyzed_urls", None),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -331,7 +332,8 @@ class ResearchHandler:
|
||||
|
||||
def _format_research_report(
|
||||
self, query: str, full_report: str, stats: dict, elapsed: float,
|
||||
findings: list = None, evolving_report: str = None,
|
||||
findings: Optional[list] = None, evolving_report: Optional[str] = None,
|
||||
analyzed_urls: Optional[list] = None,
|
||||
) -> str:
|
||||
"""Format research report with sources list and expandable raw findings."""
|
||||
summary_lines = [
|
||||
@@ -342,20 +344,34 @@ class ResearchHandler:
|
||||
]
|
||||
summary_text = " | ".join(summary_lines)
|
||||
|
||||
# Build sources list with clickable links
|
||||
# Build sources list with clickable links. Keep the curated Sources
|
||||
# section filtered for citation quality, but also list every unique URL
|
||||
# the research run inspected so the "URLs Analyzed" count is auditable.
|
||||
sources_section = ""
|
||||
if findings:
|
||||
analyzed_urls_section = ""
|
||||
url_items = analyzed_urls if analyzed_urls is not None else findings
|
||||
if findings or url_items:
|
||||
seen_urls = set()
|
||||
source_lines = []
|
||||
for f in findings:
|
||||
analyzed_seen = set()
|
||||
analyzed_lines = []
|
||||
for f in findings or []:
|
||||
url = f.get("url", "")
|
||||
title = f.get("title", "") or url
|
||||
summary = f.get("summary", "") or f.get("evidence", "")
|
||||
if url and url not in seen_urls and not is_low_quality(summary):
|
||||
seen_urls.add(url)
|
||||
source_lines.append(f"- [{title}]({url})")
|
||||
for item in url_items or []:
|
||||
url = item.get("url", "")
|
||||
title = item.get("title", "") or url
|
||||
if url and url not in analyzed_seen:
|
||||
analyzed_seen.add(url)
|
||||
analyzed_lines.append(f"{len(analyzed_lines) + 1}. [{title}]({url})")
|
||||
if source_lines:
|
||||
sources_section = "\n### Sources\n\n" + "\n".join(source_lines) + "\n"
|
||||
if analyzed_lines:
|
||||
analyzed_urls_section = "\n### Analyzed URLs\n\n" + "\n".join(analyzed_lines) + "\n"
|
||||
|
||||
# Build raw findings section (individual extractions per source)
|
||||
raw_findings_section = ""
|
||||
@@ -391,6 +407,7 @@ class ResearchHandler:
|
||||
{full_report}
|
||||
|
||||
{sources_section}
|
||||
{analyzed_urls_section}
|
||||
{collected_section}
|
||||
---
|
||||
|
||||
|
||||
@@ -232,6 +232,7 @@ class DeepResearcher:
|
||||
self._start_time: float = 0
|
||||
self.queries_used: Set[str] = set()
|
||||
self.urls_fetched: Set[str] = set()
|
||||
self.analyzed_urls: List[Dict[str, str]] = []
|
||||
self.round_count: int = 0
|
||||
# Track which search providers actually returned results during the
|
||||
# run, in arrival order — surfaced in the visual report so users can
|
||||
@@ -525,6 +526,10 @@ class DeepResearcher:
|
||||
if url and url not in self.urls_fetched:
|
||||
urls_to_fetch.append(r)
|
||||
self.urls_fetched.add(url)
|
||||
self.analyzed_urls.append({
|
||||
"url": url,
|
||||
"title": r.get("title", "") or url,
|
||||
})
|
||||
if len(urls_to_fetch) >= self.max_urls_per_round * len(queries):
|
||||
break
|
||||
|
||||
|
||||
@@ -45,6 +45,20 @@ async def test_search_and_extract_respects_extraction_concurrency():
|
||||
assert researcher.max_active == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_and_extract_tracks_all_urls_selected_for_analysis():
|
||||
researcher = _ControlledResearcher(extraction_concurrency=2, max_urls_per_round=2)
|
||||
researcher._start_time = time.time()
|
||||
|
||||
findings = await researcher._search_and_extract(["a"], "question")
|
||||
|
||||
assert len(findings) == 2
|
||||
assert researcher.analyzed_urls == [
|
||||
{"url": "https://example.test/a/0", "title": "a-0"},
|
||||
{"url": "https://example.test/a/1", "title": "a-1"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fetch_and_extract_uses_configured_timeout(monkeypatch):
|
||||
captured = {}
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
from services.research.research_handler import ResearchHandler
|
||||
|
||||
|
||||
def _format_report(findings):
|
||||
handler = object.__new__(ResearchHandler)
|
||||
return handler._format_research_report(
|
||||
"test query",
|
||||
"# Report\n\nBody",
|
||||
{"Rounds": 1, "Queries": 1, "URLs": len(findings)},
|
||||
1.0,
|
||||
findings=findings,
|
||||
)
|
||||
|
||||
|
||||
def _format_report_with_analyzed_urls(findings, analyzed_urls):
|
||||
handler = object.__new__(ResearchHandler)
|
||||
return handler._format_research_report(
|
||||
"test query",
|
||||
"# Report\n\nBody",
|
||||
{"Rounds": 1, "Queries": 1, "URLs": len(analyzed_urls)},
|
||||
1.0,
|
||||
findings=findings,
|
||||
analyzed_urls=analyzed_urls,
|
||||
)
|
||||
|
||||
|
||||
def test_research_report_lists_every_analyzed_url_once():
|
||||
findings = [
|
||||
{
|
||||
"url": "https://example.com/good",
|
||||
"title": "Good Source",
|
||||
"summary": "Detailed useful evidence about the query.",
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/low-quality",
|
||||
"title": "Low Quality Page",
|
||||
"summary": "",
|
||||
"evidence": "",
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/good",
|
||||
"title": "Good Source Duplicate",
|
||||
"summary": "Repeated extraction from the same URL.",
|
||||
},
|
||||
]
|
||||
|
||||
report = _format_report(findings)
|
||||
|
||||
assert "### Analyzed URLs" in report
|
||||
analyzed_section = report.split("### Analyzed URLs", 1)[1].split("<details>", 1)[0]
|
||||
assert "1. [Good Source](https://example.com/good)" in analyzed_section
|
||||
assert "2. [Low Quality Page](https://example.com/low-quality)" in analyzed_section
|
||||
assert analyzed_section.count("https://example.com/good") == 1
|
||||
|
||||
|
||||
def test_research_report_keeps_sources_section_curated():
|
||||
findings = [
|
||||
{
|
||||
"url": "https://example.com/good",
|
||||
"title": "Good Source",
|
||||
"summary": "Detailed useful evidence about the query.",
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/low-quality",
|
||||
"title": "Low Quality Page",
|
||||
"summary": "",
|
||||
"evidence": "",
|
||||
},
|
||||
]
|
||||
|
||||
report = _format_report(findings)
|
||||
|
||||
sources_section = report.split("### Sources", 1)[1].split("### Analyzed URLs", 1)[0]
|
||||
assert "[Good Source](https://example.com/good)" in sources_section
|
||||
assert "https://example.com/low-quality" not in sources_section
|
||||
|
||||
|
||||
def test_research_report_uses_full_analyzed_url_set_not_just_findings():
|
||||
findings = [
|
||||
{
|
||||
"url": "https://example.com/finding",
|
||||
"title": "Finding Source",
|
||||
"summary": "Detailed useful evidence about the query.",
|
||||
},
|
||||
]
|
||||
analyzed_urls = [
|
||||
{"url": "https://example.com/finding", "title": "Finding Source"},
|
||||
{"url": "https://example.com/fetched-no-finding", "title": "Fetched No Finding"},
|
||||
{"url": "https://example.com/finding", "title": "Duplicate"},
|
||||
]
|
||||
|
||||
report = _format_report_with_analyzed_urls(findings, analyzed_urls)
|
||||
|
||||
sources_section = report.split("### Sources", 1)[1].split("### Analyzed URLs", 1)[0]
|
||||
analyzed_section = report.split("### Analyzed URLs", 1)[1].split("<details>", 1)[0]
|
||||
assert "https://example.com/fetched-no-finding" not in sources_section
|
||||
assert "1. [Finding Source](https://example.com/finding)" in analyzed_section
|
||||
assert "2. [Fetched No Finding](https://example.com/fetched-no-finding)" in analyzed_section
|
||||
assert analyzed_section.count("https://example.com/finding") == 1
|
||||
Reference in New Issue
Block a user