From 725d174243093a9f9c91bedeaa187b62bb1dabf0 Mon Sep 17 00:00:00 2001 From: ooovenenoso <120500656+ooovenenoso@users.noreply.github.com> Date: Wed, 10 Jun 2026 07:08:22 -0400 Subject: [PATCH] fix(research): track analyzed URLs separately (#3125) Co-authored-by: Alexandre Teixeira <111787685+alteixeira20@users.noreply.github.com> --- services/research/research_handler.py | 25 ++++- src/deep_research.py | 5 + .../test_deep_research_extraction_controls.py | 14 +++ tests/test_research_handler_analyzed_urls.py | 99 +++++++++++++++++++ 4 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 tests/test_research_handler_analyzed_urls.py diff --git a/services/research/research_handler.py b/services/research/research_handler.py index bd4c6bb15..2521f61e1 100644 --- a/services/research/research_handler.py +++ b/services/research/research_handler.py @@ -285,6 +285,7 @@ class ResearchHandler: query, report, stats, elapsed, findings=researcher.findings, evolving_report=researcher.evolving_report, + analyzed_urls=getattr(researcher, "analyzed_urls", None), ) except Exception as e: @@ -331,7 +332,8 @@ class ResearchHandler: def _format_research_report( self, query: str, full_report: str, stats: dict, elapsed: float, - findings: list = None, evolving_report: str = None, + findings: Optional[list] = None, evolving_report: Optional[str] = None, + analyzed_urls: Optional[list] = None, ) -> str: """Format research report with sources list and expandable raw findings.""" summary_lines = [ @@ -342,20 +344,34 @@ class ResearchHandler: ] summary_text = " | ".join(summary_lines) - # Build sources list with clickable links + # Build sources list with clickable links. Keep the curated Sources + # section filtered for citation quality, but also list every unique URL + # the research run inspected so the "URLs Analyzed" count is auditable. sources_section = "" - if findings: + analyzed_urls_section = "" + url_items = analyzed_urls if analyzed_urls is not None else findings + if findings or url_items: seen_urls = set() source_lines = [] - for f in findings: + analyzed_seen = set() + analyzed_lines = [] + for f in findings or []: url = f.get("url", "") title = f.get("title", "") or url summary = f.get("summary", "") or f.get("evidence", "") if url and url not in seen_urls and not is_low_quality(summary): seen_urls.add(url) source_lines.append(f"- [{title}]({url})") + for item in url_items or []: + url = item.get("url", "") + title = item.get("title", "") or url + if url and url not in analyzed_seen: + analyzed_seen.add(url) + analyzed_lines.append(f"{len(analyzed_lines) + 1}. [{title}]({url})") if source_lines: sources_section = "\n### Sources\n\n" + "\n".join(source_lines) + "\n" + if analyzed_lines: + analyzed_urls_section = "\n### Analyzed URLs\n\n" + "\n".join(analyzed_lines) + "\n" # Build raw findings section (individual extractions per source) raw_findings_section = "" @@ -391,6 +407,7 @@ class ResearchHandler: {full_report} {sources_section} +{analyzed_urls_section} {collected_section} --- diff --git a/src/deep_research.py b/src/deep_research.py index 2045d1c1f..c8ed02b11 100644 --- a/src/deep_research.py +++ b/src/deep_research.py @@ -232,6 +232,7 @@ class DeepResearcher: self._start_time: float = 0 self.queries_used: Set[str] = set() self.urls_fetched: Set[str] = set() + self.analyzed_urls: List[Dict[str, str]] = [] self.round_count: int = 0 # Track which search providers actually returned results during the # run, in arrival order — surfaced in the visual report so users can @@ -525,6 +526,10 @@ class DeepResearcher: if url and url not in self.urls_fetched: urls_to_fetch.append(r) self.urls_fetched.add(url) + self.analyzed_urls.append({ + "url": url, + "title": r.get("title", "") or url, + }) if len(urls_to_fetch) >= self.max_urls_per_round * len(queries): break diff --git a/tests/test_deep_research_extraction_controls.py b/tests/test_deep_research_extraction_controls.py index a1158e103..1cae97464 100644 --- a/tests/test_deep_research_extraction_controls.py +++ b/tests/test_deep_research_extraction_controls.py @@ -45,6 +45,20 @@ async def test_search_and_extract_respects_extraction_concurrency(): assert researcher.max_active == 2 +@pytest.mark.asyncio +async def test_search_and_extract_tracks_all_urls_selected_for_analysis(): + researcher = _ControlledResearcher(extraction_concurrency=2, max_urls_per_round=2) + researcher._start_time = time.time() + + findings = await researcher._search_and_extract(["a"], "question") + + assert len(findings) == 2 + assert researcher.analyzed_urls == [ + {"url": "https://example.test/a/0", "title": "a-0"}, + {"url": "https://example.test/a/1", "title": "a-1"}, + ] + + @pytest.mark.asyncio async def test_fetch_and_extract_uses_configured_timeout(monkeypatch): captured = {} diff --git a/tests/test_research_handler_analyzed_urls.py b/tests/test_research_handler_analyzed_urls.py new file mode 100644 index 000000000..b8328d5b5 --- /dev/null +++ b/tests/test_research_handler_analyzed_urls.py @@ -0,0 +1,99 @@ +from services.research.research_handler import ResearchHandler + + +def _format_report(findings): + handler = object.__new__(ResearchHandler) + return handler._format_research_report( + "test query", + "# Report\n\nBody", + {"Rounds": 1, "Queries": 1, "URLs": len(findings)}, + 1.0, + findings=findings, + ) + + +def _format_report_with_analyzed_urls(findings, analyzed_urls): + handler = object.__new__(ResearchHandler) + return handler._format_research_report( + "test query", + "# Report\n\nBody", + {"Rounds": 1, "Queries": 1, "URLs": len(analyzed_urls)}, + 1.0, + findings=findings, + analyzed_urls=analyzed_urls, + ) + + +def test_research_report_lists_every_analyzed_url_once(): + findings = [ + { + "url": "https://example.com/good", + "title": "Good Source", + "summary": "Detailed useful evidence about the query.", + }, + { + "url": "https://example.com/low-quality", + "title": "Low Quality Page", + "summary": "", + "evidence": "", + }, + { + "url": "https://example.com/good", + "title": "Good Source Duplicate", + "summary": "Repeated extraction from the same URL.", + }, + ] + + report = _format_report(findings) + + assert "### Analyzed URLs" in report + analyzed_section = report.split("### Analyzed URLs", 1)[1].split("
", 1)[0] + assert "1. [Good Source](https://example.com/good)" in analyzed_section + assert "2. [Low Quality Page](https://example.com/low-quality)" in analyzed_section + assert analyzed_section.count("https://example.com/good") == 1 + + +def test_research_report_keeps_sources_section_curated(): + findings = [ + { + "url": "https://example.com/good", + "title": "Good Source", + "summary": "Detailed useful evidence about the query.", + }, + { + "url": "https://example.com/low-quality", + "title": "Low Quality Page", + "summary": "", + "evidence": "", + }, + ] + + report = _format_report(findings) + + sources_section = report.split("### Sources", 1)[1].split("### Analyzed URLs", 1)[0] + assert "[Good Source](https://example.com/good)" in sources_section + assert "https://example.com/low-quality" not in sources_section + + +def test_research_report_uses_full_analyzed_url_set_not_just_findings(): + findings = [ + { + "url": "https://example.com/finding", + "title": "Finding Source", + "summary": "Detailed useful evidence about the query.", + }, + ] + analyzed_urls = [ + {"url": "https://example.com/finding", "title": "Finding Source"}, + {"url": "https://example.com/fetched-no-finding", "title": "Fetched No Finding"}, + {"url": "https://example.com/finding", "title": "Duplicate"}, + ] + + report = _format_report_with_analyzed_urls(findings, analyzed_urls) + + sources_section = report.split("### Sources", 1)[1].split("### Analyzed URLs", 1)[0] + analyzed_section = report.split("### Analyzed URLs", 1)[1].split("
", 1)[0] + assert "https://example.com/fetched-no-finding" not in sources_section + assert "1. [Finding Source](https://example.com/finding)" in analyzed_section + assert "2. [Fetched No Finding](https://example.com/fetched-no-finding)" in analyzed_section + assert analyzed_section.count("https://example.com/finding") == 1