From 725d174243093a9f9c91bedeaa187b62bb1dabf0 Mon Sep 17 00:00:00 2001
From: ooovenenoso <120500656+ooovenenoso@users.noreply.github.com>
Date: Wed, 10 Jun 2026 07:08:22 -0400
Subject: [PATCH] fix(research): track analyzed URLs separately (#3125)

Co-authored-by: Alexandre Teixeira <111787685+alteixeira20@users.noreply.github.com>
---
 services/research/research_handler.py         | 25 ++++-
 src/deep_research.py                          |  5 +
 .../test_deep_research_extraction_controls.py | 14 +++
 tests/test_research_handler_analyzed_urls.py  | 99 +++++++++++++++++++
 4 files changed, 139 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_research_handler_analyzed_urls.py

diff --git a/services/research/research_handler.py b/services/research/research_handler.py
index bd4c6bb15..2521f61e1 100644
--- a/services/research/research_handler.py
+++ b/services/research/research_handler.py
@@ -285,6 +285,7 @@ class ResearchHandler:
                 query, report, stats, elapsed,
                 findings=researcher.findings,
                 evolving_report=researcher.evolving_report,
+                analyzed_urls=getattr(researcher, "analyzed_urls", None),
             )
 
         except Exception as e:
@@ -331,7 +332,8 @@ class ResearchHandler:
 
     def _format_research_report(
         self, query: str, full_report: str, stats: dict, elapsed: float,
-        findings: list = None, evolving_report: str = None,
+        findings: Optional[list] = None, evolving_report: Optional[str] = None,
+        analyzed_urls: Optional[list] = None,
     ) -> str:
         """Format research report with sources list and expandable raw findings."""
         summary_lines = [
@@ -342,20 +344,34 @@ class ResearchHandler:
         ]
         summary_text = " | ".join(summary_lines)
 
-        # Build sources list with clickable links
+        # Build sources list with clickable links. Keep the curated Sources
+        # section filtered for citation quality, but also list every unique URL
+        # the research run inspected so the "URLs Analyzed" count is auditable.
         sources_section = ""
-        if findings:
+        analyzed_urls_section = ""
+        url_items = analyzed_urls if analyzed_urls is not None else findings
+        if findings or url_items:
             seen_urls = set()
             source_lines = []
-            for f in findings:
+            analyzed_seen = set()
+            analyzed_lines = []
+            for f in findings or []:
                 url = f.get("url", "")
                 title = f.get("title", "") or url
                 summary = f.get("summary", "") or f.get("evidence", "")
                 if url and url not in seen_urls and not is_low_quality(summary):
                     seen_urls.add(url)
                     source_lines.append(f"- [{title}]({url})")
+            for item in url_items or []:
+                url = item.get("url", "")
+                title = item.get("title", "") or url
+                if url and url not in analyzed_seen:
+                    analyzed_seen.add(url)
+                    analyzed_lines.append(f"{len(analyzed_lines) + 1}. [{title}]({url})")
             if source_lines:
                 sources_section = "\n### Sources\n\n" + "\n".join(source_lines) + "\n"
+            if analyzed_lines:
+                analyzed_urls_section = "\n### Analyzed URLs\n\n" + "\n".join(analyzed_lines) + "\n"
 
         # Build raw findings section (individual extractions per source)
         raw_findings_section = ""
@@ -391,6 +407,7 @@ class ResearchHandler:
 {full_report}
 
 {sources_section}
+{analyzed_urls_section}
 {collected_section}
 ---
 
diff --git a/src/deep_research.py b/src/deep_research.py
index 2045d1c1f..c8ed02b11 100644
--- a/src/deep_research.py
+++ b/src/deep_research.py
@@ -232,6 +232,7 @@ class DeepResearcher:
         self._start_time: float = 0
         self.queries_used: Set[str] = set()
         self.urls_fetched: Set[str] = set()
+        self.analyzed_urls: List[Dict[str, str]] = []
         self.round_count: int = 0
         # Track which search providers actually returned results during the
         # run, in arrival order — surfaced in the visual report so users can
@@ -525,6 +526,10 @@ class DeepResearcher:
                 if url and url not in self.urls_fetched:
                     urls_to_fetch.append(r)
                     self.urls_fetched.add(url)
+                    self.analyzed_urls.append({
+                        "url": url,
+                        "title": r.get("title", "") or url,
+                    })
                 if len(urls_to_fetch) >= self.max_urls_per_round * len(queries):
                     break
 
diff --git a/tests/test_deep_research_extraction_controls.py b/tests/test_deep_research_extraction_controls.py
index a1158e103..1cae97464 100644
--- a/tests/test_deep_research_extraction_controls.py
+++ b/tests/test_deep_research_extraction_controls.py
@@ -45,6 +45,20 @@ async def test_search_and_extract_respects_extraction_concurrency():
     assert researcher.max_active == 2
 
 
+@pytest.mark.asyncio
+async def test_search_and_extract_tracks_all_urls_selected_for_analysis():
+    researcher = _ControlledResearcher(extraction_concurrency=2, max_urls_per_round=2)
+    researcher._start_time = time.time()
+
+    findings = await researcher._search_and_extract(["a"], "question")
+
+    assert len(findings) == 2
+    assert researcher.analyzed_urls == [
+        {"url": "https://example.test/a/0", "title": "a-0"},
+        {"url": "https://example.test/a/1", "title": "a-1"},
+    ]
+
+
 @pytest.mark.asyncio
 async def test_fetch_and_extract_uses_configured_timeout(monkeypatch):
     captured = {}
diff --git a/tests/test_research_handler_analyzed_urls.py b/tests/test_research_handler_analyzed_urls.py
new file mode 100644
index 000000000..b8328d5b5
--- /dev/null
+++ b/tests/test_research_handler_analyzed_urls.py
@@ -0,0 +1,99 @@
+from services.research.research_handler import ResearchHandler
+
+
+def _format_report(findings):
+    handler = object.__new__(ResearchHandler)
+    return handler._format_research_report(
+        "test query",
+        "# Report\n\nBody",
+        {"Rounds": 1, "Queries": 1, "URLs": len(findings)},
+        1.0,
+        findings=findings,
+    )
+
+
+def _format_report_with_analyzed_urls(findings, analyzed_urls):
+    handler = object.__new__(ResearchHandler)
+    return handler._format_research_report(
+        "test query",
+        "# Report\n\nBody",
+        {"Rounds": 1, "Queries": 1, "URLs": len(analyzed_urls)},
+        1.0,
+        findings=findings,
+        analyzed_urls=analyzed_urls,
+    )
+
+
+def test_research_report_lists_every_analyzed_url_once():
+    findings = [
+        {
+            "url": "https://example.com/good",
+            "title": "Good Source",
+            "summary": "Detailed useful evidence about the query.",
+        },
+        {
+            "url": "https://example.com/low-quality",
+            "title": "Low Quality Page",
+            "summary": "",
+            "evidence": "",
+        },
+        {
+            "url": "https://example.com/good",
+            "title": "Good Source Duplicate",
+            "summary": "Repeated extraction from the same URL.",
+        },
+    ]
+
+    report = _format_report(findings)
+
+    assert "### Analyzed URLs" in report
+    analyzed_section = report.split("### Analyzed URLs", 1)[1].split("<details>", 1)[0]
+    assert "1. [Good Source](https://example.com/good)" in analyzed_section
+    assert "2. [Low Quality Page](https://example.com/low-quality)" in analyzed_section
+    assert analyzed_section.count("https://example.com/good") == 1
+
+
+def test_research_report_keeps_sources_section_curated():
+    findings = [
+        {
+            "url": "https://example.com/good",
+            "title": "Good Source",
+            "summary": "Detailed useful evidence about the query.",
+        },
+        {
+            "url": "https://example.com/low-quality",
+            "title": "Low Quality Page",
+            "summary": "",
+            "evidence": "",
+        },
+    ]
+
+    report = _format_report(findings)
+
+    sources_section = report.split("### Sources", 1)[1].split("### Analyzed URLs", 1)[0]
+    assert "[Good Source](https://example.com/good)" in sources_section
+    assert "https://example.com/low-quality" not in sources_section
+
+
+def test_research_report_uses_full_analyzed_url_set_not_just_findings():
+    findings = [
+        {
+            "url": "https://example.com/finding",
+            "title": "Finding Source",
+            "summary": "Detailed useful evidence about the query.",
+        },
+    ]
+    analyzed_urls = [
+        {"url": "https://example.com/finding", "title": "Finding Source"},
+        {"url": "https://example.com/fetched-no-finding", "title": "Fetched No Finding"},
+        {"url": "https://example.com/finding", "title": "Duplicate"},
+    ]
+
+    report = _format_report_with_analyzed_urls(findings, analyzed_urls)
+
+    sources_section = report.split("### Sources", 1)[1].split("### Analyzed URLs", 1)[0]
+    analyzed_section = report.split("### Analyzed URLs", 1)[1].split("<details>", 1)[0]
+    assert "https://example.com/fetched-no-finding" not in sources_section
+    assert "1. [Finding Source](https://example.com/finding)" in analyzed_section
+    assert "2. [Fetched No Finding](https://example.com/fetched-no-finding)" in analyzed_section
+    assert analyzed_section.count("https://example.com/finding") == 1