fix: data integrity — deep-research result parsing + memory-extraction durability (#808)

Two independent data-integrity bugs: - services/research/service.py: ResearchService.research() (the public deep-research API, re-exported from services/__init__) treated the handler return value as a dict (result.get("sources"/"summary"/...)), but call_research_service() returns a formatted markdown STRING -> AttributeError: str has no attribute get on EVERY successful call, making the API unusable for any non-error result. Now uses the string report as the summary and parses sources from the "### Sources" markdown section (section-bounded, URL-deduped), with a defensive dict branch for back-compat. - services/memory/memory_extractor.py: extract_and_store guarded the vector-store find_similar/add calls only with the .healthy flag set ONCE at init. If the embedding/ChromaDB backend degraded LATER (OOM, evicted model, remote endpoint down), those calls raised, the exception escaped the dedup loop, skipped memory_manager.save(), and was swallowed by the outer try/except -> EVERY validated fact from the session was silently lost (the function docstring promises "never raised"). Now falls back to the existing text/fuzzy dedup so facts are still saved when the vector index is unavailable at runtime. Tests: test_research_service.py, test_memory_extractor_vector_degraded.py.
2026-06-16 17:55:26 -04:00 · 2026-06-01 19:27:31 -07:00
parent 0e31c38be0
commit 610968f91e
4 changed files with 346 additions and 17 deletions
@@ -303,9 +303,18 @@ async def extract_and_store(
            if not fact_text or len(fact_text) < 5:
                continue

-            # Dedup: check vector similarity first (fast), then exact text match
+            # Dedup: check vector similarity first (fast), then exact text match.
+            # A runtime embedding/ChromaDB failure (backend OOM, model evicted,
+            # remote endpoint down) must not abort the whole batch — fall through
+            # to the text/fuzzy dedup below instead of losing every validated
+            # fact extracted this session. (`.healthy` is only set at init, so
+            # it does not catch failures that develop later.)
            if memory_vector and memory_vector.healthy:
-                existing_id = memory_vector.find_similar(fact_text, threshold=0.72)
+                try:
+                    existing_id = memory_vector.find_similar(fact_text, threshold=0.72)
+                except Exception as e:
+                    logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}")
+                    existing_id = None
                if existing_id:
                    logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}")
                    continue
@@ -330,9 +339,14 @@ async def extract_and_store(

            existing.append(entry)

-            # Add to vector index
+            # Add to vector index. The JSON store (saved below) is the source of
+            # truth and the keyword path can still retrieve this entry, so a vector
+            # write failure must not drop the fact or abort the remaining batch.
            if memory_vector and memory_vector.healthy:
-                memory_vector.add(entry["id"], fact_text)
+                try:
+                    memory_vector.add(entry["id"], fact_text)
+                except Exception as e:
+                    logger.warning(f"Memory vector add failed for {entry['id']}: {e}")

            added += 1

@@ -1,11 +1,16 @@
 # services/research/service.py
 """Research service — deep research with LLM-in-the-loop."""

+import re
 from dataclasses import dataclass, field
 from typing import List, Optional, Callable

 from .research_handler import ResearchHandler

+# Markdown source links emitted by ResearchHandler._format_research_report,
+# e.g. "- [Some Title](https://example.com/page)".
+_SOURCE_LINK_RE = re.compile(r"^\s*-\s*\[(?P<title>[^\]]*)\]\((?P<url>[^)]+)\)\s*$")
+

@dataclass
 class ResearchSource:
@@ -75,26 +80,70 @@ class ResearchService:

        duration = time.time() - start

-        # Parse result into structured format
-        sources = [
-            ResearchSource(
-                url=s.get("url", ""),
-                title=s.get("title", ""),
-                snippet=s.get("snippet", ""),
-                relevance=s.get("relevance", 0.0),
+        # call_research_service returns a formatted markdown report string
+        # (see ResearchHandler.call_research_service -> _format_research_report),
+        # not a dict. Treat it as such; tolerate an unexpected dict/None defensively.
+        if isinstance(result, dict):
+            sources = [
+                ResearchSource(
+                    url=s.get("url", ""),
+                    title=s.get("title", ""),
+                    snippet=s.get("snippet", ""),
+                    relevance=s.get("relevance", 0.0),
+                )
+                for s in result.get("sources", [])
+            ]
+            return ResearchResult(
+                query=topic,
+                summary=result.get("summary", result.get("answer", "")),
+                sources=sources,
+                sections=result.get("sections", []),
+                tokens_used=result.get("tokens_used", 0),
+                duration_seconds=duration,
            )
-            for s in result.get("sources", [])
-        ]

+        report = result if isinstance(result, str) else ""
        return ResearchResult(
            query=topic,
-            summary=result.get("summary", result.get("answer", "")),
-            sources=sources,
-            sections=result.get("sections", []),
-            tokens_used=result.get("tokens_used", 0),
+            summary=report,
+            sources=self._parse_sources(report),
            duration_seconds=duration,
        )

+    @staticmethod
+    def _parse_sources(report: str) -> List[ResearchSource]:
+        """Extract sources from the markdown ### Sources section of a report.
+
+        ResearchHandler emits one ``- [title](url)`` link per deduplicated
+        finding under a ``### Sources`` heading. Parse only that section so
+        inline links elsewhere in the body are not mistaken for sources.
+        """
+        if not report:
+            return []
+        sources: List[ResearchSource] = []
+        seen = set()
+        in_sources = False
+        for line in report.splitlines():
+            stripped = line.strip()
+            if stripped.startswith("###") or stripped.startswith("##"):
+                in_sources = stripped.lower().lstrip("#").strip() == "sources"
+                continue
+            if not in_sources:
+                continue
+            match = _SOURCE_LINK_RE.match(line)
+            if not match:
+                continue
+            url = match.group("url").strip()
+            if not url or url in seen:
+                continue
+            seen.add(url)
+            sources.append(
+                # snippet is required on ResearchSource; markdown source links
+                # carry no snippet, so default to empty (matches the dict path).
+                ResearchSource(url=url, title=match.group("title").strip(), snippet="")
+            )
+        return sources
+
    def start_background(
        self,
        session_id: str,