fix: data integrity — deep-research result parsing + memory-extraction durability (#808)

Two independent data-integrity bugs:

- services/research/service.py: ResearchService.research() (the public deep-research
  API, re-exported from services/__init__) treated the handler return value as a
  dict (result.get("sources"/"summary"/...)), but call_research_service() returns a
  formatted markdown STRING -> AttributeError: str has no attribute get on EVERY
  successful call, making the API unusable for any non-error result. Now uses the
  string report as the summary and parses sources from the "### Sources" markdown
  section (section-bounded, URL-deduped), with a defensive dict branch for back-compat.

- services/memory/memory_extractor.py: extract_and_store guarded the vector-store
  find_similar/add calls only with the .healthy flag set ONCE at init. If the
  embedding/ChromaDB backend degraded LATER (OOM, evicted model, remote endpoint
  down), those calls raised, the exception escaped the dedup loop, skipped
  memory_manager.save(), and was swallowed by the outer try/except -> EVERY
  validated fact from the session was silently lost (the function docstring
  promises "never raised"). Now falls back to the existing text/fuzzy dedup so
  facts are still saved when the vector index is unavailable at runtime.

Tests: test_research_service.py, test_memory_extractor_vector_degraded.py.
This commit is contained in:
David Anderson
2026-06-01 19:27:31 -07:00
committed by GitHub
parent 0e31c38be0
commit 610968f91e
4 changed files with 346 additions and 17 deletions
+18 -4
View File
@@ -303,9 +303,18 @@ async def extract_and_store(
if not fact_text or len(fact_text) < 5:
continue
# Dedup: check vector similarity first (fast), then exact text match
# Dedup: check vector similarity first (fast), then exact text match.
# A runtime embedding/ChromaDB failure (backend OOM, model evicted,
# remote endpoint down) must not abort the whole batch — fall through
# to the text/fuzzy dedup below instead of losing every validated
# fact extracted this session. (`.healthy` is only set at init, so
# it does not catch failures that develop later.)
if memory_vector and memory_vector.healthy:
existing_id = memory_vector.find_similar(fact_text, threshold=0.72)
try:
existing_id = memory_vector.find_similar(fact_text, threshold=0.72)
except Exception as e:
logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}")
existing_id = None
if existing_id:
logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}")
continue
@@ -330,9 +339,14 @@ async def extract_and_store(
existing.append(entry)
# Add to vector index
# Add to vector index. The JSON store (saved below) is the source of
# truth and the keyword path can still retrieve this entry, so a vector
# write failure must not drop the fact or abort the remaining batch.
if memory_vector and memory_vector.healthy:
memory_vector.add(entry["id"], fact_text)
try:
memory_vector.add(entry["id"], fact_text)
except Exception as e:
logger.warning(f"Memory vector add failed for {entry['id']}: {e}")
added += 1
+62 -13
View File
@@ -1,11 +1,16 @@
# services/research/service.py
"""Research service — deep research with LLM-in-the-loop."""
import re
from dataclasses import dataclass, field
from typing import List, Optional, Callable
from .research_handler import ResearchHandler
# Markdown source links emitted by ResearchHandler._format_research_report,
# e.g. "- [Some Title](https://example.com/page)".
_SOURCE_LINK_RE = re.compile(r"^\s*-\s*\[(?P<title>[^\]]*)\]\((?P<url>[^)]+)\)\s*$")
@dataclass
class ResearchSource:
@@ -75,26 +80,70 @@ class ResearchService:
duration = time.time() - start
# Parse result into structured format
sources = [
ResearchSource(
url=s.get("url", ""),
title=s.get("title", ""),
snippet=s.get("snippet", ""),
relevance=s.get("relevance", 0.0),
# call_research_service returns a formatted markdown report string
# (see ResearchHandler.call_research_service -> _format_research_report),
# not a dict. Treat it as such; tolerate an unexpected dict/None defensively.
if isinstance(result, dict):
sources = [
ResearchSource(
url=s.get("url", ""),
title=s.get("title", ""),
snippet=s.get("snippet", ""),
relevance=s.get("relevance", 0.0),
)
for s in result.get("sources", [])
]
return ResearchResult(
query=topic,
summary=result.get("summary", result.get("answer", "")),
sources=sources,
sections=result.get("sections", []),
tokens_used=result.get("tokens_used", 0),
duration_seconds=duration,
)
for s in result.get("sources", [])
]
report = result if isinstance(result, str) else ""
return ResearchResult(
query=topic,
summary=result.get("summary", result.get("answer", "")),
sources=sources,
sections=result.get("sections", []),
tokens_used=result.get("tokens_used", 0),
summary=report,
sources=self._parse_sources(report),
duration_seconds=duration,
)
@staticmethod
def _parse_sources(report: str) -> List[ResearchSource]:
"""Extract sources from the markdown ### Sources section of a report.
ResearchHandler emits one ``- [title](url)`` link per deduplicated
finding under a ``### Sources`` heading. Parse only that section so
inline links elsewhere in the body are not mistaken for sources.
"""
if not report:
return []
sources: List[ResearchSource] = []
seen = set()
in_sources = False
for line in report.splitlines():
stripped = line.strip()
if stripped.startswith("###") or stripped.startswith("##"):
in_sources = stripped.lower().lstrip("#").strip() == "sources"
continue
if not in_sources:
continue
match = _SOURCE_LINK_RE.match(line)
if not match:
continue
url = match.group("url").strip()
if not url or url in seen:
continue
seen.add(url)
sources.append(
# snippet is required on ResearchSource; markdown source links
# carry no snippet, so default to empty (matches the dict path).
ResearchSource(url=url, title=match.group("title").strip(), snippet="")
)
return sources
def start_background(
self,
session_id: str,