fix: data integrity — deep-research result parsing + memory-extraction durability (#808)

Two independent data-integrity bugs: - services/research/service.py: ResearchService.research() (the public deep-research API, re-exported from services/__init__) treated the handler return value as a dict (result.get("sources"/"summary"/...)), but call_research_service() returns a formatted markdown STRING -> AttributeError: str has no attribute get on EVERY successful call, making the API unusable for any non-error result. Now uses the string report as the summary and parses sources from the "### Sources" markdown section (section-bounded, URL-deduped), with a defensive dict branch for back-compat. - services/memory/memory_extractor.py: extract_and_store guarded the vector-store find_similar/add calls only with the .healthy flag set ONCE at init. If the embedding/ChromaDB backend degraded LATER (OOM, evicted model, remote endpoint down), those calls raised, the exception escaped the dedup loop, skipped memory_manager.save(), and was swallowed by the outer try/except -> EVERY validated fact from the session was silently lost (the function docstring promises "never raised"). Now falls back to the existing text/fuzzy dedup so facts are still saved when the vector index is unavailable at runtime. Tests: test_research_service.py, test_memory_extractor_vector_degraded.py.
2026-06-16 17:55:26 -04:00 · 2026-06-01 19:27:31 -07:00
parent 0e31c38be0
commit 610968f91e
4 changed files with 346 additions and 17 deletions
@@ -0,0 +1,113 @@
+"""Regression: auto memory extraction must survive a runtime vector-store
+failure.
+
+The vector index reports `.healthy` only at init time. If the embedding
+backend dies later (OOM, model evicted, remote endpoint down), the per-fact
+`find_similar` / `add` calls raise. Before the fix these exceptions escaped the
+dedup loop, jumped past `memory_manager.save(...)`, and were swallowed by the
+function's outer try/except — so EVERY validated fact from the session was
+silently lost (the feature promises "Errors are logged, never raised", but it
+also quietly dropped all the data).
+
+After the fix a degraded vector store falls through to the text/fuzzy dedup
+path (which the code already maintains "when vector index is unavailable") and
+the facts still land in the JSON store.
+"""
+
+import asyncio
+import tempfile
+
+import src.llm_core
+import src.event_bus
+from src.memory import MemoryManager
+from services.memory.memory_extractor import extract_and_store
+
+
+class _FakeSession:
+    """Minimal session: two-message history so extraction proceeds."""
+
+    owner = "alice"
+    session_id = "sess-1"
+
+    def get_context_messages(self):
+        return [
+            {"role": "user", "content": "Hi, a few things about me."},
+            {"role": "assistant", "content": "Noted."},
+        ]
+
+
+class _BrokenVectorStore:
+    """Healthy at init, but every embedding-backed op raises at runtime."""
+
+    healthy = True
+
+    def find_similar(self, text, threshold=0.72):
+        raise RuntimeError("embedding backend unavailable")
+
+    def add(self, memory_id, text):
+        raise RuntimeError("embedding backend unavailable")
+
+
+def _run(coro):
+    return asyncio.new_event_loop().run_until_complete(coro)
+
+
+def test_extraction_persists_facts_when_vector_store_fails_at_runtime(monkeypatch):
+    facts_json = (
+        '[{"text": "Alice lives in Lisbon", "category": "fact"}, '
+        '{"text": "Alice prefers tea over coffee", "category": "preference"}]'
+    )
+
+    async def _fake_llm(url, model, messages, **kwargs):
+        return facts_json
+
+    monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
+    # fire_event touches an async event loop / disk — neutralize it.
+    monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
+
+    with tempfile.TemporaryDirectory() as data_dir:
+        mgr = MemoryManager(data_dir)
+
+        _run(extract_and_store(
+            _FakeSession(),
+            mgr,
+            _BrokenVectorStore(),
+            endpoint_url="http://x",
+            model="m",
+            headers=None,
+        ))
+
+        stored = mgr.load(owner="alice")
+        texts = {e["text"] for e in stored}
+
+    # The bug lost ALL of them (save() was never reached); both must survive.
+    assert "Alice lives in Lisbon" in texts
+    assert "Alice prefers tea over coffee" in texts
+
+
+def test_healthy_vector_store_still_dedups_normally(monkeypatch):
+    """Control: when find_similar reports a match, that fact is skipped — the
+    try/except added around it must not swallow a legitimate dedup hit."""
+
+    async def _fake_llm(url, model, messages, **kwargs):
+        return '[{"text": "Alice lives in Lisbon", "category": "fact"}]'
+
+    monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
+    monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
+
+    class _DedupVectorStore:
+        healthy = True
+
+        def find_similar(self, text, threshold=0.72):
+            return "existing-id"  # claim it already exists
+
+        def add(self, memory_id, text):  # pragma: no cover - should not run
+            raise AssertionError("add should not be called for a deduped fact")
+
+    with tempfile.TemporaryDirectory() as data_dir:
+        mgr = MemoryManager(data_dir)
+        _run(extract_and_store(
+            _FakeSession(), mgr, _DedupVectorStore(),
+            endpoint_url="http://x", model="m", headers=None,
+        ))
+        assert mgr.load(owner="alice") == []