mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
fix: data integrity — deep-research result parsing + memory-extraction durability (#808)
Two independent data-integrity bugs:
- services/research/service.py: ResearchService.research() (the public deep-research
API, re-exported from services/__init__) treated the handler return value as a
dict (result.get("sources"/"summary"/...)), but call_research_service() returns a
formatted markdown STRING -> AttributeError: str has no attribute get on EVERY
successful call, making the API unusable for any non-error result. Now uses the
string report as the summary and parses sources from the "### Sources" markdown
section (section-bounded, URL-deduped), with a defensive dict branch for back-compat.
- services/memory/memory_extractor.py: extract_and_store guarded the vector-store
find_similar/add calls only with the .healthy flag set ONCE at init. If the
embedding/ChromaDB backend degraded LATER (OOM, evicted model, remote endpoint
down), those calls raised, the exception escaped the dedup loop, skipped
memory_manager.save(), and was swallowed by the outer try/except -> EVERY
validated fact from the session was silently lost (the function docstring
promises "never raised"). Now falls back to the existing text/fuzzy dedup so
facts are still saved when the vector index is unavailable at runtime.
Tests: test_research_service.py, test_memory_extractor_vector_degraded.py.
This commit is contained in:
@@ -0,0 +1,113 @@
|
||||
"""Regression: auto memory extraction must survive a runtime vector-store
|
||||
failure.
|
||||
|
||||
The vector index reports `.healthy` only at init time. If the embedding
|
||||
backend dies later (OOM, model evicted, remote endpoint down), the per-fact
|
||||
`find_similar` / `add` calls raise. Before the fix these exceptions escaped the
|
||||
dedup loop, jumped past `memory_manager.save(...)`, and were swallowed by the
|
||||
function's outer try/except — so EVERY validated fact from the session was
|
||||
silently lost (the feature promises "Errors are logged, never raised", but it
|
||||
also quietly dropped all the data).
|
||||
|
||||
After the fix a degraded vector store falls through to the text/fuzzy dedup
|
||||
path (which the code already maintains "when vector index is unavailable") and
|
||||
the facts still land in the JSON store.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import tempfile
|
||||
|
||||
import src.llm_core
|
||||
import src.event_bus
|
||||
from src.memory import MemoryManager
|
||||
from services.memory.memory_extractor import extract_and_store
|
||||
|
||||
|
||||
class _FakeSession:
|
||||
"""Minimal session: two-message history so extraction proceeds."""
|
||||
|
||||
owner = "alice"
|
||||
session_id = "sess-1"
|
||||
|
||||
def get_context_messages(self):
|
||||
return [
|
||||
{"role": "user", "content": "Hi, a few things about me."},
|
||||
{"role": "assistant", "content": "Noted."},
|
||||
]
|
||||
|
||||
|
||||
class _BrokenVectorStore:
|
||||
"""Healthy at init, but every embedding-backed op raises at runtime."""
|
||||
|
||||
healthy = True
|
||||
|
||||
def find_similar(self, text, threshold=0.72):
|
||||
raise RuntimeError("embedding backend unavailable")
|
||||
|
||||
def add(self, memory_id, text):
|
||||
raise RuntimeError("embedding backend unavailable")
|
||||
|
||||
|
||||
def _run(coro):
|
||||
return asyncio.new_event_loop().run_until_complete(coro)
|
||||
|
||||
|
||||
def test_extraction_persists_facts_when_vector_store_fails_at_runtime(monkeypatch):
|
||||
facts_json = (
|
||||
'[{"text": "Alice lives in Lisbon", "category": "fact"}, '
|
||||
'{"text": "Alice prefers tea over coffee", "category": "preference"}]'
|
||||
)
|
||||
|
||||
async def _fake_llm(url, model, messages, **kwargs):
|
||||
return facts_json
|
||||
|
||||
monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
|
||||
# fire_event touches an async event loop / disk — neutralize it.
|
||||
monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
|
||||
|
||||
with tempfile.TemporaryDirectory() as data_dir:
|
||||
mgr = MemoryManager(data_dir)
|
||||
|
||||
_run(extract_and_store(
|
||||
_FakeSession(),
|
||||
mgr,
|
||||
_BrokenVectorStore(),
|
||||
endpoint_url="http://x",
|
||||
model="m",
|
||||
headers=None,
|
||||
))
|
||||
|
||||
stored = mgr.load(owner="alice")
|
||||
texts = {e["text"] for e in stored}
|
||||
|
||||
# The bug lost ALL of them (save() was never reached); both must survive.
|
||||
assert "Alice lives in Lisbon" in texts
|
||||
assert "Alice prefers tea over coffee" in texts
|
||||
|
||||
|
||||
def test_healthy_vector_store_still_dedups_normally(monkeypatch):
|
||||
"""Control: when find_similar reports a match, that fact is skipped — the
|
||||
try/except added around it must not swallow a legitimate dedup hit."""
|
||||
|
||||
async def _fake_llm(url, model, messages, **kwargs):
|
||||
return '[{"text": "Alice lives in Lisbon", "category": "fact"}]'
|
||||
|
||||
monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
|
||||
monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
|
||||
|
||||
class _DedupVectorStore:
|
||||
healthy = True
|
||||
|
||||
def find_similar(self, text, threshold=0.72):
|
||||
return "existing-id" # claim it already exists
|
||||
|
||||
def add(self, memory_id, text): # pragma: no cover - should not run
|
||||
raise AssertionError("add should not be called for a deduped fact")
|
||||
|
||||
with tempfile.TemporaryDirectory() as data_dir:
|
||||
mgr = MemoryManager(data_dir)
|
||||
_run(extract_and_store(
|
||||
_FakeSession(), mgr, _DedupVectorStore(),
|
||||
endpoint_url="http://x", model="m", headers=None,
|
||||
))
|
||||
assert mgr.load(owner="alice") == []
|
||||
Reference in New Issue
Block a user