mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
fix: data integrity — deep-research result parsing + memory-extraction durability (#808)
Two independent data-integrity bugs:
- services/research/service.py: ResearchService.research() (the public deep-research
API, re-exported from services/__init__) treated the handler return value as a
dict (result.get("sources"/"summary"/...)), but call_research_service() returns a
formatted markdown STRING -> AttributeError: str has no attribute get on EVERY
successful call, making the API unusable for any non-error result. Now uses the
string report as the summary and parses sources from the "### Sources" markdown
section (section-bounded, URL-deduped), with a defensive dict branch for back-compat.
- services/memory/memory_extractor.py: extract_and_store guarded the vector-store
find_similar/add calls only with the .healthy flag set ONCE at init. If the
embedding/ChromaDB backend degraded LATER (OOM, evicted model, remote endpoint
down), those calls raised, the exception escaped the dedup loop, skipped
memory_manager.save(), and was swallowed by the outer try/except -> EVERY
validated fact from the session was silently lost (the function docstring
promises "never raised"). Now falls back to the existing text/fuzzy dedup so
facts are still saved when the vector index is unavailable at runtime.
Tests: test_research_service.py, test_memory_extractor_vector_degraded.py.
This commit is contained in:
@@ -303,9 +303,18 @@ async def extract_and_store(
|
||||
if not fact_text or len(fact_text) < 5:
|
||||
continue
|
||||
|
||||
# Dedup: check vector similarity first (fast), then exact text match
|
||||
# Dedup: check vector similarity first (fast), then exact text match.
|
||||
# A runtime embedding/ChromaDB failure (backend OOM, model evicted,
|
||||
# remote endpoint down) must not abort the whole batch — fall through
|
||||
# to the text/fuzzy dedup below instead of losing every validated
|
||||
# fact extracted this session. (`.healthy` is only set at init, so
|
||||
# it does not catch failures that develop later.)
|
||||
if memory_vector and memory_vector.healthy:
|
||||
existing_id = memory_vector.find_similar(fact_text, threshold=0.72)
|
||||
try:
|
||||
existing_id = memory_vector.find_similar(fact_text, threshold=0.72)
|
||||
except Exception as e:
|
||||
logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}")
|
||||
existing_id = None
|
||||
if existing_id:
|
||||
logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}")
|
||||
continue
|
||||
@@ -330,9 +339,14 @@ async def extract_and_store(
|
||||
|
||||
existing.append(entry)
|
||||
|
||||
# Add to vector index
|
||||
# Add to vector index. The JSON store (saved below) is the source of
|
||||
# truth and the keyword path can still retrieve this entry, so a vector
|
||||
# write failure must not drop the fact or abort the remaining batch.
|
||||
if memory_vector and memory_vector.healthy:
|
||||
memory_vector.add(entry["id"], fact_text)
|
||||
try:
|
||||
memory_vector.add(entry["id"], fact_text)
|
||||
except Exception as e:
|
||||
logger.warning(f"Memory vector add failed for {entry['id']}: {e}")
|
||||
|
||||
added += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user