mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
fix: data integrity — deep-research result parsing + memory-extraction durability (#808)
Two independent data-integrity bugs:
- services/research/service.py: ResearchService.research() (the public deep-research
API, re-exported from services/__init__) treated the handler return value as a
dict (result.get("sources"/"summary"/...)), but call_research_service() returns a
formatted markdown STRING -> AttributeError: str has no attribute get on EVERY
successful call, making the API unusable for any non-error result. Now uses the
string report as the summary and parses sources from the "### Sources" markdown
section (section-bounded, URL-deduped), with a defensive dict branch for back-compat.
- services/memory/memory_extractor.py: extract_and_store guarded the vector-store
find_similar/add calls only with the .healthy flag set ONCE at init. If the
embedding/ChromaDB backend degraded LATER (OOM, evicted model, remote endpoint
down), those calls raised, the exception escaped the dedup loop, skipped
memory_manager.save(), and was swallowed by the outer try/except -> EVERY
validated fact from the session was silently lost (the function docstring
promises "never raised"). Now falls back to the existing text/fuzzy dedup so
facts are still saved when the vector index is unavailable at runtime.
Tests: test_research_service.py, test_memory_extractor_vector_degraded.py.
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
# services/research/service.py
|
||||
"""Research service — deep research with LLM-in-the-loop."""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Callable
|
||||
|
||||
from .research_handler import ResearchHandler
|
||||
|
||||
# Markdown source links emitted by ResearchHandler._format_research_report,
|
||||
# e.g. "- [Some Title](https://example.com/page)".
|
||||
_SOURCE_LINK_RE = re.compile(r"^\s*-\s*\[(?P<title>[^\]]*)\]\((?P<url>[^)]+)\)\s*$")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResearchSource:
|
||||
@@ -75,26 +80,70 @@ class ResearchService:
|
||||
|
||||
duration = time.time() - start
|
||||
|
||||
# Parse result into structured format
|
||||
sources = [
|
||||
ResearchSource(
|
||||
url=s.get("url", ""),
|
||||
title=s.get("title", ""),
|
||||
snippet=s.get("snippet", ""),
|
||||
relevance=s.get("relevance", 0.0),
|
||||
# call_research_service returns a formatted markdown report string
|
||||
# (see ResearchHandler.call_research_service -> _format_research_report),
|
||||
# not a dict. Treat it as such; tolerate an unexpected dict/None defensively.
|
||||
if isinstance(result, dict):
|
||||
sources = [
|
||||
ResearchSource(
|
||||
url=s.get("url", ""),
|
||||
title=s.get("title", ""),
|
||||
snippet=s.get("snippet", ""),
|
||||
relevance=s.get("relevance", 0.0),
|
||||
)
|
||||
for s in result.get("sources", [])
|
||||
]
|
||||
return ResearchResult(
|
||||
query=topic,
|
||||
summary=result.get("summary", result.get("answer", "")),
|
||||
sources=sources,
|
||||
sections=result.get("sections", []),
|
||||
tokens_used=result.get("tokens_used", 0),
|
||||
duration_seconds=duration,
|
||||
)
|
||||
for s in result.get("sources", [])
|
||||
]
|
||||
|
||||
report = result if isinstance(result, str) else ""
|
||||
return ResearchResult(
|
||||
query=topic,
|
||||
summary=result.get("summary", result.get("answer", "")),
|
||||
sources=sources,
|
||||
sections=result.get("sections", []),
|
||||
tokens_used=result.get("tokens_used", 0),
|
||||
summary=report,
|
||||
sources=self._parse_sources(report),
|
||||
duration_seconds=duration,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _parse_sources(report: str) -> List[ResearchSource]:
|
||||
"""Extract sources from the markdown ### Sources section of a report.
|
||||
|
||||
ResearchHandler emits one ``- [title](url)`` link per deduplicated
|
||||
finding under a ``### Sources`` heading. Parse only that section so
|
||||
inline links elsewhere in the body are not mistaken for sources.
|
||||
"""
|
||||
if not report:
|
||||
return []
|
||||
sources: List[ResearchSource] = []
|
||||
seen = set()
|
||||
in_sources = False
|
||||
for line in report.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("###") or stripped.startswith("##"):
|
||||
in_sources = stripped.lower().lstrip("#").strip() == "sources"
|
||||
continue
|
||||
if not in_sources:
|
||||
continue
|
||||
match = _SOURCE_LINK_RE.match(line)
|
||||
if not match:
|
||||
continue
|
||||
url = match.group("url").strip()
|
||||
if not url or url in seen:
|
||||
continue
|
||||
seen.add(url)
|
||||
sources.append(
|
||||
# snippet is required on ResearchSource; markdown source links
|
||||
# carry no snippet, so default to empty (matches the dict path).
|
||||
ResearchSource(url=url, title=match.group("title").strip(), snippet="")
|
||||
)
|
||||
return sources
|
||||
|
||||
def start_background(
|
||||
self,
|
||||
session_id: str,
|
||||
|
||||
Reference in New Issue
Block a user