mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Fix auto-memory vector dedup across tenants
Ensure vector dedup only suppresses a memory when the matched JSON memory belongs to the same owner or is legacy unowned. Cross-owner vector hits now fall through to the existing owner-scoped text/fuzzy dedup path, preventing one user's memory from blocking another user's similar fact. Fixes #2114.
This commit is contained in:
@@ -345,8 +345,17 @@ async def extract_and_store(
|
|||||||
logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}")
|
logger.warning(f"Memory dedup (vector) unavailable, using text fallback: {e}")
|
||||||
existing_id = None
|
existing_id = None
|
||||||
if existing_id:
|
if existing_id:
|
||||||
logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}")
|
# The vector store is a single shared collection with no
|
||||||
continue
|
# owner metadata, so find_similar can return ANOTHER
|
||||||
|
# tenant's memory. Only treat it as a duplicate when the
|
||||||
|
# match is this user's own (or a legacy unowned) memory —
|
||||||
|
# otherwise the user's freshly-extracted fact would be
|
||||||
|
# silently dropped. Mirror the owner predicate used by the
|
||||||
|
# text dedup below; cross-tenant/stale matches fall through.
|
||||||
|
_match = next((e for e in existing if e.get("id") == existing_id), None)
|
||||||
|
if _match is not None and (_match.get("owner") == _owner or _match.get("owner") is None):
|
||||||
|
logger.debug(f"Memory dedup (vector): '{fact_text[:50]}' matches {existing_id}")
|
||||||
|
continue
|
||||||
|
|
||||||
# Text dedup fallback: exact match + fuzzy similarity
|
# Text dedup fallback: exact match + fuzzy similarity
|
||||||
user_existing = [e for e in existing if e.get("owner") == _owner or e.get("owner") is None] if _owner else existing
|
user_existing = [e for e in existing if e.get("owner") == _owner or e.get("owner") is None] if _owner else existing
|
||||||
|
|||||||
@@ -0,0 +1,115 @@
|
|||||||
|
"""Regression: auto-memory vector dedup must not drop a user's fact because it
|
||||||
|
matches ANOTHER tenant's memory.
|
||||||
|
|
||||||
|
`extract_and_store` dedups each extracted fact against the vector store first.
|
||||||
|
The vector store (`memory_vector`) is a single shared ChromaDB collection with
|
||||||
|
no owner in its metadata, so `find_similar` can return a memory_id belonging to
|
||||||
|
a different user. The old code `continue`d (skipped storing) on any vector hit
|
||||||
|
without checking ownership, so user B's freshly-extracted fact was silently
|
||||||
|
dropped when it was merely semantically similar to user A's memory. The text
|
||||||
|
dedup fallback right below is already owner-scoped; the vector path must be too.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import importlib.util
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_extractor():
|
||||||
|
# Load services/memory/memory_extractor.py directly by path so we don't
|
||||||
|
# trigger services/__init__ (which imports the search stack and its heavy
|
||||||
|
# optional deps). The module's only module-level imports are stdlib; its
|
||||||
|
# src.llm_core / src.event_bus imports are lazy and stubbed/guarded.
|
||||||
|
path = ROOT / "services" / "memory" / "memory_extractor.py"
|
||||||
|
spec = importlib.util.spec_from_file_location("memory_extractor_under_test", path)
|
||||||
|
mod = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(mod)
|
||||||
|
return mod
|
||||||
|
|
||||||
|
|
||||||
|
def _install_llm_stub(monkeypatch, facts_json):
|
||||||
|
mod = types.ModuleType("src.llm_core")
|
||||||
|
|
||||||
|
async def llm_call_async(*a, **k):
|
||||||
|
return facts_json
|
||||||
|
|
||||||
|
mod.llm_call_async = llm_call_async
|
||||||
|
# Use monkeypatch.setitem so sys.modules is restored at teardown. A raw
|
||||||
|
# assignment here permanently replaced the real src.llm_core with this
|
||||||
|
# stripped stub, leaking "My home is in Lisbon" (and hiding _detect_provider)
|
||||||
|
# into every later-collected test that imports the real module.
|
||||||
|
src_pkg = sys.modules.get("src") or types.ModuleType("src")
|
||||||
|
monkeypatch.setitem(sys.modules, "src", src_pkg)
|
||||||
|
monkeypatch.setitem(sys.modules, "src.llm_core", mod)
|
||||||
|
|
||||||
|
|
||||||
|
class FakeSession:
|
||||||
|
def __init__(self, owner):
|
||||||
|
self.owner = owner
|
||||||
|
|
||||||
|
def get_context_messages(self):
|
||||||
|
return [
|
||||||
|
{"role": "user", "content": "Tell me where I live."},
|
||||||
|
{"role": "assistant", "content": "Noted."},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class FakeMemoryManager:
|
||||||
|
def __init__(self, rows):
|
||||||
|
self.rows = list(rows)
|
||||||
|
self._n = 0
|
||||||
|
|
||||||
|
def load_all(self):
|
||||||
|
return list(self.rows)
|
||||||
|
|
||||||
|
def load(self, owner=None):
|
||||||
|
return [r for r in self.rows if r.get("owner") == owner]
|
||||||
|
|
||||||
|
def find_duplicates(self, text, subset):
|
||||||
|
t = text.strip().lower()
|
||||||
|
return [r for r in subset if r.get("text", "").strip().lower() == t]
|
||||||
|
|
||||||
|
def add_entry(self, text, source="auto", category="fact", owner=None):
|
||||||
|
self._n += 1
|
||||||
|
entry = {"id": f"new-{self._n}", "text": text, "owner": owner,
|
||||||
|
"source": source, "category": category}
|
||||||
|
self.rows.append(entry)
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
|
class FakeVector:
|
||||||
|
"""Healthy vector store whose find_similar always matches user A's memory."""
|
||||||
|
def __init__(self, match_id):
|
||||||
|
self.healthy = True
|
||||||
|
self._match_id = match_id
|
||||||
|
|
||||||
|
def find_similar(self, text, threshold=0.92):
|
||||||
|
return self._match_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_vector_match_from_other_tenant_does_not_drop_users_fact(monkeypatch):
|
||||||
|
# User A already owns a semantically-similar memory.
|
||||||
|
mm = FakeMemoryManager([
|
||||||
|
{"id": "a1", "text": "I live in Lisbon", "owner": "userA"},
|
||||||
|
])
|
||||||
|
# The vector store reports user B's new fact as a near-duplicate of a1.
|
||||||
|
vec = FakeVector(match_id="a1")
|
||||||
|
_install_llm_stub(monkeypatch, '["My home is in Lisbon"]')
|
||||||
|
|
||||||
|
memory_extractor = _load_extractor()
|
||||||
|
|
||||||
|
asyncio.run(memory_extractor.extract_and_store(
|
||||||
|
FakeSession(owner="userB"), mm, vec,
|
||||||
|
endpoint_url="http://x", model="m",
|
||||||
|
))
|
||||||
|
|
||||||
|
b_texts = {r["text"] for r in mm.load(owner="userB")}
|
||||||
|
assert "My home is in Lisbon" in b_texts, (
|
||||||
|
"User B's own extracted fact was dropped because the shared vector "
|
||||||
|
"store matched user A's memory (cross-tenant dedup)."
|
||||||
|
)
|
||||||
@@ -86,8 +86,12 @@ def test_extraction_persists_facts_when_vector_store_fails_at_runtime(monkeypatc
|
|||||||
|
|
||||||
|
|
||||||
def test_healthy_vector_store_still_dedups_normally(monkeypatch):
|
def test_healthy_vector_store_still_dedups_normally(monkeypatch):
|
||||||
"""Control: when find_similar reports a match, that fact is skipped — the
|
"""Control: a vector hit on the user's OWN memory is honored (deduped) and
|
||||||
try/except added around it must not swallow a legitimate dedup hit."""
|
add is not called. The vector store is a shared collection with no owner
|
||||||
|
metadata, so a hit is only treated as a duplicate when the matched id
|
||||||
|
resolves to this user's own (or legacy unowned) memory — otherwise the
|
||||||
|
fact would be a cross-tenant false drop. Here the match is alice's own
|
||||||
|
memory, so the dedup must still fire."""
|
||||||
|
|
||||||
async def _fake_llm(url, model, messages, **kwargs):
|
async def _fake_llm(url, model, messages, **kwargs):
|
||||||
return '[{"text": "Alice lives in Lisbon", "category": "fact"}]'
|
return '[{"text": "Alice lives in Lisbon", "category": "fact"}]'
|
||||||
@@ -95,19 +99,27 @@ def test_healthy_vector_store_still_dedups_normally(monkeypatch):
|
|||||||
monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
|
monkeypatch.setattr(src.llm_core, "llm_call_async", _fake_llm)
|
||||||
monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
|
monkeypatch.setattr(src.event_bus, "fire_event", lambda *a, **k: None)
|
||||||
|
|
||||||
class _DedupVectorStore:
|
|
||||||
healthy = True
|
|
||||||
|
|
||||||
def find_similar(self, text, threshold=0.72):
|
|
||||||
return "existing-id" # claim it already exists
|
|
||||||
|
|
||||||
def add(self, memory_id, text): # pragma: no cover - should not run
|
|
||||||
raise AssertionError("add should not be called for a deduped fact")
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as data_dir:
|
with tempfile.TemporaryDirectory() as data_dir:
|
||||||
mgr = MemoryManager(data_dir)
|
mgr = MemoryManager(data_dir)
|
||||||
|
# Seed alice's own memory (persisted so load_all sees it) and point
|
||||||
|
# find_similar at its real id.
|
||||||
|
seeded = mgr.add_entry("Alice's home city is Lisbon", source="auto",
|
||||||
|
category="fact", owner="alice")
|
||||||
|
mgr.save([seeded])
|
||||||
|
|
||||||
|
class _DedupVectorStore:
|
||||||
|
healthy = True
|
||||||
|
|
||||||
|
def find_similar(self, text, threshold=0.72):
|
||||||
|
return seeded["id"] # matches alice's own seeded memory
|
||||||
|
|
||||||
|
def add(self, memory_id, text): # pragma: no cover - should not run
|
||||||
|
raise AssertionError("add should not be called for a deduped fact")
|
||||||
|
|
||||||
_run(extract_and_store(
|
_run(extract_and_store(
|
||||||
_FakeSession(), mgr, _DedupVectorStore(),
|
_FakeSession(), mgr, _DedupVectorStore(),
|
||||||
endpoint_url="http://x", model="m", headers=None,
|
endpoint_url="http://x", model="m", headers=None,
|
||||||
))
|
))
|
||||||
assert mgr.load(owner="alice") == []
|
# The new fact was deduped against alice's own memory, so only the
|
||||||
|
# seeded entry remains (no duplicate added).
|
||||||
|
assert [e["text"] for e in mgr.load(owner="alice")] == ["Alice's home city is Lisbon"]
|
||||||
|
|||||||
Reference in New Issue
Block a user