mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 18:25:26 -04:00
fix(embeddings): survive numpy embeddings when restoring a reset lane (#3410)
When a lane reset fails to rewrite the recreated collection, the recovery path
re-adds the preserved rows. It read the embeddings with
`preserved.get("embeddings") or []` and gated the loop with
`if ids and docs and old_embeddings:`. chromadb returns embeddings as a numpy
ndarray, whose truth value is ambiguous, so both expressions raise ValueError
inside the except block — the restore is abandoned and every preserved row is
lost (the collection was already deleted), exactly when the code is trying to
avoid data loss.
Use an explicit `is None` check and `len(...)`, and convert ndarray batches to
lists before re-adding.
Adds tests/test_embedding_lane_ndarray_restore.py (preserved embeddings come
back as np.ndarray); existing test_embedding_lanes.py still passes.
This commit is contained in:
committed by
GitHub
parent
2fdb4813db
commit
3c4ec8828b
+11
-2
@@ -196,13 +196,22 @@ def _get_or_reset_collection(chroma_client, name: str, metadata: Dict[str, Any],
|
|||||||
try:
|
try:
|
||||||
chroma_client.delete_collection(name)
|
chroma_client.delete_collection(name)
|
||||||
restored = chroma_client.get_or_create_collection(name=name, metadata=current)
|
restored = chroma_client.get_or_create_collection(name=name, metadata=current)
|
||||||
old_embeddings = preserved.get("embeddings") or []
|
# chromadb returns embeddings as a numpy ndarray, whose truth value
|
||||||
if ids and docs and old_embeddings:
|
# is ambiguous — `preserved.get("embeddings") or []` and a bare
|
||||||
|
# `if ... and old_embeddings:` both raise ValueError, which aborts
|
||||||
|
# the restore and loses the rows the reset was supposed to keep.
|
||||||
|
# Use explicit None/len checks instead.
|
||||||
|
old_embeddings = preserved.get("embeddings")
|
||||||
|
if old_embeddings is None:
|
||||||
|
old_embeddings = []
|
||||||
|
if ids and docs and len(old_embeddings):
|
||||||
for start in range(0, len(ids), 100):
|
for start in range(0, len(ids), 100):
|
||||||
batch_ids = ids[start:start + 100]
|
batch_ids = ids[start:start + 100]
|
||||||
batch_docs = docs[start:start + 100]
|
batch_docs = docs[start:start + 100]
|
||||||
batch_metas = metas[start:start + 100]
|
batch_metas = metas[start:start + 100]
|
||||||
batch_embeddings = old_embeddings[start:start + 100]
|
batch_embeddings = old_embeddings[start:start + 100]
|
||||||
|
if hasattr(batch_embeddings, "tolist"):
|
||||||
|
batch_embeddings = batch_embeddings.tolist()
|
||||||
if len(batch_metas) < len(batch_ids):
|
if len(batch_metas) < len(batch_ids):
|
||||||
batch_metas += [{}] * (len(batch_ids) - len(batch_metas))
|
batch_metas += [{}] * (len(batch_ids) - len(batch_metas))
|
||||||
restored.add(
|
restored.add(
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
"""Embedding-lane reset must restore rows even when chromadb returns the
|
||||||
|
preserved embeddings as a numpy ndarray.
|
||||||
|
|
||||||
|
Real chromadb returns collection.get(include=["embeddings"]) as a numpy
|
||||||
|
ndarray. The restore-after-failed-rewrite path used `embeddings or []` and a
|
||||||
|
bare `if ... and embeddings:`, both of which raise
|
||||||
|
"truth value of an array ... is ambiguous" on an ndarray — aborting the
|
||||||
|
restore and wiping the collection the reset was meant to preserve.
|
||||||
|
|
||||||
|
This mirrors test_lane_reset_restores_existing_collection_when_rewrite_fails
|
||||||
|
in test_embedding_lanes.py, but the preserved embeddings come back as ndarray.
|
||||||
|
"""
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from src.embedding_lanes import build_embedding_lanes
|
||||||
|
from tests.test_embedding_lanes import FakeChroma, FakeEmbedder, _patch_chroma
|
||||||
|
|
||||||
|
|
||||||
|
def test_lane_reset_restores_when_chroma_returns_numpy_embeddings(monkeypatch):
|
||||||
|
fake = FakeChroma()
|
||||||
|
old_custom = fake.get_or_create_collection(
|
||||||
|
"odysseus_memories_custom",
|
||||||
|
metadata={
|
||||||
|
"embedding_lane": "custom",
|
||||||
|
"embedding_dimension": 384,
|
||||||
|
"embedding_fingerprint": "old",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
old_custom.add(
|
||||||
|
ids=["existing-memory"],
|
||||||
|
embeddings=[[0.0] * 384],
|
||||||
|
documents=["existing custom memory"],
|
||||||
|
metadatas=[{"source": "memory"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make the preserved embeddings come back as a numpy ndarray, like real
|
||||||
|
# chromadb does.
|
||||||
|
real_get = old_custom.get
|
||||||
|
|
||||||
|
def ndarray_get(*args, **kwargs):
|
||||||
|
result = real_get(*args, **kwargs)
|
||||||
|
result["embeddings"] = np.array(result["embeddings"])
|
||||||
|
return result
|
||||||
|
|
||||||
|
old_custom.get = ndarray_get
|
||||||
|
|
||||||
|
# Force the post-reset rewrite to fail so the restore branch runs.
|
||||||
|
fake.fail_next_add_for["odysseus_memories_custom"] = 1
|
||||||
|
_patch_chroma(monkeypatch, fake)
|
||||||
|
|
||||||
|
import src.embedding_lanes as lanes
|
||||||
|
|
||||||
|
monkeypatch.setattr(lanes, "_build_custom_client", lambda: FakeEmbedder(768, "nomic", "http://embeddings/v1"))
|
||||||
|
|
||||||
|
def fail_fastembed():
|
||||||
|
raise RuntimeError("fastembed missing")
|
||||||
|
|
||||||
|
monkeypatch.setattr(lanes, "_build_fastembed_client", fail_fastembed)
|
||||||
|
|
||||||
|
built = build_embedding_lanes("odysseus_memories")
|
||||||
|
|
||||||
|
# Both lanes are unavailable, but the existing row must survive — not be
|
||||||
|
# wiped by an ndarray-truthiness crash in the restore path.
|
||||||
|
assert built == []
|
||||||
|
restored = fake.collections["odysseus_memories_custom"]
|
||||||
|
assert restored.count() == 1
|
||||||
|
assert restored.get()["ids"] == ["existing-memory"]
|
||||||
|
assert len(restored.rows["existing-memory"]["embedding"]) == 384
|
||||||
Reference in New Issue
Block a user