diff --git a/src/embedding_lanes.py b/src/embedding_lanes.py index bca4eaef2..f23be32b8 100644 --- a/src/embedding_lanes.py +++ b/src/embedding_lanes.py @@ -196,13 +196,22 @@ def _get_or_reset_collection(chroma_client, name: str, metadata: Dict[str, Any], try: chroma_client.delete_collection(name) restored = chroma_client.get_or_create_collection(name=name, metadata=current) - old_embeddings = preserved.get("embeddings") or [] - if ids and docs and old_embeddings: + # chromadb returns embeddings as a numpy ndarray, whose truth value + # is ambiguous — `preserved.get("embeddings") or []` and a bare + # `if ... and old_embeddings:` both raise ValueError, which aborts + # the restore and loses the rows the reset was supposed to keep. + # Use explicit None/len checks instead. + old_embeddings = preserved.get("embeddings") + if old_embeddings is None: + old_embeddings = [] + if ids and docs and len(old_embeddings): for start in range(0, len(ids), 100): batch_ids = ids[start:start + 100] batch_docs = docs[start:start + 100] batch_metas = metas[start:start + 100] batch_embeddings = old_embeddings[start:start + 100] + if hasattr(batch_embeddings, "tolist"): + batch_embeddings = batch_embeddings.tolist() if len(batch_metas) < len(batch_ids): batch_metas += [{}] * (len(batch_ids) - len(batch_metas)) restored.add( diff --git a/tests/test_embedding_lane_ndarray_restore.py b/tests/test_embedding_lane_ndarray_restore.py new file mode 100644 index 000000000..710a4c92b --- /dev/null +++ b/tests/test_embedding_lane_ndarray_restore.py @@ -0,0 +1,68 @@ +"""Embedding-lane reset must restore rows even when chromadb returns the +preserved embeddings as a numpy ndarray. + +Real chromadb returns collection.get(include=["embeddings"]) as a numpy +ndarray. The restore-after-failed-rewrite path used `embeddings or []` and a +bare `if ... and embeddings:`, both of which raise +"truth value of an array ... is ambiguous" on an ndarray — aborting the +restore and wiping the collection the reset was meant to preserve. + +This mirrors test_lane_reset_restores_existing_collection_when_rewrite_fails +in test_embedding_lanes.py, but the preserved embeddings come back as ndarray. +""" +import numpy as np + +from src.embedding_lanes import build_embedding_lanes +from tests.test_embedding_lanes import FakeChroma, FakeEmbedder, _patch_chroma + + +def test_lane_reset_restores_when_chroma_returns_numpy_embeddings(monkeypatch): + fake = FakeChroma() + old_custom = fake.get_or_create_collection( + "odysseus_memories_custom", + metadata={ + "embedding_lane": "custom", + "embedding_dimension": 384, + "embedding_fingerprint": "old", + }, + ) + old_custom.add( + ids=["existing-memory"], + embeddings=[[0.0] * 384], + documents=["existing custom memory"], + metadatas=[{"source": "memory"}], + ) + + # Make the preserved embeddings come back as a numpy ndarray, like real + # chromadb does. + real_get = old_custom.get + + def ndarray_get(*args, **kwargs): + result = real_get(*args, **kwargs) + result["embeddings"] = np.array(result["embeddings"]) + return result + + old_custom.get = ndarray_get + + # Force the post-reset rewrite to fail so the restore branch runs. + fake.fail_next_add_for["odysseus_memories_custom"] = 1 + _patch_chroma(monkeypatch, fake) + + import src.embedding_lanes as lanes + + monkeypatch.setattr(lanes, "_build_custom_client", lambda: FakeEmbedder(768, "nomic", "http://embeddings/v1")) + + def fail_fastembed(): + raise RuntimeError("fastembed missing") + + monkeypatch.setattr(lanes, "_build_fastembed_client", fail_fastembed) + + built = build_embedding_lanes("odysseus_memories") + + # Both lanes are unavailable, but the existing row must survive — not be + # wiped by an ndarray-truthiness crash in the restore path. + assert built == [] + restored = fake.collections["odysseus_memories_custom"] + assert restored.count() == 1 + assert restored.get()["ids"] == ["existing-memory"] + assert len(restored.rows["existing-memory"]["embedding"]) == 384