mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-29 16:12:06 -04:00
fix(embeddings): survive numpy embeddings when restoring a reset lane (#3410)
When a lane reset fails to rewrite the recreated collection, the recovery path
re-adds the preserved rows. It read the embeddings with
`preserved.get("embeddings") or []` and gated the loop with
`if ids and docs and old_embeddings:`. chromadb returns embeddings as a numpy
ndarray, whose truth value is ambiguous, so both expressions raise ValueError
inside the except block — the restore is abandoned and every preserved row is
lost (the collection was already deleted), exactly when the code is trying to
avoid data loss.
Use an explicit `is None` check and `len(...)`, and convert ndarray batches to
lists before re-adding.
Adds tests/test_embedding_lane_ndarray_restore.py (preserved embeddings come
back as np.ndarray); existing test_embedding_lanes.py still passes.
This commit is contained in:
committed by
GitHub
parent
2fdb4813db
commit
3c4ec8828b
+11
-2
@@ -196,13 +196,22 @@ def _get_or_reset_collection(chroma_client, name: str, metadata: Dict[str, Any],
|
||||
try:
|
||||
chroma_client.delete_collection(name)
|
||||
restored = chroma_client.get_or_create_collection(name=name, metadata=current)
|
||||
old_embeddings = preserved.get("embeddings") or []
|
||||
if ids and docs and old_embeddings:
|
||||
# chromadb returns embeddings as a numpy ndarray, whose truth value
|
||||
# is ambiguous — `preserved.get("embeddings") or []` and a bare
|
||||
# `if ... and old_embeddings:` both raise ValueError, which aborts
|
||||
# the restore and loses the rows the reset was supposed to keep.
|
||||
# Use explicit None/len checks instead.
|
||||
old_embeddings = preserved.get("embeddings")
|
||||
if old_embeddings is None:
|
||||
old_embeddings = []
|
||||
if ids and docs and len(old_embeddings):
|
||||
for start in range(0, len(ids), 100):
|
||||
batch_ids = ids[start:start + 100]
|
||||
batch_docs = docs[start:start + 100]
|
||||
batch_metas = metas[start:start + 100]
|
||||
batch_embeddings = old_embeddings[start:start + 100]
|
||||
if hasattr(batch_embeddings, "tolist"):
|
||||
batch_embeddings = batch_embeddings.tolist()
|
||||
if len(batch_metas) < len(batch_ids):
|
||||
batch_metas += [{}] * (len(batch_ids) - len(batch_metas))
|
||||
restored.add(
|
||||
|
||||
Reference in New Issue
Block a user