mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-22 04:35:29 -04:00
fix: split Chroma embedding lanes (#3046)
This commit is contained in:
+232
-153
@@ -14,6 +14,17 @@ import numpy as np
|
||||
from typing import List, Dict, Any, Optional, Set
|
||||
from pathlib import Path
|
||||
|
||||
from src.embedding_lanes import (
|
||||
LANE_CUSTOM,
|
||||
LANE_FASTEMBED,
|
||||
build_embedding_lanes,
|
||||
collection_name,
|
||||
dedupe_results,
|
||||
lane_count,
|
||||
migrate_legacy_collection,
|
||||
query_lanes,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_FILE_EXTENSIONS: Set[str] = {
|
||||
@@ -44,6 +55,7 @@ class VectorRAG:
|
||||
self.persist_directory = persist_directory
|
||||
self._collection = None
|
||||
self._model = None
|
||||
self._lanes = []
|
||||
self._healthy = False
|
||||
|
||||
Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
|
||||
@@ -55,22 +67,20 @@ class VectorRAG:
|
||||
|
||||
def _initialize_system(self) -> bool:
|
||||
try:
|
||||
from src.chroma_client import get_chroma_client
|
||||
from src.embeddings import get_embedding_client
|
||||
|
||||
self._model = get_embedding_client()
|
||||
if self._model is None:
|
||||
raise RuntimeError("No embedding backend available")
|
||||
logger.info(f"Embedding: {self._model.url} model={self._model.model}")
|
||||
|
||||
client = get_chroma_client()
|
||||
self._collection = client.get_or_create_collection(
|
||||
name=COLLECTION_NAME,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
self._lanes = build_embedding_lanes(COLLECTION_NAME)
|
||||
if not self._lanes:
|
||||
raise RuntimeError("No embedding lanes available")
|
||||
self._collection = next(
|
||||
(lane.collection for lane in self._lanes if lane.name == LANE_FASTEMBED),
|
||||
self._lanes[0].collection,
|
||||
)
|
||||
self._model = self._lanes[0].client
|
||||
migrate_legacy_collection(COLLECTION_NAME, self._lanes)
|
||||
logger.info(
|
||||
"VectorRAG ready (lanes=%s docs=%s)",
|
||||
[lane.name for lane in self._lanes],
|
||||
lane_count(self._lanes),
|
||||
)
|
||||
|
||||
count = self._collection.count()
|
||||
logger.info(f"VectorRAG ready ({count} docs)")
|
||||
self._healthy = True
|
||||
return True
|
||||
|
||||
@@ -80,8 +90,9 @@ class VectorRAG:
|
||||
return False
|
||||
|
||||
def _embed(self, texts: List[str]) -> List[List[float]]:
|
||||
vecs = self._model.encode(texts, normalize_embeddings=True)
|
||||
return np.array(vecs, dtype=np.float32).tolist()
|
||||
if not self._lanes:
|
||||
return []
|
||||
return np.array(self._lanes[0].encode(texts), dtype=np.float32).tolist()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
@@ -89,13 +100,57 @@ class VectorRAG:
|
||||
|
||||
@property
|
||||
def healthy(self) -> bool:
|
||||
return self._healthy and self._collection is not None
|
||||
if getattr(self, "_lanes", None):
|
||||
return self._healthy and bool(self._lanes)
|
||||
return self._healthy and getattr(self, "_collection", None) is not None
|
||||
|
||||
@property
|
||||
def collection(self):
|
||||
"""Expose the ChromaDB collection for direct access by personal_routes etc."""
|
||||
return self._collection
|
||||
|
||||
def _active_collections(self):
|
||||
lanes = getattr(self, "_lanes", None)
|
||||
if lanes:
|
||||
return [(lane.name, lane.collection) for lane in lanes]
|
||||
collection = getattr(self, "_collection", None)
|
||||
return [("legacy", collection)] if collection is not None else []
|
||||
|
||||
def _collections_for_delete(self):
|
||||
collections = []
|
||||
seen = set()
|
||||
|
||||
def add(lane_name: str, collection) -> None:
|
||||
if collection is None:
|
||||
return
|
||||
key = getattr(collection, "name", None) or id(collection)
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
collections.append((lane_name, collection))
|
||||
|
||||
for lane_name, collection in self._active_collections():
|
||||
add(lane_name, collection)
|
||||
|
||||
if getattr(self, "_lanes", None):
|
||||
try:
|
||||
from src.chroma_client import get_chroma_client
|
||||
|
||||
client = get_chroma_client()
|
||||
try:
|
||||
add("legacy", client.get_collection(COLLECTION_NAME))
|
||||
except Exception:
|
||||
pass
|
||||
for lane_name in (LANE_CUSTOM, LANE_FASTEMBED):
|
||||
try:
|
||||
add(lane_name, client.get_collection(collection_name(COLLECTION_NAME, lane_name)))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return collections
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Document operations
|
||||
# ------------------------------------------------------------------
|
||||
@@ -109,23 +164,24 @@ class VectorRAG:
|
||||
if not metadata or not isinstance(metadata, dict):
|
||||
return False
|
||||
|
||||
try:
|
||||
doc_id = _generate_doc_id(text, metadata.get("owner") or "")
|
||||
# Check if already exists
|
||||
existing = self._collection.get(ids=[doc_id])
|
||||
if existing["ids"]:
|
||||
return True # already exists
|
||||
embeddings = self._embed([text])
|
||||
self._collection.add(
|
||||
ids=[doc_id],
|
||||
embeddings=embeddings,
|
||||
documents=[text],
|
||||
metadatas=[metadata],
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"add_document failed: {e}")
|
||||
return False
|
||||
doc_id = _generate_doc_id(text, metadata.get("owner") or "")
|
||||
wrote = False
|
||||
for lane in self._lanes:
|
||||
try:
|
||||
existing = lane.collection.get(ids=[doc_id])
|
||||
if existing["ids"]:
|
||||
wrote = True
|
||||
continue
|
||||
lane.collection.add(
|
||||
ids=[doc_id],
|
||||
embeddings=lane.encode([text]),
|
||||
documents=[text],
|
||||
metadatas=[metadata],
|
||||
)
|
||||
wrote = True
|
||||
except Exception as e:
|
||||
logger.warning("add_document failed in %s lane: %s", lane.name, e)
|
||||
return wrote
|
||||
|
||||
def add_documents_batch(self, docs: List[tuple]) -> Dict[str, Any]:
|
||||
if not self.healthy:
|
||||
@@ -140,42 +196,57 @@ class VectorRAG:
|
||||
if not valid:
|
||||
return {"success": False, "message": "No valid documents"}
|
||||
|
||||
try:
|
||||
# Get existing IDs to avoid duplicates
|
||||
added_ids = set()
|
||||
attempted_new = False
|
||||
write_failed = False
|
||||
for lane in self._lanes:
|
||||
all_ids = [_generate_doc_id(t, m.get("owner") or "") for t, m in valid]
|
||||
try:
|
||||
existing = lane.collection.get(ids=all_ids)
|
||||
existing_ids = set(existing.get("ids") or [])
|
||||
except Exception:
|
||||
existing_ids = set()
|
||||
|
||||
new_texts = []
|
||||
new_metas = []
|
||||
new_ids = []
|
||||
for t, m in valid:
|
||||
doc_id = _generate_doc_id(t, m.get("owner") or "")
|
||||
existing = self._collection.get(ids=[doc_id])
|
||||
if not existing["ids"]:
|
||||
new_texts.append(t)
|
||||
new_metas.append(m)
|
||||
for (text, meta), doc_id in zip(valid, all_ids):
|
||||
if doc_id not in existing_ids:
|
||||
new_texts.append(text)
|
||||
new_metas.append(meta)
|
||||
new_ids.append(doc_id)
|
||||
|
||||
if new_texts:
|
||||
# Batch in chunks of 100
|
||||
attempted_new = True
|
||||
lane_failed = False
|
||||
for i in range(0, len(new_texts), 100):
|
||||
batch_texts = new_texts[i:i + 100]
|
||||
batch_ids = new_ids[i:i + 100]
|
||||
batch_metas = new_metas[i:i + 100]
|
||||
embeddings = self._embed(batch_texts)
|
||||
self._collection.add(
|
||||
ids=batch_ids,
|
||||
embeddings=embeddings,
|
||||
documents=batch_texts,
|
||||
metadatas=batch_metas,
|
||||
)
|
||||
try:
|
||||
lane.collection.add(
|
||||
ids=batch_ids,
|
||||
embeddings=lane.encode(batch_texts),
|
||||
documents=batch_texts,
|
||||
metadatas=batch_metas,
|
||||
)
|
||||
except Exception as e:
|
||||
lane_failed = True
|
||||
write_failed = True
|
||||
logger.warning("add_documents_batch failed in %s lane: %s", lane.name, e)
|
||||
break
|
||||
if not lane_failed:
|
||||
added_ids.update(new_ids)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"added_count": len(new_texts),
|
||||
"total_count": len(docs),
|
||||
"failed_count": len(docs) - len(valid),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"add_documents_batch failed: {e}")
|
||||
return {"success": False, "message": str(e)}
|
||||
if attempted_new and write_failed and not added_ids:
|
||||
return {"success": False, "message": "No embedding lane accepted the batch"}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"added_count": len(added_ids),
|
||||
"total_count": len(docs),
|
||||
"failed_count": len(docs) - len(valid),
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Search — hybrid: vector similarity + keyword overlap
|
||||
@@ -186,58 +257,51 @@ class VectorRAG:
|
||||
return []
|
||||
if not query or not isinstance(query, str):
|
||||
return []
|
||||
if self._collection.count() == 0:
|
||||
if lane_count(self._lanes) == 0:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Fetch extra candidates when owner-filtering
|
||||
fetch_k = min(k * 3, max(k, 20), self._collection.count())
|
||||
if owner:
|
||||
fetch_k = min(fetch_k * 2, self._collection.count())
|
||||
|
||||
query_embeddings = self._embed([query])
|
||||
|
||||
# Use ChromaDB where filter for owner if specified
|
||||
where_filter = {"owner": owner} if owner else None
|
||||
|
||||
results = self._collection.query(
|
||||
query_embeddings=query_embeddings,
|
||||
n_results=fetch_k,
|
||||
where=where_filter,
|
||||
include=["documents", "metadatas", "distances"],
|
||||
)
|
||||
|
||||
query_words = set(query.lower().split())
|
||||
candidates = []
|
||||
|
||||
for idx in range(len(results["ids"][0])):
|
||||
doc_id = results["ids"][0][idx]
|
||||
distance = results["distances"][0][idx]
|
||||
doc_text = results["documents"][0][idx]
|
||||
meta = results["metadatas"][0][idx]
|
||||
for lane, results in query_lanes(
|
||||
self._lanes,
|
||||
query,
|
||||
n_results=lambda lane: min(
|
||||
(k * 6 if owner else k * 3),
|
||||
max(k, 20),
|
||||
lane.count(),
|
||||
),
|
||||
where=where_filter,
|
||||
include=["documents", "metadatas", "distances"],
|
||||
raise_if_all_failed=True,
|
||||
):
|
||||
for idx in range(len(results["ids"][0])):
|
||||
doc_id = results["ids"][0][idx]
|
||||
distance = results["distances"][0][idx]
|
||||
doc_text = results["documents"][0][idx]
|
||||
meta = results["metadatas"][0][idx]
|
||||
|
||||
# ChromaDB cosine distance = 1 - cosine_similarity
|
||||
vector_sim = 1.0 - distance
|
||||
vector_sim = 1.0 - distance
|
||||
doc_words = set(doc_text.lower().split())
|
||||
overlap = len(query_words & doc_words)
|
||||
keyword_score = overlap / len(query_words) if query_words else 0.0
|
||||
hybrid_score = (VECTOR_WEIGHT * vector_sim) + (KEYWORD_WEIGHT * keyword_score)
|
||||
|
||||
# Keyword overlap score
|
||||
doc_words = set(doc_text.lower().split())
|
||||
overlap = len(query_words & doc_words)
|
||||
keyword_score = overlap / len(query_words) if query_words else 0.0
|
||||
|
||||
hybrid_score = (VECTOR_WEIGHT * vector_sim) + (KEYWORD_WEIGHT * keyword_score)
|
||||
|
||||
candidates.append({
|
||||
"id": doc_id,
|
||||
"document": doc_text,
|
||||
"metadata": meta,
|
||||
"distance": round(distance, 4),
|
||||
"similarity": round(hybrid_score, 4),
|
||||
"vector_similarity": round(vector_sim, 4),
|
||||
"keyword_score": round(keyword_score, 4),
|
||||
})
|
||||
candidates.append({
|
||||
"id": doc_id,
|
||||
"document": doc_text,
|
||||
"metadata": meta,
|
||||
"distance": round(distance, 4),
|
||||
"similarity": round(hybrid_score, 4),
|
||||
"vector_similarity": round(vector_sim, 4),
|
||||
"keyword_score": round(keyword_score, 4),
|
||||
"embedding_lane": lane.name,
|
||||
})
|
||||
|
||||
candidates.sort(key=lambda c: c["similarity"], reverse=True)
|
||||
top = candidates[:k]
|
||||
top = dedupe_results(candidates, limit=k)
|
||||
logger.info(f"Hybrid search for '{query[:60]}': {len(top)} results")
|
||||
return top
|
||||
|
||||
@@ -247,39 +311,36 @@ class VectorRAG:
|
||||
|
||||
def _keyword_search_fallback(self, query: str, k: int = 5, owner: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
try:
|
||||
if self._collection.count() == 0:
|
||||
return []
|
||||
|
||||
# Fetch all documents for keyword search fallback
|
||||
all_docs = self._collection.get(include=["documents", "metadatas"])
|
||||
if not all_docs["ids"]:
|
||||
if not self._active_collections():
|
||||
return []
|
||||
|
||||
query_words = query.lower().split()
|
||||
scored = []
|
||||
for i, doc in enumerate(all_docs["documents"]):
|
||||
meta = all_docs["metadatas"][i]
|
||||
if owner:
|
||||
# Match the primary path's strict where={"owner": owner}
|
||||
# filter. The old `if doc_owner and doc_owner != owner`
|
||||
# let docs with a missing/empty owner fall through, leaking
|
||||
# owner-less documents into another user's results.
|
||||
if meta.get("owner") != owner:
|
||||
for lane_name, collection in self._active_collections():
|
||||
if collection.count() == 0:
|
||||
continue
|
||||
all_docs = collection.get(include=["documents", "metadatas"])
|
||||
if not all_docs["ids"]:
|
||||
continue
|
||||
for i, doc in enumerate(all_docs["documents"]):
|
||||
meta = all_docs["metadatas"][i]
|
||||
if owner and meta.get("owner") != owner:
|
||||
continue
|
||||
doc_lower = doc.lower()
|
||||
score = sum(1 for w in query_words if w in doc_lower)
|
||||
if score > 0:
|
||||
scored.append({
|
||||
"id": all_docs["ids"][i],
|
||||
"document": doc,
|
||||
"metadata": meta,
|
||||
"distance": 0,
|
||||
"similarity": score,
|
||||
"search_type": "keyword_fallback",
|
||||
})
|
||||
doc_lower = doc.lower()
|
||||
score = sum(1 for w in query_words if w in doc_lower)
|
||||
if score > 0:
|
||||
scored.append({
|
||||
"id": all_docs["ids"][i],
|
||||
"document": doc,
|
||||
"metadata": meta,
|
||||
"distance": 0,
|
||||
"similarity": score,
|
||||
"search_type": "keyword_fallback",
|
||||
"embedding_lane": lane_name,
|
||||
})
|
||||
|
||||
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
||||
return scored[:k]
|
||||
return dedupe_results(scored, limit=k)
|
||||
except Exception as e:
|
||||
logger.error(f"keyword fallback failed: {e}")
|
||||
return []
|
||||
@@ -296,9 +357,20 @@ class VectorRAG:
|
||||
client.delete_collection(COLLECTION_NAME)
|
||||
except Exception:
|
||||
pass
|
||||
self._collection = client.get_or_create_collection(
|
||||
name=COLLECTION_NAME,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
for name in (
|
||||
collection_name(COLLECTION_NAME, LANE_CUSTOM),
|
||||
collection_name(COLLECTION_NAME, LANE_FASTEMBED),
|
||||
):
|
||||
try:
|
||||
client.delete_collection(name)
|
||||
except Exception:
|
||||
pass
|
||||
# Rebuild means empty current lanes. Clear the legacy unsuffixed
|
||||
# collection too so startup migration cannot resurrect stale docs.
|
||||
self._lanes = build_embedding_lanes(COLLECTION_NAME)
|
||||
self._collection = next(
|
||||
(lane.collection for lane in self._lanes if lane.name == LANE_FASTEMBED),
|
||||
self._lanes[0].collection if self._lanes else None,
|
||||
)
|
||||
self._healthy = True
|
||||
return True
|
||||
@@ -312,10 +384,11 @@ class VectorRAG:
|
||||
return {"error": "Collection not initialized"}
|
||||
try:
|
||||
return {
|
||||
"document_count": self._collection.count(),
|
||||
"embedding_model": f"{self._model.model} @ {self._model.url}" if self._model else "N/A",
|
||||
"document_count": lane_count(self._lanes),
|
||||
"embedding_model": f"{self._lanes[0].model} @ {self._lanes[0].url}" if self._lanes else "N/A",
|
||||
"persist_directory": self.persist_directory,
|
||||
"collection_name": COLLECTION_NAME,
|
||||
"embedding_lanes": [lane.stats() for lane in self._lanes],
|
||||
"healthy": True,
|
||||
}
|
||||
except Exception as e:
|
||||
@@ -400,19 +473,23 @@ class VectorRAG:
|
||||
return {"success": False, "message": "Collection not initialized"}
|
||||
directory = os.path.abspath(directory)
|
||||
try:
|
||||
results = self._collection.get(include=["metadatas"])
|
||||
ids = [
|
||||
results["ids"][i]
|
||||
for i, m in enumerate(results["metadatas"])
|
||||
if isinstance(m, dict)
|
||||
and isinstance(m.get("source"), str)
|
||||
and (m["source"] == directory or m["source"].startswith(directory + os.sep))
|
||||
]
|
||||
if not ids:
|
||||
removed_ids = set()
|
||||
for _lane_name, collection in self._collections_for_delete():
|
||||
results = collection.get(include=["metadatas"])
|
||||
ids = [
|
||||
results["ids"][i]
|
||||
for i, m in enumerate(results["metadatas"])
|
||||
if isinstance(m, dict)
|
||||
and isinstance(m.get("source"), str)
|
||||
and (m["source"] == directory or m["source"].startswith(directory + os.sep))
|
||||
]
|
||||
if ids:
|
||||
collection.delete(ids=ids)
|
||||
removed_ids.update(ids)
|
||||
if not removed_ids:
|
||||
return {"success": True, "removed_count": 0, "message": "No docs found"}
|
||||
|
||||
self._collection.delete(ids=ids)
|
||||
n = len(ids)
|
||||
n = len(removed_ids)
|
||||
logger.info(f"Removed {n} chunks from {directory}")
|
||||
return {"success": True, "removed_count": n, "message": f"Removed {n} chunks"}
|
||||
except Exception as e:
|
||||
@@ -504,16 +581,18 @@ class VectorRAG:
|
||||
if not self.healthy:
|
||||
return 0
|
||||
try:
|
||||
results = self._collection.get(
|
||||
where={"source": source},
|
||||
include=[],
|
||||
)
|
||||
ids = results.get("ids", [])
|
||||
if not ids:
|
||||
return 0
|
||||
self._collection.delete(ids=ids)
|
||||
logger.info(f"Deleted {len(ids)} chunks for source={source}")
|
||||
return len(ids)
|
||||
removed_ids = set()
|
||||
for _lane_name, collection in self._collections_for_delete():
|
||||
results = collection.get(
|
||||
where={"source": source},
|
||||
include=[],
|
||||
)
|
||||
ids = results.get("ids", [])
|
||||
if ids:
|
||||
collection.delete(ids=ids)
|
||||
removed_ids.update(ids)
|
||||
logger.info(f"Deleted {len(removed_ids)} chunks for source={source}")
|
||||
return len(removed_ids)
|
||||
except Exception as e:
|
||||
logger.error(f"delete_by_source failed: {e}")
|
||||
return 0
|
||||
|
||||
Reference in New Issue
Block a user