mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-07-02 01:22:07 -04:00
Ignore non-string personal doc text (#1832)
This commit is contained in:
@@ -68,6 +68,8 @@ def read_text_file(path: str) -> str:
|
|||||||
|
|
||||||
def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config.CHUNK_OVERLAP) -> List[str]:
|
def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config.CHUNK_OVERLAP) -> List[str]:
|
||||||
"""Split text into overlapping chunks."""
|
"""Split text into overlapping chunks."""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return []
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
@@ -87,7 +89,8 @@ def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config
|
|||||||
|
|
||||||
def tokenize(s: str) -> Set[str]:
|
def tokenize(s: str) -> Set[str]:
|
||||||
"""Tokenize string into words, excluding stop words."""
|
"""Tokenize string into words, excluding stop words."""
|
||||||
tokens = re.findall(r"[A-Za-z0-9_\-]+", (s or "").lower())
|
text = s if isinstance(s, str) else ""
|
||||||
|
tokens = re.findall(r"[A-Za-z0-9_\-]+", text.lower())
|
||||||
return set(t for t in tokens if t not in config.STOP_WORDS and len(t) > 1)
|
return set(t for t in tokens if t not in config.STOP_WORDS and len(t) > 1)
|
||||||
|
|
||||||
def load_personal_index(
|
def load_personal_index(
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from src.personal_docs import retrieve_personal_keyword
|
from src.personal_docs import retrieve_personal_keyword, split_chunks
|
||||||
|
|
||||||
|
|
||||||
def test_retrieve_personal_keyword_skips_non_dict_rows():
|
def test_retrieve_personal_keyword_skips_non_dict_rows():
|
||||||
@@ -19,3 +19,17 @@ def test_retrieve_personal_keyword_tolerates_missing_chunks_key():
|
|||||||
index = [{"name": "empty.txt"}, {"name": "doc.txt", "chunks": ["alpha beta gamma"]}]
|
index = [{"name": "empty.txt"}, {"name": "doc.txt", "chunks": ["alpha beta gamma"]}]
|
||||||
out = retrieve_personal_keyword(index, "beta", k=5)
|
out = retrieve_personal_keyword(index, "beta", k=5)
|
||||||
assert out == ["[doc.txt :: chunk 1]\nalpha beta gamma"]
|
assert out == ["[doc.txt :: chunk 1]\nalpha beta gamma"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrieve_personal_keyword_ignores_non_string_text():
|
||||||
|
index = [{"name": "doc.txt", "chunks": [None, ["beta"], "alpha beta gamma"]}]
|
||||||
|
|
||||||
|
assert retrieve_personal_keyword(index, ["beta"], k=5) == []
|
||||||
|
assert retrieve_personal_keyword(index, "beta", k=5) == [
|
||||||
|
"[doc.txt :: chunk 3]\nalpha beta gamma"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_chunks_ignores_non_string_text():
|
||||||
|
assert split_chunks(None, size=1000, overlap=200) == []
|
||||||
|
assert split_chunks(["hello"], size=1000, overlap=200) == []
|
||||||
|
|||||||
Reference in New Issue
Block a user