From bbbe145247c25a352c01a26714d6f1ba88bcac68 Mon Sep 17 00:00:00 2001 From: red person Date: Mon, 29 Jun 2026 11:24:29 -0700 Subject: [PATCH] Ignore non-string personal doc text (#1832) --- src/personal_docs.py | 5 ++++- tests/test_personal_docs_keyword_nondict.py | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/personal_docs.py b/src/personal_docs.py index 7ffb5cfb9..0e8335357 100644 --- a/src/personal_docs.py +++ b/src/personal_docs.py @@ -68,6 +68,8 @@ def read_text_file(path: str) -> str: def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config.CHUNK_OVERLAP) -> List[str]: """Split text into overlapping chunks.""" + if not isinstance(text, str): + return [] text = text.strip() if not text: return [] @@ -87,7 +89,8 @@ def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config def tokenize(s: str) -> Set[str]: """Tokenize string into words, excluding stop words.""" - tokens = re.findall(r"[A-Za-z0-9_\-]+", (s or "").lower()) + text = s if isinstance(s, str) else "" + tokens = re.findall(r"[A-Za-z0-9_\-]+", text.lower()) return set(t for t in tokens if t not in config.STOP_WORDS and len(t) > 1) def load_personal_index( diff --git a/tests/test_personal_docs_keyword_nondict.py b/tests/test_personal_docs_keyword_nondict.py index f46c9f46c..29dfe6f97 100644 --- a/tests/test_personal_docs_keyword_nondict.py +++ b/tests/test_personal_docs_keyword_nondict.py @@ -1,4 +1,4 @@ -from src.personal_docs import retrieve_personal_keyword +from src.personal_docs import retrieve_personal_keyword, split_chunks def test_retrieve_personal_keyword_skips_non_dict_rows(): @@ -19,3 +19,17 @@ def test_retrieve_personal_keyword_tolerates_missing_chunks_key(): index = [{"name": "empty.txt"}, {"name": "doc.txt", "chunks": ["alpha beta gamma"]}] out = retrieve_personal_keyword(index, "beta", k=5) assert out == ["[doc.txt :: chunk 1]\nalpha beta gamma"] + + +def test_retrieve_personal_keyword_ignores_non_string_text(): + index = [{"name": "doc.txt", "chunks": [None, ["beta"], "alpha beta gamma"]}] + + assert retrieve_personal_keyword(index, ["beta"], k=5) == [] + assert retrieve_personal_keyword(index, "beta", k=5) == [ + "[doc.txt :: chunk 3]\nalpha beta gamma" + ] + + +def test_split_chunks_ignores_non_string_text(): + assert split_chunks(None, size=1000, overlap=200) == [] + assert split_chunks(["hello"], size=1000, overlap=200) == []