mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
fix: split_chunks emits a duplicate trailing chunk for text over size-overlap (#1573)
This commit is contained in:
@@ -77,6 +77,11 @@ def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config
|
|||||||
while i < n:
|
while i < n:
|
||||||
j = min(i + size, n)
|
j = min(i + size, n)
|
||||||
chunks.append(text[i:j])
|
chunks.append(text[i:j])
|
||||||
|
if j >= n:
|
||||||
|
# Reached the end. Without this, the next start (j - overlap) is
|
||||||
|
# still > i, so the loop appended one extra chunk duplicating the
|
||||||
|
# last `overlap` chars of the text.
|
||||||
|
break
|
||||||
i = j - overlap if j - overlap > i else j
|
i = j - overlap if j - overlap > i else j
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,33 @@
|
|||||||
|
"""Regression: split_chunks must not emit a duplicate trailing chunk.
|
||||||
|
|
||||||
|
The loop advanced `i = j - overlap` even after `j` reached the end of the text,
|
||||||
|
so any text longer than (size - overlap) got an extra final chunk duplicating
|
||||||
|
the last `overlap` characters. That duplicate is indexed and keyword-scored
|
||||||
|
twice, so retrieve_personal_keyword returns the same tail content twice.
|
||||||
|
"""
|
||||||
|
from src.personal_docs import split_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_duplicate_tail_chunk():
|
||||||
|
chunks = split_chunks("x" * 1100, size=1000, overlap=200)
|
||||||
|
assert [len(c) for c in chunks] == [1000, 300]
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_chunk_is_contained_in_another():
|
||||||
|
text = "".join(chr(33 + (k % 90)) for k in range(2000))
|
||||||
|
chunks = split_chunks(text, size=1000, overlap=200)
|
||||||
|
# The buggy version produced a final 200-char chunk fully inside the prior one.
|
||||||
|
for a in range(len(chunks)):
|
||||||
|
for b in range(len(chunks)):
|
||||||
|
if a != b:
|
||||||
|
assert chunks[a] not in chunks[b]
|
||||||
|
|
||||||
|
|
||||||
|
def test_overlap_is_preserved_between_chunks():
|
||||||
|
chunks = split_chunks("x" * 1100, size=1000, overlap=200)
|
||||||
|
# Second chunk starts 200 chars before the first one ended (offset 800).
|
||||||
|
assert len(chunks) == 2 and chunks[1] == ("x" * 1100)[800:1100]
|
||||||
|
|
||||||
|
|
||||||
|
def test_short_text_single_chunk():
|
||||||
|
assert split_chunks("hello world", size=1000, overlap=200) == ["hello world"]
|
||||||
Reference in New Issue
Block a user