fix: split_chunks emits a duplicate trailing chunk for text over size-overlap (#1573)

This commit is contained in:
Afonso Coutinho
2026-06-03 00:57:54 +01:00
committed by GitHub
parent c3bf32d1b1
commit 82c09dd768
2 changed files with 38 additions and 0 deletions
+5
View File
@@ -77,6 +77,11 @@ def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config
while i < n:
j = min(i + size, n)
chunks.append(text[i:j])
if j >= n:
# Reached the end. Without this, the next start (j - overlap) is
# still > i, so the loop appended one extra chunk duplicating the
# last `overlap` chars of the text.
break
i = j - overlap if j - overlap > i else j
return chunks