mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
fix: split_chunks emits a duplicate trailing chunk for text over size-overlap (#1573)
This commit is contained in:
@@ -77,6 +77,11 @@ def split_chunks(text: str, size: int = config.CHUNK_SIZE, overlap: int = config
|
||||
while i < n:
|
||||
j = min(i + size, n)
|
||||
chunks.append(text[i:j])
|
||||
if j >= n:
|
||||
# Reached the end. Without this, the next start (j - overlap) is
|
||||
# still > i, so the loop appended one extra chunk duplicating the
|
||||
# last `overlap` chars of the text.
|
||||
break
|
||||
i = j - overlap if j - overlap > i else j
|
||||
return chunks
|
||||
|
||||
|
||||
Reference in New Issue
Block a user