mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
Documents: strip PDF marker without corrupting text
_process_pdf prepends "\n\n[PDF content]:" to extracted text, and two
call sites in document_routes.py stripped it with .lstrip("\n[PDF content]:").
str.lstrip(chars) treats its argument as a *set of characters*, so it keeps
eating into the page text that follows the marker — e.g. a body starting
with "to the board" loses its leading "to" because 't'/'o' are in the
marker's character set. Replace both sites with a shared
strip_pdf_content_marker() helper that uses str.removeprefix.
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
"""Regression test: the '[PDF content]:' wrapper must be removed without eating
|
||||
into the page text that follows it.
|
||||
|
||||
The old call sites used ``str.lstrip("\\n[PDF content]:")``, which treats the
|
||||
argument as a *set of characters* and keeps stripping leading characters that
|
||||
happen to be in that set — corrupting the start of the extracted document.
|
||||
"""
|
||||
from src.document_processor import strip_pdf_content_marker, _PDF_CONTENT_MARKER
|
||||
|
||||
|
||||
def test_marker_removed_without_eating_following_text():
|
||||
# Shape that _process_pdf actually returns: marker + "\n\n[Page 1 text]:" + body.
|
||||
raw = "\n\n[PDF content]:\n\n[Page 1 text]:\nto the board, content begins"
|
||||
out = strip_pdf_content_marker(raw)
|
||||
assert out == "[Page 1 text]:\nto the board, content begins"
|
||||
# The old lstrip approach produced "age 1 text]:..." (ate "[P" then "to").
|
||||
assert not out.startswith("age 1 text")
|
||||
|
||||
|
||||
def test_marker_constant_matches_processor_output():
|
||||
# If _process_pdf's prefix ever changes, this guards the consumer.
|
||||
assert _PDF_CONTENT_MARKER == "\n\n[PDF content]:"
|
||||
|
||||
|
||||
def test_text_without_marker_is_only_stripped():
|
||||
assert strip_pdf_content_marker(" plain text ") == "plain text"
|
||||
|
||||
|
||||
def test_handles_none():
|
||||
assert strip_pdf_content_marker(None) == ""
|
||||
Reference in New Issue
Block a user