Files
odysseus/tests/test_document_library_pdf_metadata.py
T
nubs 1a0e1c5d69 fix(documents): restore PDF library metadata and preview (#2483)
PDF uploads are stored as markdown wrappers with pdf_source or pdf_form_source markers so the editor can preserve extracted text, form fields, and annotations. The library exposed that internal wrapper: auto-created PDF documents used the hashed storage filename as the title, and row/facet language reported markdown instead of pdf.

Derive chat-upload PDF titles from the original upload name, derive document-library display language from the PDF source marker for rows, filters, and facets, and keep markdown wrappers excluded from the markdown facet when they represent PDFs.

The expanded library card already renders PDF-backed documents through /api/document/{id}/render-pdf. Allow only that inline PDF preview endpoint to be framed by same-origin app pages while leaving normal routes on X-Frame-Options: DENY and frame-ancestors none.

Also tighten the existing PDF marker regression assertion so it matches the actual historical corruption signature instead of contradicting the preserved [Page 1 text]: marker.

Fixes #2468
2026-06-07 23:23:27 +02:00

44 lines
1.3 KiB
Python

from types import SimpleNamespace
from routes.document_routes import _aggregate_language_facets, _library_language_for_document
def test_pdf_backed_plain_document_displays_as_pdf_in_library():
doc = SimpleNamespace(
language="markdown",
current_content='<!-- pdf_source upload_id="0123456789abcdef0123456789abcdef.pdf" -->\n\n# Packet\n',
)
assert _library_language_for_document(doc) == "pdf"
def test_pdf_backed_form_document_displays_as_pdf_in_library():
doc = SimpleNamespace(
language="markdown",
current_content=(
'<!-- pdf_form_source upload_id="0123456789abcdef0123456789abcdef.pdf" fields="3" -->'
"\n\n# Intake Form\n"
),
)
assert _library_language_for_document(doc) == "pdf"
def test_non_pdf_library_language_is_unchanged():
assert _library_language_for_document(
SimpleNamespace(language="python", current_content="print('ok')\n")
) == "python"
assert _library_language_for_document(
SimpleNamespace(language=None, current_content="plain text")
) == "text"
def test_pdf_language_facet_counts_are_summed():
rows = [("pdf", 1), ("markdown", 2), ("pdf", 1), (None, 1)]
assert _aggregate_language_facets(rows) == {
"pdf": 2,
"markdown": 2,
"text": 1,
}