mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
fix(documents): restore PDF library metadata and preview (#2483)
PDF uploads are stored as markdown wrappers with pdf_source or pdf_form_source markers so the editor can preserve extracted text, form fields, and annotations. The library exposed that internal wrapper: auto-created PDF documents used the hashed storage filename as the title, and row/facet language reported markdown instead of pdf.
Derive chat-upload PDF titles from the original upload name, derive document-library display language from the PDF source marker for rows, filters, and facets, and keep markdown wrappers excluded from the markdown facet when they represent PDFs.
The expanded library card already renders PDF-backed documents through /api/document/{id}/render-pdf. Allow only that inline PDF preview endpoint to be framed by same-origin app pages while leaving normal routes on X-Frame-Options: DENY and frame-ancestors none.
Also tighten the existing PDF marker regression assertion so it matches the actual historical corruption signature instead of contradicting the preserved [Page 1 text]: marker.
Fixes #2468
This commit is contained in:
@@ -56,3 +56,39 @@ def test_pdf_body_marker_stripped_without_eating_text(monkeypatch, tmp_path):
|
||||
assert "to the board, the agenda is set" in body_lines
|
||||
# The old lstrip(chars) corruption produced a line like "age 1 text]:" (missing "[P").
|
||||
assert "age 1 text]:" not in body_lines
|
||||
|
||||
|
||||
def test_pdf_auto_document_uses_original_upload_name(monkeypatch, tmp_path):
|
||||
pdf_path = tmp_path / "0123456789abcdef0123456789abcdef.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
captured = {}
|
||||
monkeypatch.setattr(dp, "_process_pdf", lambda path: "\n\n[PDF content]:\nbody")
|
||||
monkeypatch.setattr(pdf_forms, "has_form_fields", lambda path: False)
|
||||
|
||||
def _capture_plain_pdf_document(**kw):
|
||||
captured.update(kw)
|
||||
return "doc-123"
|
||||
|
||||
monkeypatch.setattr(pdf_form_doc, "create_plain_pdf_document", _capture_plain_pdf_document)
|
||||
|
||||
resolved = {
|
||||
"fid1": {
|
||||
"path": str(pdf_path),
|
||||
"mime": "application/pdf",
|
||||
"name": "Quarterly Board Packet.pdf",
|
||||
}
|
||||
}
|
||||
|
||||
dp.build_user_content(
|
||||
text="here is a pdf",
|
||||
attachment_ids=["fid1"],
|
||||
upload_dir=str(tmp_path),
|
||||
upload_handler=_FakeUploadHandler(),
|
||||
session_id="s1",
|
||||
resolved_uploads=resolved,
|
||||
)
|
||||
|
||||
assert captured["title"] == "Quarterly Board Packet"
|
||||
assert captured["upload_id"] == pdf_path.name
|
||||
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
from routes.document_routes import _aggregate_language_facets, _library_language_for_document
|
||||
|
||||
|
||||
def test_pdf_backed_plain_document_displays_as_pdf_in_library():
|
||||
doc = SimpleNamespace(
|
||||
language="markdown",
|
||||
current_content='<!-- pdf_source upload_id="0123456789abcdef0123456789abcdef.pdf" -->\n\n# Packet\n',
|
||||
)
|
||||
|
||||
assert _library_language_for_document(doc) == "pdf"
|
||||
|
||||
|
||||
def test_pdf_backed_form_document_displays_as_pdf_in_library():
|
||||
doc = SimpleNamespace(
|
||||
language="markdown",
|
||||
current_content=(
|
||||
'<!-- pdf_form_source upload_id="0123456789abcdef0123456789abcdef.pdf" fields="3" -->'
|
||||
"\n\n# Intake Form\n"
|
||||
),
|
||||
)
|
||||
|
||||
assert _library_language_for_document(doc) == "pdf"
|
||||
|
||||
|
||||
def test_non_pdf_library_language_is_unchanged():
|
||||
assert _library_language_for_document(
|
||||
SimpleNamespace(language="python", current_content="print('ok')\n")
|
||||
) == "python"
|
||||
assert _library_language_for_document(
|
||||
SimpleNamespace(language=None, current_content="plain text")
|
||||
) == "text"
|
||||
|
||||
|
||||
def test_pdf_language_facet_counts_are_summed():
|
||||
rows = [("pdf", 1), ("markdown", 2), ("pdf", 1), (None, 1)]
|
||||
|
||||
assert _aggregate_language_facets(rows) == {
|
||||
"pdf": 2,
|
||||
"markdown": 2,
|
||||
"text": 1,
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import Response
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from core.middleware import SecurityHeadersMiddleware
|
||||
|
||||
|
||||
def _client():
|
||||
app = FastAPI()
|
||||
app.add_middleware(SecurityHeadersMiddleware)
|
||||
|
||||
@app.get("/plain")
|
||||
async def plain():
|
||||
return {"ok": True}
|
||||
|
||||
@app.get("/api/document/{doc_id}/render-pdf")
|
||||
async def render_pdf(doc_id: str):
|
||||
return Response(b"%PDF-1.4\n", media_type="application/pdf")
|
||||
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
def test_default_routes_remain_unframeable():
|
||||
response = _client().get("/plain")
|
||||
|
||||
assert response.headers["X-Frame-Options"] == "DENY"
|
||||
assert "frame-ancestors 'none'" in response.headers["Content-Security-Policy"]
|
||||
|
||||
|
||||
def test_document_pdf_preview_can_be_framed_by_same_origin():
|
||||
response = _client().get("/api/document/doc-123/render-pdf")
|
||||
|
||||
assert response.headers["X-Frame-Options"] == "SAMEORIGIN"
|
||||
assert response.headers["Content-Security-Policy"] == (
|
||||
"default-src 'none'; frame-ancestors 'self'"
|
||||
)
|
||||
Reference in New Issue
Block a user