mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
fix(documents): restore PDF library metadata and preview (#2483)
PDF uploads are stored as markdown wrappers with pdf_source or pdf_form_source markers so the editor can preserve extracted text, form fields, and annotations. The library exposed that internal wrapper: auto-created PDF documents used the hashed storage filename as the title, and row/facet language reported markdown instead of pdf.
Derive chat-upload PDF titles from the original upload name, derive document-library display language from the PDF source marker for rows, filters, and facets, and keep markdown wrappers excluded from the markdown facet when they represent PDFs.
The expanded library card already renders PDF-backed documents through /api/document/{id}/render-pdf. Allow only that inline PDF preview endpoint to be framed by same-origin app pages while leaving normal routes on X-Frame-Options: DENY and frame-ancestors none.
Also tighten the existing PDF marker regression assertion so it matches the actual historical corruption signature instead of contradicting the preserved [Page 1 text]: marker.
Fixes #2468
This commit is contained in:
@@ -67,6 +67,9 @@ class SecurityHeadersMiddleware(BaseHTTPMiddleware):
|
||||
|
||||
# Tool render endpoints are served inside iframes — allow framing by self
|
||||
is_tool_render = path.startswith("/api/tools/") and path.endswith("/render")
|
||||
# PDF previews are embedded by the in-app document library. Keep the
|
||||
# exception route-scoped so normal app pages remain unframeable.
|
||||
is_document_pdf_preview = path.startswith("/api/document/") and path.endswith("/render-pdf")
|
||||
# Visual report pages are self-contained HTML — need inline scripts + external images
|
||||
is_report = path.startswith("/api/research/report/")
|
||||
|
||||
@@ -96,6 +99,12 @@ class SecurityHeadersMiddleware(BaseHTTPMiddleware):
|
||||
# sandbox="allow-scripts" attribute provides isolation.
|
||||
# Don't overwrite the route's own restrictive CSP either.
|
||||
pass
|
||||
elif is_document_pdf_preview:
|
||||
response.headers["X-Frame-Options"] = "SAMEORIGIN"
|
||||
response.headers["Content-Security-Policy"] = (
|
||||
"default-src 'none'; "
|
||||
"frame-ancestors 'self'"
|
||||
)
|
||||
else:
|
||||
response.headers["X-Frame-Options"] = "DENY"
|
||||
# NOTE: `style-src 'unsafe-inline'` is intentionally retained.
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import Dict, Any, List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query, Request, UploadFile, File, Form
|
||||
|
||||
from sqlalchemy import func, or_
|
||||
from sqlalchemy import case, func, or_
|
||||
from core.database import SessionLocal, Document, DocumentVersion
|
||||
from core.database import Session as DbSession
|
||||
from src.auth_helpers import get_current_user
|
||||
@@ -39,6 +39,19 @@ def _aggregate_language_facets(lang_rows):
|
||||
return out
|
||||
|
||||
|
||||
def _library_language_for_document(doc: Document) -> str:
|
||||
"""Return the display language used by the document library.
|
||||
|
||||
PDF documents are stored as markdown wrappers so the editor can preserve
|
||||
extracted text, form fields, and annotations. The library should still
|
||||
identify them as PDFs instead of exposing that internal wrapper format.
|
||||
"""
|
||||
from src.pdf_form_doc import find_source_upload_id
|
||||
|
||||
if find_source_upload_id(doc.current_content or ""):
|
||||
return "pdf"
|
||||
return doc.language or "text"
|
||||
|
||||
|
||||
from routes.document_helpers import (
|
||||
DocumentCreate, DocumentUpdate, DocumentPatch,
|
||||
@@ -260,18 +273,29 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
db = SessionLocal()
|
||||
try:
|
||||
from sqlalchemy import or_
|
||||
pdf_marker_cond = or_(
|
||||
Document.current_content.like('%<!-- pdf_source upload_id="%'),
|
||||
Document.current_content.like('%<!-- pdf_form_source upload_id="%'),
|
||||
)
|
||||
library_language_expr = case(
|
||||
(pdf_marker_cond, "pdf"),
|
||||
(Document.language.is_(None), "text"),
|
||||
else_=Document.language,
|
||||
)
|
||||
# Archived view shows ONLY archived docs; the default view excludes
|
||||
# them (NULL = legacy rows that predate the column = not archived).
|
||||
_arch_cond = (Document.archived == True) if archived else or_(
|
||||
Document.archived == False, Document.archived.is_(None))
|
||||
# Language facet counts (owner-filtered)
|
||||
# Language facet counts (owner-filtered). PDF documents are stored
|
||||
# as markdown wrappers, so group by the library display language
|
||||
# instead of the raw stored language.
|
||||
lang_q = (
|
||||
db.query(Document.language, func.count(Document.id))
|
||||
db.query(library_language_expr, func.count(Document.id))
|
||||
.outerjoin(DbSession, Document.session_id == DbSession.id)
|
||||
.filter(Document.is_active == True).filter(_arch_cond)
|
||||
)
|
||||
lang_q = _owner_session_filter(lang_q, user)
|
||||
lang_rows = lang_q.group_by(Document.language).all()
|
||||
lang_rows = lang_q.group_by(library_language_expr).all()
|
||||
languages = _aggregate_language_facets(lang_rows)
|
||||
|
||||
# Session count (owner-filtered)
|
||||
@@ -303,12 +327,17 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
Document.title.ilike(term) | Document.current_content.ilike(term)
|
||||
)
|
||||
|
||||
# Language filter
|
||||
# Language filter. "pdf" is a display language derived from the
|
||||
# source marker; "markdown" excludes those wrappers.
|
||||
if language:
|
||||
if language == "text":
|
||||
q = q.filter((Document.language == None) | (Document.language == "text"))
|
||||
elif language == "pdf":
|
||||
q = q.filter(pdf_marker_cond)
|
||||
else:
|
||||
q = q.filter(Document.language == language)
|
||||
if language == "markdown":
|
||||
q = q.filter(~pdf_marker_cond)
|
||||
|
||||
# Total before pagination
|
||||
total = q.count()
|
||||
@@ -332,7 +361,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
"session_id": doc.session_id,
|
||||
"session_name": session_name,
|
||||
"title": doc.title,
|
||||
"language": doc.language or "text",
|
||||
"language": _library_language_for_document(doc),
|
||||
"preview": (doc.current_content or "")[:500],
|
||||
"version_count": doc.version_count,
|
||||
"created_at": (doc.created_at.isoformat() + "Z") if doc.created_at else None,
|
||||
|
||||
@@ -430,7 +430,7 @@ def build_user_content(
|
||||
create_form_markdown_document,
|
||||
create_plain_pdf_document,
|
||||
)
|
||||
title = os.path.splitext(os.path.basename(path))[0]
|
||||
title = os.path.splitext(os.path.basename(display_name))[0]
|
||||
# Pull the PDF prose once — used as either intro_text
|
||||
# (form path) or the doc body (plain path).
|
||||
try:
|
||||
|
||||
@@ -56,3 +56,39 @@ def test_pdf_body_marker_stripped_without_eating_text(monkeypatch, tmp_path):
|
||||
assert "to the board, the agenda is set" in body_lines
|
||||
# The old lstrip(chars) corruption produced a line like "age 1 text]:" (missing "[P").
|
||||
assert "age 1 text]:" not in body_lines
|
||||
|
||||
|
||||
def test_pdf_auto_document_uses_original_upload_name(monkeypatch, tmp_path):
|
||||
pdf_path = tmp_path / "0123456789abcdef0123456789abcdef.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
captured = {}
|
||||
monkeypatch.setattr(dp, "_process_pdf", lambda path: "\n\n[PDF content]:\nbody")
|
||||
monkeypatch.setattr(pdf_forms, "has_form_fields", lambda path: False)
|
||||
|
||||
def _capture_plain_pdf_document(**kw):
|
||||
captured.update(kw)
|
||||
return "doc-123"
|
||||
|
||||
monkeypatch.setattr(pdf_form_doc, "create_plain_pdf_document", _capture_plain_pdf_document)
|
||||
|
||||
resolved = {
|
||||
"fid1": {
|
||||
"path": str(pdf_path),
|
||||
"mime": "application/pdf",
|
||||
"name": "Quarterly Board Packet.pdf",
|
||||
}
|
||||
}
|
||||
|
||||
dp.build_user_content(
|
||||
text="here is a pdf",
|
||||
attachment_ids=["fid1"],
|
||||
upload_dir=str(tmp_path),
|
||||
upload_handler=_FakeUploadHandler(),
|
||||
session_id="s1",
|
||||
resolved_uploads=resolved,
|
||||
)
|
||||
|
||||
assert captured["title"] == "Quarterly Board Packet"
|
||||
assert captured["upload_id"] == pdf_path.name
|
||||
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
from routes.document_routes import _aggregate_language_facets, _library_language_for_document
|
||||
|
||||
|
||||
def test_pdf_backed_plain_document_displays_as_pdf_in_library():
|
||||
doc = SimpleNamespace(
|
||||
language="markdown",
|
||||
current_content='<!-- pdf_source upload_id="0123456789abcdef0123456789abcdef.pdf" -->\n\n# Packet\n',
|
||||
)
|
||||
|
||||
assert _library_language_for_document(doc) == "pdf"
|
||||
|
||||
|
||||
def test_pdf_backed_form_document_displays_as_pdf_in_library():
|
||||
doc = SimpleNamespace(
|
||||
language="markdown",
|
||||
current_content=(
|
||||
'<!-- pdf_form_source upload_id="0123456789abcdef0123456789abcdef.pdf" fields="3" -->'
|
||||
"\n\n# Intake Form\n"
|
||||
),
|
||||
)
|
||||
|
||||
assert _library_language_for_document(doc) == "pdf"
|
||||
|
||||
|
||||
def test_non_pdf_library_language_is_unchanged():
|
||||
assert _library_language_for_document(
|
||||
SimpleNamespace(language="python", current_content="print('ok')\n")
|
||||
) == "python"
|
||||
assert _library_language_for_document(
|
||||
SimpleNamespace(language=None, current_content="plain text")
|
||||
) == "text"
|
||||
|
||||
|
||||
def test_pdf_language_facet_counts_are_summed():
|
||||
rows = [("pdf", 1), ("markdown", 2), ("pdf", 1), (None, 1)]
|
||||
|
||||
assert _aggregate_language_facets(rows) == {
|
||||
"pdf": 2,
|
||||
"markdown": 2,
|
||||
"text": 1,
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import Response
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from core.middleware import SecurityHeadersMiddleware
|
||||
|
||||
|
||||
def _client():
|
||||
app = FastAPI()
|
||||
app.add_middleware(SecurityHeadersMiddleware)
|
||||
|
||||
@app.get("/plain")
|
||||
async def plain():
|
||||
return {"ok": True}
|
||||
|
||||
@app.get("/api/document/{doc_id}/render-pdf")
|
||||
async def render_pdf(doc_id: str):
|
||||
return Response(b"%PDF-1.4\n", media_type="application/pdf")
|
||||
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
def test_default_routes_remain_unframeable():
|
||||
response = _client().get("/plain")
|
||||
|
||||
assert response.headers["X-Frame-Options"] == "DENY"
|
||||
assert "frame-ancestors 'none'" in response.headers["Content-Security-Policy"]
|
||||
|
||||
|
||||
def test_document_pdf_preview_can_be_framed_by_same_origin():
|
||||
response = _client().get("/api/document/doc-123/render-pdf")
|
||||
|
||||
assert response.headers["X-Frame-Options"] == "SAMEORIGIN"
|
||||
assert response.headers["Content-Security-Policy"] == (
|
||||
"default-src 'none'; frame-ancestors 'self'"
|
||||
)
|
||||
Reference in New Issue
Block a user