mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Harden PDF document markers against cross-owner upload access (#445)
Route PDF lookups through UploadHandler.resolve_upload, reject poisoned pdf_source markers on document create/update, and add regression tests. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
+31
-28
@@ -20,28 +20,28 @@ from routes.document_helpers import (
|
||||
DocumentCreate, DocumentUpdate, DocumentPatch,
|
||||
_doc_to_dict, _version_to_dict,
|
||||
_verify_doc_owner, _owner_session_filter,
|
||||
_slug, _locate_upload, _derive_title,
|
||||
_slug, _resolve_user_upload_path, _assert_pdf_marker_upload_owned, _derive_title,
|
||||
_PDF_RENDER_SCALE,
|
||||
)
|
||||
|
||||
|
||||
def _locate_current_user_upload(request: Request, upload_dir: str, upload_id: str, user: Optional[str]):
|
||||
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None)
|
||||
return _locate_upload(upload_dir, upload_id, owner=user, auth_manager=auth_manager)
|
||||
|
||||
|
||||
def _load_pdf_viewer_fitz():
|
||||
from src.pdf_runtime import load_pymupdf_for_pdf_viewer
|
||||
|
||||
try:
|
||||
return load_pymupdf_for_pdf_viewer()
|
||||
except RuntimeError as exc:
|
||||
raise HTTPException(503, str(exc)) from exc
|
||||
|
||||
|
||||
def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
router = APIRouter(tags=["documents"])
|
||||
|
||||
def _locate_current_user_upload(request: Request, upload_id: str, user: Optional[str]):
|
||||
if upload_handler is None:
|
||||
return None
|
||||
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None)
|
||||
return _resolve_user_upload_path(upload_handler, upload_id, user, auth_manager)
|
||||
|
||||
def _load_pdf_viewer_fitz():
|
||||
from src.pdf_runtime import load_pymupdf_for_pdf_viewer
|
||||
|
||||
try:
|
||||
return load_pymupdf_for_pdf_viewer()
|
||||
except RuntimeError as exc:
|
||||
raise HTTPException(503, str(exc)) from exc
|
||||
|
||||
# ---- POST /api/document ----
|
||||
@router.post("/api/document")
|
||||
async def create_document(request: Request, req: DocumentCreate) -> Dict[str, Any]:
|
||||
@@ -82,6 +82,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
if _looks_like_email_document(req.content, req.title):
|
||||
language = "email"
|
||||
|
||||
_assert_pdf_marker_upload_owned(request, req.content, user, upload_handler)
|
||||
|
||||
doc = Document(
|
||||
id=doc_id,
|
||||
session_id=req.session_id,
|
||||
@@ -176,7 +178,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
raise HTTPException(500, f"Upload failed: {e}")
|
||||
|
||||
upload_id = meta["id"]
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(500, "Saved PDF could not be located")
|
||||
|
||||
@@ -400,8 +402,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
text extraction was wired, plus for scanned/image-only PDFs where the
|
||||
VL model picks up text the basic pypdf path missed."""
|
||||
import re
|
||||
from src.constants import UPLOAD_DIR
|
||||
from src.document_processor import _process_pdf
|
||||
from src.pdf_form_doc import find_source_upload_id
|
||||
|
||||
user = get_current_user(request)
|
||||
db = SessionLocal()
|
||||
@@ -412,12 +414,11 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
_verify_doc_owner(db, doc, user)
|
||||
|
||||
content = doc.current_content or ""
|
||||
m = re.search(r'<!--\s*(?:pdf_source|pdf_form_source)\s+upload_id="([^"]+)"', content)
|
||||
if not m:
|
||||
upload_id = find_source_upload_id(content)
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not a PDF — no pdf_source marker found")
|
||||
upload_id = m.group(1)
|
||||
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, "Source PDF could not be located")
|
||||
|
||||
@@ -528,6 +529,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
if doc.current_content == req.content:
|
||||
return _doc_to_dict(doc)
|
||||
|
||||
_assert_pdf_marker_upload_owned(request, req.content, user, upload_handler)
|
||||
|
||||
# Check if we can coalesce with the latest version
|
||||
latest_ver = db.query(DocumentVersion).filter(
|
||||
DocumentVersion.document_id == doc_id,
|
||||
@@ -930,7 +933,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not linked to a source PDF")
|
||||
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")
|
||||
|
||||
@@ -993,7 +996,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
upload_id = find_source_upload_id(doc.current_content or "")
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not linked to a source PDF")
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, f"Source PDF {upload_id} not found")
|
||||
|
||||
@@ -1061,7 +1064,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
upload_id = find_source_upload_id(doc.current_content or "")
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not linked to a source PDF")
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, "Source PDF not found")
|
||||
finally:
|
||||
@@ -1117,7 +1120,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
upload_id = find_source_upload_id(doc.current_content or "")
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not linked to a source PDF")
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, "Source PDF not found")
|
||||
finally:
|
||||
@@ -1266,7 +1269,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
upload_id = find_source_upload_id(doc.current_content or "")
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not linked to a source PDF")
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, f"Source PDF {upload_id} not found")
|
||||
|
||||
@@ -1361,7 +1364,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not linked to a source PDF")
|
||||
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")
|
||||
|
||||
@@ -1505,7 +1508,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
|
||||
upload_id = find_source_upload_id(doc.current_content or "")
|
||||
if not upload_id:
|
||||
raise HTTPException(400, "Document is not linked to a source PDF")
|
||||
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user)
|
||||
pdf_path = _locate_current_user_upload(request, upload_id, user)
|
||||
if not pdf_path:
|
||||
raise HTTPException(404, f"Source PDF {upload_id} not found")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user