Harden PDF document markers against cross-owner upload access (#445)

Route PDF lookups through UploadHandler.resolve_upload, reject poisoned pdf_source markers on document create/update, and add regression tests.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Duarte Antunes
2026-06-01 14:38:14 +01:00
committed by GitHub
parent b2e8d692a4
commit 448401a0fc
5 changed files with 183 additions and 106 deletions
+56 -68
View File
@@ -5,16 +5,16 @@
import logging import logging
import os import os
import re import re
from typing import Dict, Any, Optional from typing import Any, Dict, Optional
from fastapi import HTTPException from fastapi import HTTPException, Request
from pydantic import BaseModel from pydantic import BaseModel
from core.database import Document, DocumentVersion from core.database import Document, DocumentVersion
from core.database import Session as DbSession from core.database import Session as DbSession
from src.upload_handler import UploadHandler
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_UPLOAD_ID_RE = re.compile(r"^[0-9a-fA-F]{32}\.[A-Za-z0-9]+$")
# ---- Request schemas ---- # ---- Request schemas ----
@@ -138,78 +138,66 @@ def _upload_path_inside(upload_dir: str, path: str) -> bool:
return False return False
def _upload_owner_allowed( def _resolve_user_upload_path(
meta: Optional[dict], upload_handler: Any,
user: Optional[str], upload_id: str,
owner: Optional[str],
auth_manager=None, auth_manager=None,
allow_admin: bool = True, ) -> Optional[str]:
) -> bool: """Resolve an upload id to a filesystem path the caller may read."""
if not user: if upload_handler is None:
return ( return None
not bool(auth_manager and getattr(auth_manager, "is_configured", False)) resolved = upload_handler.resolve_upload(
and not (meta and meta.get("owner") is not None) upload_id,
owner=owner,
auth_manager=auth_manager,
) )
if allow_admin and auth_manager and hasattr(auth_manager, "is_admin"): if not resolved:
try:
if auth_manager.is_admin(user):
return True
except Exception:
pass
return bool(meta and meta.get("owner") == user)
def _locate_upload(upload_dir: str, file_id: str, owner: Optional[str] = None, auth_manager=None):
"""Find an upload by its filename ID.
Lookup order:
1. The `uploads.json` index that `UploadHandler.save_upload` maintains,
so owner can be verified before a document reads the source file.
2. Direct hit at `upload_dir/file_id` (very small deployments).
3. Fallback: `os.walk` the date-bucketed tree. Slow on large stores;
only allowed after the index owner check passes, or in single-user /
admin-style contexts where no owner is enforced.
`followlinks=False` keeps a stray symlink loop in `data/uploads/` from
spinning the walker into infinite recursion.
"""
import json as _json
if not _UPLOAD_ID_RE.fullmatch(file_id or ""):
logger.warning("Rejected invalid upload id in document lookup: %r", file_id)
return None return None
path = resolved.get("path")
meta = None upload_dir = getattr(upload_handler, "upload_dir", None)
try: if path and upload_dir and not _upload_path_inside(upload_dir, path):
idx_path = os.path.join(upload_dir, "uploads.json") logger.warning("Upload path outside upload directory: %s", path)
if os.path.exists(idx_path):
with open(idx_path, "r", encoding="utf-8") as f:
idx = _json.load(f)
for item in (idx.values() if isinstance(idx, dict) else []):
if isinstance(item, dict) and item.get("id") == file_id:
meta = item
break
except Exception:
meta = None
if not _upload_owner_allowed(meta, owner, auth_manager):
logger.warning("Upload %s denied for document owner %s", file_id, owner)
return None return None
return path
if meta:
p = meta.get("path")
if p and os.path.exists(p) and _upload_path_inside(upload_dir, p):
return p
direct = os.path.join(upload_dir, file_id) def _locate_upload(
if os.path.exists(direct) and _upload_path_inside(upload_dir, direct): upload_dir: str,
return direct file_id: str,
owner: Optional[str] = None,
auth_manager=None,
upload_handler: Any = None,
):
"""Find an upload by its filename ID via UploadHandler.resolve_upload."""
if upload_handler is None:
from src.upload_handler import UploadHandler
for root, _dirs, files in os.walk(upload_dir, followlinks=False): base_dir = os.path.dirname(os.path.abspath(upload_dir))
if file_id in files: upload_handler = UploadHandler(base_dir, upload_dir)
p = os.path.join(root, file_id) return _resolve_user_upload_path(upload_handler, file_id, owner, auth_manager)
if _upload_path_inside(upload_dir, p):
return p
return None def _assert_pdf_marker_upload_owned(
request: Request,
content: str,
user: Optional[str],
upload_handler: Any,
) -> None:
"""Reject document content whose pdf_source marker points at another user's upload."""
if upload_handler is None:
return
from src.pdf_form_doc import find_source_upload_id
upload_id = find_source_upload_id(content or "")
if not upload_id:
return
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None)
if not _resolve_user_upload_path(upload_handler, upload_id, user, auth_manager):
raise HTTPException(
400,
"Document PDF marker references an upload you do not own",
)
def _derive_title(content: str) -> str: def _derive_title(content: str) -> str:
+24 -21
View File
@@ -20,15 +20,19 @@ from routes.document_helpers import (
DocumentCreate, DocumentUpdate, DocumentPatch, DocumentCreate, DocumentUpdate, DocumentPatch,
_doc_to_dict, _version_to_dict, _doc_to_dict, _version_to_dict,
_verify_doc_owner, _owner_session_filter, _verify_doc_owner, _owner_session_filter,
_slug, _locate_upload, _derive_title, _slug, _resolve_user_upload_path, _assert_pdf_marker_upload_owned, _derive_title,
_PDF_RENDER_SCALE, _PDF_RENDER_SCALE,
) )
def _locate_current_user_upload(request: Request, upload_dir: str, upload_id: str, user: Optional[str]): def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None) router = APIRouter(tags=["documents"])
return _locate_upload(upload_dir, upload_id, owner=user, auth_manager=auth_manager)
def _locate_current_user_upload(request: Request, upload_id: str, user: Optional[str]):
if upload_handler is None:
return None
auth_manager = getattr(getattr(request.app, "state", None), "auth_manager", None)
return _resolve_user_upload_path(upload_handler, upload_id, user, auth_manager)
def _load_pdf_viewer_fitz(): def _load_pdf_viewer_fitz():
from src.pdf_runtime import load_pymupdf_for_pdf_viewer from src.pdf_runtime import load_pymupdf_for_pdf_viewer
@@ -38,10 +42,6 @@ def _load_pdf_viewer_fitz():
except RuntimeError as exc: except RuntimeError as exc:
raise HTTPException(503, str(exc)) from exc raise HTTPException(503, str(exc)) from exc
def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
router = APIRouter(tags=["documents"])
# ---- POST /api/document ---- # ---- POST /api/document ----
@router.post("/api/document") @router.post("/api/document")
async def create_document(request: Request, req: DocumentCreate) -> Dict[str, Any]: async def create_document(request: Request, req: DocumentCreate) -> Dict[str, Any]:
@@ -82,6 +82,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if _looks_like_email_document(req.content, req.title): if _looks_like_email_document(req.content, req.title):
language = "email" language = "email"
_assert_pdf_marker_upload_owned(request, req.content, user, upload_handler)
doc = Document( doc = Document(
id=doc_id, id=doc_id,
session_id=req.session_id, session_id=req.session_id,
@@ -176,7 +178,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
raise HTTPException(500, f"Upload failed: {e}") raise HTTPException(500, f"Upload failed: {e}")
upload_id = meta["id"] upload_id = meta["id"]
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(500, "Saved PDF could not be located") raise HTTPException(500, "Saved PDF could not be located")
@@ -400,8 +402,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
text extraction was wired, plus for scanned/image-only PDFs where the text extraction was wired, plus for scanned/image-only PDFs where the
VL model picks up text the basic pypdf path missed.""" VL model picks up text the basic pypdf path missed."""
import re import re
from src.constants import UPLOAD_DIR
from src.document_processor import _process_pdf from src.document_processor import _process_pdf
from src.pdf_form_doc import find_source_upload_id
user = get_current_user(request) user = get_current_user(request)
db = SessionLocal() db = SessionLocal()
@@ -412,12 +414,11 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
_verify_doc_owner(db, doc, user) _verify_doc_owner(db, doc, user)
content = doc.current_content or "" content = doc.current_content or ""
m = re.search(r'<!--\s*(?:pdf_source|pdf_form_source)\s+upload_id="([^"]+)"', content) upload_id = find_source_upload_id(content)
if not m: if not upload_id:
raise HTTPException(400, "Document is not a PDF — no pdf_source marker found") raise HTTPException(400, "Document is not a PDF — no pdf_source marker found")
upload_id = m.group(1)
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, "Source PDF could not be located") raise HTTPException(404, "Source PDF could not be located")
@@ -528,6 +529,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if doc.current_content == req.content: if doc.current_content == req.content:
return _doc_to_dict(doc) return _doc_to_dict(doc)
_assert_pdf_marker_upload_owned(request, req.content, user, upload_handler)
# Check if we can coalesce with the latest version # Check if we can coalesce with the latest version
latest_ver = db.query(DocumentVersion).filter( latest_ver = db.query(DocumentVersion).filter(
DocumentVersion.document_id == doc_id, DocumentVersion.document_id == doc_id,
@@ -930,7 +933,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if not upload_id: if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF") raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found in uploads") raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")
@@ -993,7 +996,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "") upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id: if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF") raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found") raise HTTPException(404, f"Source PDF {upload_id} not found")
@@ -1061,7 +1064,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "") upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id: if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF") raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, "Source PDF not found") raise HTTPException(404, "Source PDF not found")
finally: finally:
@@ -1117,7 +1120,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "") upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id: if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF") raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, "Source PDF not found") raise HTTPException(404, "Source PDF not found")
finally: finally:
@@ -1266,7 +1269,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "") upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id: if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF") raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found") raise HTTPException(404, f"Source PDF {upload_id} not found")
@@ -1361,7 +1364,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
if not upload_id: if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF") raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found in uploads") raise HTTPException(404, f"Source PDF {upload_id} not found in uploads")
@@ -1505,7 +1508,7 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
upload_id = find_source_upload_id(doc.current_content or "") upload_id = find_source_upload_id(doc.current_content or "")
if not upload_id: if not upload_id:
raise HTTPException(400, "Document is not linked to a source PDF") raise HTTPException(400, "Document is not linked to a source PDF")
pdf_path = _locate_current_user_upload(request, UPLOAD_DIR, upload_id, user) pdf_path = _locate_current_user_upload(request, upload_id, user)
if not pdf_path: if not pdf_path:
raise HTTPException(404, f"Source PDF {upload_id} not found") raise HTTPException(404, f"Source PDF {upload_id} not found")
+10 -1
View File
@@ -167,9 +167,18 @@ def find_source_upload_id(content: str) -> Optional[str]:
Matches both the form-source marker (`pdf_form_source`) used for fillable Matches both the form-source marker (`pdf_form_source`) used for fillable
PDFs and the plain marker (`pdf_source`) used for any imported PDF. PDFs and the plain marker (`pdf_source`) used for any imported PDF.
Rejects malformed ids (path traversal, wrong shape) before any lookup.
""" """
from src.upload_handler import is_valid_upload_id
m = _FRONT_MATTER_RE.search(content or "") or _PLAIN_FRONT_MATTER_RE.search(content or "") m = _FRONT_MATTER_RE.search(content or "") or _PLAIN_FRONT_MATTER_RE.search(content or "")
return m.group("upload_id") if m else None if not m:
return None
upload_id = m.group("upload_id")
if not is_valid_upload_id(upload_id):
logger.warning("Ignoring invalid pdf_source upload_id in document content: %r", upload_id)
return None
return upload_id
def render_plain_pdf_markdown(upload_id: str, title: str, body_text: Optional[str] = None) -> str: def render_plain_pdf_markdown(upload_id: str, title: str, body_text: Optional[str] = None) -> str:
+9 -2
View File
@@ -29,6 +29,14 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
UPLOAD_ID_RE = re.compile(r"^[0-9a-fA-F]{32}\.[A-Za-z0-9]+$")
def is_valid_upload_id(upload_id: str) -> bool:
"""Return True when *upload_id* matches the canonical uploads.json id format."""
return UPLOAD_ID_RE.fullmatch(upload_id or "") is not None
class UploadHandler: class UploadHandler:
def __init__(self, base_dir: str, upload_dir: str): def __init__(self, base_dir: str, upload_dir: str):
self.base_dir = base_dir self.base_dir = base_dir
@@ -223,8 +231,7 @@ class UploadHandler:
def validate_upload_id(self, upload_id: str) -> bool: def validate_upload_id(self, upload_id: str) -> bool:
"""Validate that the upload ID matches the expected pattern.""" """Validate that the upload ID matches the expected pattern."""
pattern = r'^[0-9a-fA-F]{32}\.[A-Za-z0-9]+$' return is_valid_upload_id(upload_id)
return re.fullmatch(pattern, upload_id) is not None
def _inside_upload_dir(self, path: str) -> bool: def _inside_upload_dir(self, path: str) -> bool:
"""Check if path is inside the upload directory.""" """Check if path is inside the upload directory."""
+72 -2
View File
@@ -367,17 +367,87 @@ def test_chat_preprocess_does_not_surface_cross_owner_attachment(tmp_path, monke
def test_document_upload_lookup_rejects_cross_owner_marker(tmp_path, monkeypatch): def test_document_upload_lookup_rejects_cross_owner_marker(tmp_path, monkeypatch):
from src.upload_handler import UploadHandler
sys.modules.pop("routes.document_helpers", None) sys.modules.pop("routes.document_helpers", None)
_stub_core_database_for_route_imports(monkeypatch) _stub_core_database_for_route_imports(monkeypatch)
from routes.document_helpers import _locate_upload from routes.document_helpers import _locate_upload
upload_dir, _alice_id, bob_id = _make_upload_store(tmp_path) upload_dir, _alice_id, bob_id = _make_upload_store(tmp_path)
handler = UploadHandler(str(tmp_path), str(upload_dir))
assert _locate_upload(str(upload_dir), bob_id, owner="alice") is None assert _locate_upload(str(upload_dir), bob_id, owner="alice", upload_handler=handler) is None
assert _locate_upload(str(upload_dir), bob_id, owner="bob").endswith(bob_id) assert _locate_upload(str(upload_dir), bob_id, owner="bob", upload_handler=handler).endswith(bob_id)
sys.modules.pop("routes.document_helpers", None) sys.modules.pop("routes.document_helpers", None)
def test_find_source_upload_id_rejects_path_traversal_marker():
from src.pdf_form_doc import find_source_upload_id
content = '<!-- pdf_source upload_id="../../etc/passwd" -->\n\n# x\n'
assert find_source_upload_id(content) is None
def test_pdf_marker_write_rejects_cross_owner_upload(tmp_path, monkeypatch):
"""Saving a doc whose front-matter points at another user's upload must 400."""
from src.upload_handler import UploadHandler
sys.modules.pop("routes.document_helpers", None)
_stub_core_database_for_route_imports(monkeypatch)
from fastapi import HTTPException
from routes.document_helpers import _assert_pdf_marker_upload_owned
upload_dir, _alice_id, bob_id = _make_upload_store(tmp_path)
handler = UploadHandler(str(tmp_path), str(upload_dir))
class _AuthMgr:
is_configured = True
@staticmethod
def is_admin(_user):
return False
class _AppState:
auth_manager = _AuthMgr()
class _App:
state = _AppState()
class _Req:
app = _App()
marker = f'<!-- pdf_source upload_id="{bob_id}" -->\n\n# Notes\n'
with pytest.raises(HTTPException) as exc:
_assert_pdf_marker_upload_owned(_Req(), marker, "alice", handler)
assert exc.value.status_code == 400
# Own upload is allowed
own_marker = f'<!-- pdf_source upload_id="{_alice_id}" -->\n\n# Notes\n'
_assert_pdf_marker_upload_owned(_Req(), own_marker, "alice", handler)
sys.modules.pop("routes.document_helpers", None)
def test_pdf_marker_render_lookup_denies_cross_owner_without_doc_leak(tmp_path):
"""Read path: cross-owner marker resolves to None (404 at route layer)."""
from src.upload_handler import UploadHandler
upload_dir, alice_id, bob_id = _make_upload_store(tmp_path)
handler = UploadHandler(str(tmp_path), str(upload_dir))
class _AuthMgr:
is_configured = True
@staticmethod
def is_admin(_user):
return False
assert handler.resolve_upload(bob_id, owner="alice", auth_manager=_AuthMgr()) is None
resolved = handler.resolve_upload(alice_id, owner="alice", auth_manager=_AuthMgr())
assert resolved is not None
assert resolved["path"].endswith(alice_id)
# ── require_user dependency rejects anon callers ──────────────── # ── require_user dependency rejects anon callers ────────────────
def test_require_user_rejects_unauthenticated(monkeypatch): def test_require_user_rejects_unauthenticated(monkeypatch):