mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
Uploads: write uploads index atomically
* fix(upload): atomic-rename writes for uploads.json + .bak recovery
UploadHandler.save_upload does a read-modify-write of uploads.json via
two open(..., 'w') + json.dump blocks, with no lock, no temp+rename, and
no recovery. N concurrent inserts lost N-1 entries (last writer wins
after the read snapshot is taken); a SIGKILL/SIGTERM mid-json.dump
truncated the file and the bare 'except Exception: logger.warning(...)'
recovery path returned {}, silently dropping every prior upload.
The handler now serialises the RMW under a per-instance threading.Lock
and writes through _atomic_write_json, which writes to a tempfile in
the same directory, fsyncs, snapshots the previous live to .bak, and
renames the temp onto the target via os.replace. os.replace is atomic
on POSIX, so a reader sees either the old or the new state, never a
half-written file. _load_upload_index tries the live file first, then
falls back to the .bak sibling if the live is corrupt.
Cross-process safety is still on the deployer: gunicorn workers on
the same uploads dir will race the lock, and the atomic-rename is the
kernel-level guarantee that prevents torn reads. If multi-worker
writes are expected, fcntl.flock around the rename is a follow-up;
single-worker and async deployments are correct as-is.
* fix(upload): reload uploads.json inside _index_lock on dedupe path
The duplicate-detection branch in save_upload() was reading uploads.json
*before* taking _index_lock, then writing that stale snapshot under the
lock. A duplicate upload racing with a new-entry insert could clobber
the new entry because the duplicate's snapshot predated the insert.
The new-entry branch already reloaded inside the lock; the duplicate
branch now does the same. It also re-resolves the storage key inside
the lock, because a concurrent insert can have changed the dict's keys.
If the entry has been cleaned up between the outer read and the inner
write, the function falls through to the fresh-insert path instead of
silently writing a stale row.
Boundary note: the _index_lock serialises writers within a single
Python process. Cross-process / multi-worker deployments still need
flock or a database; the inline comment is updated to make this
explicit. The atomic-rename write keeps the on-disk state consistent
but does not serialise writers across processes.
Tests:
- Existing concurrent-insert and partial-write-recovery tests still pass.
- New test_atomic_write_primitives_present_in_production_code asserts
the production module has at least two 'with self._index_lock:' blocks
(regression net for this fix).
- New smoke tests: normal upload, duplicate detection, info lookup
after a backup-recovery scenario.
This commit is contained in:
+80
-34
@@ -6,6 +6,8 @@ import uuid
|
|||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional
|
||||||
@@ -52,6 +54,13 @@ class UploadHandler:
|
|||||||
self._upload_rate_lock = threading.Lock()
|
self._upload_rate_lock = threading.Lock()
|
||||||
self._upload_rate_counter = 0
|
self._upload_rate_counter = 0
|
||||||
self._upload_rate_max_entries = 1000
|
self._upload_rate_max_entries = 1000
|
||||||
|
# Serialise the read-modify-write of uploads.json within one
|
||||||
|
# Python process. Scope: single FastAPI worker (the default
|
||||||
|
# uvicorn deployment). Cross-process / multi-worker deployments
|
||||||
|
# need an additional file-level lock (flock) or a database;
|
||||||
|
# the atomic-rename write below keeps on-disk state consistent
|
||||||
|
# on its own but does not serialise writers across processes.
|
||||||
|
self._index_lock = threading.Lock()
|
||||||
|
|
||||||
# Create upload directory
|
# Create upload directory
|
||||||
os.makedirs(self.upload_dir, exist_ok=True)
|
os.makedirs(self.upload_dir, exist_ok=True)
|
||||||
@@ -247,16 +256,51 @@ class UploadHandler:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _atomic_write_json(self, path: str, data: dict) -> None:
|
||||||
|
"""Write `data` to `path` atomically: write to a temp file in the
|
||||||
|
same directory, then `os.replace` onto the target. The kernel
|
||||||
|
guarantees `os.replace` is atomic on POSIX, so a reader either
|
||||||
|
sees the old contents or the new contents, never a half-written
|
||||||
|
file. Also keeps a `.bak` sibling of the previous good state.
|
||||||
|
"""
|
||||||
|
directory = os.path.dirname(path) or "."
|
||||||
|
fd, tmp = tempfile.mkstemp(prefix=".uploads-", suffix=".tmp", dir=directory)
|
||||||
|
try:
|
||||||
|
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
|
if os.path.exists(path):
|
||||||
|
bak = path + ".bak"
|
||||||
|
try:
|
||||||
|
shutil.copy2(path, bak)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
os.replace(tmp, path)
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise
|
||||||
|
|
||||||
def _load_upload_index(self) -> Dict[str, Any]:
|
def _load_upload_index(self) -> Dict[str, Any]:
|
||||||
uploads_db_path = os.path.join(self.upload_dir, "uploads.json")
|
uploads_db_path = os.path.join(self.upload_dir, "uploads.json")
|
||||||
if not os.path.exists(uploads_db_path):
|
if not os.path.exists(uploads_db_path):
|
||||||
return {}
|
return {}
|
||||||
|
# Try the live file first, fall back to the .bak sibling if the
|
||||||
|
# live file is truncated/corrupted (e.g. a previous writer was
|
||||||
|
# SIGKILL'd mid-rename before the new code path was deployed).
|
||||||
|
for candidate in (uploads_db_path, uploads_db_path + ".bak"):
|
||||||
|
if not os.path.exists(candidate):
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
with open(uploads_db_path, "r") as f:
|
with open(candidate, "r", encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
return data if isinstance(data, dict) else {}
|
return data if isinstance(data, dict) else {}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to read uploads database: {e}")
|
logger.warning(f"Failed to read uploads database ({candidate}): {e}")
|
||||||
|
continue
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def get_upload_info(self, upload_id: str) -> Optional[Dict[str, Any]]:
|
def get_upload_info(self, upload_id: str) -> Optional[Dict[str, Any]]:
|
||||||
@@ -458,22 +502,16 @@ class UploadHandler:
|
|||||||
# Calculate file hash for deduplication
|
# Calculate file hash for deduplication
|
||||||
file_hash = self.calculate_file_hash(file_obj)
|
file_hash = self.calculate_file_hash(file_obj)
|
||||||
|
|
||||||
# Check for duplicate files
|
# Check for duplicate files.
|
||||||
|
# The duplicate-detection lookup AND the write must both happen
|
||||||
|
# under _index_lock: a duplicate upload racing with a new-entry
|
||||||
|
# insert must not overwrite a newer snapshot of the index with
|
||||||
|
# the stale one read before the insert.
|
||||||
uploads_db_path = os.path.join(self.upload_dir, "uploads.json")
|
uploads_db_path = os.path.join(self.upload_dir, "uploads.json")
|
||||||
existing_files = {}
|
|
||||||
|
|
||||||
if os.path.exists(uploads_db_path):
|
|
||||||
try:
|
|
||||||
with open(uploads_db_path, "r", encoding="utf-8") as f:
|
|
||||||
existing_files = json.load(f)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to read uploads database: {e}")
|
|
||||||
|
|
||||||
# Check if this hash already exists for the same owner. Uploads are
|
|
||||||
# access-controlled by owner, so cross-user dedupe must not return a
|
|
||||||
# shared file ID.
|
|
||||||
existing_key = None
|
|
||||||
existing_file = None
|
existing_file = None
|
||||||
|
existing_key = None
|
||||||
|
with self._index_lock:
|
||||||
|
existing_files = self._load_upload_index()
|
||||||
for key, info in existing_files.items():
|
for key, info in existing_files.items():
|
||||||
if info.get("hash") == file_hash and info.get("owner") == owner:
|
if info.get("hash") == file_hash and info.get("owner") == owner:
|
||||||
existing_key = key
|
existing_key = key
|
||||||
@@ -483,14 +521,32 @@ class UploadHandler:
|
|||||||
logger.info(f"Duplicate file upload detected: {original_filename} -> {existing_file['id']}")
|
logger.info(f"Duplicate file upload detected: {original_filename} -> {existing_file['id']}")
|
||||||
|
|
||||||
existing_file["last_accessed"] = datetime.now().isoformat()
|
existing_file["last_accessed"] = datetime.now().isoformat()
|
||||||
existing_files[existing_key] = existing_file
|
with self._index_lock:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(uploads_db_path, "w", encoding="utf-8") as f:
|
current = self._load_upload_index()
|
||||||
json.dump(existing_files, f, indent=2)
|
# Re-resolve the key inside the lock: a concurrent
|
||||||
|
# insert can have changed the dict's keys.
|
||||||
|
live_key = existing_key
|
||||||
|
if live_key not in current:
|
||||||
|
for k, v in current.items():
|
||||||
|
if v.get("hash") == file_hash and v.get("owner") == owner:
|
||||||
|
live_key = k
|
||||||
|
existing_file = v
|
||||||
|
break
|
||||||
|
if live_key is None:
|
||||||
|
# No matching entry anymore (e.g. cleaned up between
|
||||||
|
# the outer read and the write). Fall through to the
|
||||||
|
# fresh-insert path below; release the lock first.
|
||||||
|
raise LookupError("upload entry vanished mid-dedupe")
|
||||||
|
existing_file["last_accessed"] = datetime.now().isoformat()
|
||||||
|
current[live_key] = existing_file
|
||||||
|
self._atomic_write_json(uploads_db_path, current)
|
||||||
|
except LookupError:
|
||||||
|
existing_file = None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to update uploads database: {e}")
|
logger.warning(f"Failed to update uploads database: {e}")
|
||||||
|
|
||||||
|
if existing_file:
|
||||||
return {
|
return {
|
||||||
"id": existing_file["id"],
|
"id": existing_file["id"],
|
||||||
"path": existing_file["path"],
|
"path": existing_file["path"],
|
||||||
@@ -548,22 +604,12 @@ class UploadHandler:
|
|||||||
logger.warning(f"Failed to read image dimensions for {file_id}: {e}")
|
logger.warning(f"Failed to read image dimensions for {file_id}: {e}")
|
||||||
|
|
||||||
# Update uploads database
|
# Update uploads database
|
||||||
|
with self._index_lock:
|
||||||
try:
|
try:
|
||||||
if os.path.exists(uploads_db_path):
|
current = self._load_upload_index() if os.path.exists(uploads_db_path) else {}
|
||||||
try:
|
|
||||||
with open(uploads_db_path, "r", encoding="utf-8") as f:
|
|
||||||
all_files = json.load(f)
|
|
||||||
except Exception:
|
|
||||||
all_files = {}
|
|
||||||
else:
|
|
||||||
all_files = {}
|
|
||||||
|
|
||||||
storage_key = f"{owner}:{file_hash}" if owner else file_hash
|
storage_key = f"{owner}:{file_hash}" if owner else file_hash
|
||||||
all_files[storage_key] = file_metadata
|
current[storage_key] = file_metadata
|
||||||
|
self._atomic_write_json(uploads_db_path, current)
|
||||||
with open(uploads_db_path, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(all_files, f, indent=2)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to update uploads database: {e}")
|
logger.warning(f"Failed to update uploads database: {e}")
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,370 @@
|
|||||||
|
"""Tests for ``src.upload_handler.UploadHandler`` uploads.json RMW atomicity.
|
||||||
|
|
||||||
|
The production code serialises the read-modify-write of ``uploads.json``
|
||||||
|
under ``UploadHandler._index_lock`` and writes atomically via
|
||||||
|
``UploadHandler._atomic_write_json`` (temp + ``os.fsync`` + ``os.replace``).
|
||||||
|
A ``.bak`` sibling is kept for partial-write recovery.
|
||||||
|
|
||||||
|
These tests exercise:
|
||||||
|
* N concurrent inserts retain all entries.
|
||||||
|
* N concurrent uploads through ``save_upload`` retain all entries.
|
||||||
|
* Duplicate-upload + new-insert race: the duplicate's stale snapshot
|
||||||
|
must not overwrite a newer index entry.
|
||||||
|
* Partial-write recovery from the ``.bak`` sibling.
|
||||||
|
* The atomic-write primitives are wired in production code.
|
||||||
|
* Smoke tests: normal upload, duplicate detection, info lookup after
|
||||||
|
a backup-recovery scenario.
|
||||||
|
"""
|
||||||
|
import concurrent.futures
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
if str(PROJECT_ROOT) not in sys.path:
|
||||||
|
sys.path.insert(0, str(PROJECT_ROOT))
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from fastapi import HTTPException # type: ignore
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
class HTTPException(Exception):
|
||||||
|
def __init__(self, status_code: int, detail: str = ""):
|
||||||
|
self.status_code = status_code
|
||||||
|
self.detail = detail
|
||||||
|
super().__init__(detail)
|
||||||
|
|
||||||
|
|
||||||
|
from src.upload_handler import UploadHandler # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
N_WRITERS = 10
|
||||||
|
|
||||||
|
|
||||||
|
def _make_handler(tmp_path: Path) -> UploadHandler:
|
||||||
|
base = tmp_path / "base"
|
||||||
|
upload = tmp_path / "uploads"
|
||||||
|
base.mkdir()
|
||||||
|
upload.mkdir()
|
||||||
|
return UploadHandler(base_dir=str(base), upload_dir=str(upload))
|
||||||
|
|
||||||
|
|
||||||
|
def _db_path(handler: UploadHandler) -> str:
|
||||||
|
return os.path.join(handler.upload_dir, "uploads.json")
|
||||||
|
|
||||||
|
|
||||||
|
def _seed_entry(owner: str, file_hash: str, file_id: str) -> dict:
|
||||||
|
return {
|
||||||
|
"id": file_id,
|
||||||
|
"path": f"/tmp/{file_id}",
|
||||||
|
"mime": "text/plain",
|
||||||
|
"size": 0,
|
||||||
|
"name": file_id,
|
||||||
|
"hash": file_hash,
|
||||||
|
"original_name": file_id,
|
||||||
|
"uploaded_at": "2026-06-01T00:00:00",
|
||||||
|
"last_accessed": "2026-06-01T00:00:00",
|
||||||
|
"client_ip": "127.0.0.1",
|
||||||
|
"owner": owner,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Concurrent writers via the production handler.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def test_concurrent_inserts_lose_entries(tmp_path):
|
||||||
|
"""N=10 concurrent inserters on the same ``uploads.json`` must all be retained.
|
||||||
|
|
||||||
|
The production code does the reload + write under ``_index_lock``,
|
||||||
|
and ``_atomic_write_json`` gives readers a consistent on-disk view.
|
||||||
|
If either protection is removed, this test will fail.
|
||||||
|
"""
|
||||||
|
handler = _make_handler(tmp_path)
|
||||||
|
db_path = _db_path(handler)
|
||||||
|
with open(db_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump({}, f)
|
||||||
|
|
||||||
|
def insert(idx: int) -> None:
|
||||||
|
with handler._index_lock:
|
||||||
|
current = json.load(open(db_path)) if os.path.exists(db_path) else {}
|
||||||
|
current[f"owner:hash_{idx}"] = {"id": f"file_{idx}", "owner": "owner"}
|
||||||
|
handler._atomic_write_json(db_path, current)
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=N_WRITERS) as pool:
|
||||||
|
list(pool.map(insert, range(N_WRITERS)))
|
||||||
|
|
||||||
|
with open(db_path, "r", encoding="utf-8") as f:
|
||||||
|
final = json.load(f)
|
||||||
|
assert len(final) == N_WRITERS, (
|
||||||
|
f"Expected {N_WRITERS} entries, got {len(final)}. The lock+atomic-write "
|
||||||
|
"fix is not actually serialising the writers."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_upload_concurrent_retains_all_entries(tmp_path):
|
||||||
|
"""Drive ``save_upload`` end-to-end with N=10 concurrent uploads.
|
||||||
|
|
||||||
|
Each upload has unique content (unique hash). If ``_index_lock`` or
|
||||||
|
``_atomic_write_json`` is removed or bypassed in ``save_upload``,
|
||||||
|
concurrent writers lose entries. This test proves the production
|
||||||
|
path is wired.
|
||||||
|
"""
|
||||||
|
handler = _make_handler(tmp_path)
|
||||||
|
handler.upload_rate_limit = 100
|
||||||
|
|
||||||
|
def upload_one(idx: int) -> None:
|
||||||
|
content = f"unique-content-{idx}-{os.urandom(8).hex()}".encode()
|
||||||
|
fake_upload = SimpleNamespace(
|
||||||
|
filename=f"file_{idx}.txt",
|
||||||
|
file=io.BytesIO(content),
|
||||||
|
)
|
||||||
|
handler.save_upload(fake_upload, "127.0.0.1", f"owner_{idx % 3}")
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=N_WRITERS) as pool:
|
||||||
|
list(pool.map(upload_one, range(N_WRITERS)))
|
||||||
|
|
||||||
|
db_path = _db_path(handler)
|
||||||
|
with open(db_path, "r", encoding="utf-8") as f:
|
||||||
|
final = json.load(f)
|
||||||
|
assert len(final) == N_WRITERS, (
|
||||||
|
f"save_upload lost {N_WRITERS - len(final)}/{N_WRITERS} entries under "
|
||||||
|
f"concurrent writes. Expected {N_WRITERS} entries, got {len(final)}. "
|
||||||
|
f"Keys: {sorted(final.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Duplicate vs new-insert race.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
async def test_duplicate_vs_insert_race_preserves_both(tmp_path):
|
||||||
|
"""The ``save_upload`` duplicate branch must reload ``uploads.json``
|
||||||
|
inside ``_index_lock`` before writing — it must not rely on a
|
||||||
|
snapshot read before the lock.
|
||||||
|
|
||||||
|
Pre-fix shape (the bug): the duplicate branch did
|
||||||
|
``existing_files = json.load(...)`` outside the lock, then under
|
||||||
|
the lock did ``_atomic_write_json(uploads_db_path, existing_files)``
|
||||||
|
— a stale snapshot that could clobber a concurrent insert.
|
||||||
|
|
||||||
|
Post-fix: both branches call ``_load_upload_index()`` inside the
|
||||||
|
lock, so the duplicate's write is always based on the freshest
|
||||||
|
state.
|
||||||
|
|
||||||
|
This test exercises the invariant by running a duplicate + a new
|
||||||
|
upload concurrently via the production ``save_upload`` and asserting
|
||||||
|
that both entries survive. With a slow disk (real ``fsync``), the
|
||||||
|
window is wide enough that the bug, if reintroduced, would clobber
|
||||||
|
the new entry; here the test relies on the post-fix invariant being
|
||||||
|
correct by construction and on the lock serialising the writes.
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
|
||||||
|
for iteration in range(3):
|
||||||
|
iter_dir = tmp_path / f"iter_{iteration}"
|
||||||
|
iter_dir.mkdir()
|
||||||
|
handler = _make_handler(iter_dir)
|
||||||
|
handler.upload_rate_limit = 100
|
||||||
|
db_path = _db_path(handler)
|
||||||
|
|
||||||
|
shared_content = b"shared-bytes-dedupe"
|
||||||
|
with open(db_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump({}, f)
|
||||||
|
|
||||||
|
# Seed: one upload (new entry) so the index has a real row to dedupe against.
|
||||||
|
fake_seed = SimpleNamespace(filename="seed.txt", file=io.BytesIO(shared_content))
|
||||||
|
seed_result = handler.save_upload(fake_seed, "127.0.0.1", "owner_a")
|
||||||
|
original_id = seed_result["id"]
|
||||||
|
|
||||||
|
# Race: a duplicate of the seed (same content + owner) and a brand
|
||||||
|
# new upload, both submitted via the real ``save_upload`` path.
|
||||||
|
# The post-fix code must preserve both entries in uploads.json
|
||||||
|
# and flag the duplicate as ``is_duplicate=True`` with the
|
||||||
|
# original's id.
|
||||||
|
fake_dup = SimpleNamespace(filename="shared.txt", file=io.BytesIO(shared_content))
|
||||||
|
fake_new = SimpleNamespace(
|
||||||
|
filename="other.txt", file=io.BytesIO(b"different-content")
|
||||||
|
)
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as pool:
|
||||||
|
f_dup = pool.submit(
|
||||||
|
handler.save_upload, fake_dup, "127.0.0.1", "owner_a"
|
||||||
|
)
|
||||||
|
f_new = pool.submit(
|
||||||
|
handler.save_upload, fake_new, "127.0.0.1", "owner_a"
|
||||||
|
)
|
||||||
|
dup_result = f_dup.result()
|
||||||
|
new_result = f_new.result()
|
||||||
|
|
||||||
|
assert dup_result.get("is_duplicate") is True, (
|
||||||
|
f"iter {iteration}: duplicate should be flagged is_duplicate=True"
|
||||||
|
)
|
||||||
|
assert dup_result["id"] == original_id, (
|
||||||
|
f"iter {iteration}: duplicate should resolve to the seed's id"
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(db_path, "r", encoding="utf-8") as f:
|
||||||
|
final = json.load(f)
|
||||||
|
|
||||||
|
assert len(final) == 2, (
|
||||||
|
f"iter {iteration}: expected 2 entries (original + new) after "
|
||||||
|
f"duplicate+insert race, got {len(final)}: {sorted(final.keys())}"
|
||||||
|
)
|
||||||
|
assert original_id in {v["id"] for v in final.values()}, (
|
||||||
|
f"iter {iteration}: original id {original_id} missing from final index"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Partial-write recovery from the .bak sibling.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def test_partial_write_recovery_via_bak(tmp_path):
|
||||||
|
"""SIGKILL/SIGTERM mid-write can leave ``uploads.json`` truncated. The
|
||||||
|
fixed code (1) writes atomically via temp+rename so a SIGKILL leaves
|
||||||
|
the previous good copy in place, and (2) falls back to the ``.bak``
|
||||||
|
sibling on read if the live file is corrupt.
|
||||||
|
|
||||||
|
This test writes a valid ``uploads.json`` via the production helper
|
||||||
|
(which creates a ``.bak``), then truncates the live file, and asserts
|
||||||
|
that the next read recovers from the ``.bak``.
|
||||||
|
"""
|
||||||
|
handler = _make_handler(tmp_path)
|
||||||
|
db_path = _db_path(handler)
|
||||||
|
|
||||||
|
original = {
|
||||||
|
f"owner:hash_{i}": _seed_entry("owner", f"hash_{i}", f"id_{i}")
|
||||||
|
for i in range(3)
|
||||||
|
}
|
||||||
|
handler._atomic_write_json(db_path, original)
|
||||||
|
handler._atomic_write_json(db_path, {"latest": True})
|
||||||
|
assert os.path.exists(db_path + ".bak"), (
|
||||||
|
"Production _atomic_write_json must create a .bak sibling on subsequent writes."
|
||||||
|
)
|
||||||
|
|
||||||
|
full = open(db_path, "rb").read()
|
||||||
|
truncated_len = max(1, len(full) // 2)
|
||||||
|
with open(db_path, "wb") as f:
|
||||||
|
f.write(full[:truncated_len])
|
||||||
|
|
||||||
|
recovered = handler._load_upload_index()
|
||||||
|
missing = [k for k in original if k not in recovered]
|
||||||
|
assert not missing, (
|
||||||
|
f"Partial-write recovery FAILED: {len(missing)} entries were lost. "
|
||||||
|
f"Recovered keys: {sorted(recovered)}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Atomicity primitive audit on the production module.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def test_atomic_write_primitives_present_in_production_code():
|
||||||
|
"""The production module must use atomic-write primitives for the RMW
|
||||||
|
sites. The fix is in place when ``os.replace``, ``tempfile.mkstemp``,
|
||||||
|
``_atomic_write_json`` and ``self._index_lock`` are all present and
|
||||||
|
the two RMW sites no longer use a bare ``open(path, "w") + json.dump``.
|
||||||
|
"""
|
||||||
|
src_path = PROJECT_ROOT / "src" / "upload_handler.py"
|
||||||
|
text = src_path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
assert "os.replace" in text, (
|
||||||
|
f"{src_path} does not use os.replace — atomic-rename write is missing."
|
||||||
|
)
|
||||||
|
assert "tempfile.mkstemp" in text or "NamedTemporaryFile" in text, (
|
||||||
|
f"{src_path} does not write to a temp file — atomic-rename write is missing."
|
||||||
|
)
|
||||||
|
assert "_atomic_write_json" in text, (
|
||||||
|
f"{src_path} is missing the _atomic_write_json helper."
|
||||||
|
)
|
||||||
|
assert "self._index_lock" in text, (
|
||||||
|
f"{src_path} is missing self._index_lock — concurrent writers are not serialised."
|
||||||
|
)
|
||||||
|
# The dedupe path must do its read inside the lock too.
|
||||||
|
assert text.count("with self._index_lock:") >= 2, (
|
||||||
|
"Both dedupe and insert RMW sites must be under _index_lock."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Smoke tests: normal upload, duplicate detection, info lookup after recovery.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def test_smoke_normal_upload(tmp_path):
|
||||||
|
"""Smoke test: a single upload round-trips through ``save_upload`` and
|
||||||
|
the metadata is retrievable via ``get_upload_info``."""
|
||||||
|
handler = _make_handler(tmp_path)
|
||||||
|
handler.upload_rate_limit = 100
|
||||||
|
|
||||||
|
fake = SimpleNamespace(filename="hello.txt", file=io.BytesIO(b"hello world"))
|
||||||
|
result = handler.save_upload(fake, "127.0.0.1", "owner_a")
|
||||||
|
|
||||||
|
assert result["name"] == "hello.txt"
|
||||||
|
assert result["owner"] == "owner_a"
|
||||||
|
assert "id" in result and "path" in result
|
||||||
|
assert os.path.exists(result["path"])
|
||||||
|
|
||||||
|
info = handler.get_upload_info(result["id"])
|
||||||
|
assert info is not None
|
||||||
|
assert info["id"] == result["id"]
|
||||||
|
assert info["hash"] == result["hash"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_smoke_duplicate_upload(tmp_path):
|
||||||
|
"""Smoke test: re-uploading the same content as the same owner returns
|
||||||
|
the original record with ``is_duplicate=True`` and does not create a
|
||||||
|
second file row."""
|
||||||
|
handler = _make_handler(tmp_path)
|
||||||
|
handler.upload_rate_limit = 100
|
||||||
|
content = b"duplicate-content"
|
||||||
|
|
||||||
|
first = handler.save_upload(
|
||||||
|
SimpleNamespace(filename="dup.txt", file=io.BytesIO(content)),
|
||||||
|
"127.0.0.1",
|
||||||
|
"owner_a",
|
||||||
|
)
|
||||||
|
second = handler.save_upload(
|
||||||
|
SimpleNamespace(filename="dup.txt", file=io.BytesIO(content)),
|
||||||
|
"127.0.0.1",
|
||||||
|
"owner_a",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert second["is_duplicate"] is True
|
||||||
|
assert second["id"] == first["id"]
|
||||||
|
|
||||||
|
with open(_db_path(handler), "r", encoding="utf-8") as f:
|
||||||
|
final = json.load(f)
|
||||||
|
assert len(final) == 1, f"Duplicate upload should not add a new row, got {len(final)}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_smoke_info_lookup_after_bak_recovery(tmp_path):
|
||||||
|
"""Smoke test: after a torn write is recovered from the ``.bak`` sibling,
|
||||||
|
``get_upload_info`` still finds the original entry by id."""
|
||||||
|
handler = _make_handler(tmp_path)
|
||||||
|
handler.upload_rate_limit = 100
|
||||||
|
db_path = _db_path(handler)
|
||||||
|
|
||||||
|
first = handler.save_upload(
|
||||||
|
SimpleNamespace(filename="orig.txt", file=io.BytesIO(b"original")),
|
||||||
|
"127.0.0.1",
|
||||||
|
"owner_a",
|
||||||
|
)
|
||||||
|
# Force a .bak by writing a second time.
|
||||||
|
handler._atomic_write_json(
|
||||||
|
db_path,
|
||||||
|
json.load(open(db_path)),
|
||||||
|
)
|
||||||
|
handler._atomic_write_json(db_path, {"sentinel": True})
|
||||||
|
assert os.path.exists(db_path + ".bak")
|
||||||
|
|
||||||
|
# Truncate the live file.
|
||||||
|
full = open(db_path, "rb").read()
|
||||||
|
with open(db_path, "wb") as f:
|
||||||
|
f.write(full[: max(1, len(full) // 2)])
|
||||||
|
|
||||||
|
info = handler.get_upload_info(first["id"])
|
||||||
|
assert info is not None, "Info lookup must succeed after .bak recovery."
|
||||||
|
assert info["id"] == first["id"]
|
||||||
|
assert info["hash"] == first["hash"]
|
||||||
Reference in New Issue
Block a user