fix: improve uploaded document retrieval and deep research reuse (#4784)

* fix: improve uploaded document retrieval and deep research reuse

* test: add coverage for upload manifest and document pagination

* chore: rerun CI

* fix: restore _insert_before_latest_user helper

* fix(agent_loop): restore missing upload context helper
This commit is contained in:
muhamed hamed
2026-06-27 21:24:17 +03:00
committed by GitHub
parent 7e9bfb1700
commit 3e7af8634f
8 changed files with 411 additions and 5 deletions
+31
View File
@@ -39,6 +39,7 @@ try:
_classify_agent_request,
_compute_final_metrics,
_append_tool_results,
_insert_before_latest_user,
_MCP_KEYWORDS,
)
_IMPORTED_AGENT_LOOP = sys.modules.get("src.agent_loop")
@@ -73,6 +74,36 @@ def test_polish_internet_search_request_classifies_as_web():
assert "web" in intent["domains"]
def test_insert_before_latest_user_places_context_before_last_user_turn():
messages = [
{"role": "user", "content": "first"},
{"role": "assistant", "content": "reply"},
{"role": "user", "content": "latest"},
]
context = {"role": "system", "content": "context"}
out = _insert_before_latest_user(messages, context)
assert out == [
{"role": "user", "content": "first"},
{"role": "assistant", "content": "reply"},
context,
{"role": "user", "content": "latest"},
]
assert messages == [
{"role": "user", "content": "first"},
{"role": "assistant", "content": "reply"},
{"role": "user", "content": "latest"},
]
def test_insert_before_latest_user_appends_when_no_user_message_exists():
messages = [{"role": "assistant", "content": "reply"}]
context = {"role": "system", "content": "context"}
assert _insert_before_latest_user(messages, context) == [messages[0], context]
# ---------------------------------------------------------------------------
# _detect_admin_intent
# ---------------------------------------------------------------------------
+125
View File
@@ -1,4 +1,8 @@
import asyncio
import os
import shutil
import uuid
from pathlib import Path
from types import SimpleNamespace
import pytest
@@ -10,6 +14,7 @@ from routes.chat_helpers import (
_session_is_research_spinoff,
auto_name_session,
build_chat_context,
build_uploaded_file_manifest,
clean_thinking_for_save,
needs_auto_name,
PreprocessedMessage,
@@ -145,6 +150,126 @@ class _FakeSession:
self.history.append(message)
class _ManifestUploadHandler:
def __init__(self, upload_dir, rows):
self.upload_dir = str(upload_dir)
self.rows = rows
self.calls = []
def _inside_upload_dir(self, path):
base = os.path.realpath(self.upload_dir)
candidate = os.path.realpath(path)
try:
return os.path.commonpath([base, candidate]) == base
except ValueError:
return False
def resolve_upload(self, upload_id, owner=None):
self.calls.append((upload_id, owner))
row = self.rows.get(upload_id)
if isinstance(row, dict) and row.get("owner") and row.get("owner") != owner:
return None
return row
def _manifest_test_dir(name):
root = Path(__file__).resolve().parents[1] / "tmp_pytest_probe" / f"{name}-{uuid.uuid4().hex}"
root.mkdir(parents=True, exist_ok=False)
return root
def test_build_uploaded_file_manifest_filters_and_nulls_unreadable_paths(monkeypatch):
root = _manifest_test_dir("manifest")
try:
upload_dir = root / "uploads"
upload_dir.mkdir()
good = upload_dir / "good.txt"
good.write_text("hello", encoding="utf-8")
outside = root / "outside.txt"
outside.write_text("nope", encoding="utf-8")
missing = upload_dir / "missing.txt"
import src.settings as settings
monkeypatch.setattr(
settings,
"get_setting",
lambda key: [str(upload_dir)] if key == "tool_path_extra_roots" else None,
)
handler = _ManifestUploadHandler(upload_dir, {
"good": {
"id": "good",
"name": "good.txt",
"mime": "text/plain",
"size": 5,
"path": str(good),
"owner": "alice",
},
"bob": {
"id": "bob",
"name": "bob.txt",
"path": str(good),
"owner": "bob",
},
"outside": {
"id": "outside",
"name": "outside.txt",
"path": str(outside),
"owner": "alice",
},
"missing": {
"id": "missing",
"name": "missing.txt",
"path": str(missing),
"owner": "alice",
},
"bad": ["not", "a", "dict"],
})
manifest = build_uploaded_file_manifest(
["good", "bob", "outside", "missing", "bad"],
handler,
owner="alice",
)
assert [item["id"] for item in manifest] == ["good", "outside", "missing"]
assert os.path.realpath(manifest[0]["path"]) == os.path.realpath(good)
assert manifest[1]["path"] is None
assert manifest[2]["path"] is None
assert handler.calls == [
("good", "alice"),
("bob", "alice"),
("outside", "alice"),
("missing", "alice"),
("bad", "alice"),
]
finally:
shutil.rmtree(root, ignore_errors=True)
def test_build_uploaded_file_manifest_hides_paths_read_file_cannot_open(monkeypatch):
root = _manifest_test_dir("manifest-unreadable")
try:
upload_dir = root / "uploads"
upload_dir.mkdir()
upload = upload_dir / "upload.txt"
upload.write_text("hello", encoding="utf-8")
handler = _ManifestUploadHandler(upload_dir, {
"upload": {"id": "upload", "name": "upload.txt", "path": str(upload), "owner": "alice"},
})
def reject_path(_path):
raise ValueError("outside the allowed roots")
monkeypatch.setattr("src.tool_execution._resolve_tool_path", reject_path)
manifest = build_uploaded_file_manifest(["upload"], handler, owner="alice")
assert manifest[0]["path"] is None
finally:
shutil.rmtree(root, ignore_errors=True)
@pytest.mark.parametrize("name,expected", [
# 24h format (the bug this PR fixes)
("deepseek-v4-flash 14:05:33", True),
+99
View File
@@ -100,6 +100,105 @@ def test_default_ssh_port_omits_flag():
assert port_flag == ""
def _documents_endpoint(total: int):
calls = []
document_router = APIRouter()
@document_router.get("/api/documents/library")
async def documents_library(
request: Request,
search=None,
language=None,
sort="recent",
offset=0,
limit=20,
archived=False,
):
calls.append({
"owner": request.state.current_user,
"search": search,
"language": language,
"sort": sort,
"offset": offset,
"limit": limit,
"archived": archived,
})
end = min(offset + limit, total)
docs = [{"id": f"doc-{i}"} for i in range(offset, end)]
return {"documents": docs, "total": total}
router = codex_routes.setup_codex_routes(document_router=document_router)
return _route_endpoint("/api/codex/documents", "GET", router=router), calls
@pytest.mark.asyncio
async def test_documents_pagination_clamps_offset_and_limit():
endpoint, calls = _documents_endpoint(total=99)
result = await endpoint(_codex_request(["documents:read"]), offset=-10, limit=500)
assert calls[-1]["owner"] == "alice"
assert calls[-1]["offset"] == 0
assert calls[-1]["limit"] == 50
assert len(result["documents"]) == 50
assert result["next_offset"] == 50
@pytest.mark.asyncio
async def test_documents_pagination_clamps_zero_limit_to_one():
endpoint, calls = _documents_endpoint(total=3)
result = await endpoint(_codex_request(["documents:read"]), offset=0, limit=0)
assert calls[-1]["limit"] == 1
assert len(result["documents"]) == 1
assert result["next_offset"] == 1
@pytest.mark.asyncio
async def test_documents_pagination_returns_next_offset_when_truncated():
endpoint, _calls = _documents_endpoint(total=7)
result = await endpoint(_codex_request(["documents:read"]), offset=2, limit=3)
assert [doc["id"] for doc in result["documents"]] == ["doc-2", "doc-3", "doc-4"]
assert result["next_offset"] == 5
@pytest.mark.asyncio
async def test_documents_pagination_rejects_invalid_offset():
endpoint, _calls = _documents_endpoint(total=7)
with pytest.raises(HTTPException) as exc:
await endpoint(_codex_request(["documents:read"]), offset="soon", limit=3)
assert exc.value.status_code == 400
assert exc.value.detail == "Invalid offset"
@pytest.mark.asyncio
async def test_documents_pagination_rejects_invalid_limit():
endpoint, _calls = _documents_endpoint(total=7)
with pytest.raises(HTTPException) as exc:
await endpoint(_codex_request(["documents:read"]), offset=0, limit="many")
assert exc.value.status_code == 400
assert exc.value.detail == "Invalid limit"
@pytest.mark.asyncio
async def test_documents_pagination_out_of_range_offset_returns_empty_page():
endpoint, calls = _documents_endpoint(total=3)
result = await endpoint(_codex_request(["documents:read"]), offset=10, limit=2)
assert calls[-1]["offset"] == 10
assert calls[-1]["limit"] == 2
assert result["documents"] == []
assert result["next_offset"] is None
def test_adopt_rejects_ssh_option_host_before_shell(monkeypatch):
calls = []