fix: improve uploaded document retrieval and deep research reuse (#4784)

* fix: improve uploaded document retrieval and deep research reuse

* test: add coverage for upload manifest and document pagination

* chore: rerun CI

* fix: restore _insert_before_latest_user helper

* fix(agent_loop): restore missing upload context helper
This commit is contained in:
muhamed hamed
2026-06-27 21:24:17 +03:00
committed by GitHub
parent 7e9bfb1700
commit 3e7af8634f
8 changed files with 411 additions and 5 deletions
+62
View File
@@ -104,6 +104,9 @@ class ChatContext:
# The chat route emits a doc_update SSE event for each before streaming
# begins, so the editor pane switches to the new doc immediately.
auto_opened_docs: list = field(default_factory=list)
# Uploads attached to this user turn, resolved and owner-checked for the
# agent's private context. This is not emitted to the browser.
uploaded_files: list = field(default_factory=list)
# ── Helpers ────────────────────────────────────────────────────────────── #
@@ -366,6 +369,59 @@ async def preprocess(
)
def build_uploaded_file_manifest(att_ids: list, upload_handler, owner: Optional[str]) -> list[dict]:
"""Resolve current-turn upload IDs into a small tool-facing manifest.
The chat UI already sends attachment ids, and preprocessing inlines as much
text as fits. Agent mode still needs a discoverable bridge for files whose
content was truncated/omitted or when the model chooses file tools. Only
owner-authorized uploads are included, and paths must remain inside the
configured upload directory.
"""
if not att_ids or not upload_handler or not hasattr(upload_handler, "resolve_upload"):
return []
def _read_file_can_open(path: str) -> bool:
try:
from src.tool_execution import _resolve_tool_path
return _resolve_tool_path(path) == os.path.realpath(path)
except Exception:
return False
manifest: list[dict] = []
for att_id in att_ids:
try:
info = upload_handler.resolve_upload(str(att_id), owner=owner)
except Exception:
logger.debug("Failed to resolve upload %r for agent manifest", att_id, exc_info=True)
continue
if not isinstance(info, dict):
continue
path = info.get("path")
if path:
try:
inside = True
if hasattr(upload_handler, "_inside_upload_dir"):
inside = bool(upload_handler._inside_upload_dir(path))
elif hasattr(upload_handler, "inside_base_dir"):
inside = bool(upload_handler.inside_base_dir(path))
if not inside or not os.path.exists(path) or not _read_file_can_open(path):
path = None
except Exception:
path = None
manifest.append({
"id": info.get("id") or str(att_id),
"name": info.get("name") or info.get("original_name") or str(att_id),
"mime": info.get("mime", ""),
"size": info.get("size", 0),
"path": path,
})
return manifest
def add_user_message(sess, chat_handler, preprocessed: PreprocessedMessage, incognito: bool = False):
"""Add user message to session history and update session name.
In incognito mode, still add to in-memory history (for conversation context)
@@ -613,6 +669,11 @@ async def build_chat_context(
# bearer-token chat requests use the token owner instead of the "api" sentinel.
user = effective_user(request)
uprefs = load_prefs_for_user(user)
uploaded_files = build_uploaded_file_manifest(
att_ids or [],
getattr(chat_handler, "upload_handler", None),
getattr(sess, "owner", None),
)
casual_low_signal = _is_casual_low_signal(message)
# Memory enabled?
@@ -731,6 +792,7 @@ async def build_chat_context(
preset=preset,
preprocessed=preprocessed,
auto_opened_docs=auto_opened_docs,
uploaded_files=uploaded_files,
)
+1
View File
@@ -1297,6 +1297,7 @@ def setup_chat_routes(
approved_plan=approved_plan or None,
workspace=workspace or None,
forced_tools=_forced_tools,
uploaded_files=ctx.uploaded_files,
):
if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
try:
+21 -1
View File
@@ -133,6 +133,18 @@ def _find_endpoint(router: APIRouter | None, method: str, path: str):
return None
def _clamp_pagination(offset: Any, limit: Any, *, default_limit: int = 50, max_limit: int = 50) -> tuple[int, int]:
try:
parsed_offset = int(0 if offset in (None, "") else offset)
except (TypeError, ValueError):
raise HTTPException(400, "Invalid offset")
try:
parsed_limit = int(default_limit if limit in (None, "") else limit)
except (TypeError, ValueError):
raise HTTPException(400, "Invalid limit")
return max(0, parsed_offset), max(1, min(parsed_limit, max_limit))
def setup_codex_routes(
email_router: APIRouter | None = None,
memory_router: APIRouter | None = None,
@@ -440,10 +452,18 @@ def setup_codex_routes(
owner = _scope_owner(request, DOCS_READ_SCOPES)
if documents_library_endpoint is None:
raise HTTPException(503, "Documents integration is not available")
return await _as_owner(
offset, limit = _clamp_pagination(offset, limit)
result = await _as_owner(
request, owner, documents_library_endpoint,
request, search, language, sort, offset, limit, archived,
)
if isinstance(result, dict):
docs = result.get("documents")
total = result.get("total")
if isinstance(docs, list) and isinstance(total, int):
next_offset = offset + len(docs)
result["next_offset"] = next_offset if next_offset < total else None
return result
@router.get("/documents/{doc_id}")
async def codex_documents_get(request: Request, doc_id: str):
+55
View File
@@ -755,6 +755,46 @@ def _extract_last_user_message(messages: List[Dict]) -> str:
return ""
def _insert_before_latest_user(messages: List[Dict], context_msg: Dict) -> List[Dict]:
"""Insert a context message immediately before the latest user turn."""
out = list(messages or [])
for idx in range(len(out) - 1, -1, -1):
if out[idx].get("role") == "user":
out.insert(idx, context_msg)
return out
out.append(context_msg)
return out
def _uploaded_files_context_message(uploaded_files: Optional[List[Dict]]) -> Optional[Dict]:
if not uploaded_files:
return None
lines = [
"Uploaded files attached to the latest user turn:",
]
for item in uploaded_files[:20]:
name = str(item.get("name") or item.get("id") or "upload")
bits = [
f"id={item.get('id', '')}",
f"name={name}",
]
if item.get("mime"):
bits.append(f"mime={item.get('mime')}")
if item.get("size") is not None:
bits.append(f"size={item.get('size')} bytes")
if item.get("path"):
bits.append(f"path={item.get('path')}")
lines.append("- " + "; ".join(bits))
if len(uploaded_files) > 20:
lines.append(f"- ... {len(uploaded_files) - 20} more upload(s) omitted from this manifest")
lines.extend([
"",
"The attachment contents may already be in the latest user message. If an attachment is marked truncated or omitted, read its listed path with `read_file` when that tool is available. Do not say uploaded files are undiscoverable when they are listed here.",
])
return untrusted_context_message("current chat uploaded files", "\n".join(lines))
def _strip_think_blocks(text: str) -> str:
"""Linear-time equivalent of
``re.sub(r'<think>.*?</think>', '', text, flags=DOTALL|IGNORECASE)``.
@@ -1986,6 +2026,7 @@ async def stream_agent_loop(
tool_policy: Optional[ToolPolicy] = None,
workspace: Optional[str] = None,
forced_tools: Optional[Set[str]] = None,
uploaded_files: Optional[List[Dict]] = None,
_is_teacher_run: bool = False,
) -> AsyncGenerator[str, None]:
"""Streaming agent loop generator.
@@ -2021,6 +2062,11 @@ async def stream_agent_loop(
# filtered to read-only tools below (after the disabled map is loaded).
disabled_tools.update(plan_mode_disabled_tools())
uploaded_files = uploaded_files or []
_upload_msg = _uploaded_files_context_message(uploaded_files)
if _upload_msg:
messages = _insert_before_latest_user(messages, _upload_msg)
_t0 = time.time()
_needs_admin = _detect_admin_intent(messages)
_last_user = _extract_last_user_message(messages)
@@ -2232,6 +2278,15 @@ async def stream_agent_loop(
if _relevant_tools is not None and active_document is not None:
_relevant_tools.update({"edit_document", "update_document", "suggest_document"})
# Current-turn chat uploads are real files under the upload/data root. Make
# the read-side file/document tools visible immediately so the agent can
# inspect files whose inline text was truncated or omitted.
if not guide_only and uploaded_files:
if _relevant_tools is None:
from src.tool_index import ALWAYS_AVAILABLE
_relevant_tools = set(ALWAYS_AVAILABLE)
_relevant_tools.update({"read_file", "grep", "ls", "manage_documents"})
# Per-request UI toggles are stronger than retrieval. If the user turns on
# Search, the model must see the search tools even when the latest text is a
# typo or otherwise low-signal for tool RAG.
+17 -4
View File
@@ -564,9 +564,20 @@ class ManageDocumentTool:
if not doc:
return {"error": f"Document '{doc_id}' not found", "exit_code": 1}
body = doc.current_content or ""
preview_limit = int(args.get("limit", MAX_READ_CHARS))
truncated = len(body) > preview_limit
preview = body[:preview_limit] + (f"\n... (truncated, {len(body)} chars total)" if truncated else "")
try:
preview_limit = max(1, min(int(args.get("limit", MAX_READ_CHARS)), MAX_READ_CHARS))
except (TypeError, ValueError):
preview_limit = MAX_READ_CHARS
try:
offset = max(0, int(args.get("offset", 0) or 0))
except (TypeError, ValueError):
offset = 0
offset = min(offset, len(body))
end = min(offset + preview_limit, len(body))
truncated = end < len(body)
preview = body[offset:end]
if truncated:
preview += f"\n... (truncated, {len(body)} chars total; next_offset={end})"
anchor = f"[{doc.title}](#document-{doc.id})"
return {
"response": f"{anchor} — click to open in editor.\n\n```{doc.language or ''}\n{preview}\n```",
@@ -577,6 +588,8 @@ class ManageDocumentTool:
"size": len(body),
"content": preview,
"truncated": truncated,
"offset": offset,
"next_offset": end if truncated else None,
},
"exit_code": 0,
}
@@ -609,4 +622,4 @@ class ManageDocumentTool:
logger.error(f"manage_documents error: {e}")
return {"error": str(e), "exit_code": 1}
finally:
db.close()
db.close()
+31
View File
@@ -39,6 +39,7 @@ try:
_classify_agent_request,
_compute_final_metrics,
_append_tool_results,
_insert_before_latest_user,
_MCP_KEYWORDS,
)
_IMPORTED_AGENT_LOOP = sys.modules.get("src.agent_loop")
@@ -73,6 +74,36 @@ def test_polish_internet_search_request_classifies_as_web():
assert "web" in intent["domains"]
def test_insert_before_latest_user_places_context_before_last_user_turn():
messages = [
{"role": "user", "content": "first"},
{"role": "assistant", "content": "reply"},
{"role": "user", "content": "latest"},
]
context = {"role": "system", "content": "context"}
out = _insert_before_latest_user(messages, context)
assert out == [
{"role": "user", "content": "first"},
{"role": "assistant", "content": "reply"},
context,
{"role": "user", "content": "latest"},
]
assert messages == [
{"role": "user", "content": "first"},
{"role": "assistant", "content": "reply"},
{"role": "user", "content": "latest"},
]
def test_insert_before_latest_user_appends_when_no_user_message_exists():
messages = [{"role": "assistant", "content": "reply"}]
context = {"role": "system", "content": "context"}
assert _insert_before_latest_user(messages, context) == [messages[0], context]
# ---------------------------------------------------------------------------
# _detect_admin_intent
# ---------------------------------------------------------------------------
+125
View File
@@ -1,4 +1,8 @@
import asyncio
import os
import shutil
import uuid
from pathlib import Path
from types import SimpleNamespace
import pytest
@@ -10,6 +14,7 @@ from routes.chat_helpers import (
_session_is_research_spinoff,
auto_name_session,
build_chat_context,
build_uploaded_file_manifest,
clean_thinking_for_save,
needs_auto_name,
PreprocessedMessage,
@@ -145,6 +150,126 @@ class _FakeSession:
self.history.append(message)
class _ManifestUploadHandler:
def __init__(self, upload_dir, rows):
self.upload_dir = str(upload_dir)
self.rows = rows
self.calls = []
def _inside_upload_dir(self, path):
base = os.path.realpath(self.upload_dir)
candidate = os.path.realpath(path)
try:
return os.path.commonpath([base, candidate]) == base
except ValueError:
return False
def resolve_upload(self, upload_id, owner=None):
self.calls.append((upload_id, owner))
row = self.rows.get(upload_id)
if isinstance(row, dict) and row.get("owner") and row.get("owner") != owner:
return None
return row
def _manifest_test_dir(name):
root = Path(__file__).resolve().parents[1] / "tmp_pytest_probe" / f"{name}-{uuid.uuid4().hex}"
root.mkdir(parents=True, exist_ok=False)
return root
def test_build_uploaded_file_manifest_filters_and_nulls_unreadable_paths(monkeypatch):
root = _manifest_test_dir("manifest")
try:
upload_dir = root / "uploads"
upload_dir.mkdir()
good = upload_dir / "good.txt"
good.write_text("hello", encoding="utf-8")
outside = root / "outside.txt"
outside.write_text("nope", encoding="utf-8")
missing = upload_dir / "missing.txt"
import src.settings as settings
monkeypatch.setattr(
settings,
"get_setting",
lambda key: [str(upload_dir)] if key == "tool_path_extra_roots" else None,
)
handler = _ManifestUploadHandler(upload_dir, {
"good": {
"id": "good",
"name": "good.txt",
"mime": "text/plain",
"size": 5,
"path": str(good),
"owner": "alice",
},
"bob": {
"id": "bob",
"name": "bob.txt",
"path": str(good),
"owner": "bob",
},
"outside": {
"id": "outside",
"name": "outside.txt",
"path": str(outside),
"owner": "alice",
},
"missing": {
"id": "missing",
"name": "missing.txt",
"path": str(missing),
"owner": "alice",
},
"bad": ["not", "a", "dict"],
})
manifest = build_uploaded_file_manifest(
["good", "bob", "outside", "missing", "bad"],
handler,
owner="alice",
)
assert [item["id"] for item in manifest] == ["good", "outside", "missing"]
assert os.path.realpath(manifest[0]["path"]) == os.path.realpath(good)
assert manifest[1]["path"] is None
assert manifest[2]["path"] is None
assert handler.calls == [
("good", "alice"),
("bob", "alice"),
("outside", "alice"),
("missing", "alice"),
("bad", "alice"),
]
finally:
shutil.rmtree(root, ignore_errors=True)
def test_build_uploaded_file_manifest_hides_paths_read_file_cannot_open(monkeypatch):
root = _manifest_test_dir("manifest-unreadable")
try:
upload_dir = root / "uploads"
upload_dir.mkdir()
upload = upload_dir / "upload.txt"
upload.write_text("hello", encoding="utf-8")
handler = _ManifestUploadHandler(upload_dir, {
"upload": {"id": "upload", "name": "upload.txt", "path": str(upload), "owner": "alice"},
})
def reject_path(_path):
raise ValueError("outside the allowed roots")
monkeypatch.setattr("src.tool_execution._resolve_tool_path", reject_path)
manifest = build_uploaded_file_manifest(["upload"], handler, owner="alice")
assert manifest[0]["path"] is None
finally:
shutil.rmtree(root, ignore_errors=True)
@pytest.mark.parametrize("name,expected", [
# 24h format (the bug this PR fixes)
("deepseek-v4-flash 14:05:33", True),
+99
View File
@@ -100,6 +100,105 @@ def test_default_ssh_port_omits_flag():
assert port_flag == ""
def _documents_endpoint(total: int):
calls = []
document_router = APIRouter()
@document_router.get("/api/documents/library")
async def documents_library(
request: Request,
search=None,
language=None,
sort="recent",
offset=0,
limit=20,
archived=False,
):
calls.append({
"owner": request.state.current_user,
"search": search,
"language": language,
"sort": sort,
"offset": offset,
"limit": limit,
"archived": archived,
})
end = min(offset + limit, total)
docs = [{"id": f"doc-{i}"} for i in range(offset, end)]
return {"documents": docs, "total": total}
router = codex_routes.setup_codex_routes(document_router=document_router)
return _route_endpoint("/api/codex/documents", "GET", router=router), calls
@pytest.mark.asyncio
async def test_documents_pagination_clamps_offset_and_limit():
endpoint, calls = _documents_endpoint(total=99)
result = await endpoint(_codex_request(["documents:read"]), offset=-10, limit=500)
assert calls[-1]["owner"] == "alice"
assert calls[-1]["offset"] == 0
assert calls[-1]["limit"] == 50
assert len(result["documents"]) == 50
assert result["next_offset"] == 50
@pytest.mark.asyncio
async def test_documents_pagination_clamps_zero_limit_to_one():
endpoint, calls = _documents_endpoint(total=3)
result = await endpoint(_codex_request(["documents:read"]), offset=0, limit=0)
assert calls[-1]["limit"] == 1
assert len(result["documents"]) == 1
assert result["next_offset"] == 1
@pytest.mark.asyncio
async def test_documents_pagination_returns_next_offset_when_truncated():
endpoint, _calls = _documents_endpoint(total=7)
result = await endpoint(_codex_request(["documents:read"]), offset=2, limit=3)
assert [doc["id"] for doc in result["documents"]] == ["doc-2", "doc-3", "doc-4"]
assert result["next_offset"] == 5
@pytest.mark.asyncio
async def test_documents_pagination_rejects_invalid_offset():
endpoint, _calls = _documents_endpoint(total=7)
with pytest.raises(HTTPException) as exc:
await endpoint(_codex_request(["documents:read"]), offset="soon", limit=3)
assert exc.value.status_code == 400
assert exc.value.detail == "Invalid offset"
@pytest.mark.asyncio
async def test_documents_pagination_rejects_invalid_limit():
endpoint, _calls = _documents_endpoint(total=7)
with pytest.raises(HTTPException) as exc:
await endpoint(_codex_request(["documents:read"]), offset=0, limit="many")
assert exc.value.status_code == 400
assert exc.value.detail == "Invalid limit"
@pytest.mark.asyncio
async def test_documents_pagination_out_of_range_offset_returns_empty_page():
endpoint, calls = _documents_endpoint(total=3)
result = await endpoint(_codex_request(["documents:read"]), offset=10, limit=2)
assert calls[-1]["offset"] == 10
assert calls[-1]["limit"] == 2
assert result["documents"] == []
assert result["next_offset"] is None
def test_adopt_rejects_ssh_option_host_before_shell(monkeypatch):
calls = []