fix: improve uploaded document retrieval and deep research reuse (#4784)

* fix: improve uploaded document retrieval and deep research reuse * test: add coverage for upload manifest and document pagination * chore: rerun CI * fix: restore _insert_before_latest_user helper * fix(agent_loop): restore missing upload context helper
2026-06-28 23:52:09 -04:00 · 2026-06-27 21:24:17 +03:00
parent 7e9bfb1700
commit 3e7af8634f
8 changed files with 411 additions and 5 deletions
@@ -104,6 +104,9 @@ class ChatContext:
    # The chat route emits a doc_update SSE event for each before streaming
    # begins, so the editor pane switches to the new doc immediately.
    auto_opened_docs: list = field(default_factory=list)
+    # Uploads attached to this user turn, resolved and owner-checked for the
+    # agent's private context. This is not emitted to the browser.
+    uploaded_files: list = field(default_factory=list)


 # ── Helpers ────────────────────────────────────────────────────────────── #
@@ -366,6 +369,59 @@ async def preprocess(
    )


+def build_uploaded_file_manifest(att_ids: list, upload_handler, owner: Optional[str]) -> list[dict]:
+    """Resolve current-turn upload IDs into a small tool-facing manifest.
+
+    The chat UI already sends attachment ids, and preprocessing inlines as much
+    text as fits. Agent mode still needs a discoverable bridge for files whose
+    content was truncated/omitted or when the model chooses file tools. Only
+    owner-authorized uploads are included, and paths must remain inside the
+    configured upload directory.
+    """
+    if not att_ids or not upload_handler or not hasattr(upload_handler, "resolve_upload"):
+        return []
+
+    def _read_file_can_open(path: str) -> bool:
+        try:
+            from src.tool_execution import _resolve_tool_path
+
+            return _resolve_tool_path(path) == os.path.realpath(path)
+        except Exception:
+            return False
+
+    manifest: list[dict] = []
+    for att_id in att_ids:
+        try:
+            info = upload_handler.resolve_upload(str(att_id), owner=owner)
+        except Exception:
+            logger.debug("Failed to resolve upload %r for agent manifest", att_id, exc_info=True)
+            continue
+        if not isinstance(info, dict):
+            continue
+
+        path = info.get("path")
+        if path:
+            try:
+                inside = True
+                if hasattr(upload_handler, "_inside_upload_dir"):
+                    inside = bool(upload_handler._inside_upload_dir(path))
+                elif hasattr(upload_handler, "inside_base_dir"):
+                    inside = bool(upload_handler.inside_base_dir(path))
+                if not inside or not os.path.exists(path) or not _read_file_can_open(path):
+                    path = None
+            except Exception:
+                path = None
+
+        manifest.append({
+            "id": info.get("id") or str(att_id),
+            "name": info.get("name") or info.get("original_name") or str(att_id),
+            "mime": info.get("mime", ""),
+            "size": info.get("size", 0),
+            "path": path,
+        })
+    return manifest
+
+
 def add_user_message(sess, chat_handler, preprocessed: PreprocessedMessage, incognito: bool = False):
    """Add user message to session history and update session name.
    In incognito mode, still add to in-memory history (for conversation context)
@@ -613,6 +669,11 @@ async def build_chat_context(
    # bearer-token chat requests use the token owner instead of the "api" sentinel.
    user = effective_user(request)
    uprefs = load_prefs_for_user(user)
+    uploaded_files = build_uploaded_file_manifest(
+        att_ids or [],
+        getattr(chat_handler, "upload_handler", None),
+        getattr(sess, "owner", None),
+    )
    casual_low_signal = _is_casual_low_signal(message)

    # Memory enabled?
@@ -731,6 +792,7 @@ async def build_chat_context(
        preset=preset,
        preprocessed=preprocessed,
        auto_opened_docs=auto_opened_docs,
+        uploaded_files=uploaded_files,
    )


@@ -1297,6 +1297,7 @@ def setup_chat_routes(
                        approved_plan=approved_plan or None,
                        workspace=workspace or None,
                        forced_tools=_forced_tools,
+                        uploaded_files=ctx.uploaded_files,
                    ):
                        if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
                            try:
@@ -133,6 +133,18 @@ def _find_endpoint(router: APIRouter | None, method: str, path: str):
    return None


+def _clamp_pagination(offset: Any, limit: Any, *, default_limit: int = 50, max_limit: int = 50) -> tuple[int, int]:
+    try:
+        parsed_offset = int(0 if offset in (None, "") else offset)
+    except (TypeError, ValueError):
+        raise HTTPException(400, "Invalid offset")
+    try:
+        parsed_limit = int(default_limit if limit in (None, "") else limit)
+    except (TypeError, ValueError):
+        raise HTTPException(400, "Invalid limit")
+    return max(0, parsed_offset), max(1, min(parsed_limit, max_limit))
+
+
 def setup_codex_routes(
    email_router: APIRouter | None = None,
    memory_router: APIRouter | None = None,
@@ -440,10 +452,18 @@ def setup_codex_routes(
        owner = _scope_owner(request, DOCS_READ_SCOPES)
        if documents_library_endpoint is None:
            raise HTTPException(503, "Documents integration is not available")
-        return await _as_owner(
+        offset, limit = _clamp_pagination(offset, limit)
+        result = await _as_owner(
            request, owner, documents_library_endpoint,
            request, search, language, sort, offset, limit, archived,
        )
+        if isinstance(result, dict):
+            docs = result.get("documents")
+            total = result.get("total")
+            if isinstance(docs, list) and isinstance(total, int):
+                next_offset = offset + len(docs)
+                result["next_offset"] = next_offset if next_offset < total else None
+        return result

    @router.get("/documents/{doc_id}")
    async def codex_documents_get(request: Request, doc_id: str):
@@ -755,6 +755,46 @@ def _extract_last_user_message(messages: List[Dict]) -> str:
    return ""


+def _insert_before_latest_user(messages: List[Dict], context_msg: Dict) -> List[Dict]:
+    """Insert a context message immediately before the latest user turn."""
+    out = list(messages or [])
+    for idx in range(len(out) - 1, -1, -1):
+        if out[idx].get("role") == "user":
+            out.insert(idx, context_msg)
+            return out
+    out.append(context_msg)
+    return out
+
+
+def _uploaded_files_context_message(uploaded_files: Optional[List[Dict]]) -> Optional[Dict]:
+    if not uploaded_files:
+        return None
+
+    lines = [
+        "Uploaded files attached to the latest user turn:",
+    ]
+    for item in uploaded_files[:20]:
+        name = str(item.get("name") or item.get("id") or "upload")
+        bits = [
+            f"id={item.get('id', '')}",
+            f"name={name}",
+        ]
+        if item.get("mime"):
+            bits.append(f"mime={item.get('mime')}")
+        if item.get("size") is not None:
+            bits.append(f"size={item.get('size')} bytes")
+        if item.get("path"):
+            bits.append(f"path={item.get('path')}")
+        lines.append("- " + "; ".join(bits))
+    if len(uploaded_files) > 20:
+        lines.append(f"- ... {len(uploaded_files) - 20} more upload(s) omitted from this manifest")
+    lines.extend([
+        "",
+        "The attachment contents may already be in the latest user message. If an attachment is marked truncated or omitted, read its listed path with `read_file` when that tool is available. Do not say uploaded files are undiscoverable when they are listed here.",
+    ])
+    return untrusted_context_message("current chat uploaded files", "\n".join(lines))
+
+
 def _strip_think_blocks(text: str) -> str:
    """Linear-time equivalent of
    ``re.sub(r'<think>.*?</think>', '', text, flags=DOTALL|IGNORECASE)``.
@@ -1986,6 +2026,7 @@ async def stream_agent_loop(
    tool_policy: Optional[ToolPolicy] = None,
    workspace: Optional[str] = None,
    forced_tools: Optional[Set[str]] = None,
+    uploaded_files: Optional[List[Dict]] = None,
    _is_teacher_run: bool = False,
 ) -> AsyncGenerator[str, None]:
    """Streaming agent loop generator.
@@ -2021,6 +2062,11 @@ async def stream_agent_loop(
        # filtered to read-only tools below (after the disabled map is loaded).
        disabled_tools.update(plan_mode_disabled_tools())

+    uploaded_files = uploaded_files or []
+    _upload_msg = _uploaded_files_context_message(uploaded_files)
+    if _upload_msg:
+        messages = _insert_before_latest_user(messages, _upload_msg)
+
    _t0 = time.time()
    _needs_admin = _detect_admin_intent(messages)
    _last_user = _extract_last_user_message(messages)
@@ -2232,6 +2278,15 @@ async def stream_agent_loop(
    if _relevant_tools is not None and active_document is not None:
        _relevant_tools.update({"edit_document", "update_document", "suggest_document"})

+    # Current-turn chat uploads are real files under the upload/data root. Make
+    # the read-side file/document tools visible immediately so the agent can
+    # inspect files whose inline text was truncated or omitted.
+    if not guide_only and uploaded_files:
+        if _relevant_tools is None:
+            from src.tool_index import ALWAYS_AVAILABLE
+            _relevant_tools = set(ALWAYS_AVAILABLE)
+        _relevant_tools.update({"read_file", "grep", "ls", "manage_documents"})
+
    # Per-request UI toggles are stronger than retrieval. If the user turns on
    # Search, the model must see the search tools even when the latest text is a
    # typo or otherwise low-signal for tool RAG.
@@ -564,9 +564,20 @@ class ManageDocumentTool:
                if not doc:
                    return {"error": f"Document '{doc_id}' not found", "exit_code": 1}
                body = doc.current_content or ""
-                preview_limit = int(args.get("limit", MAX_READ_CHARS))
-                truncated = len(body) > preview_limit
-                preview = body[:preview_limit] + (f"\n... (truncated, {len(body)} chars total)" if truncated else "")
+                try:
+                    preview_limit = max(1, min(int(args.get("limit", MAX_READ_CHARS)), MAX_READ_CHARS))
+                except (TypeError, ValueError):
+                    preview_limit = MAX_READ_CHARS
+                try:
+                    offset = max(0, int(args.get("offset", 0) or 0))
+                except (TypeError, ValueError):
+                    offset = 0
+                offset = min(offset, len(body))
+                end = min(offset + preview_limit, len(body))
+                truncated = end < len(body)
+                preview = body[offset:end]
+                if truncated:
+                    preview += f"\n... (truncated, {len(body)} chars total; next_offset={end})"
                anchor = f"[{doc.title}](#document-{doc.id})"
                return {
                    "response": f"{anchor} — click to open in editor.\n\n```{doc.language or ''}\n{preview}\n```",
@@ -577,6 +588,8 @@ class ManageDocumentTool:
                        "size": len(body),
                        "content": preview,
                        "truncated": truncated,
+                        "offset": offset,
+                        "next_offset": end if truncated else None,
                    },
                    "exit_code": 0,
                }
@@ -609,4 +622,4 @@ class ManageDocumentTool:
            logger.error(f"manage_documents error: {e}")
            return {"error": str(e), "exit_code": 1}
        finally:
-            db.close()
+            db.close()
@@ -39,6 +39,7 @@ try:
        _classify_agent_request,
        _compute_final_metrics,
        _append_tool_results,
+        _insert_before_latest_user,
        _MCP_KEYWORDS,
    )
    _IMPORTED_AGENT_LOOP = sys.modules.get("src.agent_loop")
@@ -73,6 +74,36 @@ def test_polish_internet_search_request_classifies_as_web():
    assert "web" in intent["domains"]


+def test_insert_before_latest_user_places_context_before_last_user_turn():
+    messages = [
+        {"role": "user", "content": "first"},
+        {"role": "assistant", "content": "reply"},
+        {"role": "user", "content": "latest"},
+    ]
+    context = {"role": "system", "content": "context"}
+
+    out = _insert_before_latest_user(messages, context)
+
+    assert out == [
+        {"role": "user", "content": "first"},
+        {"role": "assistant", "content": "reply"},
+        context,
+        {"role": "user", "content": "latest"},
+    ]
+    assert messages == [
+        {"role": "user", "content": "first"},
+        {"role": "assistant", "content": "reply"},
+        {"role": "user", "content": "latest"},
+    ]
+
+
+def test_insert_before_latest_user_appends_when_no_user_message_exists():
+    messages = [{"role": "assistant", "content": "reply"}]
+    context = {"role": "system", "content": "context"}
+
+    assert _insert_before_latest_user(messages, context) == [messages[0], context]
+
+
 # ---------------------------------------------------------------------------
 # _detect_admin_intent
 # ---------------------------------------------------------------------------
@@ -1,4 +1,8 @@
 import asyncio
+import os
+import shutil
+import uuid
+from pathlib import Path
 from types import SimpleNamespace

 import pytest
@@ -10,6 +14,7 @@ from routes.chat_helpers import (
    _session_is_research_spinoff,
    auto_name_session,
    build_chat_context,
+    build_uploaded_file_manifest,
    clean_thinking_for_save,
    needs_auto_name,
    PreprocessedMessage,
@@ -145,6 +150,126 @@ class _FakeSession:
        self.history.append(message)


+class _ManifestUploadHandler:
+    def __init__(self, upload_dir, rows):
+        self.upload_dir = str(upload_dir)
+        self.rows = rows
+        self.calls = []
+
+    def _inside_upload_dir(self, path):
+        base = os.path.realpath(self.upload_dir)
+        candidate = os.path.realpath(path)
+        try:
+            return os.path.commonpath([base, candidate]) == base
+        except ValueError:
+            return False
+
+    def resolve_upload(self, upload_id, owner=None):
+        self.calls.append((upload_id, owner))
+        row = self.rows.get(upload_id)
+        if isinstance(row, dict) and row.get("owner") and row.get("owner") != owner:
+            return None
+        return row
+
+
+def _manifest_test_dir(name):
+    root = Path(__file__).resolve().parents[1] / "tmp_pytest_probe" / f"{name}-{uuid.uuid4().hex}"
+    root.mkdir(parents=True, exist_ok=False)
+    return root
+
+
+def test_build_uploaded_file_manifest_filters_and_nulls_unreadable_paths(monkeypatch):
+    root = _manifest_test_dir("manifest")
+    try:
+        upload_dir = root / "uploads"
+        upload_dir.mkdir()
+        good = upload_dir / "good.txt"
+        good.write_text("hello", encoding="utf-8")
+        outside = root / "outside.txt"
+        outside.write_text("nope", encoding="utf-8")
+        missing = upload_dir / "missing.txt"
+
+        import src.settings as settings
+
+        monkeypatch.setattr(
+            settings,
+            "get_setting",
+            lambda key: [str(upload_dir)] if key == "tool_path_extra_roots" else None,
+        )
+        handler = _ManifestUploadHandler(upload_dir, {
+            "good": {
+                "id": "good",
+                "name": "good.txt",
+                "mime": "text/plain",
+                "size": 5,
+                "path": str(good),
+                "owner": "alice",
+            },
+            "bob": {
+                "id": "bob",
+                "name": "bob.txt",
+                "path": str(good),
+                "owner": "bob",
+            },
+            "outside": {
+                "id": "outside",
+                "name": "outside.txt",
+                "path": str(outside),
+                "owner": "alice",
+            },
+            "missing": {
+                "id": "missing",
+                "name": "missing.txt",
+                "path": str(missing),
+                "owner": "alice",
+            },
+            "bad": ["not", "a", "dict"],
+        })
+
+        manifest = build_uploaded_file_manifest(
+            ["good", "bob", "outside", "missing", "bad"],
+            handler,
+            owner="alice",
+        )
+
+        assert [item["id"] for item in manifest] == ["good", "outside", "missing"]
+        assert os.path.realpath(manifest[0]["path"]) == os.path.realpath(good)
+        assert manifest[1]["path"] is None
+        assert manifest[2]["path"] is None
+        assert handler.calls == [
+            ("good", "alice"),
+            ("bob", "alice"),
+            ("outside", "alice"),
+            ("missing", "alice"),
+            ("bad", "alice"),
+        ]
+    finally:
+        shutil.rmtree(root, ignore_errors=True)
+
+
+def test_build_uploaded_file_manifest_hides_paths_read_file_cannot_open(monkeypatch):
+    root = _manifest_test_dir("manifest-unreadable")
+    try:
+        upload_dir = root / "uploads"
+        upload_dir.mkdir()
+        upload = upload_dir / "upload.txt"
+        upload.write_text("hello", encoding="utf-8")
+        handler = _ManifestUploadHandler(upload_dir, {
+            "upload": {"id": "upload", "name": "upload.txt", "path": str(upload), "owner": "alice"},
+        })
+
+        def reject_path(_path):
+            raise ValueError("outside the allowed roots")
+
+        monkeypatch.setattr("src.tool_execution._resolve_tool_path", reject_path)
+
+        manifest = build_uploaded_file_manifest(["upload"], handler, owner="alice")
+
+        assert manifest[0]["path"] is None
+    finally:
+        shutil.rmtree(root, ignore_errors=True)
+
+
@pytest.mark.parametrize("name,expected", [
    # 24h format (the bug this PR fixes)
    ("deepseek-v4-flash 14:05:33", True),
@@ -100,6 +100,105 @@ def test_default_ssh_port_omits_flag():
    assert port_flag == ""


+def _documents_endpoint(total: int):
+    calls = []
+    document_router = APIRouter()
+
+    @document_router.get("/api/documents/library")
+    async def documents_library(
+        request: Request,
+        search=None,
+        language=None,
+        sort="recent",
+        offset=0,
+        limit=20,
+        archived=False,
+    ):
+        calls.append({
+            "owner": request.state.current_user,
+            "search": search,
+            "language": language,
+            "sort": sort,
+            "offset": offset,
+            "limit": limit,
+            "archived": archived,
+        })
+        end = min(offset + limit, total)
+        docs = [{"id": f"doc-{i}"} for i in range(offset, end)]
+        return {"documents": docs, "total": total}
+
+    router = codex_routes.setup_codex_routes(document_router=document_router)
+    return _route_endpoint("/api/codex/documents", "GET", router=router), calls
+
+
+@pytest.mark.asyncio
+async def test_documents_pagination_clamps_offset_and_limit():
+    endpoint, calls = _documents_endpoint(total=99)
+
+    result = await endpoint(_codex_request(["documents:read"]), offset=-10, limit=500)
+
+    assert calls[-1]["owner"] == "alice"
+    assert calls[-1]["offset"] == 0
+    assert calls[-1]["limit"] == 50
+    assert len(result["documents"]) == 50
+    assert result["next_offset"] == 50
+
+
+@pytest.mark.asyncio
+async def test_documents_pagination_clamps_zero_limit_to_one():
+    endpoint, calls = _documents_endpoint(total=3)
+
+    result = await endpoint(_codex_request(["documents:read"]), offset=0, limit=0)
+
+    assert calls[-1]["limit"] == 1
+    assert len(result["documents"]) == 1
+    assert result["next_offset"] == 1
+
+
+@pytest.mark.asyncio
+async def test_documents_pagination_returns_next_offset_when_truncated():
+    endpoint, _calls = _documents_endpoint(total=7)
+
+    result = await endpoint(_codex_request(["documents:read"]), offset=2, limit=3)
+
+    assert [doc["id"] for doc in result["documents"]] == ["doc-2", "doc-3", "doc-4"]
+    assert result["next_offset"] == 5
+
+
+@pytest.mark.asyncio
+async def test_documents_pagination_rejects_invalid_offset():
+    endpoint, _calls = _documents_endpoint(total=7)
+
+    with pytest.raises(HTTPException) as exc:
+        await endpoint(_codex_request(["documents:read"]), offset="soon", limit=3)
+
+    assert exc.value.status_code == 400
+    assert exc.value.detail == "Invalid offset"
+
+
+@pytest.mark.asyncio
+async def test_documents_pagination_rejects_invalid_limit():
+    endpoint, _calls = _documents_endpoint(total=7)
+
+    with pytest.raises(HTTPException) as exc:
+        await endpoint(_codex_request(["documents:read"]), offset=0, limit="many")
+
+    assert exc.value.status_code == 400
+    assert exc.value.detail == "Invalid limit"
+
+
+@pytest.mark.asyncio
+async def test_documents_pagination_out_of_range_offset_returns_empty_page():
+    endpoint, calls = _documents_endpoint(total=3)
+
+    result = await endpoint(_codex_request(["documents:read"]), offset=10, limit=2)
+
+    assert calls[-1]["offset"] == 10
+    assert calls[-1]["limit"] == 2
+    assert result["documents"] == []
+    assert result["next_offset"] is None
+
+
 def test_adopt_rejects_ssh_option_host_before_shell(monkeypatch):
    calls = []