diff --git a/routes/chat_helpers.py b/routes/chat_helpers.py index 06c92ac6b..576429005 100644 --- a/routes/chat_helpers.py +++ b/routes/chat_helpers.py @@ -104,6 +104,9 @@ class ChatContext: # The chat route emits a doc_update SSE event for each before streaming # begins, so the editor pane switches to the new doc immediately. auto_opened_docs: list = field(default_factory=list) + # Uploads attached to this user turn, resolved and owner-checked for the + # agent's private context. This is not emitted to the browser. + uploaded_files: list = field(default_factory=list) # ── Helpers ────────────────────────────────────────────────────────────── # @@ -366,6 +369,59 @@ async def preprocess( ) +def build_uploaded_file_manifest(att_ids: list, upload_handler, owner: Optional[str]) -> list[dict]: + """Resolve current-turn upload IDs into a small tool-facing manifest. + + The chat UI already sends attachment ids, and preprocessing inlines as much + text as fits. Agent mode still needs a discoverable bridge for files whose + content was truncated/omitted or when the model chooses file tools. Only + owner-authorized uploads are included, and paths must remain inside the + configured upload directory. + """ + if not att_ids or not upload_handler or not hasattr(upload_handler, "resolve_upload"): + return [] + + def _read_file_can_open(path: str) -> bool: + try: + from src.tool_execution import _resolve_tool_path + + return _resolve_tool_path(path) == os.path.realpath(path) + except Exception: + return False + + manifest: list[dict] = [] + for att_id in att_ids: + try: + info = upload_handler.resolve_upload(str(att_id), owner=owner) + except Exception: + logger.debug("Failed to resolve upload %r for agent manifest", att_id, exc_info=True) + continue + if not isinstance(info, dict): + continue + + path = info.get("path") + if path: + try: + inside = True + if hasattr(upload_handler, "_inside_upload_dir"): + inside = bool(upload_handler._inside_upload_dir(path)) + elif hasattr(upload_handler, "inside_base_dir"): + inside = bool(upload_handler.inside_base_dir(path)) + if not inside or not os.path.exists(path) or not _read_file_can_open(path): + path = None + except Exception: + path = None + + manifest.append({ + "id": info.get("id") or str(att_id), + "name": info.get("name") or info.get("original_name") or str(att_id), + "mime": info.get("mime", ""), + "size": info.get("size", 0), + "path": path, + }) + return manifest + + def add_user_message(sess, chat_handler, preprocessed: PreprocessedMessage, incognito: bool = False): """Add user message to session history and update session name. In incognito mode, still add to in-memory history (for conversation context) @@ -613,6 +669,11 @@ async def build_chat_context( # bearer-token chat requests use the token owner instead of the "api" sentinel. user = effective_user(request) uprefs = load_prefs_for_user(user) + uploaded_files = build_uploaded_file_manifest( + att_ids or [], + getattr(chat_handler, "upload_handler", None), + getattr(sess, "owner", None), + ) casual_low_signal = _is_casual_low_signal(message) # Memory enabled? @@ -731,6 +792,7 @@ async def build_chat_context( preset=preset, preprocessed=preprocessed, auto_opened_docs=auto_opened_docs, + uploaded_files=uploaded_files, ) diff --git a/routes/chat_routes.py b/routes/chat_routes.py index 4c395c06e..94f33e32c 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -1297,6 +1297,7 @@ def setup_chat_routes( approved_plan=approved_plan or None, workspace=workspace or None, forced_tools=_forced_tools, + uploaded_files=ctx.uploaded_files, ): if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): try: diff --git a/routes/codex_routes.py b/routes/codex_routes.py index e3ddaba46..9fe36a822 100644 --- a/routes/codex_routes.py +++ b/routes/codex_routes.py @@ -133,6 +133,18 @@ def _find_endpoint(router: APIRouter | None, method: str, path: str): return None +def _clamp_pagination(offset: Any, limit: Any, *, default_limit: int = 50, max_limit: int = 50) -> tuple[int, int]: + try: + parsed_offset = int(0 if offset in (None, "") else offset) + except (TypeError, ValueError): + raise HTTPException(400, "Invalid offset") + try: + parsed_limit = int(default_limit if limit in (None, "") else limit) + except (TypeError, ValueError): + raise HTTPException(400, "Invalid limit") + return max(0, parsed_offset), max(1, min(parsed_limit, max_limit)) + + def setup_codex_routes( email_router: APIRouter | None = None, memory_router: APIRouter | None = None, @@ -440,10 +452,18 @@ def setup_codex_routes( owner = _scope_owner(request, DOCS_READ_SCOPES) if documents_library_endpoint is None: raise HTTPException(503, "Documents integration is not available") - return await _as_owner( + offset, limit = _clamp_pagination(offset, limit) + result = await _as_owner( request, owner, documents_library_endpoint, request, search, language, sort, offset, limit, archived, ) + if isinstance(result, dict): + docs = result.get("documents") + total = result.get("total") + if isinstance(docs, list) and isinstance(total, int): + next_offset = offset + len(docs) + result["next_offset"] = next_offset if next_offset < total else None + return result @router.get("/documents/{doc_id}") async def codex_documents_get(request: Request, doc_id: str): diff --git a/src/agent_loop.py b/src/agent_loop.py index 0a6d54695..24d27ef6d 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -755,6 +755,46 @@ def _extract_last_user_message(messages: List[Dict]) -> str: return "" +def _insert_before_latest_user(messages: List[Dict], context_msg: Dict) -> List[Dict]: + """Insert a context message immediately before the latest user turn.""" + out = list(messages or []) + for idx in range(len(out) - 1, -1, -1): + if out[idx].get("role") == "user": + out.insert(idx, context_msg) + return out + out.append(context_msg) + return out + + +def _uploaded_files_context_message(uploaded_files: Optional[List[Dict]]) -> Optional[Dict]: + if not uploaded_files: + return None + + lines = [ + "Uploaded files attached to the latest user turn:", + ] + for item in uploaded_files[:20]: + name = str(item.get("name") or item.get("id") or "upload") + bits = [ + f"id={item.get('id', '')}", + f"name={name}", + ] + if item.get("mime"): + bits.append(f"mime={item.get('mime')}") + if item.get("size") is not None: + bits.append(f"size={item.get('size')} bytes") + if item.get("path"): + bits.append(f"path={item.get('path')}") + lines.append("- " + "; ".join(bits)) + if len(uploaded_files) > 20: + lines.append(f"- ... {len(uploaded_files) - 20} more upload(s) omitted from this manifest") + lines.extend([ + "", + "The attachment contents may already be in the latest user message. If an attachment is marked truncated or omitted, read its listed path with `read_file` when that tool is available. Do not say uploaded files are undiscoverable when they are listed here.", + ]) + return untrusted_context_message("current chat uploaded files", "\n".join(lines)) + + def _strip_think_blocks(text: str) -> str: """Linear-time equivalent of ``re.sub(r'.*?', '', text, flags=DOTALL|IGNORECASE)``. @@ -1986,6 +2026,7 @@ async def stream_agent_loop( tool_policy: Optional[ToolPolicy] = None, workspace: Optional[str] = None, forced_tools: Optional[Set[str]] = None, + uploaded_files: Optional[List[Dict]] = None, _is_teacher_run: bool = False, ) -> AsyncGenerator[str, None]: """Streaming agent loop generator. @@ -2021,6 +2062,11 @@ async def stream_agent_loop( # filtered to read-only tools below (after the disabled map is loaded). disabled_tools.update(plan_mode_disabled_tools()) + uploaded_files = uploaded_files or [] + _upload_msg = _uploaded_files_context_message(uploaded_files) + if _upload_msg: + messages = _insert_before_latest_user(messages, _upload_msg) + _t0 = time.time() _needs_admin = _detect_admin_intent(messages) _last_user = _extract_last_user_message(messages) @@ -2232,6 +2278,15 @@ async def stream_agent_loop( if _relevant_tools is not None and active_document is not None: _relevant_tools.update({"edit_document", "update_document", "suggest_document"}) + # Current-turn chat uploads are real files under the upload/data root. Make + # the read-side file/document tools visible immediately so the agent can + # inspect files whose inline text was truncated or omitted. + if not guide_only and uploaded_files: + if _relevant_tools is None: + from src.tool_index import ALWAYS_AVAILABLE + _relevant_tools = set(ALWAYS_AVAILABLE) + _relevant_tools.update({"read_file", "grep", "ls", "manage_documents"}) + # Per-request UI toggles are stronger than retrieval. If the user turns on # Search, the model must see the search tools even when the latest text is a # typo or otherwise low-signal for tool RAG. diff --git a/src/agent_tools/document_tools.py b/src/agent_tools/document_tools.py index dcbea8b99..a86968e51 100644 --- a/src/agent_tools/document_tools.py +++ b/src/agent_tools/document_tools.py @@ -564,9 +564,20 @@ class ManageDocumentTool: if not doc: return {"error": f"Document '{doc_id}' not found", "exit_code": 1} body = doc.current_content or "" - preview_limit = int(args.get("limit", MAX_READ_CHARS)) - truncated = len(body) > preview_limit - preview = body[:preview_limit] + (f"\n... (truncated, {len(body)} chars total)" if truncated else "") + try: + preview_limit = max(1, min(int(args.get("limit", MAX_READ_CHARS)), MAX_READ_CHARS)) + except (TypeError, ValueError): + preview_limit = MAX_READ_CHARS + try: + offset = max(0, int(args.get("offset", 0) or 0)) + except (TypeError, ValueError): + offset = 0 + offset = min(offset, len(body)) + end = min(offset + preview_limit, len(body)) + truncated = end < len(body) + preview = body[offset:end] + if truncated: + preview += f"\n... (truncated, {len(body)} chars total; next_offset={end})" anchor = f"[{doc.title}](#document-{doc.id})" return { "response": f"{anchor} — click to open in editor.\n\n```{doc.language or ''}\n{preview}\n```", @@ -577,6 +588,8 @@ class ManageDocumentTool: "size": len(body), "content": preview, "truncated": truncated, + "offset": offset, + "next_offset": end if truncated else None, }, "exit_code": 0, } @@ -609,4 +622,4 @@ class ManageDocumentTool: logger.error(f"manage_documents error: {e}") return {"error": str(e), "exit_code": 1} finally: - db.close() \ No newline at end of file + db.close() diff --git a/tests/test_agent_loop.py b/tests/test_agent_loop.py index 0f1912361..107636b4a 100644 --- a/tests/test_agent_loop.py +++ b/tests/test_agent_loop.py @@ -39,6 +39,7 @@ try: _classify_agent_request, _compute_final_metrics, _append_tool_results, + _insert_before_latest_user, _MCP_KEYWORDS, ) _IMPORTED_AGENT_LOOP = sys.modules.get("src.agent_loop") @@ -73,6 +74,36 @@ def test_polish_internet_search_request_classifies_as_web(): assert "web" in intent["domains"] +def test_insert_before_latest_user_places_context_before_last_user_turn(): + messages = [ + {"role": "user", "content": "first"}, + {"role": "assistant", "content": "reply"}, + {"role": "user", "content": "latest"}, + ] + context = {"role": "system", "content": "context"} + + out = _insert_before_latest_user(messages, context) + + assert out == [ + {"role": "user", "content": "first"}, + {"role": "assistant", "content": "reply"}, + context, + {"role": "user", "content": "latest"}, + ] + assert messages == [ + {"role": "user", "content": "first"}, + {"role": "assistant", "content": "reply"}, + {"role": "user", "content": "latest"}, + ] + + +def test_insert_before_latest_user_appends_when_no_user_message_exists(): + messages = [{"role": "assistant", "content": "reply"}] + context = {"role": "system", "content": "context"} + + assert _insert_before_latest_user(messages, context) == [messages[0], context] + + # --------------------------------------------------------------------------- # _detect_admin_intent # --------------------------------------------------------------------------- diff --git a/tests/test_chat_helpers.py b/tests/test_chat_helpers.py index 92bd51561..0e2cce1f7 100644 --- a/tests/test_chat_helpers.py +++ b/tests/test_chat_helpers.py @@ -1,4 +1,8 @@ import asyncio +import os +import shutil +import uuid +from pathlib import Path from types import SimpleNamespace import pytest @@ -10,6 +14,7 @@ from routes.chat_helpers import ( _session_is_research_spinoff, auto_name_session, build_chat_context, + build_uploaded_file_manifest, clean_thinking_for_save, needs_auto_name, PreprocessedMessage, @@ -145,6 +150,126 @@ class _FakeSession: self.history.append(message) +class _ManifestUploadHandler: + def __init__(self, upload_dir, rows): + self.upload_dir = str(upload_dir) + self.rows = rows + self.calls = [] + + def _inside_upload_dir(self, path): + base = os.path.realpath(self.upload_dir) + candidate = os.path.realpath(path) + try: + return os.path.commonpath([base, candidate]) == base + except ValueError: + return False + + def resolve_upload(self, upload_id, owner=None): + self.calls.append((upload_id, owner)) + row = self.rows.get(upload_id) + if isinstance(row, dict) and row.get("owner") and row.get("owner") != owner: + return None + return row + + +def _manifest_test_dir(name): + root = Path(__file__).resolve().parents[1] / "tmp_pytest_probe" / f"{name}-{uuid.uuid4().hex}" + root.mkdir(parents=True, exist_ok=False) + return root + + +def test_build_uploaded_file_manifest_filters_and_nulls_unreadable_paths(monkeypatch): + root = _manifest_test_dir("manifest") + try: + upload_dir = root / "uploads" + upload_dir.mkdir() + good = upload_dir / "good.txt" + good.write_text("hello", encoding="utf-8") + outside = root / "outside.txt" + outside.write_text("nope", encoding="utf-8") + missing = upload_dir / "missing.txt" + + import src.settings as settings + + monkeypatch.setattr( + settings, + "get_setting", + lambda key: [str(upload_dir)] if key == "tool_path_extra_roots" else None, + ) + handler = _ManifestUploadHandler(upload_dir, { + "good": { + "id": "good", + "name": "good.txt", + "mime": "text/plain", + "size": 5, + "path": str(good), + "owner": "alice", + }, + "bob": { + "id": "bob", + "name": "bob.txt", + "path": str(good), + "owner": "bob", + }, + "outside": { + "id": "outside", + "name": "outside.txt", + "path": str(outside), + "owner": "alice", + }, + "missing": { + "id": "missing", + "name": "missing.txt", + "path": str(missing), + "owner": "alice", + }, + "bad": ["not", "a", "dict"], + }) + + manifest = build_uploaded_file_manifest( + ["good", "bob", "outside", "missing", "bad"], + handler, + owner="alice", + ) + + assert [item["id"] for item in manifest] == ["good", "outside", "missing"] + assert os.path.realpath(manifest[0]["path"]) == os.path.realpath(good) + assert manifest[1]["path"] is None + assert manifest[2]["path"] is None + assert handler.calls == [ + ("good", "alice"), + ("bob", "alice"), + ("outside", "alice"), + ("missing", "alice"), + ("bad", "alice"), + ] + finally: + shutil.rmtree(root, ignore_errors=True) + + +def test_build_uploaded_file_manifest_hides_paths_read_file_cannot_open(monkeypatch): + root = _manifest_test_dir("manifest-unreadable") + try: + upload_dir = root / "uploads" + upload_dir.mkdir() + upload = upload_dir / "upload.txt" + upload.write_text("hello", encoding="utf-8") + handler = _ManifestUploadHandler(upload_dir, { + "upload": {"id": "upload", "name": "upload.txt", "path": str(upload), "owner": "alice"}, + }) + + def reject_path(_path): + raise ValueError("outside the allowed roots") + + monkeypatch.setattr("src.tool_execution._resolve_tool_path", reject_path) + + manifest = build_uploaded_file_manifest(["upload"], handler, owner="alice") + + assert manifest[0]["path"] is None + finally: + shutil.rmtree(root, ignore_errors=True) + + @pytest.mark.parametrize("name,expected", [ # 24h format (the bug this PR fixes) ("deepseek-v4-flash 14:05:33", True), diff --git a/tests/test_codex_ssh_host_validation.py b/tests/test_codex_ssh_host_validation.py index 26a4dad4a..b4c07f1b7 100644 --- a/tests/test_codex_ssh_host_validation.py +++ b/tests/test_codex_ssh_host_validation.py @@ -100,6 +100,105 @@ def test_default_ssh_port_omits_flag(): assert port_flag == "" +def _documents_endpoint(total: int): + calls = [] + document_router = APIRouter() + + @document_router.get("/api/documents/library") + async def documents_library( + request: Request, + search=None, + language=None, + sort="recent", + offset=0, + limit=20, + archived=False, + ): + calls.append({ + "owner": request.state.current_user, + "search": search, + "language": language, + "sort": sort, + "offset": offset, + "limit": limit, + "archived": archived, + }) + end = min(offset + limit, total) + docs = [{"id": f"doc-{i}"} for i in range(offset, end)] + return {"documents": docs, "total": total} + + router = codex_routes.setup_codex_routes(document_router=document_router) + return _route_endpoint("/api/codex/documents", "GET", router=router), calls + + +@pytest.mark.asyncio +async def test_documents_pagination_clamps_offset_and_limit(): + endpoint, calls = _documents_endpoint(total=99) + + result = await endpoint(_codex_request(["documents:read"]), offset=-10, limit=500) + + assert calls[-1]["owner"] == "alice" + assert calls[-1]["offset"] == 0 + assert calls[-1]["limit"] == 50 + assert len(result["documents"]) == 50 + assert result["next_offset"] == 50 + + +@pytest.mark.asyncio +async def test_documents_pagination_clamps_zero_limit_to_one(): + endpoint, calls = _documents_endpoint(total=3) + + result = await endpoint(_codex_request(["documents:read"]), offset=0, limit=0) + + assert calls[-1]["limit"] == 1 + assert len(result["documents"]) == 1 + assert result["next_offset"] == 1 + + +@pytest.mark.asyncio +async def test_documents_pagination_returns_next_offset_when_truncated(): + endpoint, _calls = _documents_endpoint(total=7) + + result = await endpoint(_codex_request(["documents:read"]), offset=2, limit=3) + + assert [doc["id"] for doc in result["documents"]] == ["doc-2", "doc-3", "doc-4"] + assert result["next_offset"] == 5 + + +@pytest.mark.asyncio +async def test_documents_pagination_rejects_invalid_offset(): + endpoint, _calls = _documents_endpoint(total=7) + + with pytest.raises(HTTPException) as exc: + await endpoint(_codex_request(["documents:read"]), offset="soon", limit=3) + + assert exc.value.status_code == 400 + assert exc.value.detail == "Invalid offset" + + +@pytest.mark.asyncio +async def test_documents_pagination_rejects_invalid_limit(): + endpoint, _calls = _documents_endpoint(total=7) + + with pytest.raises(HTTPException) as exc: + await endpoint(_codex_request(["documents:read"]), offset=0, limit="many") + + assert exc.value.status_code == 400 + assert exc.value.detail == "Invalid limit" + + +@pytest.mark.asyncio +async def test_documents_pagination_out_of_range_offset_returns_empty_page(): + endpoint, calls = _documents_endpoint(total=3) + + result = await endpoint(_codex_request(["documents:read"]), offset=10, limit=2) + + assert calls[-1]["offset"] == 10 + assert calls[-1]["limit"] == 2 + assert result["documents"] == [] + assert result["next_offset"] is None + + def test_adopt_rejects_ssh_option_host_before_shell(monkeypatch): calls = []