Merge origin/dev into main

2026-06-30 16:42:15 -04:00 · 2026-06-21 11:08:50 +00:00
parent c504214925 160267417e
commit 75f04bc088
203 changed files with 11283 additions and 1649 deletions
@@ -79,13 +79,23 @@ class WebSearchTool:
 class WebFetchTool:
    async def execute(self, content: str, ctx: dict) -> dict:
        from src.search.content import fetch_webpage_content
+        from src.constants import WEB_FETCH_HARD_MAX_BYTES
        raw = content.strip()
        url = ""
+        max_bytes = None
        if raw.startswith("{"):
            try:
                parsed = json.loads(raw)
                if isinstance(parsed, dict):
                    url = str(parsed.get("url") or "").strip()
+                    # Download-budget override (#3812): "full": true raises the
+                    # budget to the hard cap; an explicit max_bytes is clamped
+                    # to the hard cap downstream. Default stays the soft cap.
+                    if parsed.get("full") is True:
+                        max_bytes = WEB_FETCH_HARD_MAX_BYTES
+                    mb = parsed.get("max_bytes")
+                    if isinstance(mb, int) and mb > 0:
+                        max_bytes = mb
            except json.JSONDecodeError:
                url = ""
        if not url:
@@ -100,7 +110,7 @@ class WebFetchTool:
        loop = asyncio.get_running_loop()
        try:
            result = await asyncio.wait_for(
-                loop.run_in_executor(None, lambda: fetch_webpage_content(url, timeout=10)),
+                loop.run_in_executor(None, lambda: fetch_webpage_content(url, timeout=10, max_bytes=max_bytes)),
                timeout=30,
            )
        except asyncio.TimeoutError:
@@ -116,8 +126,28 @@ class WebFetchTool:
                return {"error": f"web_fetch: {url}: {err}", "exit_code": 1}
            return {"error": f"web_fetch: {url}: no readable text content (not HTML, or the page needs JS/login)", "exit_code": 1}

+        # Tell the model when the download budget cut the body short and how
+        # to get the rest, instead of silently presenting a partial page as
+        # the whole thing.
+        size_note = ""
+        if result.get("truncated"):
+            fetched = result.get("fetched_bytes") or 0
+            total = result.get("total_bytes")
+            total_txt = f" of {total:,} bytes" if total else ""
+            size_note = (
+                f"[partial content: download stopped at {fetched:,} bytes{total_txt}. "
+                f'Re-call with {{"url": "{url}", "full": true}} to fetch up to '
+                f"{WEB_FETCH_HARD_MAX_BYTES:,} bytes.]\n\n"
+            )
+
+        # The notice must lead the output so the MAX_OUTPUT_CHARS trim below can
+        # never drop it. The title is untrusted, uncapped page content, so a
+        # giant title ahead of the notice could push it out of range; keep the
+        # notice first and cap the title as a second guard.
+        if len(title) > 300:
+            title = title[:300] + "..."
        header = (f"# {title}\n" if title else "") + f"Source: {url}\n\n"
-        output = header + text
+        output = size_note + header + text
        if len(output) > MAX_OUTPUT_CHARS:
            output = output[:MAX_OUTPUT_CHARS] + "\n\n[...truncated]"
        return {"output": output, "exit_code": 0}