From 2fbfd2294684d90d1c37d8b66d1a2ff71ae5dc0c Mon Sep 17 00:00:00 2001
From: pewdiepie-archdaemon <pewdiepie-archdaemon@users.noreply.github.com>
Date: Fri, 19 Jun 2026 00:34:24 +0000
Subject: [PATCH] Agent loop: compact one-line tool-usage hints for local/small
 models so the system prompt doesnt eat the context budget

---
 src/agent_loop.py | 127 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 116 insertions(+), 11 deletions(-)

diff --git a/src/agent_loop.py b/src/agent_loop.py
index f600ac598..e574a42b6 100644
--- a/src/agent_loop.py
+++ b/src/agent_loop.py
@@ -536,17 +536,44 @@ def _section_text(name: str, default: str) -> str:
     return val if isinstance(val, str) and val.strip() else default
 
 
+def _compact_tool_line(name: str, section: str) -> str:
+    """One-line fenced-tool usage hint for compact/local prompts."""
+    text = (section or "").strip()
+    if not text:
+        return f"- `{name}`"
+    if text.startswith("- "):
+        return text
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    usage = []
+    in_fence = False
+    for ln in lines:
+        if ln.startswith("```"):
+            usage.append(ln)
+            in_fence = not in_fence
+            if len(usage) >= 3:
+                break
+            continue
+        if in_fence and len(usage) < 3:
+            usage.append(ln)
+    if usage:
+        return f"- `{name}` — " + " ".join(usage)
+    return f"- `{name}` — " + lines[0][:160]
+
+
 def _assemble_prompt(tool_names: set, disabled_tools: set = None, compact: bool = False) -> str:
     """Build the system prompt with only the specified tools included."""
     disabled = disabled_tools or set()
     included = tool_names - disabled
 
     if compact:
-        tool_list = ", ".join(sorted(included)) if included else "none"
+        tool_lines = []
+        for name, _default_section in TOOL_SECTIONS.items():
+            if name in included:
+                tool_lines.append(_compact_tool_line(name, _section_text(name, _default_section)))
         parts = [
-            "You are an AI assistant with tool access.",
-            f"Available tools: {tool_list}.",
-            _API_AGENT_RULES,
+            _AGENT_PREAMBLE,
+            "## Available tools\n" + ("\n".join(tool_lines) if tool_lines else "none"),
+            _AGENT_RULES,
         ]
         parts.extend(_domain_rules_for_tools(included))
         return "\n\n".join(parts)
@@ -612,11 +639,6 @@ _API_HOSTS = frozenset([
     "api.perplexity.ai", "api.x.ai",
     "ollama.com", "api.venice.ai", "api.kimi.com",
     "api.githubcopilot.com",
-    # Local OpenAI-compatible endpoints (llama.cpp, vLLM, LM Studio, etc.).
-    # Without these, `_is_api_model` falls back to keyword sniffing on the
-    # model name, so well-behaved local servers don't get native tool
-    # schemas and the agent silently degrades to fenced-block parsing.
-    "localhost", "127.0.0.1", "host.docker.internal",
 ])
 _MCP_KEYWORDS = frozenset(["mcp", "browse", "browser", "website", "calendar", "event", "email",
                            "gmail", "screenshot", "navigate", "click", "miniflux", "rss", "feed"])
@@ -644,6 +666,28 @@ def _is_ollama_openai_compat_url(endpoint_url: str) -> bool:
     return parsed.port == 11434 and (path == "/v1" or path.startswith("/v1/"))
 
 
+def _is_local_openai_compat_url(endpoint_url: str) -> bool:
+    try:
+        parsed = urlparse(endpoint_url or "")
+    except Exception:
+        return False
+    host = (parsed.hostname or "").lower()
+    path = (parsed.path or "").rstrip("/")
+    if not (path == "/v1" or path.startswith("/v1/")):
+        return False
+    if host in {"localhost", "127.0.0.1", "0.0.0.0", "host.docker.internal"}:
+        return True
+    if host.startswith("192.168.") or host.startswith("10."):
+        return True
+    if host.startswith("172."):
+        try:
+            second = int(host.split(".")[1])
+            return 16 <= second <= 31
+        except Exception:
+            return False
+    return False
+
+
 def _endpoint_lookup_keys(endpoint_url: str) -> List[str]:
     """Candidate ModelEndpoint.base_url keys for a runtime chat URL."""
     raw = (endpoint_url or "").strip()
@@ -2082,6 +2126,7 @@ async def stream_agent_loop(
     # the fenced-block path is used instead of native function calling.
     _is_ollama_native = _is_ollama_native_url(endpoint_url or "")
     _ollama_openai_compat = _is_ollama_openai_compat_url(endpoint_url or "")
+    _local_openai_compat = _is_local_openai_compat_url(endpoint_url or "")
     if _endpoint_supports is True:
         _is_api_model = True
     elif (
@@ -2089,15 +2134,17 @@ async def stream_agent_loop(
         or _model_no_tools
         or _is_ollama_native
         or _ollama_openai_compat
+        or _local_openai_compat
     ):
         _is_api_model = False
     else:
         _is_api_model = any(h in endpoint_url for h in _API_HOSTS) or _model_supports_tools
+    _compact_agent_prompt = _is_api_model or _is_ollama_native or _ollama_openai_compat or _local_openai_compat
     messages, mcp_schemas = _build_system_prompt(
         messages, model, active_document, mcp_mgr, disabled_tools,
         needs_admin=_needs_admin, relevant_tools=_relevant_tools,
         mcp_disabled_map=_mcp_disabled_map,
-        compact=_is_api_model,
+        compact=_compact_agent_prompt,
         owner=owner,
         suppress_local_context=guide_only,
         active_email=active_email,
@@ -2185,6 +2232,14 @@ async def stream_agent_loop(
     # Strip internal metadata keys before sending to the LLM API
     messages = [{k: v for k, v in msg.items() if k != "_protected"} for msg in messages]
 
+    agent_prompt_tokens = estimate_tokens(messages)
+    logger.info(
+        "[agent-timing] prep_done model=%s prompt_tokens=%s context_length=%s prep=%s",
+        model,
+        agent_prompt_tokens,
+        context_length,
+        {k: round(v, 3) for k, v in prep_timings.items()},
+    )
     yield f"data: {json.dumps({'type': 'agent_prep', 'data': {k: round(v, 3) for k, v in prep_timings.items()}})}\n\n"
 
     full_response = ""
@@ -2329,6 +2384,19 @@ async def stream_agent_loop(
         # complementary cap for the rare stream that trickles bytes forever and
         # so never trips the inactivity timeout. Generous — only catches runaway.
         _round_deadline = time.time() + max(agent_stream_timeout * 4, 1200)
+        _round_start = time.time()
+        _round_first_event_logged = False
+        _round_first_token_logged = False
+        logger.info(
+            "[agent-timing] round_start round=%s model=%s endpoint=%s prompt_tokens=%s tools=%s native_tools=%s timeout=%s",
+            round_num,
+            model,
+            endpoint_url,
+            estimate_tokens(messages),
+            len(_tool_names_sent),
+            bool(all_tool_schemas),
+            agent_stream_timeout,
+        )
         async for chunk in stream_llm_with_fallback(
             _candidates,
             messages,
@@ -2339,11 +2407,30 @@ async def stream_agent_loop(
             timeout=agent_stream_timeout,
             session_id=session_id,
         ):
+            if not _round_first_event_logged:
+                _round_first_event_logged = True
+                logger.info(
+                    "[agent-timing] first_event round=%s elapsed=%.3fs kind=%s",
+                    round_num,
+                    time.time() - _round_start,
+                    "error" if chunk.startswith("event: error") else "data",
+                )
             if time.time() > _round_deadline:
-                logger.warning(f"[agent] round {round_num} stream exceeded wall-clock deadline; cutting off")
+                logger.warning(
+                    "[agent-timing] round_deadline round=%s elapsed=%.3fs deadline_s=%s",
+                    round_num,
+                    time.time() - _round_start,
+                    max(agent_stream_timeout * 4, 1200),
+                )
                 break
             # Forward error events from stream_llm to the frontend
             if chunk.startswith("event: error"):
+                logger.warning(
+                    "[agent-timing] stream_error round=%s elapsed=%.3fs chunk=%r",
+                    round_num,
+                    time.time() - _round_start,
+                    chunk[:500],
+                )
                 yield chunk
                 continue
             if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
@@ -2423,6 +2510,15 @@ async def stream_agent_loop(
                         if not first_token_received:
                             time_to_first_token = time.time() - total_start
                             first_token_received = True
+                        if not _round_first_token_logged:
+                            _round_first_token_logged = True
+                            logger.info(
+                                "[agent-timing] first_visible_token round=%s elapsed=%.3fs total_elapsed=%.3fs thinking=%s",
+                                round_num,
+                                time.time() - _round_start,
+                                time.time() - total_start,
+                                bool(data.get("thinking")),
+                            )
                         # Keep reasoning deltas in a separate accumulator so
                         # we can echo them back via `reasoning_content` on the
                         # next request (DeepSeek requires this; harmless for
@@ -2492,6 +2588,15 @@ async def stream_agent_loop(
                 yield chunk
             # Intercept [DONE] — don't forward until all rounds finish
 
+        logger.info(
+            "[agent-timing] round_stream_done round=%s elapsed=%.3fs text_chars=%s tool_calls=%s first_event=%s first_token=%s",
+            round_num,
+            time.time() - _round_start,
+            len(round_response),
+            len(native_tool_calls),
+            _round_first_event_logged,
+            _round_first_token_logged,
+        )
         tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num, is_api_model=_is_api_model)
 
         # Force-answer round: we told the model to STOP calling tools and