From 2fbfd2294684d90d1c37d8b66d1a2ff71ae5dc0c Mon Sep 17 00:00:00 2001 From: pewdiepie-archdaemon Date: Fri, 19 Jun 2026 00:34:24 +0000 Subject: [PATCH] Agent loop: compact one-line tool-usage hints for local/small models so the system prompt doesnt eat the context budget --- src/agent_loop.py | 127 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 116 insertions(+), 11 deletions(-) diff --git a/src/agent_loop.py b/src/agent_loop.py index f600ac598..e574a42b6 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -536,17 +536,44 @@ def _section_text(name: str, default: str) -> str: return val if isinstance(val, str) and val.strip() else default +def _compact_tool_line(name: str, section: str) -> str: + """One-line fenced-tool usage hint for compact/local prompts.""" + text = (section or "").strip() + if not text: + return f"- `{name}`" + if text.startswith("- "): + return text + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + usage = [] + in_fence = False + for ln in lines: + if ln.startswith("```"): + usage.append(ln) + in_fence = not in_fence + if len(usage) >= 3: + break + continue + if in_fence and len(usage) < 3: + usage.append(ln) + if usage: + return f"- `{name}` — " + " ".join(usage) + return f"- `{name}` — " + lines[0][:160] + + def _assemble_prompt(tool_names: set, disabled_tools: set = None, compact: bool = False) -> str: """Build the system prompt with only the specified tools included.""" disabled = disabled_tools or set() included = tool_names - disabled if compact: - tool_list = ", ".join(sorted(included)) if included else "none" + tool_lines = [] + for name, _default_section in TOOL_SECTIONS.items(): + if name in included: + tool_lines.append(_compact_tool_line(name, _section_text(name, _default_section))) parts = [ - "You are an AI assistant with tool access.", - f"Available tools: {tool_list}.", - _API_AGENT_RULES, + _AGENT_PREAMBLE, + "## Available tools\n" + ("\n".join(tool_lines) if tool_lines else "none"), + _AGENT_RULES, ] parts.extend(_domain_rules_for_tools(included)) return "\n\n".join(parts) @@ -612,11 +639,6 @@ _API_HOSTS = frozenset([ "api.perplexity.ai", "api.x.ai", "ollama.com", "api.venice.ai", "api.kimi.com", "api.githubcopilot.com", - # Local OpenAI-compatible endpoints (llama.cpp, vLLM, LM Studio, etc.). - # Without these, `_is_api_model` falls back to keyword sniffing on the - # model name, so well-behaved local servers don't get native tool - # schemas and the agent silently degrades to fenced-block parsing. - "localhost", "127.0.0.1", "host.docker.internal", ]) _MCP_KEYWORDS = frozenset(["mcp", "browse", "browser", "website", "calendar", "event", "email", "gmail", "screenshot", "navigate", "click", "miniflux", "rss", "feed"]) @@ -644,6 +666,28 @@ def _is_ollama_openai_compat_url(endpoint_url: str) -> bool: return parsed.port == 11434 and (path == "/v1" or path.startswith("/v1/")) +def _is_local_openai_compat_url(endpoint_url: str) -> bool: + try: + parsed = urlparse(endpoint_url or "") + except Exception: + return False + host = (parsed.hostname or "").lower() + path = (parsed.path or "").rstrip("/") + if not (path == "/v1" or path.startswith("/v1/")): + return False + if host in {"localhost", "127.0.0.1", "0.0.0.0", "host.docker.internal"}: + return True + if host.startswith("192.168.") or host.startswith("10."): + return True + if host.startswith("172."): + try: + second = int(host.split(".")[1]) + return 16 <= second <= 31 + except Exception: + return False + return False + + def _endpoint_lookup_keys(endpoint_url: str) -> List[str]: """Candidate ModelEndpoint.base_url keys for a runtime chat URL.""" raw = (endpoint_url or "").strip() @@ -2082,6 +2126,7 @@ async def stream_agent_loop( # the fenced-block path is used instead of native function calling. _is_ollama_native = _is_ollama_native_url(endpoint_url or "") _ollama_openai_compat = _is_ollama_openai_compat_url(endpoint_url or "") + _local_openai_compat = _is_local_openai_compat_url(endpoint_url or "") if _endpoint_supports is True: _is_api_model = True elif ( @@ -2089,15 +2134,17 @@ async def stream_agent_loop( or _model_no_tools or _is_ollama_native or _ollama_openai_compat + or _local_openai_compat ): _is_api_model = False else: _is_api_model = any(h in endpoint_url for h in _API_HOSTS) or _model_supports_tools + _compact_agent_prompt = _is_api_model or _is_ollama_native or _ollama_openai_compat or _local_openai_compat messages, mcp_schemas = _build_system_prompt( messages, model, active_document, mcp_mgr, disabled_tools, needs_admin=_needs_admin, relevant_tools=_relevant_tools, mcp_disabled_map=_mcp_disabled_map, - compact=_is_api_model, + compact=_compact_agent_prompt, owner=owner, suppress_local_context=guide_only, active_email=active_email, @@ -2185,6 +2232,14 @@ async def stream_agent_loop( # Strip internal metadata keys before sending to the LLM API messages = [{k: v for k, v in msg.items() if k != "_protected"} for msg in messages] + agent_prompt_tokens = estimate_tokens(messages) + logger.info( + "[agent-timing] prep_done model=%s prompt_tokens=%s context_length=%s prep=%s", + model, + agent_prompt_tokens, + context_length, + {k: round(v, 3) for k, v in prep_timings.items()}, + ) yield f"data: {json.dumps({'type': 'agent_prep', 'data': {k: round(v, 3) for k, v in prep_timings.items()}})}\n\n" full_response = "" @@ -2329,6 +2384,19 @@ async def stream_agent_loop( # complementary cap for the rare stream that trickles bytes forever and # so never trips the inactivity timeout. Generous — only catches runaway. _round_deadline = time.time() + max(agent_stream_timeout * 4, 1200) + _round_start = time.time() + _round_first_event_logged = False + _round_first_token_logged = False + logger.info( + "[agent-timing] round_start round=%s model=%s endpoint=%s prompt_tokens=%s tools=%s native_tools=%s timeout=%s", + round_num, + model, + endpoint_url, + estimate_tokens(messages), + len(_tool_names_sent), + bool(all_tool_schemas), + agent_stream_timeout, + ) async for chunk in stream_llm_with_fallback( _candidates, messages, @@ -2339,11 +2407,30 @@ async def stream_agent_loop( timeout=agent_stream_timeout, session_id=session_id, ): + if not _round_first_event_logged: + _round_first_event_logged = True + logger.info( + "[agent-timing] first_event round=%s elapsed=%.3fs kind=%s", + round_num, + time.time() - _round_start, + "error" if chunk.startswith("event: error") else "data", + ) if time.time() > _round_deadline: - logger.warning(f"[agent] round {round_num} stream exceeded wall-clock deadline; cutting off") + logger.warning( + "[agent-timing] round_deadline round=%s elapsed=%.3fs deadline_s=%s", + round_num, + time.time() - _round_start, + max(agent_stream_timeout * 4, 1200), + ) break # Forward error events from stream_llm to the frontend if chunk.startswith("event: error"): + logger.warning( + "[agent-timing] stream_error round=%s elapsed=%.3fs chunk=%r", + round_num, + time.time() - _round_start, + chunk[:500], + ) yield chunk continue if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): @@ -2423,6 +2510,15 @@ async def stream_agent_loop( if not first_token_received: time_to_first_token = time.time() - total_start first_token_received = True + if not _round_first_token_logged: + _round_first_token_logged = True + logger.info( + "[agent-timing] first_visible_token round=%s elapsed=%.3fs total_elapsed=%.3fs thinking=%s", + round_num, + time.time() - _round_start, + time.time() - total_start, + bool(data.get("thinking")), + ) # Keep reasoning deltas in a separate accumulator so # we can echo them back via `reasoning_content` on the # next request (DeepSeek requires this; harmless for @@ -2492,6 +2588,15 @@ async def stream_agent_loop( yield chunk # Intercept [DONE] — don't forward until all rounds finish + logger.info( + "[agent-timing] round_stream_done round=%s elapsed=%.3fs text_chars=%s tool_calls=%s first_event=%s first_token=%s", + round_num, + time.time() - _round_start, + len(round_response), + len(native_tool_calls), + _round_first_event_logged, + _round_first_token_logged, + ) tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num, is_api_model=_is_api_model) # Force-answer round: we told the model to STOP calling tools and