diff --git a/routes/chat_routes.py b/routes/chat_routes.py index cd5e4e672..a4de4536b 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -394,6 +394,7 @@ def setup_chat_routes( search_context = form_data.get("search_context") # pre-fetched web search results (compare mode) compare_mode = str(form_data.get("compare_mode", "")).lower() == "true" incognito = str(form_data.get("incognito", "")).lower() == "true" + plan_mode = str(form_data.get("plan_mode", "")).lower() == "true" chat_mode = str(form_data.get("mode", "")).lower() # 'chat' or 'agent' # Workspace: confine the agent's file/shell tools to this folder. Validate # it's a real directory; ignore (no confinement) otherwise. @@ -401,6 +402,17 @@ def setup_chat_routes( if workspace: _ws_real = os.path.realpath(os.path.expanduser(workspace)) workspace = _ws_real if os.path.isdir(_ws_real) else "" + # Plan mode is a modifier on agent mode — it only makes sense with tools. + if plan_mode: + chat_mode = "agent" + # An approved plan being EXECUTED: the frontend sends the checklist back + # on each turn so we can pin it in context. This way a long plan on a + # weak model survives history truncation — the agent can always re-read + # the plan. Ignored while still proposing (plan_mode on). Capped so a + # huge plan can't blow the prompt. + approved_plan = "" + if not plan_mode: + approved_plan = (form_data.get("approved_plan") or "").strip()[:8192] # Did the USER explicitly pick agent mode? (vs. us auto-escalating # below). Skill extraction should only learn from real agent sessions, # not chats we quietly promoted for a notes/calendar intent. @@ -659,6 +671,13 @@ def setup_chat_routes( if chat_mode == 'chat': disabled_tools.update({"bash", "python", "read_file", "write_file", "web_search", "web_fetch", "search_chats", "manage_tasks"}) + # Plan mode: investigate read-only, propose a plan, don't mutate. Block + # every tool not on the read-only allowlist. (stream_agent_loop enforces + # this again + drops MCP, so this is belt-and-suspenders.) + if plan_mode: + from src.tool_security import plan_mode_disabled_tools + disabled_tools.update(plan_mode_disabled_tools()) + async def stream_with_save() -> AsyncGenerator[str, None]: # _effective_mode is read-only here; closure captures it from # the outer scope. (Was `nonlocal` but never reassigned.) @@ -1015,6 +1034,8 @@ def setup_chat_routes( owner=_user, fallbacks=_fallback_candidates, workspace=workspace or None, + plan_mode=plan_mode, + approved_plan=approved_plan or None, ): if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): try: @@ -1036,6 +1057,7 @@ def setup_chat_routes( "doc_update", "doc_suggestions", "ui_control", "rounds_exhausted", "ask_user", + "plan_update", ): if data.get("type") == "agent_step": _agent_rounds = max(_agent_rounds, data.get("round", 1)) diff --git a/src/agent_loop.py b/src/agent_loop.py index a74c95e9c..84870db31 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -19,7 +19,7 @@ from src.llm_core import stream_llm, stream_llm_with_fallback, _is_ollama_native from src.model_context import estimate_tokens from src.settings import get_setting from src.prompt_security import untrusted_context_message -from src.tool_security import blocked_tools_for_owner +from src.tool_security import blocked_tools_for_owner, plan_mode_disabled_tools from src.agent_tools import ( parse_tool_blocks, strip_tool_blocks, @@ -336,6 +336,7 @@ If the user asks for a reminder/alarm before the event, pass `reminder_minutes` "pipeline": "- ```pipeline``` — Run a multi-step AI pipeline. Args (JSON) with ordered steps, each specifying a model and prompt. Use for complex workflows.", "ui_control": "- ```ui_control``` — Control the UI: toggle tools on/off, OPEN PANELS, open email reply drafts, switch models, change themes. Commands: `toggle on/off` (names: bash/shell, web/search, research, incognito, document_editor/documents), `open_panel ` (panels: documents, gallery, email, sessions, notes, memories/brain, skills, settings, cookbook), `open_email_reply ` (opens an email compose document, does NOT send), `set_mode agent/chat`, `switch_model `, `set_theme `, `create_theme ` (optional key=val for advanced colors AND background effects: bgPattern=, bgEffectColor=#RRGGBB, bgEffectIntensity=, bgEffectSize=, frosted=true|false). \"open documents\" / \"open library\" / \"show gallery\" / \"open inbox\" / \"open notes\" / \"open cookbook\" all map to `open_panel `. Theme presets: dark, light, midnight, paper, cyberpunk, retrowave, forest, ocean, ume, copper, terminal, organs, lavender, gpt, claude, cute.", "ask_user": "- ```ask_user``` — Ask the user a multiple-choice question when the task is genuinely ambiguous and the answer changes what you do next (pick an approach, confirm an assumption, choose a target). Args (JSON): {\"question\": \"...\", \"options\": [{\"label\": \"...\", \"description\": \"...\"?}, ...], \"multi\": false?}. 2-6 options. The user gets clickable buttons; calling this ENDS your turn and their choice comes back as your next message. Prefer sensible defaults — only ask when you truly can't proceed well without their input.", + "update_plan": "- ```update_plan``` — While executing an approved plan, write the plan back: tick steps done or revise them. Args (JSON): {\"plan\": \"- [x] done step\\n- [ ] next step\"}. Always pass the COMPLETE checklist, not a diff. Call it after finishing each step (mark it `- [x]`) and whenever the user asks to change the plan. The user's docked plan window updates live. Does nothing if there's no active plan.", "list_served_models": "- ```list_served_models``` — Show what the Cookbook (LLM-serving subsystem) is currently running. NO args. Use this for ANY 'what's running' / 'what's serving' / 'show my cookbook' / 'is anything up' query. DO NOT shell out (`ps aux`, `docker ps`, etc.) — this tool is the source of truth. Failed serve tasks include recent logs plus diagnosis/retry suggestions; use those suggestions to call `serve_model` again with an adjusted command when appropriate.", "stop_served_model": "- ```stop_served_model``` — Stop a running model server. Args (JSON): {\"session_id\": \"\"}. Use for 'kill my cookbook' / 'stop the model' / 'shut down vLLM'.", "tail_serve_output": "- ```tail_serve_output``` — Read the actual tmux stderr/traceback of a CURRENTLY failing cookbook task. Args (JSON): {\"session_id\": \"\", \"tail\": 150?}. **Use ONLY after** you just launched something via `serve_model` AND `list_served_models` reports YOUR new task as `crashed`/`error`. DO NOT use it on old stopped/completed download tasks (they're historical noise — won't predict whether a new launch succeeds). DO NOT call it before launching a fresh attempt. When you do call it, bump `tail` to 400+ only if the visible error references 'see root cause above'.", @@ -1372,6 +1373,53 @@ def _empty_response_fallback( return _error_msg, f'data: {json.dumps({"delta": _error_msg})}\n\n' +PLAN_MODE_DIRECTIVE = ( + "## PLAN MODE — OVERRIDES EVERYTHING ELSE BELOW\n" + "You are in PLAN MODE. Your ONLY job this turn is to PROPOSE a plan. You have " + "NOT done anything yet. Do NOT claim you created, wrote, ran, sent, or changed " + "anything — that would be a lie.\n" + "\n" + "ABSOLUTE RULE — DO NOT MUTATE ANYTHING. Every write/state-changing tool, " + "including the shell (`bash`/`python`), is disabled this turn and will be " + "rejected — only read-only tools remain available. Use the read-only tools " + "listed below (read files, search code, browse the project, web lookups) to " + "ground the plan. If the task is 'write a file', your plan is to DESCRIBE " + "writing it — you do NOT write it now.\n" + "\n" + "OUTPUT: present the plan as a GitHub-style checklist, one concrete step per line:\n" + "- [ ] first action you will take once approved\n" + "- [ ] next action\n" + "Each item = one concrete action (file to create/edit, command to run, side " + "effect). Do not execute. Do not end with 'Done' or anything implying the work " + "is finished. End your turn with the checklist." +) + + +def build_active_plan_note(approved_plan: str) -> str: + """System note that pins an approved plan during execution. + + Sent back by the frontend each turn so a long plan on a weak model survives + history truncation — the agent can always re-read it. Returns "" for empty + input. + """ + if not approved_plan or not approved_plan.strip(): + return "" + return ( + "## ACTIVE PLAN (approved — execute this)\n" + "You are executing a plan the user already approved. THE FULL PLAN IS " + "BELOW — it is always provided here every turn. Do NOT say you lost it, " + "and do NOT look for it in tasks, notes, memory, files, or the API; just " + "read it below. Work through it IN ORDER. After finishing each step, call " + "the `update_plan` tool with the full checklist and that step marked " + "`- [x]` so progress stays visible in the user's plan window. If the user " + "asks to change the plan, call `update_plan` with the revised checklist. " + "Do the next unchecked item until all are done. Do not skip, reorder, or " + "invent steps; if a step is genuinely impossible, say so and stop.\n\n" + "Current plan:\n" + + approved_plan.strip() + ) + + async def stream_agent_loop( endpoint_url: str, model: str, @@ -1390,6 +1438,8 @@ async def stream_agent_loop( relevant_tools: Optional[Set[str]] = None, fallbacks: Optional[List[tuple]] = None, workspace: Optional[str] = None, + plan_mode: bool = False, + approved_plan: Optional[str] = None, _is_teacher_run: bool = False, ) -> AsyncGenerator[str, None]: """Streaming agent loop generator. @@ -1413,6 +1463,13 @@ async def stream_agent_loop( # public/non-admin users rather than trying to enumerate every tool. mcp_mgr = None + if plan_mode: + # Plan mode: investigate read-only, propose a plan, don't execute. The + # route also unions the read-only-disabled set, but enforce here too so + # the loop is safe regardless of caller. MCP stays available but is + # filtered to read-only tools below (after the disabled map is loaded). + disabled_tools.update(plan_mode_disabled_tools()) + _t0 = time.time() _needs_admin = _detect_admin_intent(messages) _last_user = _extract_last_user_message(messages) @@ -1420,6 +1477,13 @@ async def stream_agent_loop( # not just the latest message, so short follow-ups don't drop just-used tools. _retrieval_query = _recent_context_for_retrieval(messages) or _last_user _mcp_disabled_map = _load_mcp_disabled_map() if mcp_mgr else {} + if plan_mode and mcp_mgr: + # Allow read-only MCP tools to investigate, block write/unknown ones: + # hide them from the schemas AND reject them at runtime by qualified name. + _mcp_block_map, _mcp_block_q = mcp_mgr.plan_mode_blocked_mcp() + for _sid, _names in _mcp_block_map.items(): + _mcp_disabled_map.setdefault(_sid, set()).update(_names) + disabled_tools.update(_mcp_block_q) prep_timings["request_setup"] = time.time() - _t0 # RAG-based tool selection: retrieve relevant tools for this query. @@ -1577,6 +1641,27 @@ async def stream_agent_loop( else: messages.insert(0, {"role": "system", "content": _ws_note}) logger.info("[workspace] active for this turn: %s", workspace) + if plan_mode: + # Steer the model to investigate-then-propose. Hard tool gating handles + # every write path except shell; this directive is what keeps the + # intentionally-allowed bash/python read-only, so it must DOMINATE. Put + # it at the very TOP of the system prompt (the base prompt is large and + # action-oriented — appending buried it, and small models ignored it). + if messages and messages[0].get("role") == "system": + messages[0]["content"] = PLAN_MODE_DIRECTIVE + "\n\n" + (messages[0].get("content") or "") + else: + messages.insert(0, {"role": "system", "content": PLAN_MODE_DIRECTIVE}) + elif approved_plan and approved_plan.strip(): + # EXECUTING an approved plan. Pin the checklist as a top-of-context + # system note so a long plan on a weak model survives history + # truncation — the agent can always re-read the plan instead of losing + # the thread. (The first system message is kept by the context trimmer.) + _plan_note = build_active_plan_note(approved_plan) + if messages and messages[0].get("role") == "system": + messages[0]["content"] = _plan_note + "\n\n" + (messages[0].get("content") or "") + else: + messages.insert(0, {"role": "system", "content": _plan_note}) + logger.info("[plan] pinned approved plan (%d chars) for execution turn", len(approved_plan)) prep_timings["prompt_build"] = time.time() - _t2 _t3 = time.time() @@ -2287,6 +2372,14 @@ async def stream_agent_loop( ) _awaiting_user = True + # update_plan: agent wrote back to the plan (ticked a step / revised). + # Push it to the frontend so the stored plan + docked window update + # live. Does NOT end the turn — the agent keeps working. + if "plan_update" in result: + yield ( + f'data: {json.dumps({"type": "plan_update", "data": result["plan_update"]})}\n\n' + ) + # Build output for frontend tool bubble. # Document tools get a short summary — content goes to the editor panel. output_text = "" diff --git a/src/agent_tools.py b/src/agent_tools.py index c7c2a3636..41f0411ce 100644 --- a/src/agent_tools.py +++ b/src/agent_tools.py @@ -34,7 +34,7 @@ TOOL_TAGS = {"bash", "python", "web_search", "web_fetch", "read_file", "write_fi "send_to_session", "pipeline", "manage_session", "manage_memory", "list_models", - "ui_control", "generate_image", "ask_user", + "ui_control", "generate_image", "ask_user", "update_plan", "manage_tasks", "api_call", "ask_teacher", "manage_skills", "suggest_document", "manage_endpoints", "manage_mcp", "manage_webhooks", diff --git a/src/mcp_manager.py b/src/mcp_manager.py index 03bcf1839..29fdedebf 100644 --- a/src/mcp_manager.py +++ b/src/mcp_manager.py @@ -9,7 +9,7 @@ import json import logging import os import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Set, Tuple logger = logging.getLogger(__name__) @@ -90,6 +90,44 @@ def _format_mcp_params(input_schema: Any) -> str: return hint +# Tool-name prefixes that denote a read-only/inspection operation. Used to +# classify MCP tools for plan mode when the server provides no readOnlyHint. +# These are PREFIXES, not whole words (matched via str.startswith below), so a +# stem like "summar" intentionally covers "summarise"/"summarize"/"summary". +_MCP_READONLY_VERBS = ( + "list", "get", "read", "search", "fetch", "query", "find", "describe", + "show", "view", "lookup", "count", "status", "info", "inspect", "summar", +) + + +def mcp_tool_is_readonly(tool: Dict) -> bool: + """Classify an MCP tool as safe (non-mutating) for plan mode. + + Prefer the server's own annotations (readOnlyHint / destructiveHint). When + absent, fall back to a tool-name verb heuristic, and FAIL CLOSED (treat as + write) for anything that doesn't clearly read — plan mode must not run a + write tool just because its intent is ambiguous. + """ + ann = tool.get("annotations") + # annotations may be a dict or a pydantic model + read_hint = None + destructive = None + if ann is not None: + if isinstance(ann, dict): + read_hint = ann.get("readOnlyHint") + destructive = ann.get("destructiveHint") + else: + read_hint = getattr(ann, "readOnlyHint", None) + destructive = getattr(ann, "destructiveHint", None) + if read_hint is True: + return True + if read_hint is False or destructive is True: + return False + # No usable hint — heuristic on the tool name's leading verb. + name = (tool.get("name") or "").lower() + return name.startswith(_MCP_READONLY_VERBS) + + class McpManager: """Manages MCP server connections and tool routing.""" @@ -170,6 +208,10 @@ class McpManager: "name": tool.name, "description": tool.description or "", "input_schema": tool.inputSchema if hasattr(tool, 'inputSchema') else {}, + # MCP tool annotations (readOnlyHint / destructiveHint) drive + # plan-mode read-only gating. Absent on many servers, so we + # fall back to a name heuristic in mcp_tool_is_readonly(). + "annotations": getattr(tool, 'annotations', None), }) self._sessions[server_id] = session @@ -227,6 +269,10 @@ class McpManager: "name": tool.name, "description": tool.description or "", "input_schema": tool.inputSchema if hasattr(tool, 'inputSchema') else {}, + # MCP tool annotations (readOnlyHint / destructiveHint) drive + # plan-mode read-only gating. Absent on many servers, so we + # fall back to a name heuristic in mcp_tool_is_readonly(). + "annotations": getattr(tool, 'annotations', None), }) self._sessions[server_id] = session @@ -537,6 +583,24 @@ class McpManager: }) return result + def plan_mode_blocked_mcp(self) -> Tuple[Dict[str, Set[str]], Set[str]]: + """Plan mode: block every MCP tool that isn't clearly read-only. + + Returns (disabled_map, qualified_names): + - disabled_map: {server_id: {tool_name, ...}} to hide write tools from + the prompt/schemas (merged into the existing mcp_disabled_map). + - qualified_names: {"mcp____", ...} for runtime rejection + in execute_tool_block (which matches the qualified name). + """ + disabled_map: Dict[str, Set[str]] = {} + qualified: Set[str] = set() + for server_id, tools in self._tools.items(): + for tool in tools: + if not mcp_tool_is_readonly(tool): + disabled_map.setdefault(server_id, set()).add(tool["name"]) + qualified.add(f"mcp__{server_id}__{tool['name']}") + return disabled_map, qualified + def is_builtin(self, server_id: str) -> bool: """Check if a server is a built-in (auto-registered) server.""" return server_id.startswith("builtin_") or server_id in { diff --git a/src/tool_execution.py b/src/tool_execution.py index 9af6cce79..40bca4231 100644 --- a/src/tool_execution.py +++ b/src/tool_execution.py @@ -1263,6 +1263,41 @@ async def execute_tool_block( logger.info("Tool executed: %s (%d options, multi=%s)", desc, len(options), multi) return desc, result + # update_plan: the agent writes back to the active plan — tick an item done + # or revise steps (e.g. when the user asks to change something). Pure UI + # marker: returns a `plan_update` payload the agent loop turns into a + # `plan_update` SSE event; the frontend replaces the stored plan and refreshes + # the docked plan window. Does NOT end the turn. + if tool == "update_plan": + import json as _json + raw = (content or "").strip() + plan = "" + try: + parsed = _json.loads(raw) if raw else {} + except (ValueError, TypeError): + parsed = {} + if isinstance(parsed, dict) and parsed.get("plan"): + plan = str(parsed.get("plan", "")).strip() + else: + # Plain-string call (raw checklist) or JSON without a usable `plan`. + plan = raw + if not plan: + return "update_plan: invalid", { + "error": "update_plan needs a non-empty `plan` (the full updated checklist as markdown).", + "exit_code": 1, + } + plan = plan[:8192] + done = plan.count("- [x]") + plan.count("- [X]") + total = done + plan.count("- [ ]") + desc = f"update_plan: {done}/{total} done" if total else "update_plan" + result = { + "plan_update": {"plan": plan}, + "output": f"Plan updated ({done}/{total} steps complete)." if total else "Plan updated.", + "exit_code": 0, + } + logger.info("Tool executed: %s", desc) + return desc, result + # Background execution: a `bash` block whose first line is the `#!bg` # marker runs DETACHED — returns a job id immediately so the chat stream # isn't held open for a multi-minute install/ffmpeg/download. The always-on diff --git a/src/tool_index.py b/src/tool_index.py index c6eea86c7..a56fa0574 100644 --- a/src/tool_index.py +++ b/src/tool_index.py @@ -55,6 +55,8 @@ ALWAYS_AVAILABLE = frozenset({ # Ask the user a multiple-choice question for a decision/clarification. # Always reachable so the agent can pause and ask at any point. "ask_user", + # Write back to the active plan (tick steps done / revise) during execution. + "update_plan", }) # Tools that the Personal Assistant always has access to during scheduled @@ -115,6 +117,7 @@ BUILTIN_TOOL_DESCRIPTIONS: Dict[str, str] = { "send_to_session": "Send a message to another chat. Cross-chat communication.", "search_chats": "Search through chat history across all sessions.", "ask_user": "Ask the user a multiple-choice question to get a decision or clarification. Use this when the task is genuinely ambiguous and the answer changes what you do next — pick between approaches, confirm an assumption, choose among options — instead of guessing. Provide a clear `question` and 2-6 `options` (each with a short `label`, optional `description`). Calling this ENDS your turn: the user sees clickable buttons and their choice arrives as your next message. Don't use it for things you can decide from context or sensible defaults, or for irreversible-action confirmation if a dedicated flow exists.", + "update_plan": "Write back to the ACTIVE PLAN while executing an approved plan: mark steps done or revise them. After finishing a step call this with the full checklist and that step marked done; when the user asks to change the plan call it with the revised checklist. Always pass the COMPLETE markdown checklist (`- [ ]` / `- [x]`), not a diff. The user's docked plan window updates live. No effect when there is no active plan.", "ui_control": "Control the UI and toggle tools on/off. Use this to turn off / turn on / disable / enable individual tools and features: shell (bash), search (web), research, browser, documents, incognito. Open panels (documents library, gallery, email inbox, sessions, notes, memories/brain, skills, settings, cookbook) via `open_panel `. Use `open_email_reply reply` to open an email reply draft document without sending. Also switches between chat/agent modes, changes the current model, and applies/creates themes.", "list_email_accounts": "List configured email accounts and default status. Use before reading or sending mail when the user mentions Gmail, work mail, custom domain mail, another mailbox, or asks to compare/check multiple inboxes.", "list_emails": "List emails for a folder/account, newest first, including read messages by default. Shows subject, sender, date, UID, account, and AI summary. Check inbox, find emails needing replies. Supports account from list_email_accounts for Gmail/work/custom mailboxes. For last/latest/newest email, use max_results=1 and unread_only=false.", diff --git a/src/tool_schemas.py b/src/tool_schemas.py index 7c6a63953..3138d606c 100644 --- a/src/tool_schemas.py +++ b/src/tool_schemas.py @@ -474,6 +474,20 @@ FUNCTION_TOOL_SCHEMAS = [ } } }, + { + "type": "function", + "function": { + "name": "update_plan", + "description": "Write back to the ACTIVE PLAN: mark steps done or revise them. Use this while executing an approved plan — after you finish a step, call update_plan with the full checklist and that step marked `- [x]`; when the user asks to change the plan, call it with the revised checklist. The user's docked plan window updates live. Pass the COMPLETE checklist every time (not a diff). No effect if there is no active plan.", + "parameters": { + "type": "object", + "properties": { + "plan": {"type": "string", "description": "The full updated plan as a GitHub-style markdown checklist — one step per line, `- [ ]` for pending and `- [x]` for done. Always send the whole list."} + }, + "required": ["plan"] + } + } + }, { "type": "function", "function": { diff --git a/src/tool_security.py b/src/tool_security.py index 8ffa50f9b..82d2c3d67 100644 --- a/src/tool_security.py +++ b/src/tool_security.py @@ -51,6 +51,101 @@ NON_ADMIN_BLOCKED_TOOLS = { } +# Plan mode: the agent may investigate but must not mutate anything. Only these +# read-only/inspection tools stay enabled; everything else (writes, sends, +# manage_*, model serving, MCP, etc.) is blocked. Allowlist rather than blocklist +# so any newly added tool defaults to BLOCKED in plan mode — fail safe. +# +# bash/python are deliberately NOT here: the shell can mutate (write files, hit +# the network) and can't be constrained to read-only at the tool layer, so plan +# mode blocks it outright rather than relying on a prompt to keep it well-behaved. +# Code/file discovery is covered by the dedicated read-only tools below +# (read_file, grep, glob, ls) instead of freestyle shell. +PLAN_MODE_READONLY_TOOLS = { + "read_file", + "grep", + "glob", + "ls", + "web_search", + "web_fetch", + "search_chats", + "list_models", + "list_sessions", + "list_emails", + "read_email", + "list_served_models", + "list_downloads", + "list_cached_models", + "search_hf_models", + "list_serve_presets", + "list_cookbook_servers", + "resolve_contact", + "chat_with_model", + "ask_teacher", +} + + +# The agent's tool gate is a DENYLIST: execute_tool_block blocks any tool whose +# name is in `disabled_tools`. Plan mode's policy is the opposite — an allowlist +# (PLAN_MODE_READONLY_TOOLS). To apply an allowlist through a denylist, plan mode +# returns the inverse: every known tool name minus the allowlist. +# +# Known tool names come from FUNCTION_TOOL_SCHEMAS, but that source is imperfect: +# some tools are only XML-invocable (e.g. manage_notes, generate_image) and never +# appear there, and the import can fail outright. Either gap would drop a mutating +# tool from the subtraction and silently leave it enabled. This set is the static +# backstop for both: union it in so known mutators are always subtracted, and so a +# failed import still blocks them (fail closed, never open). Only mutators belong +# here — read-only tools are covered by the allowlist. Keep in sync when adding +# new mutating tools. +_PLAN_MODE_KNOWN_MUTATORS = { + "write_file", "create_document", "edit_document", "update_document", + "suggest_document", "manage_documents", "create_session", "manage_session", + "send_to_session", "pipeline", "manage_memory", "manage_skills", + "manage_tasks", "manage_notes", "manage_endpoints", "manage_mcp", + "manage_webhooks", "manage_tokens", "manage_settings", "manage_contact", + "manage_calendar", "api_call", "app_api", "ui_control", + "send_email", "reply_to_email", "bulk_email", "delete_email", + "archive_email", "mark_email_read", "download_model", "serve_model", + "stop_served_model", "cancel_download", "adopt_served_model", "serve_preset", + "generate_image", "edit_image", "trigger_research", "manage_research", + # Shell is never read-only-safe; block it explicitly so it stays out of plan + # mode even if the schema list fails to load. + "bash", "python", +} + + +def plan_mode_disabled_tools() -> Set[str]: + """Tool names to add to the denylist in plan mode. + + Plan mode allows only PLAN_MODE_READONLY_TOOLS. The gate is a denylist, so + return the inverse: every known tool name minus the allowlist. Known names + come from the function-tool schemas, backstopped by _PLAN_MODE_KNOWN_MUTATORS + (see above) so XML-only tools and a failed schema import can't leave a mutator + enabled. MCP tools are handled separately — the loop drops the MCP manager + entirely in plan mode.""" + try: + # agent_tools / tool_parsing / tool_schemas form a mutually-circular + # cluster that only resolves cleanly when entered via agent_tools. + # Import it first so the lazy schema import works even from a cold + # import (e.g. tests) — not just after the app has wired everything up. + import src.agent_tools # noqa: F401 + from src.tool_schemas import FUNCTION_TOOL_SCHEMAS + + all_names = { + (t.get("function") or {}).get("name") + for t in FUNCTION_TOOL_SCHEMAS + } + all_names.discard(None) + except Exception as exc: + logger.warning("Unable to load tool schemas for plan-mode gating: %s", exc) + all_names = set() + # Subtract the allowlist from all known tool names (schema-derived plus the + # static mutator backstop). Fail closed: if the schema import failed above, + # the backstop alone still blocks known mutators. + return (all_names | _PLAN_MODE_KNOWN_MUTATORS) - PLAN_MODE_READONLY_TOOLS + + def is_public_blocked_tool(tool_name: Optional[str]) -> bool: """Return True when a non-admin/public user must not execute this tool. diff --git a/static/app.js b/static/app.js index 08ab12161..5621ef7dd 100644 --- a/static/app.js +++ b/static/app.js @@ -1555,6 +1555,7 @@ function initializeEventListeners() { const MODE_TOOLS = [ { btnId: 'web-toggle-btn', checkboxId: 'web-toggle', stateKey: 'web' }, { btnId: 'bash-toggle-btn', checkboxId: 'bash-toggle', stateKey: 'bash' }, + { btnId: 'plan-toggle-btn', checkboxId: 'plan-toggle', stateKey: 'plan' }, ]; function _modeKey(stateKey, mode) { return `${stateKey}_${mode}`; } @@ -1563,6 +1564,9 @@ function initializeEventListeners() { const state = loadToggleState(); const key = _modeKey(stateKey, mode); if (Object.prototype.hasOwnProperty.call(state, key)) return !!state[key]; + // Plan mode is opt-in: never default it on, otherwise every agent turn + // would be forced into planning. + if (stateKey === 'plan') return false; return mode === 'agent'; // default: ON in agent, OFF in chat } @@ -1575,6 +1579,7 @@ function initializeEventListeners() { const TOOL_TOGGLE_TOAST_LABELS = { web: 'Web search', bash: 'Shell', + plan: 'Plan mode', }; function showToolToggleToast(stateKey, active) { @@ -1688,6 +1693,81 @@ function initializeEventListeners() { } setupToggle('web-toggle-btn', 'web-toggle', 'web'); setupToggle('bash-toggle-btn', 'bash-toggle', 'bash'); + try { workspaceModule.initWorkspace(); } catch (_) {} + setupToggle('plan-toggle-btn', 'plan-toggle', 'plan'); + + // Set plan mode on/off directly (checkbox + button state + saved pref) WITHOUT + // going through the button's click handler — used by the plan menu and by the + // "Approve & Run" flow. Going through .click() would hit the plan-menu + // intercept below (a stored plan re-opens the menu instead of toggling), which + // is exactly the bug that left approved plans stuck in plan mode. + function _setPlanMode(on) { + const btn = el('plan-toggle-btn'); + const chk = el('plan-toggle'); + const mode = (loadToggleState().mode) || 'chat'; + if (chk) chk.checked = !!on; + if (btn) { btn.classList.toggle('active', !!on); btn.setAttribute('aria-pressed', String(!!on)); } + saveToolPref('plan', mode, !!on); + } + window._setPlanMode = _setPlanMode; + + // ── Plan-button menu ── + // When a plan exists for this chat, clicking the plan button opens a small + // menu (Show plan / Plan mode on-off) instead of plain-toggling — so the plan + // window can be re-opened and docked at any time while the agent works. With + // no plan, the button behaves as before (one-click toggle). + (function initPlanMenu() { + const planBtn = el('plan-toggle-btn'); + if (!planBtn) return; + const _hasPlan = () => { try { return !!(window._getStoredPlan && window._getStoredPlan()); } catch (_) { return false; } }; + const _close = () => { const m = document.getElementById('plan-menu'); if (m) m.remove(); }; + function _open() { + _close(); + const planChk = el('plan-toggle'); + const on = !!(planChk && planChk.checked); + const menu = document.createElement('div'); + menu.id = 'plan-menu'; + menu.className = 'overflow-menu plan-menu'; + menu.innerHTML = + '' + + ''; + document.body.appendChild(menu); + const r = planBtn.getBoundingClientRect(); + menu.style.position = 'fixed'; + menu.style.left = Math.round(r.left) + 'px'; + menu.style.top = Math.round(r.top - menu.offsetHeight - 6) + 'px'; + menu.querySelector('[data-act="show"]').addEventListener('click', () => { + _close(); + const txt = window._getStoredPlan ? window._getStoredPlan() : ''; + if (txt && window.planWindowModule) window.planWindowModule.openPlanWindow(txt, null); + }); + menu.querySelector('[data-act="toggle"]').addEventListener('click', () => { + _close(); + _setPlanMode(!on); // flip state directly (no click → no menu re-open) + }); + // Dismiss on any outside click (capture so it beats other handlers) / Escape. + setTimeout(() => { + const off = (e) => { + if (!menu.contains(e.target) && e.target !== planBtn) { + _close(); document.removeEventListener('click', off, true); document.removeEventListener('keydown', esc, true); + } + }; + const esc = (e) => { if (e.key === 'Escape') { _close(); document.removeEventListener('click', off, true); document.removeEventListener('keydown', esc, true); } }; + document.addEventListener('click', off, true); + document.addEventListener('keydown', esc, true); + }, 0); + } + planBtn.addEventListener('click', (e) => { + // With a stored plan, the button opens the menu (Show plan / toggle). + // Without one, it falls through to the normal one-click toggle. + if (_hasPlan()) { e.preventDefault(); e.stopImmediatePropagation(); _open(); } + }, true); // capture phase: intercept before setupToggle's bubble handler + })(); + try { workspaceModule.initWorkspace(); } catch (_) {} // Document editor toggle (special: uses module panel, not a checkbox) diff --git a/static/index.html b/static/index.html index 3916cca53..22cdfdaae 100644 --- a/static/index.html +++ b/static/index.html @@ -1076,6 +1076,12 @@ + + + + + + `; + document.body.appendChild(_modal); + _modal.querySelector('#plan-window-close').addEventListener('click', closePlanWindow); + _modal.querySelector('#plan-window-approve').addEventListener('click', () => { + const cb = _onApprove; + closePlanWindow(); + if (typeof cb === 'function') cb(); + }); + // Draggable + side-dockable, same one-call helper as the other windows. + const content = _modal.querySelector('.modal-content'); + const header = _modal.querySelector('.modal-header'); + if (content && header) makeWindowDraggable(_modal, { content, header }); + return _modal; +} + +/** + * Open the plan window with rendered markdown and an approve callback. + * @param {string} planMarkdown - the agent's proposed plan (raw markdown) + * @param {Function} onApprove - called when the user clicks Approve & Run + */ +export function openPlanWindow(planMarkdown, onApprove) { + const modal = _getModal(); + _onApprove = onApprove || null; + const body = modal.querySelector('#plan-window-body'); + if (body) { + body.innerHTML = markdownModule.processWithThinking( + markdownModule.squashOutsideCode(planMarkdown || '') + ); + if (window.hljs) body.querySelectorAll('pre code').forEach((b) => window.hljs.highlightElement(b)); + } + const approveBtn = modal.querySelector('#plan-window-approve'); + if (approveBtn) approveBtn.style.display = onApprove ? '' : 'none'; + // Title reflects state: still awaiting approval (approve callback present) vs + // already approved and being executed. + const title = modal.querySelector('#plan-window-title'); + if (title) title.textContent = onApprove ? 'Proposed plan' : 'Approved plan'; + modal.style.display = 'flex'; + if (uiModule && uiModule.scrollHistory) { try { uiModule.scrollHistory(); } catch (_) {} } +} + +export function closePlanWindow() { + if (_modal) _modal.style.display = 'none'; +} + +/** True when the plan window is currently visible (for live-refresh on progress). */ +export function isPlanWindowOpen() { + return !!(_modal && _modal.style.display !== 'none'); +} + +export default { openPlanWindow, closePlanWindow, isPlanWindowOpen }; diff --git a/static/js/slashCommands.js b/static/js/slashCommands.js index 0f3a72052..1a11454bf 100644 --- a/static/js/slashCommands.js +++ b/static/js/slashCommands.js @@ -1170,6 +1170,22 @@ async function _cmdWorkspace(args, ctx) { slashReply('Usage: /workspace · set /path · clear · pick'); return true; } +// Plan mode: drive the real toggle pill (#plan-toggle-btn) so its per-mode +// persistence/UI logic runs. Only meaningful in agent mode. +async function _cmdTogglePlan(args, ctx) { + const btn = document.getElementById('plan-toggle-btn'); + const chk = document.getElementById('plan-toggle'); + if (!btn || btn.style.display === 'none' || btn.offsetParent === null) { + slashReply('Plan mode is only available in agent mode — switch to Agent first.'); + return true; + } + const cur = !!(chk && chk.checked); + const v = (args[0] || '').toLowerCase(); + const target = v === 'on' ? true : v === 'off' ? false : !cur; + if (target !== cur) btn.click(); + slashReply(`Plan mode: ${target ? 'on' : 'off'}`); + return true; +} async function _cmdToggleShow(args, ctx) { const name = (args[0] || '').toLowerCase(); @@ -5489,6 +5505,7 @@ const COMMANDS = { 'bash': { handler: _cmdToggleBash, alias: ['b','shell'], help: 'Toggle bash/shell', usage: '/toggle bash' }, 'research': { handler: _cmdToggleResearch, alias: ['r'], help: 'Toggle deep research', usage: '/toggle research' }, 'doc': { handler: _cmdToggleDoc, alias: [], help: 'Toggle document editor', usage: '/toggle doc' }, + 'plan': { handler: _cmdTogglePlan, alias: ['p'], help: 'Toggle plan mode (agent)', usage: '/toggle plan' }, 'sidebar': { handler: _cmdToggleSidebar, alias: ['sb'], help: 'Cycle sidebar (full/mini/off)', usage: '/toggle sidebar [1|2|3]' }, '_show': { handler: _cmdToggleShow, alias: [], help: 'Show all toggle states', usage: '/toggle' } } @@ -5501,6 +5518,13 @@ const COMMANDS = { noUserBubble: true, usage: '/workspace [set | clear | pick]', }, + plan: { + alias: [], + category: 'Quick toggles', + help: 'Toggle plan mode (agent)', + handler: _cmdTogglePlan, + usage: '/plan [on|off]', + }, memory: { alias: ['m'], category: 'Memory', diff --git a/static/js/storage.js b/static/js/storage.js index 7ff9c6bd5..06b4d5430 100644 --- a/static/js/storage.js +++ b/static/js/storage.js @@ -24,7 +24,8 @@ export const KEYS = { SECTION_ORDER: 'sidebar-section-order', ADMIN_LAST_TAB: 'admin-last-tab', DENSITY: 'odysseus-density', - WORKSPACE: 'odysseus-workspace' + WORKSPACE: 'odysseus-workspace', + PLAN: 'odysseus-plan' }; /** diff --git a/static/style.css b/static/style.css index 8243a0b14..2c79b51df 100644 --- a/static/style.css +++ b/static/style.css @@ -2305,6 +2305,104 @@ body.bg-pattern-sparkles { color: var(--fg); background: color-mix(in srgb, var(--fg) 9%, transparent); } + /* Plan mode: "Approve & Run" affordance under a proposed plan */ + .plan-approve-bar { + margin: 8px 0 2px; + } + .plan-approve-btn { + font: inherit; + font-size: 13px; + font-weight: 600; + padding: 6px 14px; + border-radius: 8px; + cursor: pointer; + color: var(--accent); + background: color-mix(in srgb, var(--accent) 12%, transparent); + border: 1px solid var(--accent); + transition: background 0.15s, transform 0.1s; + } + .plan-approve-btn:hover { + background: color-mix(in srgb, var(--accent) 22%, transparent); + } + .plan-approve-btn:active { + transform: scale(0.97); + } + .plan-approve-bar { + display: flex; + gap: 8px; + align-items: center; + } + .plan-open-btn { + font: inherit; + font-size: 13px; + padding: 6px 12px; + border-radius: 8px; + cursor: pointer; + color: var(--fg); + background: color-mix(in srgb, var(--fg) 8%, transparent); + border: 1px solid color-mix(in srgb, var(--fg) 22%, transparent); + transition: background 0.15s; + } + .plan-open-btn:hover { + background: color-mix(in srgb, var(--fg) 15%, transparent); + } + /* GitHub-style task lists (- [ ] / - [x]) — used by plan-mode checklists */ + li.task-item { + list-style: none; + margin-left: -1.2em; + display: flex; + align-items: flex-start; + gap: 8px; + } + li.task-item .task-check { + flex: 0 0 auto; + width: 15px; + height: 15px; + margin-top: 3px; + border-radius: 4px; + border: 1.5px solid color-mix(in srgb, var(--fg) 45%, transparent); + box-sizing: border-box; + position: relative; + } + li.task-item.task-done .task-check { + background: var(--accent); + border-color: var(--accent); + } + li.task-item.task-done .task-check::after { + content: ''; + position: absolute; + left: 4px; + top: 1px; + width: 4px; + height: 8px; + border: solid var(--bg); + border-width: 0 2px 2px 0; + transform: rotate(45deg); + } + li.task-item.task-done .task-text { + opacity: 0.6; + text-decoration: line-through; + } + /* Plan window: a draggable/dockable modal (shares .modal framework) */ + .plan-window-content { + width: 520px; + max-width: 92vw; + max-height: 80vh; + display: flex; + flex-direction: column; + } + .plan-window-body { + overflow-y: auto; + padding: 14px 18px; + flex: 1 1 auto; + line-height: 1.55; + } + .plan-window-footer { + padding: 10px 18px; + border-top: 1px solid color-mix(in srgb, var(--fg) 12%, transparent); + display: flex; + justify-content: flex-end; + } /* While the menu is open the chevron stays in its highlighted state — don't run the opacity fade transition so we never flash from 0.5 → hover-1.0 → drop-back. The state holds steady. */ diff --git a/tests/test_plan_mode.py b/tests/test_plan_mode.py new file mode 100644 index 000000000..cfca83146 --- /dev/null +++ b/tests/test_plan_mode.py @@ -0,0 +1,104 @@ +"""Plan mode gating regression tests. + +Plan mode restricts the agent to read-only/inspection tools so it can investigate +and propose a plan without mutating anything. These pin the security-relevant +contract: + +- The read-only allowlist contains only inspection tools (no writes/sends/manage_*). +- `plan_mode_disabled_tools()` blocks every mutating tool and never blocks an + allowlisted one. +- It fails CLOSED: if the tool-schema list can't be loaded, it still blocks a + known-mutating set rather than returning nothing (which would allow mutations). + +Pure-function tests — no FastAPI app boot, no DB. +""" + +from src.tool_security import ( + PLAN_MODE_READONLY_TOOLS, + _PLAN_MODE_KNOWN_MUTATORS, + plan_mode_disabled_tools, +) + + +def test_allowlist_has_no_obvious_mutating_tools(): + # Sanity: the read-only allowlist must not contain mutating/external tools. + mutating_markers = ("write_", "send_", "manage_", "create_", "edit_", "delete_") + for name in PLAN_MODE_READONLY_TOOLS: + assert not name.startswith(mutating_markers), f"{name} should not be read-only" + + +def test_plan_mode_blocks_mutating_tools(): + disabled = plan_mode_disabled_tools() + # A representative spread of mutating/external tools must be blocked. + for name in ( + "write_file", "send_email", "reply_to_email", "manage_memory", + "manage_settings", "create_document", "edit_document", "download_model", + "generate_image", "trigger_research", + ): + assert name in disabled, f"{name} must be blocked in plan mode" + + +def test_plan_mode_allows_readonly_tools(): + disabled = plan_mode_disabled_tools() + # Read-only investigation tools stay enabled, including the discovery tools + # (grep/glob/ls) that replace freestyle shell. + for name in ("read_file", "grep", "glob", "ls", "web_search", "web_fetch", "search_chats"): + assert name not in disabled, f"{name} should be usable in plan mode" + + +def test_plan_mode_blocks_shell(): + # bash/python can mutate and can't be constrained read-only, so plan mode + # must block them (the whole point of dropping shell from plan mode). + disabled = plan_mode_disabled_tools() + for name in ("bash", "python"): + assert name in disabled, f"{name} must be blocked in plan mode" + + +def test_disabled_never_intersects_allowlist(): + assert plan_mode_disabled_tools() & PLAN_MODE_READONLY_TOOLS == set() + + +def test_mcp_readonly_classification(): + from src.mcp_manager import mcp_tool_is_readonly as ro + # Server-provided hints win over the name heuristic. + assert ro({"name": "zap", "annotations": {"readOnlyHint": True}}) is True + assert ro({"name": "list_things", "annotations": {"readOnlyHint": False}}) is False + assert ro({"name": "get_x", "annotations": {"destructiveHint": True}}) is False + # No hint → leading-verb heuristic, fail closed for ambiguous names. + assert ro({"name": "list_files"}) is True + assert ro({"name": "search_docs"}) is True + assert ro({"name": "send_message"}) is False + assert ro({"name": "frobnicate"}) is False + + +def test_fail_closed_fallback_blocks_mutations(monkeypatch): + # If the schema list can't load, we must still block (fail closed), not + # return an empty set that would silently allow every mutating tool. + import src.tool_security as ts + + def _boom(): + raise ImportError("simulated circular import failure") + + # Force the dynamic path to fail by making the lazy import explode. + monkeypatch.setitem( + __import__("sys").modules, "src.agent_tools", None + ) + disabled = ts.plan_mode_disabled_tools() + assert disabled, "plan mode must never fail open (empty disabled set)" + assert "write_file" in disabled + assert "send_email" in disabled + assert disabled == set(_PLAN_MODE_KNOWN_MUTATORS) + + +def test_active_plan_note_pins_checklist(): + """The approved-plan note re-grounds execution so a long plan survives + history truncation (the agent can always re-read it).""" + from src.agent_loop import build_active_plan_note + plan = "- [ ] step one\n- [ ] step two" + note = build_active_plan_note(plan) + assert "ACTIVE PLAN" in note + assert plan in note # the actual checklist is embedded + assert "IN ORDER" in note # execution guidance present + # Empty input → no note (so we never inject a blank pin). + assert build_active_plan_note("") == "" + assert build_active_plan_note(" ") == "" diff --git a/tests/test_update_plan_tool.py b/tests/test_update_plan_tool.py new file mode 100644 index 000000000..cac58b21e --- /dev/null +++ b/tests/test_update_plan_tool.py @@ -0,0 +1,46 @@ +"""`update_plan` — the agent writes back to the active plan (tick done / revise). + +Pure UI-control marker: `execute_tool_block` returns a `plan_update` payload the +agent loop turns into a `plan_update` SSE event; the frontend replaces the stored +plan and refreshes the docked plan window. No I/O, does not end the turn. +""" +import asyncio +import json + +from src.agent_tools import ToolBlock, TOOL_TAGS # import first to avoid circular +from src.tool_execution import execute_tool_block +from src.tool_index import ALWAYS_AVAILABLE, BUILTIN_TOOL_DESCRIPTIONS +from src.tool_security import is_public_blocked_tool + + +def _run(content): + return asyncio.run(execute_tool_block(ToolBlock("update_plan", content))) + + +def test_valid_plan_returns_marker_and_counts(): + plan = "- [x] step one\n- [ ] step two\n- [ ] step three" + desc, result = _run(json.dumps({"plan": plan})) + assert result.get("exit_code") == 0 + assert result["plan_update"]["plan"] == plan + assert "1/3" in result["output"] # 1 done of 3 + + +def test_plain_string_accepted(): + plan = "- [ ] a\n- [x] b" + _, result = _run(plan) + assert result["plan_update"]["plan"] == plan + + +def test_empty_rejected(): + _, result = _run(json.dumps({"plan": " "})) + assert "error" in result and result.get("exit_code") == 1 + + +def test_registered_everywhere(): + assert "update_plan" in TOOL_TAGS + assert "update_plan" in ALWAYS_AVAILABLE + assert "update_plan" in BUILTIN_TOOL_DESCRIPTIONS + from src.tool_schemas import FUNCTION_TOOL_SCHEMAS + assert "update_plan" in {s["function"]["name"] for s in FUNCTION_TOOL_SCHEMAS} + # Not admin/public-gated — any user can drive their own plan. + assert is_public_blocked_tool("update_plan") is False