Cookbook model workflow fixes

2026-06-24 05:35:31 -04:00 · 2026-06-21 11:02:35 +00:00
parent 8c46172e87
commit c504214925
38 changed files with 3042 additions and 459 deletions
@@ -751,6 +751,17 @@ def _extract_last_user_message(messages: List[Dict]) -> str:


 _LOW_SIGNAL_RE = re.compile(r"^[\W_]*$", re.UNICODE)
+_CASUAL_OPENING_RE = re.compile(
+    r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|"
+    r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P<tail>.*)$",
+    re.IGNORECASE,
+)
+_CASUAL_BLOCKLIST_RE = re.compile(
+    r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|"
+    r"download|model|email|document|doc|note|calendar|task|search|web|research|"
+    r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b",
+    re.IGNORECASE,
+)
 _EXPLICIT_CONTINUATION_RE = re.compile(
    r"^\s*(?:"
    r"yes|y|yeah|yep|ok|okay|sure|do it|go ahead|continue|carry on|"
@@ -760,6 +771,17 @@ _EXPLICIT_CONTINUATION_RE = re.compile(
    r")\s*[.!?]*\s*$",
    re.IGNORECASE,
 )
+_RETRY_CONTINUATION_RE = re.compile(
+    r"\b(?:try again|retry|again|rerun|re-run|run it again|launch it again|"
+    r"start it again|failed|fails?|died|crashed|broke|insta|instantly)\b",
+    re.IGNORECASE,
+)
+_COOKBOOK_CONTEXT_RE = re.compile(
+    r"\b(?:cookbook|serve|serving|served|launch|start|preset|vllm|sglang|"
+    r"llama\.?cpp|ollama|download|cached models?|model servers?|running models?|"
+    r"gpu box|ajax|qwen|gemma|llama|mistral|minimax)\b",
+    re.IGNORECASE,
+)


 def _is_explicit_continuation(text: str) -> bool:
@@ -767,6 +789,37 @@ def _is_explicit_continuation(text: str) -> bool:
    return bool(_EXPLICIT_CONTINUATION_RE.match(str(text or "").strip()))


+def _is_casual_low_signal(text: str) -> bool:
+    """True for short greetings/slang that should not inherit stale context."""
+    s = str(text or "").strip()
+    m = _CASUAL_OPENING_RE.match(s)
+    if not m:
+        return False
+    tail = m.group("tail") or ""
+    if _CASUAL_BLOCKLIST_RE.search(tail):
+        return False
+    # Allow a short vocative/address after the opener without hardcoding the
+    # address term itself: "hey man", "yo dude", "sup <name>". Longer tails are
+    # more likely to be an actual request and should get normal context/tooling.
+    tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail)
+    return len(tail_words) <= 2
+
+
+def _is_contextual_retry_continuation(messages: List[Dict], text: str) -> bool:
+    """Treat "try again / it failed" as a continuation only for active tool work.
+
+    These follow-ups are common after Cookbook launches: the latest user turn
+    says only "try again it failed", while the actionable model/host/command
+    details live one or two turns back. Keep this intentionally narrow so
+    ordinary chat does not inherit stale Cookbook context.
+    """
+    latest = str(text or "").strip()
+    if not latest or not _RETRY_CONTINUATION_RE.search(latest):
+        return False
+    recent = _recent_context_for_retrieval(messages, max_user=5, max_chars=1200)
+    return bool(_COOKBOOK_CONTEXT_RE.search(recent))
+
+
 def _assistant_requested_followup(messages: List[Dict]) -> bool:
    """True when the previous assistant turn asked for missing task details.

@@ -808,11 +861,12 @@ def _classify_agent_request(messages: List[Dict], last_user: str) -> Dict[str, o
    which domain rule packs get appended to the system prompt.
    """
    text = str(last_user or "").strip()
-    continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages)
+    retry_continuation = _is_contextual_retry_continuation(messages, text)
+    continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages) or retry_continuation
    retrieval_query = _recent_context_for_retrieval(messages) if continuation else text
    q = retrieval_query.lower()

-    if not text or bool(_LOW_SIGNAL_RE.match(text)):
+    if not text or bool(_LOW_SIGNAL_RE.match(text)) or _is_casual_low_signal(text):
        return {
            "low_signal": True,
            "continuation": False,
@@ -907,6 +961,7 @@ def _build_system_prompt(
    compact: bool = False,
    owner: Optional[str] = None,
    suppress_local_context: bool = False,
+    suppress_skills: bool = False,
    active_email: Optional[Dict[str, str]] = None,
 ) -> List[Dict]:
    """Build agent system prompt, inject MCP/document context, merge consecutive system msgs."""
@@ -924,7 +979,7 @@ def _build_system_prompt(
        _ov_sig = _hl.sha256(_json.dumps(get_builtin_overrides() or {}, sort_keys=True).encode()).hexdigest()
    except Exception:
        _ov_sig = ""
-    cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context)
+    cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context, suppress_skills)
    if _cached_base_prompt and _cached_base_prompt_key == cache_key and not active_document:
        agent_prompt = _cached_base_prompt
        # Skill index is user-editable (name + description), so it must never
@@ -934,6 +989,7 @@ def _build_system_prompt(
            disabled_tools, mcp_mgr, needs_admin, relevant_tools,
            mcp_disabled_map=mcp_disabled_map, compact=compact, owner=owner,
            suppress_local_context=suppress_local_context,
+            suppress_skills=suppress_skills,
        )
    else:
        agent_prompt, _skill_index_block = _build_base_prompt(
@@ -945,6 +1001,7 @@ def _build_system_prompt(
            compact=compact,
            owner=owner,
            suppress_local_context=suppress_local_context,
+            suppress_skills=suppress_skills,
        )
        if not active_document:
            _cached_base_prompt = agent_prompt
@@ -1228,7 +1285,7 @@ def _build_system_prompt(
    # few. If the teacher wrote a procedure for "open my X chat" last
    # time the student failed, this is where the student finds it
    # before deciding which tool to call.
-    if not suppress_local_context:
+    if not suppress_local_context and not suppress_skills:
        try:
            last_user = _extract_last_user_message(messages)
            # Respect the user's skills-enabled toggle (mirrors memory_enabled).
@@ -1395,6 +1452,7 @@ def _build_base_prompt(
    compact: bool = False,
    owner: Optional[str] = None,
    suppress_local_context: bool = False,
+    suppress_skills: bool = False,
 ):
    """Build the agent prompt with only relevant tools included.

@@ -1447,7 +1505,7 @@ def _build_base_prompt(
    # The caller wraps it in untrusted_context_message and ships it as a
    # user-role message — same treatment as the matched-skills block.
    skill_index_block = ""
-    if not suppress_local_context:
+    if not suppress_local_context and not suppress_skills:
        try:
            from services.memory.skills import SkillsManager
            from src.constants import DATA_DIR
@@ -1866,6 +1924,7 @@ async def stream_agent_loop(
    approved_plan: Optional[str] = None,
    tool_policy: Optional[ToolPolicy] = None,
    workspace: Optional[str] = None,
+    forced_tools: Optional[Set[str]] = None,
    _is_teacher_run: bool = False,
 ) -> AsyncGenerator[str, None]:
    """Streaming agent loop generator.
@@ -1905,6 +1964,18 @@ async def stream_agent_loop(
    _needs_admin = _detect_admin_intent(messages)
    _last_user = _extract_last_user_message(messages)
    _intent = _classify_agent_request(messages, _last_user)
+    _low_signal_turn = bool(_intent.get("low_signal"))
+    _casual_low_signal_turn = _is_casual_low_signal(_last_user)
+    _direct_low_signal = (
+        _low_signal_turn
+        and not bool(_intent.get("continuation"))
+        and not plan_mode
+        and not approved_plan
+        and (_casual_low_signal_turn or active_document is None)
+        and (_casual_low_signal_turn or not active_email)
+        and (_casual_low_signal_turn or not workspace)
+        and not forced_tools
+    )
    # Tool retrieval uses the latest message by default. It may inherit recent
    # user turns only for explicit continuations ("yes", "do it", "1").
    _retrieval_query = str(_intent.get("retrieval_query") or _last_user)
@@ -1912,11 +1983,86 @@ async def stream_agent_loop(
        "[agent-intent] latest=%r continuation=%s low_signal=%s domains=%s retrieval_query=%r",
        _last_user[:120],
        bool(_intent.get("continuation")),
-        bool(_intent.get("low_signal")),
+        _low_signal_turn,
        sorted(_intent.get("domains") or []),
        _retrieval_query[:200],
    )
    _mcp_disabled_map = _load_mcp_disabled_map() if mcp_mgr else {}
+    if _direct_low_signal:
+        logger.info("[agent] direct low-signal reply path for latest=%r", _last_user[:80])
+        direct_messages = [{"role": "user", "content": _last_user}]
+        direct_response = ""
+        direct_start = time.time()
+        direct_actual_model = model
+        real_input_tokens = 0
+        real_output_tokens = 0
+        try:
+            async for chunk in stream_llm_with_fallback(
+                [(endpoint_url, model, headers)] + list(fallbacks or []),
+                direct_messages,
+                temperature=temperature,
+                max_tokens=min(max_tokens or 128, 128),
+                prompt_type=None,
+                tools=None,
+                timeout=int(get_setting("agent_stream_timeout_seconds", 300) or 300),
+                session_id=session_id,
+            ):
+                if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
+                    try:
+                        data = json.loads(chunk[6:])
+                    except json.JSONDecodeError:
+                        yield chunk
+                        continue
+                    if data.get("type") == "usage":
+                        usage = data.get("data", {}) or {}
+                        direct_actual_model = usage.get("model") or direct_actual_model
+                        real_input_tokens += usage.get("input_tokens", 0) or 0
+                        real_output_tokens += usage.get("output_tokens", 0) or 0
+                        continue
+                    if data.get("type") == "model_actual":
+                        direct_actual_model = data.get("model") or direct_actual_model
+                        data["requested_model"] = model
+                        yield f"data: {json.dumps(data)}\n\n"
+                        continue
+                    if data.get("type") == "fallback":
+                        direct_actual_model = data.get("answered_by") or direct_actual_model
+                        yield chunk
+                        continue
+                    if "delta" in data:
+                        if not data.get("thinking"):
+                            direct_response += data.get("delta", "")
+                        yield chunk
+                        continue
+                    yield chunk
+                elif chunk.startswith("event: "):
+                    yield chunk
+        except Exception as _direct_err:
+            logger.warning("[agent] direct low-signal path failed: %s", _direct_err)
+            fallback = "Hey."
+            direct_response += fallback
+            yield f"data: {json.dumps({'delta': fallback})}\n\n"
+
+        if not direct_response.strip():
+            fallback = "Hey."
+            direct_response = fallback
+            yield f"data: {json.dumps({'delta': fallback})}\n\n"
+
+        duration = time.time() - direct_start
+        metrics = {
+            "model": direct_actual_model,
+            "requested_model": model,
+            "input_tokens": real_input_tokens or estimate_tokens(direct_messages),
+            "output_tokens": real_output_tokens or max(len(direct_response) // 4, 1),
+            "total_time": round(duration, 2),
+            "response_time": round(duration, 2),
+            "agent_rounds": 0,
+            "tool_calls": 0,
+            "direct_low_signal": True,
+        }
+        yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"
+        yield "data: [DONE]\n\n"
+        return
+
    if plan_mode and mcp_mgr:
        # Allow read-only MCP tools to investigate, block write/unknown ones:
        # hide them from the schemas AND reject them at runtime by qualified name.
@@ -1932,7 +2078,7 @@ async def stream_agent_loop(
    _t1 = time.time()
    if _relevant_tools:
        logger.info(f"[tool-rag] Using caller-provided relevant_tools ({len(_relevant_tools)} tools)")
-    if not guide_only and not _relevant_tools and bool(_intent.get("low_signal")):
+    if not guide_only and not _relevant_tools and _low_signal_turn:
        from src.tool_index import ALWAYS_AVAILABLE
        if workspace:
            # An active workspace IS the file-work signal: a vague "look at the
@@ -2023,6 +2169,15 @@ async def stream_agent_loop(
    if _relevant_tools is not None and active_document is not None:
        _relevant_tools.update({"edit_document", "update_document", "suggest_document"})

+    # Per-request UI toggles are stronger than retrieval. If the user turns on
+    # Search, the model must see the search tools even when the latest text is a
+    # typo or otherwise low-signal for tool RAG.
+    if not guide_only and forced_tools:
+        if _relevant_tools is None:
+            from src.tool_index import ALWAYS_AVAILABLE
+            _relevant_tools = set(ALWAYS_AVAILABLE)
+        _relevant_tools.update(t for t in forced_tools if t not in disabled_tools)
+
    # The skill index injected by _build_system_prompt tells the model to
    # call `manage_skills action=view`, and Jaccard-matched skills are pasted
    # into the prompt as procedures to follow — but neither path goes through
@@ -2030,7 +2185,7 @@ async def stream_agent_loop(
    # (grep, read_file, ...) that aren't in its schema list. Keep the schemas
    # in lockstep: manage_skills is callable whenever any skill is indexed,
    # and a matched skill's declared requires_toolsets ride along with it.
-    if not guide_only and _relevant_tools is not None:
+    if not guide_only and _relevant_tools is not None and not _low_signal_turn:
        try:
            from services.memory.skills import SkillsManager
            from src.constants import DATA_DIR
@@ -2147,6 +2302,7 @@ async def stream_agent_loop(
        compact=_compact_agent_prompt,
        owner=owner,
        suppress_local_context=guide_only,
+        suppress_skills=_low_signal_turn,
        active_email=active_email,
    )
    if plan_mode and not guide_only:
@@ -2753,6 +2909,15 @@ async def stream_agent_loop(
                _intent_nudge_count += 1
                _matched_phrase = _intent_match.group(0).strip()
                logger.info(f"[agent] intent-without-action nudge #{_intent_nudge_count} on round {round_num}: {_matched_phrase!r}")
+                _lower_phrase = _matched_phrase.lower()
+                _cookbook_log_hint = ""
+                if any(_word in _lower_phrase for _word in ("log", "logs", "output", "tail", "status")):
+                    _cookbook_log_hint = (
+                        " If this is about a Cookbook/model serve, the concrete calls are: "
+                        "`list_served_models` first, then `tail_serve_output` with the "
+                        "session_id from the serve/list result. Never answer with "
+                        "\"check logs\" when those tools are available."
+                    )
                messages.append({
                    "role": "system",
                    "content": (
@@ -2761,6 +2926,7 @@ async def stream_agent_loop(
                        "see you announced the action but didn't run it, which "
                        "is the most frustrating thing you can do. "
                        "DO IT NOW: emit the actual function call this turn. "
+                        f"{_cookbook_log_hint}"
                        "If you decided not to do it after all, say so plainly in "
                        "one sentence instead of restating the plan."
                    ),
@@ -7,6 +7,7 @@ from src.constants import MAX_OUTPUT_CHARS
 class WebSearchTool:
    async def execute(self, content: str, ctx: dict) -> dict:
        from src.search import comprehensive_web_search
+        progress_cb = ctx.get("progress_cb") if isinstance(ctx, dict) else None
        raw = content.strip()
        query = raw
        time_filter = None
@@ -37,18 +38,39 @@ class WebSearchTool:
            elif " news" in q_lc or q_lc.startswith("news ") or q_lc.endswith(" news"):
                time_filter = "week"
        loop = asyncio.get_running_loop()
-        text, sources = await asyncio.wait_for(
-            loop.run_in_executor(
-                None,
-                lambda: comprehensive_web_search(
-                    query,
-                    max_pages=max_pages,
-                    time_filter=time_filter,
-                    return_sources=True,
+        if progress_cb:
+            await progress_cb({
+                "elapsed_s": 0,
+                "tail": f"Searching web for: {query[:160]}",
+            })
+        try:
+            text, sources = await asyncio.wait_for(
+                loop.run_in_executor(
+                    None,
+                    lambda: comprehensive_web_search(
+                        query,
+                        max_pages=max_pages,
+                        time_filter=time_filter,
+                        return_sources=True,
+                    ),
                ),
-            ),
-            timeout=30,
-        )
+                timeout=30,
+            )
+        except asyncio.TimeoutError:
+            return {
+                "error": f"web_search timed out after 30s: {query[:200]}",
+                "exit_code": 1,
+            }
+        except Exception as e:
+            return {
+                "error": f"web_search failed: {type(e).__name__}: {str(e) or 'no details'}",
+                "exit_code": 1,
+            }
+        if progress_cb:
+            await progress_cb({
+                "elapsed_s": 30,
+                "tail": "Search completed; preparing sources.",
+            })
        output = text[:MAX_OUTPUT_CHARS] if len(text) > MAX_OUTPUT_CHARS else text
        if sources:
            output += "\n\n<!-- SOURCES:" + json.dumps(sources) + " -->"
@@ -76,8 +76,7 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]:
        import json
        import re
        from src.constants import DATA_DIR
-        from src.endpoint_resolver import resolve_endpoint
-        from src.llm_core import llm_call_async
+        from src.llm_core import llm_call_async_with_fallback
        from src.memory import MemoryManager

        manager = MemoryManager(DATA_DIR)
@@ -116,10 +115,9 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]:
            if len(group_memories) < 2:
                return False

-            url, model, headers = resolve_endpoint("utility", owner=group_owner or None)
-            if not url or not model:
-                url, model, headers = resolve_endpoint("default", owner=group_owner or None)
-            if not url or not model:
+            from src.task_endpoint import resolve_task_candidates
+            candidates = resolve_task_candidates(owner=group_owner or None)
+            if not candidates:
                return False

            try:
@@ -147,13 +145,11 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]:
                    "\"drop\":[{\"id\":\"existing id\",\"reason\":\"short reason\"}]}\n\n"
                    f"MEMORIES:\n{json.dumps(items, ensure_ascii=False)}"
                )
-                raw = await llm_call_async(
-                    url=url,
-                    model=model,
+                raw = await llm_call_async_with_fallback(
+                    candidates,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                    max_tokens=4096,
-                    headers=headers,
                    timeout=120,
                )
                from src.text_helpers import strip_think
@@ -604,8 +600,7 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]:
    try:
        from datetime import timedelta
        from core.database import SessionLocal, CalendarEvent
-        from src.endpoint_resolver import resolve_endpoint
-        from src.llm_core import llm_call_async
+        from src.llm_core import llm_call_async_with_fallback
        import re as _re, json as _json

        db = SessionLocal()
@@ -620,10 +615,9 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]:
            if not events:
                return "No upcoming events to classify", True

-            llm_url, llm_model, llm_headers = resolve_endpoint("utility", owner=owner)
-            if not llm_url:
-                llm_url, llm_model, llm_headers = resolve_endpoint("default", owner=owner)
-            llm_available = bool(llm_url and llm_model)
+            from src.task_endpoint import resolve_task_candidates
+            llm_candidates = resolve_task_candidates(owner=owner)
+            llm_available = bool(llm_candidates)

            # Pull user memories so the LLM has personal context (relationships,
            # job, hobbies). Helps it know e.g. "<name> is your spouse" so their
@@ -699,11 +693,11 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]:
                    f"EVENTS: {_json.dumps(items)}"
                )
                try:
-                    raw = await llm_call_async(
-                        url=llm_url, model=llm_model,
+                    raw = await llm_call_async_with_fallback(
+                        llm_candidates,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=0.1, max_tokens=16384,
-                        headers=llm_headers, timeout=180,
+                        timeout=180,
                    )
                    from src.text_helpers import strip_think as _st
                    raw = _st(raw or "", prose=False, prompt_echo=False)
@@ -810,8 +804,7 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo
        import asyncio as _aio
        from datetime import datetime as _dt, timedelta as _td
        from routes.email_helpers import _email_cache_owner_clause, _imap_connect, SCHEDULED_DB
-        from src.endpoint_resolver import resolve_endpoint
-        from src.llm_core import llm_call_async
+        from src.llm_core import llm_call_async_with_fallback

        # 1. Pull recent UIDs + From headers cheaply (header-only fetch).
        def _pull_headers():
@@ -891,11 +884,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo
        if not eligible:
            return "All sender sigs already cached (or no eligible senders)", True

-        url, model, headers = resolve_endpoint("utility", owner=owner)
-        if not url or not model:
-            url, model, headers = resolve_endpoint("default", owner=owner)
-        if not url or not model:
+        from src.task_endpoint import resolve_task_candidates
+        candidates = resolve_task_candidates(owner=owner)
+        if not candidates:
            return "No LLM endpoint available", False
+        model = candidates[0][1]

        analyzed = 0
        no_sig = 0
@@ -949,11 +942,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo
            )

            try:
-                raw = await llm_call_async(
-                    url=url, model=model,
+                raw = await llm_call_async_with_fallback(
+                    candidates,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0, max_tokens=600,
-                    headers=headers, timeout=60,
+                    timeout=60,
                )
                from src.text_helpers import strip_think as _st
                sig = _st(raw or "", prose=False, prompt_echo=False).strip()
@@ -1137,7 +1130,6 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]:
        from services.memory.skills import SkillsManager
        from src.constants import DATA_DIR
        from routes.skills_routes import _run_skill_test_once, _skill_test_task
-        from src.endpoint_resolver import resolve_endpoint

        # #3 SCOPE GUARD: refuse to run on a None/empty owner — otherwise
        # `sm.load(owner=None)` returns every user's skills and we'd cross-
@@ -1152,27 +1144,40 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]:
        if not names:
            raise TaskNoop("no skills to test")

-        url, model, headers = resolve_endpoint("default", owner=owner)
-        if not url or not model:
+        from src.task_endpoint import resolve_task_candidates
+        candidates = resolve_task_candidates(owner=owner)
+        if not candidates:
            return "No Default/Utility model configured — set one in Settings.", False

        # #2 NO SILENT MODEL SWAP: if the configured model isn't served by the
        # endpoint, try a basename match — but fail loudly instead of grabbing
        # `avail[0]` which could be an embedding-only model and produce 36
        # garbage transcripts → 36 'unknown' verdicts with no hint why.
+        url, model, headers = candidates[0]
        try:
            from src.llm_core import list_model_ids
-            avail = list_model_ids(url, headers=headers)
-            if avail and model not in avail:
-                import os as _os
-                base = _os.path.basename((model or "").rstrip("/"))
-                m = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None)
-                if m:
-                    model = m
-                else:
-                    return (f"Default model '{model}' not served by endpoint {url}. "
-                            f"Available: {', '.join(avail[:8])}{'…' if len(avail) > 8 else ''}. "
-                            "Set a valid Default model in Settings."), False
+            import os as _os
+
+            selected = None
+            mismatch_notes = []
+            for cand_url, cand_model, cand_headers in candidates:
+                avail = list_model_ids(cand_url, headers=cand_headers)
+                if not avail or cand_model in avail:
+                    selected = (cand_url, cand_model, cand_headers)
+                    break
+                base = _os.path.basename((cand_model or "").rstrip("/"))
+                matched = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None)
+                if matched:
+                    selected = (cand_url, matched, cand_headers)
+                    break
+                mismatch_notes.append(
+                    f"{cand_model} not served by {cand_url}; available: "
+                    f"{', '.join(avail[:8])}{'...' if len(avail) > 8 else ''}"
+                )
+            if selected:
+                url, model, headers = selected
+            elif mismatch_notes:
+                return "No configured task fallback model is served. " + " | ".join(mismatch_notes[:3]), False
        except Exception as _e:
            logger.warning(f"test_skills model resolve check failed (continuing): {_e}")

@@ -1483,7 +1488,6 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]:
        from pathlib import Path as _P
        from core.database import SessionLocal as _SL, EmailAccount as _EA
        from routes.email_helpers import _imap_connect, _decode_header
-        from src.endpoint_resolver import resolve_endpoint, resolve_utility_fallback_candidates
        from src.llm_core import llm_call_async_with_fallback

        # Per-owner state file so multi-user runs don't clobber each other's
@@ -1505,12 +1509,10 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]:

        # ── 1. Resolve LLM candidates (utility primary + utility fallbacks; fall
        # through to default chat as a last resort).
-        url, model, headers = resolve_endpoint("utility", owner=owner)
-        if not url or not model:
-            url, model, headers = resolve_endpoint("default", owner=owner)
-        if not url or not model:
+        from src.task_endpoint import resolve_task_candidates
+        candidates = resolve_task_candidates(owner=owner)
+        if not candidates:
            return "No LLM endpoint available", False
-        candidates = [(url, model, headers)] + resolve_utility_fallback_candidates(owner=owner)

        # ── 2. Enumerate enabled accounts. Match this task's owner AND fall
        # back to the legacy "unowned account whose imap_user / from_address
@@ -396,6 +396,9 @@ def resolve_utility_fallback_candidates(owner: Optional[str] = None) -> list:
        settings = load_settings()
        utility_ep = (get_user_setting("utility_endpoint_id", owner or "", settings.get("utility_endpoint_id", "")) or "").strip()
        if not utility_ep:
+            utility_chain = get_user_setting("utility_model_fallbacks", owner or "", settings.get("utility_model_fallbacks") or []) or []
+            if utility_chain:
+                return _resolve_fallback_candidates("utility_model_fallbacks", owner=owner)
            return _resolve_fallback_candidates("default_model_fallbacks", owner=owner)
    except Exception:
        pass
@@ -2130,6 +2130,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
                                            yield _stream_delta_event(reasoning, thinking=True)
                                        content = delta.get("content") or ""
                                        if content:
+                                            content = re.sub(r"<mm:think(\s+[^>]*)?>", r"<think\1>", content, flags=re.IGNORECASE)
+                                            content = re.sub(r"</mm:think>", "</think>", content, flags=re.IGNORECASE)
                                            stripped = content.lstrip()
                                            # gpt-oss harmony format (<|channel|>analysis/final): route via the harmony
                                            # stream router. Sticky once the first marker appears — distinct from the
@@ -1,6 +1,11 @@
-"""Shared resolver for background-task AI endpoint (auto-naming, memory, sorting)."""
+"""Shared resolver for background-task AI endpoints."""

-from src.endpoint_resolver import resolve_endpoint
+from src.endpoint_resolver import (
+    resolve_chat_fallback_candidates,
+    resolve_endpoint,
+    resolve_utility_fallback_candidates,
+)
+from src.llm_core import llm_call_async_with_fallback


 def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_headers=None, owner=None):
@@ -11,3 +16,60 @@ def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_heade
    endpoint cannot be resolved.
    """
    return resolve_endpoint("task", fallback_url, fallback_model, fallback_headers, owner=owner)
+
+
+def resolve_task_candidates(
+    fallback_url=None,
+    fallback_model=None,
+    fallback_headers=None,
+    owner=None,
+):
+    """Return ordered background-task LLM candidates.
+
+    Order:
+    1. configured Background Tasks endpoint/model, or caller fallback
+    2. Utility endpoint/model
+    3. Default endpoint/model
+    4. Utility fallback chain
+    5. Default fallback chain
+    """
+    candidates = []
+
+    def _append(url, model, headers):
+        if not url or not model:
+            return
+        key = (url, model)
+        if any((u, m) == key for u, m, _ in candidates):
+            return
+        candidates.append((url, model, headers or {}))
+
+    _append(*resolve_task_endpoint(fallback_url, fallback_model, fallback_headers, owner=owner))
+    _append(*resolve_endpoint("utility", owner=owner))
+    _append(*resolve_endpoint("default", owner=owner))
+    for url, model, headers in resolve_utility_fallback_candidates(owner=owner):
+        _append(url, model, headers)
+    for url, model, headers in resolve_chat_fallback_candidates(owner=owner):
+        _append(url, model, headers)
+
+    return candidates
+
+
+async def task_llm_call_async(
+    messages,
+    *,
+    fallback_url=None,
+    fallback_model=None,
+    fallback_headers=None,
+    owner=None,
+    **kwargs,
+):
+    """Call the shared background-task LLM candidate chain."""
+    candidates = resolve_task_candidates(
+        fallback_url=fallback_url,
+        fallback_model=fallback_model,
+        fallback_headers=fallback_headers,
+        owner=owner,
+    )
+    if not candidates:
+        raise RuntimeError("No LLM endpoint available for background task")
+    return await llm_call_async_with_fallback(candidates, messages=messages, **kwargs)
@@ -833,6 +833,14 @@ class TaskScheduler:
                    owner=task.owner,
                    body=run.result if output == "notification" else None,
                )
+            elif run.status == "error":
+                self.add_notification(
+                    task.name,
+                    "error",
+                    task_id,
+                    owner=task.owner,
+                    body=run.error or run.result,
+                )

            # Log result to the assistant chat so all task activity is visible.
            # Skip skipped/error rows — user shouldn't see "skipped: …" noise
@@ -1406,12 +1414,18 @@ class TaskScheduler:
            )
        except Exception as e:
            logger.warning(f"Agent loop failed for task '{task.name}', falling back to simple call: {e}")
-            from src.llm_core import llm_call_async
+            from src.task_endpoint import task_llm_call_async
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": task.prompt},
            ]
-            result = await llm_call_async(url=endpoint_url, model=model, messages=messages, timeout=120)
+            result = await task_llm_call_async(
+                messages,
+                fallback_url=endpoint_url,
+                fallback_model=model,
+                owner=task.owner,
+                timeout=120,
+            )

        # Strip the model's chain-of-thought before saving/delivering. Task
        # output is LLM-only, so prose=True (which also removes untagged
@@ -1636,13 +1650,17 @@ class TaskScheduler:
        # Honor per-task max_steps (defense against runaway agent loops).
        # Falls back to 20 if not set — the historical default.
        _task_max_rounds = task.max_steps if task.max_steps and task.max_steps > 0 else 20
-        # Tasks are background workloads — they share the Utility model's
-        # fallback chain (Settings → Utility Model → Fallbacks). A downed
-        # primary endpoint won't silently yield `(no output)` — same recipe
-        # chat uses but with the utility list (`utility_model_fallbacks`).
+        # Tasks are background workloads: use the shared task fallback chain
+        # behind the primary endpoint so a downed primary won't silently yield
+        # `(no output)`.
        try:
-            from src.endpoint_resolver import resolve_utility_fallback_candidates
-            _task_fallbacks = resolve_utility_fallback_candidates(owner=task.owner or None)
+            from src.task_endpoint import resolve_task_candidates
+            _task_fallbacks = resolve_task_candidates(
+                fallback_url=endpoint_url,
+                fallback_model=model,
+                fallback_headers=headers,
+                owner=task.owner or None,
+            )[1:]
        except Exception:
            _task_fallbacks = []
        async for event_str in stream_agent_loop(
@@ -1679,21 +1697,22 @@ class TaskScheduler:
        # asking it to summarize what it did. Guarantees output.
        if not full_text.strip():
            try:
-                from src.llm_core import llm_call_async_with_fallback
-                from src.endpoint_resolver import resolve_utility_fallback_candidates
+                from src.task_endpoint import task_llm_call_async
                grace_context = "You ran out of steps. "
                if tool_results:
                    grace_context += "Here's what your tools returned:\n" + "\n".join(tool_results[-5:])
                else:
                    grace_context += "No tool results were captured."
                grace_context += "\n\nSummarize what you accomplished and what's still pending. Be concise."
-                _grace_candidates = [(endpoint_url, model, headers)] + resolve_utility_fallback_candidates(owner=task.owner or None)
-                full_text = await llm_call_async_with_fallback(
-                    _grace_candidates,
+                full_text = await task_llm_call_async(
                    messages=[
                        {"role": "system", "content": system_content},
                        {"role": "user", "content": grace_context},
                    ],
+                    fallback_url=endpoint_url,
+                    fallback_model=model,
+                    fallback_headers=headers,
+                    owner=task.owner or None,
                    timeout=30,
                )
                full_text = (full_text or "").strip()
@@ -1119,8 +1119,8 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict:
            _ALIASES = {
                "shell": ["bash"],
                "terminal": ["bash"],
-                "search": ["web_search"],
-                "web": ["web_search"],
+                "search": ["web_search", "web_fetch"],
+                "web": ["web_search", "web_fetch"],
                "browser": ["builtin_browser"],
                "documents": ["create_document", "edit_document", "update_document", "suggest_document"],
                "doc": ["create_document", "edit_document", "update_document", "suggest_document"],
@@ -1132,7 +1132,7 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict:
                "notes": ["manage_notes"],
                "calendar": ["manage_calendar"],
                "email": ["mcp__email__list_emails", "mcp__email__read_email", "mcp__email__send_email"],
-                "research": ["web_search"],  # research is a per-request flag, not a tool — closest analog
+                "research": ["web_search", "web_fetch"],  # research is a per-request flag, not a tool — closest analog
            }

            if action == "list_tools":
@@ -2714,13 +2714,25 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict:
                endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
            )
            note = "" if registered else " (state-write failed — task may not show in UI)"
+            where = host or "local"
+            log_path = f"/tmp/odysseus-tmux/{sid}.log"
            return {
-                "output": f"Serving {repo_id} (session: {sid}){note}",
+                "output": (
+                    f"Serving {repo_id} on {where} (session: {sid}){note}\n"
+                    f"Next required check: call list_served_models. If this task is not ready, "
+                    f"call tail_serve_output with session_id={sid} and tail=400 before answering. "
+                    f"Do not tell the user to check logs; you have the log tool."
+                ),
                "session_id": sid,
                "task_type": "serve",
                "phase": "running",
                "host": host,
                "endpoint_id": endpoint_id,
+                "log_path": log_path,
+                "next_tools": [
+                    {"name": "list_served_models", "arguments": {}},
+                    {"name": "tail_serve_output", "arguments": {"session_id": sid, "tail": 400}},
+                ],
                "exit_code": 0,
            }
        # FastAPI HTTPException puts the message under `detail`, not `error`.
@@ -3057,8 +3069,17 @@ async def do_tail_serve_output(content: str, owner: Optional[str] = None) -> Dic
        MAX_CHARS = 8000
        if len(output_text) > MAX_CHARS:
            output_text = "…(earlier output truncated)…\n" + output_text[-MAX_CHARS:]
+        if not output_text:
+            output_text = (
+                f"No log output captured yet for {session_id} on {host_label}. "
+                "This usually means the tmux wrapper has started but the model process "
+                "has not printed anything yet. Do not stop here: call list_served_models "
+                "again to check whether it is still loading, ready, or crashed; if it is "
+                "still not ready, call tail_serve_output again with a larger tail after "
+                "the next status check."
+            )
        return {
-            "output": output_text or "(empty pane)",
+            "output": output_text,
            "session_id": session_id,
            "host": host_label,
            "tail_lines": tail,