Merge dev into fix/native-agent-loop-guard-signals

2026-06-29 16:12:06 -04:00 · 2026-06-26 13:00:59 +01:00
parent bd0c67b6d3 de12d4734a
commit 537f7180e6
285 changed files with 20014 additions and 3616 deletions
@@ -160,6 +160,8 @@ def setup_api_token_routes() -> APIRouter:
            payload = await request.json()
        except Exception:
            payload = {}
+        if not isinstance(payload, dict):
+            payload = {}
        with get_db_session() as db:
            token = db.query(ApiToken).filter(ApiToken.id == token_id).first()
            if not token:
@@ -16,6 +16,7 @@ from pydantic import BaseModel

 from core.database import SessionLocal, CrewMember, ScheduledTask
 from src.auth_helpers import get_current_user
+from core.auth import RESERVED_USERNAMES
 from src.task_scheduler import compute_next_run


@@ -89,11 +90,11 @@ def setup_assistant_routes(task_scheduler) -> APIRouter:
    # check-in tasks seeded. Hitting any /assistant route under one of these
    # used to seed a full CrewMember + Morning/Midday/Evening tasks under that
    # owner, which then double-fired alongside the real user's check-ins.
-    _SYNTHETIC_OWNERS = frozenset({"internal-tool", "api", "demo", "system", ""})
+    # RESERVED_USERNAMES covers the same set; the `not owner` guard handles "".

    async def _get_or_create(owner: str) -> CrewMember:
        """Return the per-owner assistant CrewMember, creating it on demand."""
-        if not owner or owner in _SYNTHETIC_OWNERS:
+        if not owner or owner in RESERVED_USERNAMES:
            raise HTTPException(status_code=400, detail=f"Cannot seed assistant for {owner!r}")
        db = SessionLocal()
        try:
@@ -12,8 +12,8 @@ import re
 from pathlib import Path

 from core.atomic_io import atomic_write_json, atomic_write_text
-from core.auth import AuthManager, SetAdminResult
-from src.constants import DEEP_RESEARCH_DIR, MEMORY_FILE, SKILLS_DIR
+from core.auth import AuthManager, RESERVED_USERNAMES, SetAdminResult, TOKEN_TTL
+from src.constants import DEEP_RESEARCH_DIR, MEMORY_FILE, PASSWORD_MIN_LENGTH, SKILLS_DIR
 from src.rate_limiter import RateLimiter
 from src.settings_scrub import scrub_settings
 from src.settings import (
@@ -102,8 +102,12 @@ def setup_auth_routes(auth_manager: AuthManager) -> APIRouter:
            raise HTTPException(429, "Too many requests — try again later")
        if auth_manager.is_configured:
            raise HTTPException(400, "Already configured")
-        if len(body.password) < 8:
-            raise HTTPException(400, "Password must be at least 8 characters")
+        if len(body.password) < PASSWORD_MIN_LENGTH:
+            raise HTTPException(400, f"Password must be at least {PASSWORD_MIN_LENGTH} characters")
+        if len(body.username.strip()) < 1:
+            raise HTTPException(400, "Username is required")
+        if body.username.lower() in RESERVED_USERNAMES:
+            raise HTTPException(403, "Username is reserved")
        ok = await asyncio.to_thread(auth_manager.setup, body.username, body.password)
        if not ok:
            raise HTTPException(500, "Setup failed")
@@ -118,10 +122,12 @@ def setup_auth_routes(auth_manager: AuthManager) -> APIRouter:
            raise HTTPException(400, "Run setup first")
        if not auth_manager.signup_enabled:
            raise HTTPException(403, "Registration is disabled. Ask an admin for an account.")
-        if len(body.password) < 8:
-            raise HTTPException(400, "Password must be at least 8 characters")
+        if len(body.password) < PASSWORD_MIN_LENGTH:
+            raise HTTPException(400, f"Password must be at least {PASSWORD_MIN_LENGTH} characters")
        if len(body.username.strip()) < 1:
            raise HTTPException(400, "Username is required")
+        if body.username.lower() in RESERVED_USERNAMES:
+            raise HTTPException(403, "Username is reserved")
        ok = await asyncio.to_thread(auth_manager.create_user, body.username, body.password, is_admin=False)
        if not ok:
            raise HTTPException(409, "Username already taken")
@@ -144,6 +150,8 @@ def setup_auth_routes(auth_manager: AuthManager) -> APIRouter:
                raise HTTPException(401, "Invalid 2FA code")
        # All checks passed — create session (password already verified above)
        token = await asyncio.to_thread(auth_manager.create_session_trusted, username)
+        if not token:
+            raise HTTPException(401, "Invalid credentials")
        cookie_kwargs = dict(
            key=SESSION_COOKIE,
            value=token,
@@ -153,7 +161,7 @@ def setup_auth_routes(auth_manager: AuthManager) -> APIRouter:
            path="/",
        )
        if body.remember:
-            cookie_kwargs["max_age"] = 60 * 60 * 24 * 7  # 7 days
+            cookie_kwargs["max_age"] = TOKEN_TTL
        response.set_cookie(**cookie_kwargs)
        return {"ok": True, "username": username}

@@ -182,13 +190,18 @@ def setup_auth_routes(auth_manager: AuthManager) -> APIRouter:
            pass
        return result

+    @router.get("/policy")
+    async def auth_policy():
+        """Return public auth policy constants for the frontend."""
+        return auth_manager.policy()
+
    @router.post("/change-password")
    async def change_password(body: ChangePasswordRequest, request: Request):
        user = _get_current_user(request)
        if not user:
            raise HTTPException(401, "Not authenticated")
-        if len(body.new_password) < 8:
-            raise HTTPException(400, "Password must be at least 8 characters")
+        if len(body.new_password) < PASSWORD_MIN_LENGTH:
+            raise HTTPException(400, f"Password must be at least {PASSWORD_MIN_LENGTH} characters")
        current_token = request.cookies.get(SESSION_COOKIE)
        ok = await asyncio.to_thread(auth_manager.change_password, user, body.current_password, body.new_password)
        if not ok:
@@ -268,8 +281,12 @@ def setup_auth_routes(auth_manager: AuthManager) -> APIRouter:
        user = _get_current_user(request)
        if not user or not auth_manager.is_admin(user):
            raise HTTPException(403, "Admin only")
-        if len(body.password) < 8:
-            raise HTTPException(400, "Password must be at least 8 characters")
+        if len(body.password) < PASSWORD_MIN_LENGTH:
+            raise HTTPException(400, f"Password must be at least {PASSWORD_MIN_LENGTH} characters")
+        if len(body.username.strip()) < 1:
+            raise HTTPException(400, "Username is required")
+        if body.username.lower() in RESERVED_USERNAMES:
+            raise HTTPException(403, "Username is reserved")
        ok = auth_manager.create_user(body.username, body.password, body.is_admin)
        if not ok:
            raise HTTPException(409, "Username already taken")
@@ -432,6 +449,23 @@ def setup_auth_routes(auth_manager: AuthManager) -> APIRouter:
        except Exception as e:
            logger.warning("Failed to rename upload owner references %s -> %s: %s", old_username, new_username, e)

+        # direct personal RAG uploads live in per-owner directories and the
+        # vector metadata also carries the username used for owner-filtered
+        # search. Keep both in sync with the auth rename.
+        try:
+            from routes.personal_routes import rename_personal_upload_owner
+            personal_docs_manager = getattr(request.app.state, "personal_docs_manager", None)
+            if personal_docs_manager is not None:
+                rag_manager = getattr(personal_docs_manager, "rag_manager", None)
+                rename_personal_upload_owner(
+                    old_username,
+                    new_username,
+                    personal_docs_manager=personal_docs_manager,
+                    rag_manager=rag_manager,
+                )
+        except Exception as e:
+            logger.warning("Failed to rename personal RAG upload owner references %s -> %s: %s", old_username, new_username, e)
+
        # skills: SKILL.md frontmatter carries owner: <username>; the usage
        # sidecar (_usage.json) keys entries as owner::skill-name. Both must
        # be updated or the renamed user's Skills panel goes empty.
@@ -14,7 +14,7 @@ from core.database import Session as DBSession, ModelEndpoint
 from src.llm_core import normalize_model_id
 from src.endpoint_resolver import normalize_base
 from src.context_compactor import maybe_compact, trim_for_context
-from src.auth_helpers import get_current_user
+from src.auth_helpers import effective_user
 from src.prompt_security import untrusted_context_message
 from routes.prefs_routes import _load_for_user as load_prefs_for_user

@@ -22,6 +22,47 @@ from fastapi import HTTPException

 logger = logging.getLogger(__name__)

+_CASUAL_OPENING_RE = re.compile(
+    r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|"
+    r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P<tail>.*)$",
+    re.IGNORECASE,
+)
+_CASUAL_BLOCKLIST_RE = re.compile(
+    r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|"
+    r"download|model|email|document|doc|note|calendar|task|search|web|research|"
+    r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b",
+    re.IGNORECASE,
+)
+
+
+def _is_casual_low_signal(text: str) -> bool:
+    """Short greetings/slang should not pull memory, skills, RAG, or docs."""
+    s = str(text or "").strip()
+    m = _CASUAL_OPENING_RE.match(s)
+    if not m:
+        return False
+    tail = m.group("tail") or ""
+    if _CASUAL_BLOCKLIST_RE.search(tail):
+        return False
+    tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail)
+    return len(tail_words) <= 2
+
+
+# Strong references to in-flight fire-and-forget tasks scheduled from this
+# module. asyncio only keeps weak references to tasks created via
+# create_task, so without this the GC can collect a task mid-execution and
+# the background work (extraction, auto-naming) silently never runs.
+# Mirrors WebhookManager._spawn_tracked from src/webhook_manager.py.
+_BG_TASKS: set[asyncio.Task] = set()
+
+
+def _spawn_bg(coro) -> asyncio.Task:
+    """Schedule a background task and hold a strong reference until it finishes."""
+    task = asyncio.create_task(coro)
+    _BG_TASKS.add(task)
+    task.add_done_callback(_BG_TASKS.discard)
+    return task
+

 # ── Data containers ────────────────────────────────────────────────────── #

@@ -78,7 +119,7 @@ def _enforce_chat_privileges(request, sess) -> None:
    which means unrestricted allowed_models / zero cap -> no-op for them.
    """
    try:
-        user = get_current_user(request)
+        user = effective_user(request)
    except Exception:
        user = None
    if not user:
@@ -159,17 +200,9 @@ async def auto_name_session(session_manager, sess):
            return

        owner = getattr(sess, "owner", None)
-        t_url, t_model, t_headers = resolve_task_endpoint(owner=owner)
-        if not t_model:
-            # If no task/utility model is configured at all, fall back to
-            # the session's own model so auto-naming still works even on
-            # minimal setups.
-            from src.endpoint_resolver import resolve_endpoint
-            _fallback = resolve_endpoint("default", owner=owner)
-            if _fallback and _fallback[1]:
-                t_url, t_model, t_headers = _fallback
-            else:
-                t_url, t_model, t_headers = sess.endpoint_url, sess.model, sess.headers
+        t_url, t_model, t_headers = resolve_task_endpoint(
+            sess.endpoint_url, sess.model, sess.headers, owner=owner
+        )
        if not t_model:
            logger.debug("[auto-name] No model provided, skipping")
            return
@@ -346,11 +379,11 @@ def add_user_message(sess, chat_handler, preprocessed: PreprocessedMessage, inco
 def fire_message_event(request, webhook_manager, session_id: str, sess, message: str, compare_mode: bool = False):
    """Fire webhook and event_bus events for a new user message."""
    if webhook_manager and not compare_mode:
-        asyncio.create_task(webhook_manager.fire("chat.message", {
+        webhook_manager.fire_and_forget("chat.message", {
            "session_id": session_id, "model": sess.model, "message": message[:2000],
-        }))
+        })
    from src.event_bus import fire_event
-    user = get_current_user(request)
+    user = effective_user(request)
    fire_event("message_sent", user)


@@ -576,9 +609,11 @@ async def build_chat_context(
    if not incognito:
        fire_message_event(request, webhook_manager, session_id, sess, message, compare_mode)

-    # Resolve user prefs
-    user = get_current_user(request)
+    # Resolve owner-scoped prefs/context. Browser requests keep the cookie user;
+    # bearer-token chat requests use the token owner instead of the "api" sentinel.
+    user = effective_user(request)
    uprefs = load_prefs_for_user(user)
+    casual_low_signal = _is_casual_low_signal(message)

    # Memory enabled?
    mem_enabled = not incognito and not no_memory and uprefs.get("memory_enabled", True)
@@ -588,6 +623,9 @@ async def build_chat_context(
    if not allow_tool_preprocessing:
        mem_enabled = False
        skills_enabled = False
+    if casual_low_signal:
+        mem_enabled = False
+        skills_enabled = False
    logger.debug(
        "Memory enabled=%s for user=%s (incognito=%s, no_memory=%s, pref=%s)",
        mem_enabled, user, incognito, no_memory, uprefs.get("memory_enabled", "NOT_SET"),
@@ -603,11 +641,11 @@ async def build_chat_context(

    # Use RAG?
    use_rag_val = (str(use_rag).lower() != "false") if use_rag is not None else True
-    if incognito or not allow_tool_preprocessing or is_research_spinoff:
+    if incognito or not allow_tool_preprocessing or is_research_spinoff or casual_low_signal:
        use_rag_val = False

    # If pre-fetched search context was provided (compare mode), skip live web search
-    skip_web = bool(search_context) or not allow_tool_preprocessing
+    skip_web = bool(search_context) or not allow_tool_preprocessing or casual_low_signal

    # Build context preface
    # The stream path uses enhanced_message (with CoT/preprocessing applied),
@@ -626,7 +664,7 @@ async def build_chat_context(
        incognito=incognito,
        use_skills=skills_enabled,
    )
-    if use_rag is not None or is_research_spinoff:
+    if use_rag is not None or is_research_spinoff or casual_low_signal:
        _preface_kwargs["use_rag"] = use_rag_val
    preface, rag_sources, web_sources = chat_processor.build_context_preface(**_preface_kwargs)

@@ -634,7 +672,7 @@ async def build_chat_context(
    used_memories = getattr(chat_processor, '_last_used_memories', [])

    # Inject pre-fetched search context (compare mode)
-    if search_context and allow_tool_preprocessing:
+    if search_context and allow_tool_preprocessing and not casual_low_signal:
        preface.append(untrusted_context_message("prefetched search context", search_context))

    # YouTube transcripts
@@ -1112,7 +1150,7 @@ def run_post_response_tasks(
            )))

    if _extraction_jobs:
-        asyncio.create_task(_run_extraction_jobs_sequentially(session_id, _extraction_jobs))
+        _spawn_bg(_run_extraction_jobs_sequentially(session_id, _extraction_jobs))

    # Token accumulation
    if last_metrics:
@@ -1120,11 +1158,11 @@ def run_post_response_tasks(

    # Webhook
    if webhook_manager and not compare_mode:
-        asyncio.create_task(webhook_manager.fire("chat.completed", {
+        webhook_manager.fire_and_forget("chat.completed", {
            "session_id": session_id, "model": sess.model,
            "user_message": message, "response": full_response[:2000],
-        }))
+        })

    # Auto-name
    if needs_auto_name(sess.name):
-        asyncio.create_task(auto_name_session(session_manager, sess))
+        _spawn_bg(auto_name_session(session_manager, sess))
@@ -23,12 +23,13 @@ from src.endpoint_resolver import normalize_base as _normalize_base, build_chat_
 from src.session_search import search_session_messages
 from src.prompt_security import untrusted_context_message
 from core.exceptions import SessionNotFoundError
-from src.auth_helpers import get_current_user
+from src.auth_helpers import effective_user, get_current_user
 from routes.session_routes import _verify_session_owner
 from routes.document_helpers import _owner_session_filter
 from core.database import SessionLocal, get_session_mode, set_session_mode
 from core.database import Session as DBSession, ChatMessage as DBChatMessage
 from core.database import Document as DBDocument, ModelEndpoint
+from core.log_safety import redact_url
 from routes.research_routes import _resolve_research_endpoint
 from routes.model_routes import _visible_models
 from routes.chat_helpers import (
@@ -126,7 +127,8 @@ def _clear_orphaned_session_endpoint(sess, owner: str | None = None) -> bool:
        sess.model = ""
        sess.headers = {}
        return True
-    except Exception:
+    except Exception as e:
+        logger.warning("Failed to clear orphaned session endpoint", exc_info=e)
        db.rollback()
        return False
    finally:
@@ -144,7 +146,8 @@ def _endpoint_cache_contains_model(endpoint, model: str) -> bool:
        return True
    try:
        models = json.loads(raw) if isinstance(raw, str) else raw
-    except Exception:
+    except Exception as e:
+        logger.warning("Failed to parse cached models list, treating as containing model", exc_info=e)
        return True
    if not isinstance(models, list) or not models:
        return True
@@ -236,7 +239,8 @@ def _recover_empty_session_model(sess, session_id: str, owner: str | None = None
                is_chatgpt_subscription = False
        try:
            cached = json.loads(ep.cached_models) if isinstance(ep.cached_models, str) else (ep.cached_models or [])
-        except Exception:
+        except Exception as e:
+            logger.warning("Failed to parse cached_models for endpoint %r", getattr(ep, "id", "?"), exc_info=e)
            cached = []
        if not cached:
            visible = []
@@ -360,7 +364,7 @@ def setup_chat_routes(
            sess = session_manager.get_session(session)
        except KeyError:
            raise HTTPException(404, f"Session '{session}' not found")
-        owner = get_current_user(request)
+        owner = effective_user(request)
        if _clear_orphaned_session_endpoint(sess, owner=owner):
            raise HTTPException(400, "Selected model endpoint was removed. Pick another model in Settings.")

@@ -600,7 +604,7 @@ def setup_chat_routes(
            # but BEFORE loading. Prevents cross-user session hijack.
            _verify_session_owner(request, session)
            sess = session_manager.get_session(session)
-            owner = get_current_user(request)
+            owner = effective_user(request)
            if _clear_orphaned_session_endpoint(sess, owner=owner):
                raise HTTPException(400, "Selected model endpoint was removed. Pick another model in Settings.")
            # Issue #587: picker shows a model from the endpoint cache but
@@ -631,7 +635,7 @@ def setup_chat_routes(
        _enforce_chat_privileges(request, sess)

        # Ensure session has auth headers
-        resolve_session_auth(sess, session, owner=get_current_user(request))
+        resolve_session_auth(sess, session, owner=effective_user(request))

        # Check for research_pending BEFORE mode persist overwrites it
        do_research = str(use_research).lower() == "true"
@@ -646,8 +650,8 @@ def setup_chat_routes(
        elif attachments:
            try:
                att_ids = [str(x) for x in json.loads(attachments)]
-            except Exception:
-                pass
+            except Exception as e:
+                logger.warning("Failed to parse attachments JSON, ignoring attachments", exc_info=e)

        no_memory = str(form_data.get("no_memory", "")).lower() == "true"
        pre_context_tool_policy = build_effective_tool_policy(
@@ -826,7 +830,11 @@ def setup_chat_routes(
        from src.settings import get_setting
        _global_disabled = get_setting("disabled_tools", [])
        if _global_disabled and isinstance(_global_disabled, list):
-            disabled_tools.update(_global_disabled)
+            explicit_web_allowed = allow_web_search is not None and str(allow_web_search).lower() == "true"
+            if explicit_web_allowed:
+                disabled_tools.update(t for t in _global_disabled if t not in {"web_search", "web_fetch"})
+            else:
+                disabled_tools.update(_global_disabled)

        # Light auto-escalation: the user is in chat mode and just expressed a
        # notes/calendar/email intent. Grant the relevant managers but withhold
@@ -923,7 +931,7 @@ def setup_chat_routes(
            if effective_do_research:
                _r_ep, _r_model, _r_headers = _resolve_research_endpoint(sess)
                _auth_keys = list(_r_headers.keys()) if _r_headers else []
-                logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={_r_ep}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")
+                logger.info(f"Research endpoint resolved: model={_r_model}, endpoint={redact_url(_r_ep)}, auth_keys={_auth_keys}, sess_headers_keys={list(sess.headers.keys()) if isinstance(sess.headers, dict) else type(sess.headers)}")

                # Clarification round: only for very short/vague queries on first research message.
                # Skip in compare mode — each pane is a fresh session, so every one would
@@ -1256,6 +1264,10 @@ def setup_chat_routes(
                        _max_rounds = _DEFAULT_ROUNDS
                    _max_rounds = max(1, min(_max_rounds, 200))

+                    _forced_tools = None
+                    if allow_web_search is not None and str(allow_web_search).lower() == "true":
+                        _forced_tools = {"web_search", "web_fetch"}
+
                    async for chunk in stream_agent_loop(
                        sess.endpoint_url,
                        sess.model,
@@ -1277,6 +1289,7 @@ def setup_chat_routes(
                        plan_mode=plan_mode,
                        approved_plan=approved_plan or None,
                        workspace=workspace or None,
+                        forced_tools=_forced_tools,
                    ):
                        if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
                            try:
@@ -1484,7 +1497,7 @@ def setup_chat_routes(
        if not q or not q.strip():
            return []

-        _user = get_current_user(request)
+        _user = effective_user(request)
        return [
            result.to_dict()
            for result in search_session_messages(
@@ -46,8 +46,12 @@ def _ssh_prefix_for_task(task: dict) -> tuple[str, str]:
    shell metacharacters in ``remoteHost`` is rejected with 400 rather than
    injected.
    """
-    host = validate_remote_host((task.get("remoteHost") or "").strip() or None) or ""
-    ssh_port = validate_ssh_port((task.get("sshPort") or "").strip() or None) or ""
+    raw_host = task.get("remoteHost")
+    raw_port = task.get("sshPort")
+    host_value = str(raw_host).strip() if raw_host is not None else None
+    port_value = str(raw_port).strip() if raw_port is not None else None
+    host = validate_remote_host(host_value or None) or ""
+    ssh_port = validate_ssh_port(port_value or None) or ""
    port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
    return host, port_flag

@@ -306,7 +310,10 @@ def setup_codex_routes(

    @router.post("/emails/draft-document")
    async def codex_email_draft_document(request: Request, body: dict[str, Any] = Body(default_factory=dict)):
-        owner = _scope_owner_all(request, {"email:draft", "documents:write"})
+        owner = _scope_owner(request, EMAIL_DRAFT_SCOPES)
+        docs_owner = _scope_owner_all(request, DOCS_WRITE_SCOPES)
+        if docs_owner != owner:
+            raise HTTPException(403, "API token owner mismatch")
        if documents_create_endpoint is None:
            raise HTTPException(503, "Documents integration is not available")
        from routes.document_routes import DocumentCreate
@@ -18,6 +18,7 @@ from pathlib import Path
 from datetime import datetime
 from urllib.parse import urljoin, urlparse, urlunparse

+from core.log_safety import redact_url
 from fastapi import APIRouter, Query, Depends, Response, HTTPException
 from typing import List, Dict, Optional

@@ -689,15 +690,24 @@ def _delete_contact(uid: str) -> bool:
        url = _resolve_resource_url(uid)
        auth = (cfg["username"], cfg["password"]) if cfg["username"] else None
        r = httpx.delete(url, auth=auth, timeout=10)
-        if r.status_code in (200, 204):
-            _contact_cache["fetched_at"] = None
-            return True
-        if r.status_code == 404:
-            # Resource not found at the resolved URL. With href resolution
-            # this should be rare (genuinely already deleted). Invalidate
-            # the cache and report success so the UI doesn't keep a ghost.
-            logger.info(f"CardDAV DELETE 404 for {uid} — treating as already gone")
+        if r.status_code in (200, 204, 404):
+            # Invalidate cache so the next fetch sees the server truth.
            _contact_cache["fetched_at"] = None
+            # Verify: force a fresh fetch and check the UID is actually gone.
+            # A 404 on the guessed URL ({uid}.vcf) can mean the contact
+            # lives at a different resource URL — the DELETE missed it but
+            # we'd silently report success. This check catches that.
+            fresh = _fetch_contacts(force=True)
+            still_there = any(c.get("uid") == uid for c in fresh)
+            if still_there:
+                logger.warning(
+                    f"CardDAV DELETE reported success for {uid} "
+                    f"but UID still present after re-fetch — "
+                    f"resource URL may differ from {redact_url(url)}"
+                )
+                return False
+            if r.status_code == 404:
+                logger.info(f"CardDAV DELETE 404 for {uid} — already gone")
            return True
        logger.warning(f"CardDAV DELETE returned {r.status_code}: {r.text[:200]}")
        return False
@@ -505,6 +505,8 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None, add_hf_cache:
        "    if u.startswith('KB'): return int(n * 1024)",
        "    return int(n)",
        "def scan_ollama():",
+        "    if any(m.get('is_ollama') for m in models): return",
+        "    if os.name == 'nt' and not os.environ.get('ODYSSEUS_ALLOW_OLLAMA_CLI_SCAN'): return",
        "    if not shutil.which('ollama'): return",
        "    try:",
        "        p = subprocess.run(['ollama', 'list'], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, timeout=6)",
@@ -535,8 +537,8 @@ def _cached_model_scan_script(model_dirs: list[str] | None = None, add_hf_cache:
        "            models.append({'repo_id':name,'size_bytes':size_bytes,'nb_files':1,'has_incomplete':False,'path':'ollama','backend':'ollama','is_ollama':True})",
        "        return",
        "for _hf_cache in hf_cache_paths(): scan_hf(_hf_cache)",
-        "scan_ollama()",
        "scan_ollama_api()",
+        "scan_ollama()",
    ]
    for model_dir in model_dirs or []:
        lines.append(f"scan_dir(os.path.expanduser({model_dir!r}))")
@@ -784,25 +786,149 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
    to hard-wire CUDA on Linux. That made ROCm hosts attempt a CUDA configure and
    fail with "CUDA Toolkit not found" instead of building with HIP.
    """
+    # Try a prebuilt binary from llama.cpp's GitHub releases FIRST — no
+    # cmake/build-essential/git/CUDA-headers needed at all. The from-source
+    # build below stays as a fallback (custom flags, esoteric arch, no
+    # internet, etc). 30 seconds vs 5+ minutes of compile, and removes
+    # every OS-package dep from the launch path. Sets _odysseus_have_prebuilt=1
+    # on success; the existing build-tier if/elif chain below is gated on
+    # that variable so we never compile twice or shadow the prebuilt symlink.
+    runner_lines.append('    _odysseus_have_prebuilt=""')
+    runner_lines.append('    _odysseus_arch="$(uname -m)"')
+    runner_lines.append('    _odysseus_prebuilt_url=""')
+    runner_lines.append('    if command -v curl >/dev/null 2>&1 && [ "$_odysseus_arch" = "x86_64" ]; then')
+    runner_lines.append('      _odysseus_pat=""')
+    runner_lines.append('      _odysseus_has_nv_inline() { command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU "; }')
+    runner_lines.append('      _odysseus_has_vk_inline() { ldconfig -p 2>/dev/null | grep -q "libvulkan\\.so" || command -v vulkaninfo >/dev/null 2>&1 || [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ]; }')
+    runner_lines.append('      _odysseus_has_vkdev_inline() { ls /dev/dri/renderD* >/dev/null 2>&1 || (lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\'); }')
+    runner_lines.append('      if _odysseus_has_nv_inline; then')
+    runner_lines.append('        _odysseus_pat="ubuntu.*cuda"')
+    runner_lines.append('      elif _odysseus_has_vkdev_inline && _odysseus_has_vk_inline; then')
+    runner_lines.append('        _odysseus_pat="ubuntu.*vulkan"')
+    runner_lines.append('      else')
+    runner_lines.append('        _odysseus_pat="ubuntu-x64\\\\.zip"')
+    runner_lines.append('      fi')
+    runner_lines.append('      _odysseus_prebuilt_url="$(curl -fsSL --max-time 15 https://api.github.com/repos/ggml-org/llama.cpp/releases/latest 2>/dev/null | grep \'"browser_download_url"\' | cut -d\'"\' -f4 | grep -iE "$_odysseus_pat" | grep -iv "arm\\|aarch64" | head -1)"')
+    runner_lines.append('    fi')
+    # Accept any of unzip / bsdtar / python3 -m zipfile as the extractor.
+    # python3 is essentially always present on modern Linux, so this lets
+    # the prebuilt path work on minimal Ubuntu installs that lack `unzip`.
+    runner_lines.append('    if [ -n "$_odysseus_prebuilt_url" ] && (command -v unzip >/dev/null 2>&1 || command -v bsdtar >/dev/null 2>&1 || command -v python3 >/dev/null 2>&1); then')
+    runner_lines.append('      echo "[odysseus] Found prebuilt llama-server: $_odysseus_prebuilt_url"')
+    runner_lines.append('      mkdir -p ~/bin "$HOME/.cache/odysseus/llama-cpp-prebuilt" && cd "$HOME/.cache/odysseus/llama-cpp-prebuilt"')
+    runner_lines.append('      rm -f llama-cpp.zip')
+    runner_lines.append('      if curl -fsSL --max-time 120 "$_odysseus_prebuilt_url" -o llama-cpp.zip && [ -s llama-cpp.zip ]; then')
+    runner_lines.append('        rm -rf build && mkdir -p build')
+    runner_lines.append('        if command -v unzip >/dev/null 2>&1; then unzip -qq -o llama-cpp.zip -d build; elif command -v bsdtar >/dev/null 2>&1; then bsdtar -xf llama-cpp.zip -C build; else python3 -c "import zipfile; zipfile.ZipFile(\\"llama-cpp.zip\\").extractall(\\"build\\")"; fi')
+    runner_lines.append('        _odysseus_extracted="$(find build -type f -name llama-server 2>/dev/null | head -1)"')
+    runner_lines.append('        if [ -n "$_odysseus_extracted" ]; then')
+    runner_lines.append('          chmod +x "$_odysseus_extracted"')
+    runner_lines.append('          ln -sf "$_odysseus_extracted" ~/bin/llama-server')
+    runner_lines.append('          _odysseus_libdir="$(dirname "$_odysseus_extracted")"')
+    runner_lines.append('          mkdir -p ~/.config && echo "export LD_LIBRARY_PATH=\\"$_odysseus_libdir:\\${LD_LIBRARY_PATH:-}\\"" > ~/.config/odysseus-llama-cpp-env')
+    runner_lines.append('          _odysseus_have_prebuilt=1')
+    runner_lines.append('          echo "[odysseus] Prebuilt llama-server installed at $_odysseus_extracted"')
+    runner_lines.append('        fi')
+    runner_lines.append('      fi')
+    runner_lines.append('      [ -z "$_odysseus_have_prebuilt" ] && echo "[odysseus] Prebuilt download/extract failed — falling back to from-source build."')
+    runner_lines.append('    elif [ -z "$_odysseus_prebuilt_url" ]; then')
+    runner_lines.append('      echo "[odysseus] No matching prebuilt llama-server for this host (arch=$_odysseus_arch) — will build from source."')
+    runner_lines.append('    fi')
+    runner_lines.append('  if [ -z "$_odysseus_have_prebuilt" ]; then')
    # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put it on PATH
-    # so cmake's CUDA configure can find it. We keep this after the ROCm/HIP
-    # check — a machine with both stacks should honor the native HIP toolchain on
-    # AMD hosts instead of accidentally preferring a stray nvcc wheel.
-    runner_lines.append('    for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
-    runner_lines.append('      [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
-    runner_lines.append('    done')
+    # so cmake's CUDA configure can find it — BUT only when actual NVIDIA
+    # hardware is present. On AMD/Intel hosts the pip nvcc is a misleading
+    # leftover (no libcudart, no GPU it could target) and would otherwise
+    # send the build down the CUDA branch and fail with "CUDA Toolkit not
+    # found" instead of trying Vulkan.
+    runner_lines.append('    _odysseus_has_nvidia_hw() {')
+    runner_lines.append('      command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && return 0')
+    runner_lines.append('      ls /dev/nvidia* >/dev/null 2>&1 && return 0')
+    runner_lines.append('      lspci 2>/dev/null | grep -iE \'VGA|3D|Display\' | grep -iq nvidia && return 0')
+    runner_lines.append('      return 1')
+    runner_lines.append('    }')
+    runner_lines.append('    if _odysseus_has_nvidia_hw; then')
+    runner_lines.append('      for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
+    runner_lines.append('        [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
+    runner_lines.append('      done')
+    runner_lines.append('    fi')
    # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a failed CUDA
    # or HIP attempt) doesn't cause the next configure to reuse stale settings.
    runner_lines.append('    mkdir -p ~/bin')
-    runner_lines.append('    cd ~/llama.cpp && rm -rf build')
+    # Try to install cmake / build-essential / git automatically before the
+    # build, but ONLY via passwordless sudo (`sudo -n`) — interactive sudo
+    # would hang a tmux-backgrounded serve task waiting for a password. If
+    # sudo asks for a password the install is skipped silently and the
+    # diagnosis pattern (cookbook_routes.py / cookbook_helpers.py) surfaces
+    # an explicit "install cmake" suggestion in the Cookbook diagnosis
+    # toolbar after the inevitable build failure.
+    runner_lines.append('    _odysseus_apt_bootstrap() {')
+    runner_lines.append('      local _missing=""')
+    runner_lines.append('      command -v cmake >/dev/null 2>&1 || _missing="$_missing cmake"')
+    runner_lines.append('      command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _missing="$_missing build-essential"')
+    runner_lines.append('      command -v git >/dev/null 2>&1 || _missing="$_missing git"')
+    runner_lines.append('      [ -z "$_missing" ] && return 0')
+    runner_lines.append('      if command -v apt-get >/dev/null 2>&1 && sudo -n true 2>/dev/null; then')
+    runner_lines.append('        echo "[odysseus] Auto-installing missing build deps via apt:$_missing"')
+    runner_lines.append('        sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update -qq 2>&1 | tail -3')
+    runner_lines.append('        sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends $_missing 2>&1 | tail -5 || true')
+    runner_lines.append('      elif command -v pacman >/dev/null 2>&1 && sudo -n true 2>/dev/null; then')
+    runner_lines.append('        echo "[odysseus] Auto-installing missing build deps via pacman:$_missing"')
+    runner_lines.append('        local _pacpkgs="$(echo "$_missing" | sed -e \'s/build-essential/base-devel/g\')"')
+    runner_lines.append('        sudo -n pacman -Sy --needed --noconfirm $_pacpkgs 2>&1 | tail -5 || true')
+    runner_lines.append('      elif command -v dnf >/dev/null 2>&1 && sudo -n true 2>/dev/null; then')
+    runner_lines.append('        echo "[odysseus] Auto-installing missing build deps via dnf:$_missing"')
+    runner_lines.append('        local _dnfpkgs="$(echo "$_missing" | sed -e \'s/build-essential/gcc gcc-c++ make/g\')"')
+    runner_lines.append('        sudo -n dnf install -y $_dnfpkgs 2>&1 | tail -5 || true')
+    runner_lines.append('      else')
+    runner_lines.append('        echo "[odysseus] WARNING: missing build deps ($_missing) — passwordless sudo is unavailable, cannot auto-install. Cookbook Diagnosis will explain the fix after the build fails."')
+    runner_lines.append('      fi')
+    runner_lines.append('    }')
+    runner_lines.append('    _odysseus_apt_bootstrap')
+    runner_lines.append('    _odysseus_missing_build_deps=""')
+    runner_lines.append('    command -v cmake >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps cmake"')
+    runner_lines.append('    command -v git >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps git"')
+    runner_lines.append('    command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps build-essential"')
+    runner_lines.append('    if [ -n "$_odysseus_missing_build_deps" ]; then')
+    runner_lines.append('      echo "ERROR: llama.cpp source build needs missing packages:$_odysseus_missing_build_deps"')
+    runner_lines.append('      if command -v apt-get >/dev/null 2>&1; then')
+    runner_lines.append('        echo "Install on this host: sudo apt-get update && sudo apt-get install -y cmake build-essential git"')
+    runner_lines.append('      elif command -v pacman >/dev/null 2>&1; then')
+    runner_lines.append('        echo "Install on this host: sudo pacman -Sy --needed cmake base-devel git"')
+    runner_lines.append('      elif command -v dnf >/dev/null 2>&1; then')
+    runner_lines.append('        echo "Install on this host: sudo dnf install -y cmake gcc gcc-c++ make git"')
+    runner_lines.append('      fi')
+    runner_lines.append('      echo "Alternative: install a native llama-server on PATH, then relaunch."')
+    runner_lines.append('      ODYSSEUS_PREFLIGHT_EXIT=127')
+    runner_lines.append('    fi')
+    runner_lines.append('    cd ~/llama.cpp')
+    runner_lines.append('    _odysseus_has_vulkan() {')
+    runner_lines.append('      ldconfig -p 2>/dev/null | grep -q \'libvulkan\\.so\' && return 0')
+    runner_lines.append('      [ -e /usr/lib/libvulkan.so.1 ] && return 0')
+    runner_lines.append('      [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ] && return 0')
+    runner_lines.append('      command -v vulkaninfo >/dev/null 2>&1 && return 0')
+    runner_lines.append('      return 1')
+    runner_lines.append('    }')
+    runner_lines.append('    _odysseus_has_vulkan_device() {')
+    runner_lines.append('      ls /dev/dri/renderD* >/dev/null 2>&1 && return 0')
+    runner_lines.append('      lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\' && return 0')
+    runner_lines.append('      return 1')
+    runner_lines.append('    }')
+    # Backend preference: native ROCm/HIP > native CUDA > Vulkan > CPU.
+    # Vulkan is a portable fallback that works on AMD when ROCm isn't
+    # installed (e.g. Strix Halo) and on any vendor's discrete GPU, but
+    # it's ~30-40% slower than native HIP/CUDA for LLM inference — only
+    # pick it when no native toolchain is present.
    runner_lines.append('    if command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]; then')
+    runner_lines.append('      rm -rf build')
    runner_lines.append('      if command -v hipconfig &>/dev/null; then')
    runner_lines.append('        export HIPCXX="${HIPCXX:-$(hipconfig -l)/clang}"')
    runner_lines.append('        export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"')
    runner_lines.append('      fi')
    runner_lines.append('      echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."')
    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
-    runner_lines.append('    elif command -v nvcc &>/dev/null; then')
+    runner_lines.append('    elif command -v nvcc &>/dev/null && _odysseus_has_nvidia_hw; then')
+    runner_lines.append('      rm -rf build')
    # nvcc alone is not sufficient — pip-installed CUDA wheels or incomplete
    # tooling can expose nvcc without shipping libcudart, causing cmake to fail
    # mid-build with "CUDA runtime library not found". Check cudart explicitly
@@ -826,31 +952,50 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
    runner_lines.append('        echo "[odysseus]   Ensure libcudart is installed (e.g. cuda-runtime package) and visible via ldconfig or CUDA_HOME."')
    runner_lines.append('        cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
    runner_lines.append('      fi')
+    runner_lines.append('    elif _odysseus_has_vulkan_device && _odysseus_has_vulkan; then')
+    runner_lines.append('      echo "[odysseus] Vulkan-capable GPU detected (no ROCm/CUDA toolchain installed) — building llama-server with Vulkan support..."')
+    runner_lines.append('      rm -rf build-vulkan')
+    runner_lines.append('      cmake -B build-vulkan -DCMAKE_BUILD_TYPE=Release -DGGML_VULKAN=ON && cmake --build build-vulkan -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build-vulkan/bin/llama-server ~/bin/llama-server')
    runner_lines.append('    else')
-    runner_lines.append('      echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."')
+    runner_lines.append('      echo "[odysseus] WARNING: no HIP/CUDA/Vulkan toolchain found — building llama-server for CPU only."')
    runner_lines.append('      echo "[odysseus]   GPU inference will not be available for this llama.cpp build."')
-    runner_lines.append('      echo "[odysseus]   Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."')
+    runner_lines.append('      echo "[odysseus]   Install Vulkan (libvulkan-dev) / ROCm for AMD GPUs or CUDA tooling for NVIDIA, then re-launch this serve task."')
+    runner_lines.append('      rm -rf build')
    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
    runner_lines.append('    fi')
+    runner_lines.append('  fi  # end _odysseus_have_prebuilt guard')


-def _llama_cpp_rebuild_cmd() -> str:
+def _llama_cpp_rebuild_cmd(update_source: bool = False) -> str:
    """Shell command that clears the Cookbook-managed llama.cpp build.

-    Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build``
+    Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build*``
    directory so the next llama.cpp serve recompiles from source, picking up a
    CUDA or HIP toolchain if one is now available. The serve bootstrap only
    builds when ``llama-server`` is missing from PATH, so without this an
-    existing CPU-only build is reused forever. It deliberately installs and
-    downloads nothing; the rebuild itself happens on the next serve.
+    existing CPU-only build is reused forever. When ``update_source`` is true,
+    the command also fast-forwards the Cookbook-managed ``~/llama.cpp`` checkout
+    if it exists. The rebuild itself happens on the next serve.
    """
+    update_cmd = ''
+    if update_source:
+        update_cmd = (
+            'if [ -d "$HOME/llama.cpp/.git" ]; then '
+            'git -C "$HOME/llama.cpp" pull --ff-only --depth 1 || '
+            'echo "[odysseus] WARNING: llama.cpp source update failed; clearing cached build anyway."; '
+            'elif command -v git >/dev/null 2>&1; then '
+            'git clone --depth 1 https://github.com/ggml-org/llama.cpp "$HOME/llama.cpp" || '
+            'echo "[odysseus] WARNING: llama.cpp clone failed; clearing cached build anyway."; '
+            'fi && '
+        )
    return (
        'mkdir -p "$HOME/bin" && '
+        f'{update_cmd}'
        'rm -f "$HOME/bin/llama-server" && '
-        'rm -rf "$HOME/llama.cpp/build" && '
+        'rm -rf "$HOME/llama.cpp/build" "$HOME/llama.cpp/build-vulkan" && '
        'echo "[odysseus] Cleared the cached llama.cpp build. '
        'Re-launch the serve task to rebuild llama-server from source '
-        '(CUDA or HIP will be used if a toolchain is now available)."'
+        '(Vulkan, HIP, or CUDA will be used if a matching toolchain is now available)."'
    )


@@ -1113,8 +1258,27 @@ def _diagnose_serve_output(text: str) -> dict | None:
            "SGLang is not installed or not in PATH on this server.",
            [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
        ),
+        # System build deps come BEFORE the generic llama.cpp catch-all so
+        # cmake / build-essential / git missing → a specific OS-package
+        # remediation instead of "install llama-cpp-python[server]" (which
+        # itself fails to compile when cmake is absent).
        (
-            r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+            r"cmake: command not found|cmake.*not found.*[Cc]ould not",
+            "cmake is required to build llama.cpp from source but isn't installed on this server.",
+            [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}],
+        ),
+        (
+            r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler",
+            "A C/C++ compiler (build-essential) is required to build llama.cpp from source.",
+            [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
+        ),
+        (
+            r"^git: command not found",
+            "git is required to clone the llama.cpp source tree.",
+            [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
+        ),
+        (
+            r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'",
            "llama.cpp / llama-cpp-python dependencies are missing.",
            [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
        ),
@@ -189,8 +189,27 @@ def setup_cookbook_routes() -> APIRouter:
                "SGLang is not installed or not in PATH on this server.",
                [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
            ),
+            # System build deps come BEFORE the generic llama.cpp catch-all
+            # so cmake / build-essential / git missing → a specific OS-package
+            # remediation instead of "install llama-cpp-python[server]" (which
+            # itself fails to compile when cmake is absent).
            (
-                r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+                r"cmake: command not found|cmake.*not found.*[Cc]ould not",
+                "cmake is required to build llama.cpp from source but isn't installed on this server.",
+                [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}],
+            ),
+            (
+                r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler",
+                "A C/C++ compiler (build-essential) is required to build llama.cpp from source.",
+                [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
+            ),
+            (
+                r"^git: command not found",
+                "git is required to clone the llama.cpp source tree.",
+                [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
+            ),
+            (
+                r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'",
                "llama.cpp / llama-cpp-python dependencies are missing.",
                [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
            ),
@@ -254,6 +273,79 @@ def setup_cookbook_routes() -> APIRouter:
    def _load_stored_hf_token() -> str:
        return load_stored_hf_token(state_path=_cookbook_state_path)

+    def _normalize_minimax_m3_vllm_cmd(cmd: str) -> str:
+        """Patch MiniMax M3 vLLM launches into the known-good local form.
+
+        The browser form can be stale or omit advanced-only fields. MiniMax M3
+        is sensitive to several flags: using the HF repo id with block-size 128
+        fails KV-cache setup, and FlashInfer sampler JIT fails on this host's
+        system nvcc. Normalize server-side before writing the tmux runner.
+        """
+        cmd_lower = (cmd or "").lower()
+        if not cmd or "vllm serve" not in cmd_lower or "minimax" not in cmd_lower or "m3" not in cmd_lower:
+            return cmd
+        try:
+            parts = shlex.split(cmd)
+        except ValueError:
+            return cmd
+        if "serve" not in parts:
+            return cmd
+
+        env_re = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
+        env_parts = [p for p in parts if env_re.match(p)]
+        body = [p for p in parts if not env_re.match(p)]
+        try:
+            serve_i = body.index("serve")
+        except ValueError:
+            return cmd
+        if serve_i + 1 >= len(body):
+            return cmd
+
+        repo_id = "cyankiwi/MiniMax-M3-AWQ-INT4"
+        snapshot = (
+            "/home/pewds/.cache/huggingface/hub/"
+            "models--cyankiwi--MiniMax-M3-AWQ-INT4/"
+            "snapshots/4082acbbec1236d21828d55b6bb0fe02ade4ab5b"
+        )
+        if body[serve_i + 1] == repo_id:
+            body[serve_i + 1] = snapshot
+
+        def add_env(key: str, value: str) -> None:
+            if not any(p.startswith(f"{key}=") for p in env_parts):
+                env_parts.append(f"{key}={value}")
+
+        def has_flag(flag: str) -> bool:
+            return any(p == flag or p.startswith(flag + "=") for p in body)
+
+        def set_flag(flag: str, value: str) -> None:
+            for i, part in enumerate(body):
+                if part == flag:
+                    if i + 1 < len(body):
+                        body[i + 1] = value
+                    else:
+                        body.append(value)
+                    return
+                if part.startswith(flag + "="):
+                    body[i] = f"{flag}={value}"
+                    return
+            body.extend([flag, value])
+
+        def add_bool(flag: str) -> None:
+            if not has_flag(flag):
+                body.append(flag)
+
+        add_env("VLLM_TARGET_DEVICE", "cuda")
+        add_env("VLLM_USE_FLASHINFER_SAMPLER", "0")
+        set_flag("--served-model-name", repo_id)
+        set_flag("--tool-call-parser", "minimax_m3")
+        set_flag("--reasoning-parser", "minimax_m3")
+        set_flag("--attention-backend", "TRITON_ATTN")
+        set_flag("--block-size", "128")
+        add_bool("--language-model-only")
+        add_bool("--disable-custom-all-reduce")
+        add_bool("--enable-expert-parallel")
+        return shlex.join(env_parts + body)
+
    def _cookbook_ssh_dir() -> Path:
        # The Docker image keeps cookbook keys under /app/.ssh; that path only
        # exists inside the container. On Windows (and any non-container host)
@@ -1230,6 +1322,7 @@ def setup_cookbook_routes() -> APIRouter:
        # `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
        req.cmd = _validate_serve_cmd(req.cmd) or ""
        req.cmd = _normalize_llama_cpp_python_cache_types(req.cmd) or ""
+        req.cmd = _normalize_minimax_m3_vllm_cmd(req.cmd)
        req.cmd = _venv_safe_local_pip_install_cmd(
            req.cmd,
            local=not bool(req.remote_host),
@@ -1243,8 +1336,16 @@ def setup_cookbook_routes() -> APIRouter:
            req.cmd = _pip_install_no_cache(req.cmd)
            # Accept common aliases and enforce server extras for llama-cpp so
            # `python -m llama_cpp.server` has all runtime dependencies.
-            req.cmd = re.sub(r"(?<![A-Za-z0-9_.-])llama_cpp(?![A-Za-z0-9_.-])", "llama-cpp-python[server]", req.cmd)
-            req.cmd = re.sub(r"(?<![A-Za-z0-9_.-])llama-cpp-python(?!\[)", "llama-cpp-python[server]", req.cmd)
+            # CRITICAL: the lookbehind / lookahead must also exclude `/` so
+            # the regex DOESN'T mangle a URL path like
+            #   https://abetlen.github.io/llama-cpp-python/whl/cu124
+            # The previous regex turned that URL into
+            #   https://abetlen.github.io/llama-cpp-python[server]/whl/cu124
+            # which pip then couldn't resolve → silent fallback to source
+            # build of the .tar.gz → CPU-only binary (because CMAKE_ARGS
+            # isn't set), defeating the entire purpose of the CUDA index.
+            req.cmd = re.sub(r"(?<![A-Za-z0-9_.\-/])llama_cpp(?![A-Za-z0-9_.\-/])", "llama-cpp-python[server]", req.cmd)
+            req.cmd = re.sub(r"(?<![A-Za-z0-9_.\-/])llama-cpp-python(?![\[/])", "llama-cpp-python[server]", req.cmd)
            if "llama-cpp-python" in req.cmd and "--extra-index-url" not in req.cmd:
                req.cmd += " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
            # PEP-508-style package spec — letters, digits, `.-_` for the
@@ -1284,6 +1385,11 @@ def setup_cookbook_routes() -> APIRouter:
        # LOCAL execution on a native-Windows host never uses tmux (detached
        # process path below), regardless of the UI-supplied platform.
        local_windows = IS_WINDOWS and not remote
+        if is_windows and remote and "diffusion_server.py" in req.cmd:
+            raise HTTPException(
+                400,
+                "Remote Windows Diffusers serving is not supported yet; use local Windows or a Linux remote server.",
+            )

        if not is_windows and not local_windows and not await _binary_available("tmux", remote, req.ssh_port):
            return {
@@ -1426,6 +1532,69 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('  else')
                _append_llama_cpp_linux_accel_build_lines(runner_lines)
                runner_lines.append('  fi')
+                # Source the env file the prebuilt-download path writes so
+                # LD_LIBRARY_PATH includes the directory holding libllama.so
+                # and friends. No-op when prebuilt wasn't used.
+                runner_lines.append('  [ -r ~/.config/odysseus-llama-cpp-env ] && . ~/.config/odysseus-llama-cpp-env')
+                # Auto-upgrade pip llama-cpp-python to the CUDA-enabled
+                # wheel when (a) NVIDIA hardware is present and (b) the
+                # currently-installed wheel is CPU-only. Without this the
+                # user gets the Python server happily running at 3 tok/s
+                # because pip's default index ships CPU-only wheels.
+                # Forward-compat: cu124 wheels work on driver/runtime
+                # 12.4+ including the cu13.x line.
+                runner_lines.append('  if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && python3 -c "import llama_cpp" 2>/dev/null; then')
+                runner_lines.append('    if ! python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then')
+                runner_lines.append('      echo "[odysseus] NVIDIA detected but installed llama-cpp-python is CPU-only — reinstalling with CUDA wheel index for GPU offload..."')
+                runner_lines.append('      python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124 2>&1 | tail -8 || echo "[odysseus] WARNING: CUDA wheel reinstall failed — Python server will stay CPU-only (slow). Manual fix: pip install --user --force-reinstall \'llama-cpp-python[server]\' --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124"')
+                runner_lines.append('      if python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then')
+                runner_lines.append('        echo "[odysseus] llama-cpp-python now supports GPU offload."')
+                runner_lines.append('      fi')
+                runner_lines.append('    fi')
+                runner_lines.append('  fi')
+                # SHORT-CIRCUIT before the build/pip fallback: if the
+                # native binary is missing but llama_cpp Python is already
+                # installed, drop a wrapper at ~/bin/llama-server that
+                # translates llama-server CLI args to llama_cpp.server's
+                # underscore-style flags. The user's serve command stays
+                # `llama-server ...` and "just works" — no build, no cmake,
+                # no second install. This is the path that unblocks every
+                # remote where pip-installed llama-cpp-python is already
+                # working but Cookbook used to insist on a native binary.
+                runner_lines.append('  if ! command -v llama-server >/dev/null 2>&1 && python3 -c "import llama_cpp" 2>/dev/null; then')
+                runner_lines.append('    mkdir -p ~/bin')
+                runner_lines.append('    cat > ~/bin/llama-server <<\'_ODY_LLAMA_SHIM_EOF\'')
+                runner_lines.append('#!/usr/bin/env bash')
+                runner_lines.append('# Auto-generated by Odysseus Cookbook: a `llama-server` lookalike')
+                runner_lines.append('# that translates the native CLI to `python -m llama_cpp.server`.')
+                runner_lines.append('# Lets cookbook-generated launch commands run unchanged on hosts')
+                runner_lines.append('# where only the pip llama-cpp-python package is installed.')
+                runner_lines.append('ARGS=()')
+                runner_lines.append('while [ $# -gt 0 ]; do')
+                runner_lines.append('  case "$1" in')
+                runner_lines.append('    -ngl|--gpu-layers|--n-gpu-layers) ARGS+=(--n_gpu_layers "$2"); shift 2 ;;')
+                runner_lines.append('    -c|--ctx-size) ARGS+=(--n_ctx "$2"); shift 2 ;;')
+                runner_lines.append('    -b|--batch-size) ARGS+=(--n_batch "$2"); shift 2 ;;')
+                runner_lines.append('    -ub|--ubatch-size) shift 2 ;;  # llama-cpp-python has no separate ubatch')
+                runner_lines.append('    --flash-attn) ARGS+=(--flash_attn true); shift 2 ;;')
+                runner_lines.append('    --cache-type-k) ARGS+=(--type_k "$2"); shift 2 ;;')
+                runner_lines.append('    --cache-type-v) ARGS+=(--type_v "$2"); shift 2 ;;')
+                runner_lines.append('    --n-cpu-moe) ARGS+=(--n_cpu_moe "$2"); shift 2 ;;')
+                runner_lines.append('    --mmproj) ARGS+=(--clip_model_path "$2"); shift 2 ;;')
+                runner_lines.append('    --image-max-tokens) shift 2 ;;  # native-only')
+                runner_lines.append('    --no-mmap) ARGS+=(--no_mmap true); shift ;;')
+                runner_lines.append('    --no-warmup) shift ;;  # native-only')
+                runner_lines.append('    --chat-template) ARGS+=(--chat_format "$2"); shift 2 ;;')
+                runner_lines.append('    --fit|--split-mode|--tensor-split|--main-gpu|--parallel) shift 2 ;;  # native-only')
+                runner_lines.append('    --mlock) ARGS+=(--use_mlock true); shift ;;')
+                runner_lines.append('    *) ARGS+=("$1"); shift ;;')
+                runner_lines.append('  esac')
+                runner_lines.append('done')
+                runner_lines.append('exec python3 -m llama_cpp.server "${ARGS[@]}"')
+                runner_lines.append('_ODY_LLAMA_SHIM_EOF')
+                runner_lines.append('    chmod +x ~/bin/llama-server')
+                runner_lines.append('    echo "[odysseus] Created llama-server shim → python -m llama_cpp.server (no native binary needed)"')
+                runner_lines.append('  fi')
                runner_lines.append('  # If the native build failed, fall back to the Python bindings.')
                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    echo "llama-server build failed — installing Python bindings as fallback..."')
@@ -1489,6 +1658,96 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('  echo "ERROR: vLLM is not installed."')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')
+                runner_lines.append(f"ODYSSEUS_SERVE_CMD='{_bash_squote(req.cmd)}'")
+                runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ]; then')
+                runner_lines.append('  ODYSSEUS_VLLM_HELP_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'')
+                runner_lines.append('import shlex, sys')
+                runner_lines.append('parts = shlex.split(sys.argv[1])')
+                runner_lines.append('try:')
+                runner_lines.append('    serve_i = parts.index("serve")')
+                runner_lines.append('except ValueError:')
+                runner_lines.append('    print("vllm serve --help")')
+                runner_lines.append('else:')
+                runner_lines.append('    print(shlex.join(parts[:serve_i + 1] + ["--help"]))')
+                runner_lines.append('PY')
+                runner_lines.append(')"')
+                runner_lines.append('  ODYSSEUS_VLLM_SUPPORTS_SWAP=0')
+                runner_lines.append('  if eval "$ODYSSEUS_VLLM_HELP_CMD" 2>&1 | grep -q -- "--swap-space"; then ODYSSEUS_VLLM_SUPPORTS_SWAP=1; fi')
+                runner_lines.append('fi')
+                runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" = "1" ] && ! printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then')
+                runner_lines.append('  echo "[odysseus] Setting vLLM --swap-space 0 so the runtime does not reserve CPU swap per GPU."')
+                runner_lines.append('  ODYSSEUS_SERVE_CMD="${ODYSSEUS_SERVE_CMD} --swap-space 0"')
+                runner_lines.append('fi')
+                runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" != "1" ]; then')
+                runner_lines.append('  if printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then')
+                runner_lines.append('    echo "[odysseus] vLLM serve does not expose --swap-space; removing the flag and patching the runtime default to 0."')
+                runner_lines.append('    ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'')
+                runner_lines.append('import shlex, sys')
+                runner_lines.append('parts = shlex.split(sys.argv[1])')
+                runner_lines.append('out = []')
+                runner_lines.append('skip = False')
+                runner_lines.append('for part in parts:')
+                runner_lines.append('    if skip:')
+                runner_lines.append('        skip = False')
+                runner_lines.append('        continue')
+                runner_lines.append('    if part == "--swap-space":')
+                runner_lines.append('        skip = True')
+                runner_lines.append('        continue')
+                runner_lines.append('    if part.startswith("--swap-space="):')
+                runner_lines.append('        continue')
+                runner_lines.append('    out.append(part)')
+                runner_lines.append('print(shlex.join(out))')
+                runner_lines.append('PY')
+                runner_lines.append(')"')
+                runner_lines.append('  fi')
+                runner_lines.append('  ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'')
+                runner_lines.append('import shlex, sys')
+                runner_lines.append('parts = shlex.split(sys.argv[1])')
+                runner_lines.append('patch = r"""import inspect, sys')
+                runner_lines.append('from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs')
+                runner_lines.append('def _odysseus_swap0(cls):')
+                runner_lines.append('    params = list(inspect.signature(cls).parameters)')
+                runner_lines.append('    if "swap_space" not in params:')
+                runner_lines.append('        return')
+                runner_lines.append('    idx = params.index("swap_space")')
+                runner_lines.append('    defaults = list(cls.__init__.__defaults__ or ())')
+                runner_lines.append('    if idx < len(defaults):')
+                runner_lines.append('        defaults[idx] = 0')
+                runner_lines.append('        cls.__init__.__defaults__ = tuple(defaults)')
+                runner_lines.append('    fields = getattr(cls, "__dataclass_fields__", {})')
+                runner_lines.append('    if "swap_space" in fields:')
+                runner_lines.append('        fields["swap_space"].default = 0')
+                runner_lines.append('_odysseus_swap0(EngineArgs)')
+                runner_lines.append('_odysseus_swap0(AsyncEngineArgs)')
+                runner_lines.append('try:')
+                runner_lines.append('    from vllm.config import CacheConfig')
+                runner_lines.append('    CacheConfig.swap_space = 0')
+                runner_lines.append('except Exception:')
+                runner_lines.append('    pass')
+                runner_lines.append('_orig_create_engine_config = EngineArgs.create_engine_config')
+                runner_lines.append('def _odysseus_create_engine_config(self, *args, **kwargs):')
+                runner_lines.append('    self.swap_space = 0')
+                runner_lines.append('    return _orig_create_engine_config(self, *args, **kwargs)')
+                runner_lines.append('EngineArgs.create_engine_config = _odysseus_create_engine_config')
+                runner_lines.append('AsyncEngineArgs.create_engine_config = _odysseus_create_engine_config')
+                runner_lines.append('from vllm.entrypoints.cli.main import main')
+                runner_lines.append('sys.exit(main())"""')
+                runner_lines.append('try:')
+                runner_lines.append('    serve_i = parts.index("serve")')
+                runner_lines.append('except ValueError:')
+                runner_lines.append('    print(shlex.join(parts))')
+                runner_lines.append('else:')
+                runner_lines.append('    exe_i = serve_i - 1')
+                runner_lines.append('    exe = parts[exe_i] if exe_i >= 0 else "vllm"')
+                runner_lines.append('    py = "python3"')
+                runner_lines.append('    if exe.endswith("/bin/vllm"):')
+                runner_lines.append('        py = exe[:-len("/bin/vllm")] + "/bin/python"')
+                runner_lines.append('    parts[exe_i:serve_i] = [py, "-c", patch]')
+                runner_lines.append('    print(shlex.join(parts))')
+                runner_lines.append('PY')
+                runner_lines.append(')"')
+                runner_lines.append('  echo "[odysseus] Patched vLLM internal swap_space default to 0 for this runtime."')
+                runner_lines.append('fi')
            elif "sglang.launch_server" in req.cmd:
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
                runner_lines.append('if ! command -v sglang &>/dev/null; then')
@@ -1530,7 +1789,10 @@ def setup_cookbook_routes() -> APIRouter:
                    runner_lines,
                    keep_shell_open=not local_windows,
                )
-                runner_lines.append(req.cmd)
+                if "vllm serve" in req.cmd:
+                    runner_lines.append('eval "$ODYSSEUS_SERVE_CMD"')
+                else:
+                    runner_lines.append(req.cmd)
                if local_windows:
                    # Detached background process — no interactive shell to keep open.
                    # Print the exit marker the status poller looks for, then stop.
@@ -1834,6 +2096,25 @@ def setup_cookbook_routes() -> APIRouter:
        out, err = await _run_gpu_shell("ls -1 /sys/class/drm 2>/dev/null", host, ssh_port, timeout=4)
        if err is not None or not out:
            return []
+        # Pick the runtime label up-front so each GPU dict gets the
+        # right `backend`. AMD silicon can be driven by ROCm/HIP (native)
+        # OR Vulkan (mesa RADV). Reporting "rocm" on a host where no
+        # ROCm toolchain is installed misleads the frontend env-var
+        # prefix logic — it would emit `HIP_VISIBLE_DEVICES=` for a
+        # Vulkan-only stack, which is a silent no-op at best.
+        rt_out, _ = await _run_gpu_shell(
+            'command -v rocminfo >/dev/null 2>&1 && echo rocm '
+            '|| (command -v hipconfig >/dev/null 2>&1 && echo rocm) '
+            '|| (command -v vulkaninfo >/dev/null 2>&1 && echo vulkan) '
+            '|| echo unknown',
+            host, ssh_port, timeout=4,
+        )
+        _amd_runtime = (rt_out or "").strip().splitlines()[-1:][0].strip() if rt_out else "rocm"
+        if _amd_runtime not in ("rocm", "vulkan"):
+            # Default to rocm so existing ROCm-installed hosts keep
+            # working; "unknown" only happens when neither toolchain is
+            # detected (e.g. minimal sysfs read on a fresh box).
+            _amd_runtime = "rocm"
        gpus = []
        for entry in out.split():
            if not entry.startswith("card") or "-" in entry:
@@ -1877,7 +2158,7 @@ def setup_cookbook_routes() -> APIRouter:
                "free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
                "gtt_used_mb": gtt_used_mb,
                "util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
-                "processes": [], "backend": "rocm", "source": "amd-sysfs",
+                "processes": [], "backend": _amd_runtime, "source": "amd-sysfs",
                "unified_memory": unified,
            })
        if gpus:
@@ -2018,10 +2299,15 @@ def setup_cookbook_routes() -> APIRouter:

        amd_gpus = await _probe_amd_sysfs(host, ssh_port)
        if amd_gpus:
+            # The per-GPU dict already carries the runtime label picked by
+            # _probe_amd_sysfs (rocm vs vulkan); mirror that into the
+            # wrapper so the frontend can read `data.backend` directly
+            # without scanning the list.
+            _amd_wrap_backend = str(amd_gpus[0].get("backend") or "rocm")
            return {
                "ok": True,
                "gpus": amd_gpus,
-                "backend": "rocm",
+                "backend": _amd_wrap_backend,
                "source": "amd-sysfs",
                "fallback_from": "nvidia-smi",
                "nvidia_error": nvidia_error,
@@ -2161,6 +2447,17 @@ def setup_cookbook_routes() -> APIRouter:

            disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else []
            incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else []
+            incoming_removed = data.get("removedTasks") if isinstance(data.get("removedTasks"), dict) else {}
+            disk_removed = on_disk.get("removedTasks") if isinstance(on_disk, dict) and isinstance(on_disk.get("removedTasks"), dict) else {}
+            removed_tasks = {**disk_removed, **incoming_removed}
+            data["removedTasks"] = removed_tasks
+            removed_ids = set(removed_tasks.keys())
+            if removed_ids:
+                incoming_tasks = [
+                    t for t in incoming_tasks
+                    if not (isinstance(t, dict) and t.get("sessionId") in removed_ids)
+                ]
+                data["tasks"] = incoming_tasks
            # Anti-poisoning guard: a stale browser tab can keep POSTing a
            # download task as status='done' from before the strict-finish
            # fix landed, undoing any server-side correction. For each
@@ -2198,6 +2495,8 @@ def setup_cookbook_routes() -> APIRouter:
                sid = t.get("sessionId")
                if not sid or sid in incoming_ids:
                    continue  # client's version wins
+                if sid in removed_ids:
+                    continue  # intentional cross-device clear/remove
                ts = t.get("ts") or 0
                if isinstance(ts, (int, float)) and (now_ms - ts) <= RACE_WINDOW_MS:
                    preserved.append(t)
@@ -2304,16 +2603,14 @@ def setup_cookbook_routes() -> APIRouter:
            # Add 30% headroom for KV cache, activations, etc.
            needed_vram = (est_vram * 1.3) if est_vram else None

-            if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb:
-                continue
-            # Unknown-size models (e.g. MiniMax-M2.7, DeepSeek-V4-Flash) have no
-            # "NB" in the repo id, so the regex above can't extract their
-            # param count. Previously we dropped them entirely, which made
-            # brand-new flagship releases silently vanish from this list even
-            # on rigs with hundreds of GB of VRAM. Adapters/LoRAs are already
-            # filtered by _is_excluded(), so what falls through here is
-            # overwhelmingly full models — keep them, just without a size
-            # badge (the frontend handles needed_vram_gb=null gracefully).
+            if vram_gb > 0:
+                if needed_vram is None:
+                    # The "trending models that fit" list must be conservative:
+                    # if we cannot estimate size from the repo id/tags, do not
+                    # present it as runnable on this hardware.
+                    continue
+                if needed_vram > vram_gb:
+                    continue

            out.append({
                "repo_id": repo_id,
@@ -2510,6 +2807,33 @@ def setup_cookbook_routes() -> APIRouter:
            except Exception as e:
                logger.warning(f"orphan sweep: state write failed: {e}")

+    @router.get("/api/cookbook/hf-gguf-files")
+    async def hf_gguf_files(repo_id: str, owner: str = Depends(require_user)):
+        """List GGUF files in a HuggingFace repo for the direct-download picker."""
+        import httpx
+
+        repo_id = _validate_repo_id(repo_id)
+        url = f"https://huggingface.co/api/models/{repo_id}"
+        try:
+            headers = {}
+            token = _load_stored_hf_token()
+            if token:
+                headers["Authorization"] = f"Bearer {token}"
+            async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
+                resp = await client.get(url, headers=headers)
+                if resp.status_code != 200:
+                    return {"ok": False, "files": [], "error": f"HF API HTTP {resp.status_code}"}
+                data = resp.json()
+        except Exception:
+            logger.exception("HF GGUF file scan failed for %s", repo)
+            return {"ok": False, "files": [], "error": "HF API request failed"}
+        files = [
+            str(s.get("rfilename") or "")
+            for s in data.get("siblings", [])
+            if str(s.get("rfilename") or "").lower().endswith(".gguf")
+        ]
+        return {"ok": True, "repo_id": repo_id, "files": files}
+
    # In-memory cache for the Ollama library scrape. ollama.com is a public
    # site, but it doesn't expose a stable JSON listing — we fetch the HTML
    # search page and regex out the model cards. Cached for 1 h so a busy
@@ -12,6 +12,7 @@ from pydantic import BaseModel

 from core.database import Document, DocumentVersion
 from core.database import Session as DbSession
+from src.auth_helpers import _auth_disabled
 from src.upload_handler import UploadHandler

 logger = logging.getLogger(__name__)
@@ -78,6 +79,8 @@ def _verify_doc_owner(db, doc: Document, user: str):
    the session join for any not-yet-backfilled legacy row.
    """
    if user is None:
+        if _auth_disabled():
+            return  # Single-user / no-auth mode: allow access
        raise HTTPException(403, "Authentication required")
    if doc.owner is not None:
        if doc.owner != user:
@@ -102,8 +105,10 @@ def _owner_session_filter(q, user):

    The owner backfill runs in init_db before the app serves requests, so
    by the time this filter is live there are no NULL-owner rows to leak;
-    we therefore match the owner strictly."""
-    if user is None:
+    we therefore match the owner strictly for authenticated callers."""
+    if not user:
+        if user == "" or _auth_disabled():
+            return q
        return q.filter(False)
    return q.filter(Document.owner == user)

@@ -10,7 +10,7 @@ from fastapi import APIRouter, HTTPException, Query, Request, UploadFile, File,
 from sqlalchemy import case, func, or_
 from core.database import SessionLocal, Document, DocumentVersion
 from core.database import Session as DbSession
-from src.auth_helpers import get_current_user
+from src.auth_helpers import get_current_user, _auth_disabled
 from src.constants import MAIL_ATTACHMENTS_DIR

 logger = logging.getLogger(__name__)
@@ -388,7 +388,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
        db = SessionLocal()
        try:
            if not user:
-                raise HTTPException(403, "Authentication required")
+                if not _auth_disabled():
+                    raise HTTPException(403, "Authentication required")
            # v2 review HIGH-9: raise 403 explicitly when the caller
            # can't see this session, instead of returning [] which the
            # UI treats identically to "no docs" and silently masks
@@ -503,7 +504,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
        user = get_current_user(request)
        try:
            data = await request.json()
-        except Exception:
+        except Exception as e:
+            logger.warning("Failed to parse export request body, defaulting to empty", exc_info=e)
            data = {}
        ids = data.get("ids") or []
        if not ids:
@@ -645,8 +647,8 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
                    try:
                        from src.agent_tools.document_tools import clear_active_document
                        clear_active_document(doc_id)
-                    except Exception:
-                        pass
+                    except Exception as e:
+                        logger.warning("Failed to clear active document %r on detach", doc_id, exc_info=e)
            db.commit()
            db.refresh(doc)
            return _doc_to_dict(doc)
@@ -1331,6 +1333,12 @@ def setup_document_routes(session_manager, upload_handler=None) -> APIRouter:
            if not pdf_path:
                raise HTTPException(404, f"Source PDF {upload_id} not found")

+            # Fail fast with a clear 503 if the optional PyMuPDF dependency
+            # is missing — fill_fields/stamp_annotations will otherwise
+            # raise RuntimeError deep inside and bubble out as a 500.
+            # Mirrors the convention in _load_pdf_viewer_fitz above.
+            _load_pdf_viewer_fitz()
+
            values = parse_markdown_to_values(doc.current_content or "")
            out_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
            _to_unlink.append(out_path)
@@ -1233,22 +1233,30 @@ def _list_attachments_from_msg(msg):
        return attachments
    idx = 0
    for part in msg.walk():
-        if part.is_multipart():
-            continue
        cd = str(part.get("Content-Disposition", ""))
        ct = part.get_content_type()
+        is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename())
+        if part.is_multipart() and not is_attached_email:
+            continue
        # Skip text/html body parts (only consider real attachments)
        if ct in ("text/plain", "text/html") and "attachment" not in cd:
            continue
        filename = part.get_filename()
        if filename:
            filename = _decode_header(filename)
+            if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename):
+                filename = f"{filename}.eml"
        else:
            # Inline images, etc. - generate a name
-            ext = ct.split("/")[-1] if "/" in ct else "bin"
+            ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin")
            filename = f"attachment_{idx}.{ext}"
        payload = part.get_payload(decode=True)
-        size = len(payload) if payload else 0
+        if payload is None and ct == "message/rfc822":
+            try:
+                payload = part.as_bytes()
+            except Exception:
+                payload = b""
+        size = len(payload) if payload is not None else 0
        attachments.append({
            "index": idx,
            "filename": filename,
@@ -1260,29 +1268,58 @@ def _list_attachments_from_msg(msg):
    return attachments


+def _is_likely_signature_image_attachment(att: dict) -> bool:
+    """Match the reader's inline signature/logo image filter."""
+    filename = str((att or {}).get("filename") or "").lower()
+    if not re.search(r"\.(png|jpe?g|gif|bmp|svg|webp)$", filename):
+        return False
+    size = int((att or {}).get("size") or 0)
+    if re.search(r"^image\d{3,}\.(png|jpe?g|gif)$", filename):
+        return True
+    if re.search(r"^(signature|logo|sig|footer|banner)[-_\d]*\.(png|jpe?g|gif|svg)$", filename):
+        return True
+    return 0 < size < 30 * 1024
+
+
+def _has_visible_attachments(msg) -> bool:
+    """Return True only for attachments the reader will render as chips."""
+    return any(
+        not _is_likely_signature_image_attachment(att)
+        for att in _list_attachments_from_msg(msg)
+    )
+
+
 def _extract_attachment_to_disk(msg, index, target_dir):
    """Extract a specific attachment to disk and return the file path."""
    if not msg.is_multipart():
        return None
    idx = 0
    for part in msg.walk():
-        if part.is_multipart():
-            continue
        cd = str(part.get("Content-Disposition", ""))
        ct = part.get_content_type()
+        is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename())
+        if part.is_multipart() and not is_attached_email:
+            continue
        if ct in ("text/plain", "text/html") and "attachment" not in cd:
            continue
        if idx == index:
            filename = part.get_filename()
            if filename:
                filename = _decode_header(filename)
+                if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename):
+                    filename = f"{filename}.eml"
            else:
-                ext = ct.split("/")[-1] if "/" in ct else "bin"
+                ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin")
                filename = f"attachment_{idx}.{ext}"
            # Sanitize
            safe_name = re.sub(r"[^\w\s\-.]", "_", filename).strip()
            payload = part.get_payload(decode=True)
-            if not payload:
+            if payload is None and ct == "message/rfc822":
+                try:
+                    payload = part.as_bytes()
+                except Exception:
+                    payload = b""
+            if payload is None:
                return None
            target_dir.mkdir(parents=True, exist_ok=True)
            filepath = target_dir / safe_name
@@ -44,6 +44,17 @@ from routes.email_helpers import (

 logger = logging.getLogger(__name__)

+# Recovers a `[{"action": ...}, ...]` JSON array from raw LLM output when the
+# fenced-block strip leaves nothing usable. Runs on model output influenced by
+# untrusted email bodies, so it must not backtrack: the object content class is
+# `[^{}]` (brace-delimited, greedy) rather than the old `[^[\]]*?` lazy runs,
+# which exploded exponentially on inputs like `[{"action"},{` + `}},{{` * N
+# (CodeQL py/redos #198).
+_CAL_ACTION_ARRAY_RE = re.compile(
+    r'\[\s*\{[^{}]*"action"[^{}]*\}\s*(?:,\s*\{[^{}]*\}\s*)*\]',
+    re.DOTALL,
+)
+

 def _owner_for_email_account(account_id: str | None) -> str:
    if not account_id:
@@ -558,7 +569,7 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
                        cal_extract = _strip_think(_raw_original)
                        cal_extract = re.sub(r"^```(?:json)?\s*|\s*```$", "", cal_extract, flags=re.MULTILINE).strip()
                        if not cal_extract and _raw_original:
-                            matches = list(re.finditer(r'\[\s*\{[^[\]]*?"action"[^[\]]*?\}\s*(?:,\s*\{[^[\]]*?\}\s*)*\]', _raw_original, re.DOTALL))
+                            matches = list(_CAL_ACTION_ARRAY_RE.finditer(_raw_original))
                            if matches:
                                cal_extract = matches[-1].group()
                        logger.info(f"[cal-extract] uid={uid.decode() if isinstance(uid, bytes) else uid} folder={_folder} subj={subject[:50]!r} raw_len={len(cal_extract)} orig_len={len(_raw_original)} raw={cal_extract[:800]!r}")
@@ -683,20 +694,23 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
                                logger.warning(f"[cal-extract] JSON parse failed: {je} on raw={cal_extract[:200]!r}")
                    except Exception as e:
                        logger.warning(f"[cal-extract] Meeting extraction LLM call failed for uid={uid}: {e}")
-                    # Record we processed this email so we don't re-LLM next run
-                    try:
-                        _cc = _sql3.connect(SCHEDULED_DB)
-                        _cc.execute(
-                            "INSERT OR REPLACE INTO email_calendar_extractions "
-                            "(message_id, owner, uid, events_created, created_at) VALUES (?, ?, ?, ?, ?)",
-                            (message_id, account_owner or "", uid.decode() if isinstance(uid, bytes) else str(uid),
-                             _cal_run_count, datetime.utcnow().isoformat())
-                        )
-                        _cc.commit()
-                        _cc.close()
-                        _cal_existing.add(message_id)
-                    except Exception as ce:
-                        logger.debug(f"Could not cache calendar extraction: {ce}")
+                    else:
+                        # Record we processed this email so we don't re-LLM next run.
+                        # Only mark as processed on success ? transient LLM failures
+                        # are retried on the next poll run (matches summary/reply pattern).
+                        try:
+                            _cc = _sql3.connect(SCHEDULED_DB)
+                            _cc.execute(
+                                "INSERT OR REPLACE INTO email_calendar_extractions "
+                                "(message_id, owner, uid, events_created, created_at) VALUES (?, ?, ?, ?, ?)",
+                                (message_id, account_owner or "", uid.decode() if isinstance(uid, bytes) else str(uid),
+                                 _cal_run_count, datetime.utcnow().isoformat())
+                            )
+                            _cc.commit()
+                            _cc.close()
+                            _cal_existing.add(message_id)
+                        except Exception as ce:
+                            logger.debug(f"Could not cache calendar extraction: {ce}")

                if need_urgent:
                    try:
@@ -47,7 +47,7 @@ from routes.email_helpers import (
    _IMAP_TIMEOUT_SECONDS, _open_imap_connection,
    make_oauth_state, verify_oauth_state,
    _imap_connect, _imap, _decode_header, _detect_sent_folder, _detect_drafts_folder,
-    _extract_attachment_text, _list_attachments_from_msg,
+    _extract_attachment_text, _list_attachments_from_msg, _has_visible_attachments, _is_likely_signature_image_attachment,
    _extract_attachment_to_disk, _extract_html, _extract_text,
    _fetch_sender_thread_context, _pre_retrieve_context,
    _EMAIL_REPLY_SYS_PROMPT_BASE, _POOL_HOOKS,
@@ -61,6 +61,7 @@ from routes.email_pollers import _start_poller
 logger = logging.getLogger(__name__)

 ODYSSEUS_MAIL_ORIGIN = "odysseus-ui"
+EMAIL_READ_ATTACHMENT_VERSION = 2


 def _email_tag_owner_aliases(account_id: str | None, owner: str = "") -> list[str]:
@@ -79,15 +80,16 @@ def _email_tag_owner_aliases(account_id: str | None, owner: str = "") -> list[st
                        cfg.get("smtp_user") or "",
                        cfg.get("from_address") or "",
                    ])
-                except Exception:
+                except Exception as _e:
+                    logger.warning("Failed to resolve email account alias", exc_info=_e)
                    resolved_account_id = None
            row = db.get(_EA, resolved_account_id) if resolved_account_id else None
            if row:
                aliases.extend([row.owner or "", row.imap_user or "", row.from_address or ""])
        finally:
            db.close()
-    except Exception:
-        pass
+    except Exception as _e:
+        logger.warning("Failed to load email aliases", exc_info=_e)
    out = []
    for a in aliases:
        a = (a or "").strip()
@@ -247,6 +249,21 @@ def _imap_uid_fetch(conn, uid_set: str | bytes, query: str):
    return conn.uid("FETCH", _uid_bytes(uid_set), query)


+def _imap_search_quote(value: str) -> str:
+    return '"' + str(value or "").replace("\\", "\\\\").replace('"', '\\"') + '"'
+
+
+def _message_id_chain(*values: str) -> list[str]:
+    seen = set()
+    out = []
+    for value in values:
+        for mid in re.findall(r"<[^>]+>", value or ""):
+            if mid not in seen:
+                seen.add(mid)
+                out.append(mid)
+    return out
+
+
 def _uid_from_fetch_meta(meta_b: bytes) -> str:
    m = re.search(rb"\bUID\s+(\d+)\b", meta_b)
    return m.group(1).decode() if m else ""
@@ -365,6 +382,21 @@ def _apply_odysseus_headers(msg, kind: str | None = None, ref_id: str | None = N
        msg["X-Odysseus-Ref"] = re.sub(r"[^A-Za-z0-9_.:-]", "-", ref_id)[:128]


+def _normalize_addr_field(field: str) -> str:
+    """Strip the malformed-but-common trailing/leading commas and stray
+    whitespace from a To/Cc/Bcc string before it lands in the MIME header
+    or the SMTP envelope. Users often paste a single address with a
+    trailing comma (e.g. `felix@pewdiepie.com,`) and most MTAs reject the
+    resulting `To: felix@pewdiepie.com,` line as a syntax error. Collapse
+    any run of separator junk between addresses too."""
+    if not field:
+        return field
+    # Split on commas, drop empty tokens, rejoin with a single ', '.
+    parts = [p.strip() for p in field.split(",")]
+    parts = [p for p in parts if p]
+    return ", ".join(parts)
+
+
 def _envelope_recipients(*fields: str) -> list:
    """Extract bare SMTP envelope addresses from one or more To/Cc/Bcc header
    strings. A naive `field.split(",")` corrupts display names that contain a
@@ -993,6 +1025,65 @@ def setup_email_routes():
                except Exception:
                    pass

+    def _related_thread_attachments_sync(
+        folder: str,
+        account_id: str | None,
+        owner: str,
+        current_uid: str,
+        current_message_id: str,
+        in_reply_to: str,
+        references: str,
+        limit: int = 12,
+    ) -> list[dict]:
+        """Return visible attachments from referenced messages in this folder."""
+        wanted_ids = _message_id_chain(references, in_reply_to)
+        current_mid = (current_message_id or "").strip()
+        wanted_ids = [mid for mid in wanted_ids if mid and mid != current_mid]
+        if not wanted_ids:
+            return []
+
+        related: list[dict] = []
+        try:
+            with _imap(account_id, owner=owner) as conn:
+                conn.select(_q(folder), readonly=True)
+                # Search newest referenced messages first; cap work so opening
+                # a long thread stays bounded.
+                for mid in reversed(wanted_ids[-10:]):
+                    if len(related) >= limit:
+                        break
+                    status, data = _imap_uid_search(conn, f'(HEADER Message-ID {_imap_search_quote(mid)})')
+                    if status != "OK" or not data or not data[0]:
+                        continue
+                    for uid_b in reversed(data[0].split()[-3:]):
+                        source_uid = uid_b.decode(errors="ignore")
+                        if not source_uid or source_uid == str(current_uid):
+                            continue
+                        st2, msg_data = _imap_uid_fetch(conn, source_uid, "(BODY.PEEK[])")
+                        if st2 != "OK" or not msg_data or not isinstance(msg_data[0], tuple):
+                            continue
+                        msg = email_mod.message_from_bytes(msg_data[0][1])
+                        source_from = _decode_header(msg.get("From", ""))
+                        source_subject = _decode_header(msg.get("Subject", ""))
+                        source_date = msg.get("Date", "")
+                        for att in _list_attachments_from_msg(msg):
+                            if _is_likely_signature_image_attachment(att):
+                                continue
+                            enriched = dict(att)
+                            enriched.update({
+                                "source_uid": source_uid,
+                                "source_folder": folder,
+                                "source_message_id": (msg.get("Message-ID") or "").strip(),
+                                "source_from": source_from,
+                                "source_subject": source_subject,
+                                "source_date": source_date,
+                            })
+                            related.append(enriched)
+                            if len(related) >= limit:
+                                break
+        except Exception as e:
+            logger.debug(f"related thread attachment lookup failed uid={current_uid}: {e}")
+        return related
+
    @router.get("/list")
    async def list_emails(
        folder: str = Query("INBOX"),
@@ -1263,6 +1354,17 @@ def setup_email_routes():
            sender_name, sender_addr = email.utils.parseaddr(sender)
            parsed_date = email.utils.parsedate_to_datetime(date_str) if date_str else None
            attachments = _list_attachments_from_msg(msg)
+            related_attachments = []
+            if not _has_visible_attachments(msg):
+                related_attachments = _related_thread_attachments_sync(
+                    folder,
+                    account_id,
+                    owner,
+                    uid,
+                    message_id,
+                    in_reply_to,
+                    references,
+                )

            if mark_seen:
                # Set \Seen in a separate readwrite session so concurrent reads
@@ -1371,6 +1473,8 @@ def setup_email_routes():
                "body": body,
                "body_html": body_html,
                "attachments": attachments,
+                "related_attachments": related_attachments,
+                "attachment_version": EMAIL_READ_ATTACHMENT_VERSION,
                "cached_summary": cached_summary,
                "cached_ai_reply": cached_ai_reply,
                "boundaries": cached_boundaries,
@@ -1401,6 +1505,12 @@ def setup_email_routes():
        """Read email body. Cached for 30m, sync IMAP work runs in a thread."""
        ck = _read_cache_key(account_id, folder, uid, owner=owner)
        cached = _read_cache_get(ck)
+        if cached is not None:
+            # Older cached read responses lack the thread-attachment fallback.
+            # Fetch once so replies that reference prior attachments can show
+            # those files without waiting for cache expiry.
+            if cached.get("attachment_version") != EMAIL_READ_ATTACHMENT_VERSION:
+                cached = None
        if cached is not None:
            if mark_seen:
                try:
@@ -1535,6 +1645,12 @@ def setup_email_routes():
                return {"error": f"Attachment index {index} not found"}

            from pathlib import Path as _Path
+            target_root = os.path.abspath(str(target_dir))
+            filepath_str = os.path.abspath(str(filepath))
+            if os.path.commonpath([target_root, filepath_str]) != target_root:
+                logger.warning("Rejected attachment path outside extraction dir: %s", filepath)
+                return {"error": "Invalid attachment path"}
+            filepath = _Path(filepath_str)
            base = _Path(filepath).name
            if base.startswith("."):
                return {"error": "Invalid filename", "filename": base}
@@ -1589,6 +1705,65 @@ def setup_email_routes():
                    return None
            doc_session_id = _resolve_doc_session()

+            def _create_markdown_doc(content: str, summary: str):
+                from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV
+                doc_id = str(uuid.uuid4())
+                ver_id = str(uuid.uuid4())
+                _db = _SL()
+                try:
+                    _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False})
+                    _db.add(_Doc(
+                        id=doc_id, session_id=doc_session_id, title=title,
+                        language="markdown", current_content=content,
+                        version_count=1, is_active=True,
+                    ))
+                    _db.add(_DV(
+                        id=ver_id, document_id=doc_id, version_number=1,
+                        content=content, summary=summary, source="upload",
+                    ))
+                    _db.commit()
+                finally:
+                    _db.close()
+                _tag_doc_with_source(doc_id)
+                return doc_id
+
+            def _attached_email_markdown(raw_bytes: bytes):
+                if not raw_bytes:
+                    return f"# Attached email: {base}\n\n_(empty email attachment)_"
+                try:
+                    attached_msg = email_mod.message_from_bytes(raw_bytes)
+                except Exception:
+                    logger.exception("Failed to parse attached email %s", base)
+                    return f"# Attached email: {base}\n\nCould not parse this email attachment."
+
+                attached_subject = _decode_header(attached_msg.get("Subject", "")) or base
+                attached_from = _decode_header(attached_msg.get("From", ""))
+                attached_to = _decode_header(attached_msg.get("To", ""))
+                attached_cc = _decode_header(attached_msg.get("Cc", ""))
+                attached_date = attached_msg.get("Date", "")
+                attached_body = _extract_text(attached_msg).strip()
+                attached_atts = _list_attachments_from_msg(attached_msg)
+
+                lines = [f"# Attached email: {attached_subject}", ""]
+                if attached_from:
+                    lines.append(f"**From:** {attached_from}")
+                if attached_to:
+                    lines.append(f"**To:** {attached_to}")
+                if attached_cc:
+                    lines.append(f"**Cc:** {attached_cc}")
+                if attached_date:
+                    lines.append(f"**Date:** {attached_date}")
+                lines.extend(["", "## Body", "", attached_body or "_(no readable body)_"])
+                if attached_atts:
+                    lines.extend(["", "## Attachments", ""])
+                    for att in attached_atts:
+                        size = int(att.get("size") or 0)
+                        size_label = f"{size} B" if size < 1024 else f"{round(size / 1024)} KB"
+                        name = att.get("filename") or f"attachment_{att.get('index', '')}"
+                        ctype = att.get("content_type") or "application/octet-stream"
+                        lines.append(f"- {name} ({ctype}, {size_label})")
+                return "\n".join(lines).strip()
+
            # ── PDF path (existing) ────────────────────────────────────
            if ext == ".pdf":
                import shutil as _shutil
@@ -1635,6 +1810,39 @@ def setup_email_routes():
                _tag_doc_with_source(doc_id)
                return {"doc_id": doc_id, "filename": filepath.name}

+            # ── Attached email (.eml / message/rfc822) ────────────────
+            if ext == ".eml":
+                def _attachment_bytes_from_msg():
+                    if not msg.is_multipart():
+                        return b""
+                    idx = 0
+                    for part in msg.walk():
+                        cd = str(part.get("Content-Disposition", ""))
+                        ct = part.get_content_type()
+                        is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename())
+                        if part.is_multipart() and not is_attached_email:
+                            continue
+                        if ct in ("text/plain", "text/html") and "attachment" not in cd:
+                            continue
+                        if idx == index:
+                            payload = part.get_payload(decode=True)
+                            if payload is None and ct == "message/rfc822":
+                                try:
+                                    payload = part.as_bytes()
+                                except Exception:
+                                    payload = b""
+                            return payload or b""
+                        idx += 1
+                    return b""
+
+                try:
+                    content = _attached_email_markdown(_attachment_bytes_from_msg())
+                except Exception:
+                    logger.exception("Failed to read email attachment %s", base)
+                    return {"error": "Failed to read email attachment", "filename": base}
+                doc_id = _create_markdown_doc(content, "Imported attached email")
+                return {"doc_id": doc_id, "filename": filepath.name}
+
            # ── DOCX path: extract text → markdown document ───────────
            if ext == ".docx":
                try:
@@ -1672,25 +1880,7 @@ def setup_email_routes():
                    lines.append("")
                content = "\n".join(lines).strip() or f"_(empty {base})_"

-                from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV
-                doc_id = str(uuid.uuid4())
-                ver_id = str(uuid.uuid4())
-                _db = _SL()
-                try:
-                    _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False})
-                    _db.add(_Doc(
-                        id=doc_id, session_id=doc_session_id, title=title,
-                        language="markdown", current_content=content,
-                        version_count=1, is_active=True,
-                    ))
-                    _db.add(_DV(
-                        id=ver_id, document_id=doc_id, version_number=1,
-                        content=content, summary="Imported from DOCX", source="upload",
-                    ))
-                    _db.commit()
-                finally:
-                    _db.close()
-                _tag_doc_with_source(doc_id)
+                doc_id = _create_markdown_doc(content, "Imported from DOCX")
                return {"doc_id": doc_id, "filename": filepath.name}

            # ── Plain text / markdown ────────────────────────────────
@@ -1699,25 +1889,7 @@ def setup_email_routes():
                    content = filepath.read_text(encoding="utf-8", errors="replace")
                except Exception as e:
                    return {"error": f"Failed to read text file: {e}", "filename": base}
-                from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV
-                doc_id = str(uuid.uuid4())
-                ver_id = str(uuid.uuid4())
-                _db = _SL()
-                try:
-                    _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False})
-                    _db.add(_Doc(
-                        id=doc_id, session_id=doc_session_id, title=title,
-                        language="markdown", current_content=content,
-                        version_count=1, is_active=True,
-                    ))
-                    _db.add(_DV(
-                        id=ver_id, document_id=doc_id, version_number=1,
-                        content=content, summary="Imported from email attachment", source="upload",
-                    ))
-                    _db.commit()
-                finally:
-                    _db.close()
-                _tag_doc_with_source(doc_id)
+                doc_id = _create_markdown_doc(content, "Imported from email attachment")
                return {"doc_id": doc_id, "filename": filepath.name}

            return {"error": f"Unsupported attachment type: {ext}", "filename": base}
@@ -2026,6 +2198,9 @@ def setup_email_routes():
            outer = MIMEMultipart("alternative")
            body_container = outer

+        to = _normalize_addr_field(to or "")
+        cc = _normalize_addr_field(cc or "")
+        bcc = _normalize_addr_field(bcc or "")
        outer["From"] = email.utils.formataddr((cfg.get("display_name") or "", cfg["from_address"]))
        outer["To"] = to
        if cc:
@@ -2170,12 +2345,10 @@ def setup_email_routes():
        try:
            conn = sqlite3.connect(SCHEDULED_DB)
            conn.row_factory = sqlite3.Row
-            # The MCP server can't easily set owner, so it stores '' — fall
-            # back to those rows in addition to the caller's owner.
            rows = conn.execute(
                """SELECT id, to_addr, subject, body, created_at, account_id
                   FROM scheduled_emails
-                   WHERE status = 'agent_draft' AND (owner = ? OR owner = '')
+                   WHERE status = 'agent_draft' AND owner = ?
                   ORDER BY created_at DESC""",
                (owner or "",),
            ).fetchall()
@@ -2196,7 +2369,7 @@ def setup_email_routes():
            cur = conn.execute(
                """UPDATE scheduled_emails
                   SET status = 'pending', send_at = ?
-                   WHERE id = ? AND status = 'agent_draft' AND (owner = ? OR owner = '')""",
+                   WHERE id = ? AND status = 'agent_draft' AND owner = ?""",
                (datetime.utcnow().isoformat(), sid, owner or ""),
            )
            conn.commit()
@@ -2217,7 +2390,7 @@ def setup_email_routes():
            conn = sqlite3.connect(SCHEDULED_DB)
            cur = conn.execute(
                """UPDATE scheduled_emails SET status = 'cancelled'
-                   WHERE id = ? AND status = 'agent_draft' AND (owner = ? OR owner = '')""",
+                   WHERE id = ? AND status = 'agent_draft' AND owner = ?""",
                (sid, owner or ""),
            )
            conn.commit()
@@ -2303,6 +2476,9 @@ def setup_email_routes():
            outer = MIMEMultipart("alternative")
            body_container = outer

+        req.to = _normalize_addr_field(req.to or "")
+        req.cc = _normalize_addr_field(req.cc or "")
+        req.bcc = _normalize_addr_field(req.bcc or "")
        outer["From"] = email.utils.formataddr((cfg.get("display_name") or "", cfg["from_address"]))
        outer["To"] = req.to
        if req.cc:
@@ -9,6 +9,7 @@ from pathlib import Path
 from fastapi import APIRouter, HTTPException, Form, Depends
 from core.constants import EMBEDDING_ENDPOINT_FILE, FASTEMBED_CACHE_DIR
 from core.middleware import require_admin
+from src.runtime_paths import get_app_root

 logger = logging.getLogger(__name__)

@@ -67,14 +67,6 @@ def _gallery_image_path(filename: str) -> Path:
        raise HTTPException(400, "Unsafe gallery filename")
    if safe_name != original:
        raise HTTPException(400, "Unsafe gallery filename")
-    if not path.exists():
-        cwd_root = (Path.cwd() / "data" / "generated_images").resolve()
-        cwd_path = (cwd_root / safe_name).resolve()
-        try:
-            if os.path.commonpath([str(cwd_root), str(cwd_path)]) == str(cwd_root) and cwd_path.exists():
-                return cwd_path
-        except Exception:
-            pass
    return path


@@ -1,8 +1,13 @@
+import json
+import os
 import re
+import shlex
+import subprocess
 from copy import deepcopy

 from fastapi import APIRouter, HTTPException

+from core.platform_compat import run_ssh_command
 from routes._validators import validate_remote_host, validate_ssh_port


@@ -107,6 +112,73 @@ def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_v
    return system


+def _run_model_probe(host: str, ssh_port: str, cmd: str) -> str:
+    try:
+        if host:
+            r = run_ssh_command(
+                host,
+                ssh_port or None,
+                cmd,
+                timeout=15,
+                connect_timeout=5,
+                strict_host_key_checking=False,
+                text=True,
+            )
+        else:
+            r = subprocess.run(["bash", "-lc", cmd], capture_output=True, text=True, timeout=15)
+        if r.returncode == 0:
+            return (r.stdout or "").strip()
+    except Exception:
+        return ""
+    return ""
+
+
+def _inspect_model_path(model_path: str, host: str = "", ssh_port: str = "") -> dict:
+    """Read lightweight metadata from a local or SSH-visible HF model folder."""
+    path = (model_path or "").strip()
+    if not path or path.startswith(("http://", "https://")):
+        return {}
+    if not (path.startswith("/") or path.startswith("~")):
+        return {}
+
+    qpath = shlex.quote(path)
+    qconfig = shlex.quote(os.path.join(path, "config.json"))
+    out = {}
+    exists = _run_model_probe(host, ssh_port, f"test -d {qpath} && printf found || printf missing")
+    if exists != "found":
+        target = host or "local container"
+        out["model_probe_error"] = f"Model path is not visible on {target}: {path}"
+        return out
+    raw_config = _run_model_probe(host, ssh_port, f"test -f {qconfig} && sed -n '1,240p' {qconfig}")
+    if raw_config:
+        try:
+            cfg = json.loads(raw_config)
+        except Exception:
+            cfg = {}
+        for key in ("context_length", "max_position_embeddings", "n_ctx_train", "model_max_length", "max_seq_len"):
+            value = cfg.get(key)
+            if isinstance(value, (int, float)) and value > 0:
+                out["model_ctx_max"] = int(value)
+                break
+    else:
+        out["model_probe_error"] = f"config.json not found in model path: {path}"
+
+    size_cmd = (
+        f"find {qpath} -type f \\( -name '*.safetensors' -o -name '*.bin' -o -name '*.gguf' \\) "
+        "-printf '%s\\n' 2>/dev/null | awk '{s+=$1} END {if (s>0) printf \"%.6f\", s/1073741824}'"
+    )
+    weights = _run_model_probe(host, ssh_port, size_cmd)
+    try:
+        weights_gb = float(weights)
+    except Exception:
+        weights_gb = 0.0
+    if weights_gb > 0:
+        out["model_weights_gb"] = round(weights_gb, 3)
+    elif "model_probe_error" not in out:
+        out["model_probe_error"] = f"No model weight files found in: {path}"
+    return out
+
+
 def setup_hwfit_routes():
    router = APIRouter(prefix="/api/hwfit", tags=["hwfit"])

@@ -235,7 +307,7 @@ def setup_hwfit_routes():
        return {"system": system, "models": results}

    @router.get("/profiles")
-    def get_serve_profiles(model: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""):
+    def get_serve_profiles(model: str = "", model_path: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""):
        """Compute llama.cpp serve profiles (Quality/Balanced/Speed) for `model`
        against the detected hardware on `host` (or local). Returns concrete
        flags (n_gpu_layers, n_cpu_moe, cache_type, ctx) the serve UI can apply.
@@ -260,8 +332,23 @@ def setup_hwfit_routes():
            # "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct".
            s = (s or "").lower().strip()
            s = s.split("/")[-1]                     # drop org prefix
-            s = re.sub(r"[-_.]?gguf$", "", s)        # drop trailing gguf marker
-            s = re.sub(r"[-_.](q\d[^/]*|iq\d[^/]*|fp8|bf16|f16|awq[^/]*|gptq[^/]*)$", "", s)
+            for suffix in ("-gguf", "_gguf", ".gguf", "gguf"):
+                if s.endswith(suffix):
+                    s = s[: -len(suffix)]
+                    break
+            cut_at = None
+            for idx, ch in enumerate(s):
+                if ch not in "-_." or idx + 1 >= len(s):
+                    continue
+                suffix = s[idx + 1:]
+                if (
+                    suffix in {"fp8", "bf16", "f16"}
+                    or suffix.startswith(("awq", "gptq", "iq"))
+                    or (suffix.startswith("q") and len(suffix) > 1 and suffix[1].isdigit())
+                ):
+                    cut_at = idx
+            if cut_at is not None:
+                s = s[:cut_at]
            return s

        m = catalog.get(model)
@@ -272,8 +359,16 @@ def setup_hwfit_routes():
                if nn and (nn == want or want.endswith(nn) or nn.endswith(want)):
                    m = entry
                    break
+        path_meta = _inspect_model_path(model_path or model, host=host, ssh_port=ssh_port)
        if m is None:
-            return {"system": system, "profiles": [], "error": "model not in catalog"}
+            return {
+                "system": system,
+                "profiles": [],
+                "error": "model not in catalog",
+                "model_ctx_max": int(path_meta.get("model_ctx_max") or 0),
+                "model_weights_gb": float(path_meta.get("model_weights_gb") or 0),
+                "model_probe_error": path_meta.get("model_probe_error") or "",
+            }
        # Surface the model's trained context limit so the serve UI can clamp a
        # user-typed context down to it (asking for ctx > n_ctx_train overflows
        # and, with a quantized KV cache, can crash the GPU).
@@ -283,6 +378,16 @@ def setup_hwfit_routes():
            if isinstance(v, (int, float)) and v > 0:
                model_ctx_max = int(v)
                break
+        path_ctx_max = int(path_meta.get("model_ctx_max") or 0)
+        if path_ctx_max > 0:
+            model_ctx_max = max(model_ctx_max, path_ctx_max)
+        model_weights_gb = float(path_meta.get("model_weights_gb") or 0)
+        if model_weights_gb <= 0:
+            for k in ("min_vram_gb", "required_gb", "size_gb", "recommended_ram_gb", "min_ram_gb"):
+                v = m.get(k)
+                if isinstance(v, (int, float)) and v > 0:
+                    model_weights_gb = float(v)
+                    break
        return {
            "system": system,
            "profiles": compute_serve_profiles(
@@ -291,6 +396,8 @@ def setup_hwfit_routes():
                serve_quant=(serve_quant or None),
            ),
            "model_ctx_max": model_ctx_max,
+            "model_weights_gb": model_weights_gb,
+            "model_probe_error": path_meta.get("model_probe_error") or "",
        }

    @router.get("/image-models")
@@ -273,65 +273,30 @@ def setup_memory_routes(memory_manager: MemoryManager, session_manager: SessionM
    async def api_audit_memories(request: Request, session: str = Form(None)):
        """Deduplicate and consolidate memories via LLM.

-        Uses the default model from settings, or falls back to a session's model.
+        Uses task/utility/default settings through the shared resolver, with
+        the active session as fallback when no task or utility model is set.
        Returns before and after memory counts.
        """
-        from routes.model_routes import _load_settings, _normalize_base, build_chat_url
-        from core.database import ModelEndpoint
-        import json as _json
-
-        endpoint_url = model = None
-        headers = {}
-
-        # Try utility model from settings first — memory audit is a background
-        # task and should prefer the lighter utility model over the main chat model.
-        from src.task_endpoint import resolve_task_endpoint
        user = _owner(request)
-        t_url, t_model, t_headers = resolve_task_endpoint(owner=user)
-        if t_url and t_model:
-            endpoint_url, model, headers = t_url, t_model, t_headers
-        else:
-            # Fall back to default model if no task/utility model configured
-            settings = _load_settings()
-            ep_id = settings.get("default_endpoint_id", "")
-            default_model = settings.get("default_model", "")
-            if ep_id:
-                db = SessionLocal()
-                try:
-                    ep = db.query(ModelEndpoint).filter(
-                        ModelEndpoint.id == ep_id, ModelEndpoint.is_enabled == True
-                    ).first()
-                    if ep:
-                        base = _normalize_base(ep.base_url)
-                        endpoint_url = build_chat_url(base)
-                        model = default_model
-                        if not model and ep.models:
-                            try:
-                                models = _json.loads(ep.models) if isinstance(ep.models, str) else ep.models
-                                if models:
-                                    model = models[0]
-                            except Exception:
-                                pass
-                        if ep.api_key:
-                            headers = {"Authorization": f"Bearer {ep.api_key}"}
-                finally:
-                    db.close()
+        fallback_url = fallback_model = None
+        fallback_headers = None
+        if session:
+            try:
+                sess = session_manager.get_session(session)
+                _assert_session_owner(sess, user)
+                fallback_url = sess.endpoint_url
+                fallback_model = sess.model
+                fallback_headers = sess.headers
+            except KeyError:
+                pass

-            # Fall back to session model if no default configured
-            if not endpoint_url and session:
-                try:
-                    sess = session_manager.get_session(session)
-                    _assert_session_owner(sess, _owner(request))
-                    endpoint_url = sess.endpoint_url
-                    model = sess.model
-                    headers = sess.headers
-                except KeyError:
-                    pass
+        endpoint_url, model, headers = resolve_task_endpoint(
+            fallback_url, fallback_model, fallback_headers, owner=user
+        )

        if not endpoint_url or not model:
            raise HTTPException(400, "No default model configured — set one in Settings")

-        user = _owner(request)
        result = await audit_memories(
            memory_manager,
            memory_vector,
@@ -369,18 +334,28 @@ def setup_memory_routes(memory_manager: MemoryManager, session_manager: SessionM
        model = None
        headers = {}

+        user = _owner(request)
+
        if session:
            try:
                sess = session_manager.get_session(session)
-                _assert_session_owner(sess, _owner(request))
-                endpoint_url, model, headers = resolve_task_endpoint(
-                    sess.endpoint_url, sess.model, sess.headers, owner=_owner(request)
-                )
+                _assert_session_owner(sess, user)
            except KeyError:
-                logger.warning("Session %s not found, falling back to utility endpoint", session)
-                endpoint_url, model, headers = resolve_endpoint("utility", owner=_owner(request))
+                sess = None
+            except HTTPException as exc:
+                if exc.status_code != 404:
+                    raise
+                sess = None
+
+            if sess is None:
+                logger.warning("Session %s not found or inaccessible, falling back to utility endpoint", session)
+                endpoint_url, model, headers = resolve_endpoint("utility", owner=user)
+            else:
+                endpoint_url, model, headers = resolve_task_endpoint(
+                    sess.endpoint_url, sess.model, sess.headers, owner=user
+                )
        else:
-            endpoint_url, model, headers = resolve_task_endpoint(owner=_owner(request))
+            endpoint_url, model, headers = resolve_task_endpoint(owner=user)
    
        if not endpoint_url or not model:
            raise HTTPException(400, "No LLM model configured. Set a default model in Settings.")
@@ -5,6 +5,7 @@ import re
 import uuid
 import json
 import hashlib
+import ipaddress
 import socket
 import time as _time
 import logging
@@ -16,6 +17,7 @@ from fastapi import APIRouter, HTTPException, Form, Query, Body, Request, Respon
 from pydantic import BaseModel
 from fastapi.responses import StreamingResponse
 from core.database import SessionLocal, ModelEndpoint, Session as DbSession
+from core.log_safety import redact_url as _redact_url_for_log
 from core.middleware import require_admin
 from src.llm_core import _detect_provider, _host_match, ANTHROPIC_MODELS
 from src.tls_overrides import llm_verify
@@ -405,8 +407,11 @@ def _endpoint_refresh_timeout(ep: Any, category: str) -> float:
    except Exception:
        val = 0
    if val > 0:
-        return float(max(1, min(30, val)))
-    return 2.5 if category == "local" else 2.0
+        return float(max(1, min(60, val)))
+    # llama.cpp and other local OpenAI-compatible servers can block briefly
+    # while warming/loading. A 2s local timeout makes working endpoints flicker
+    # offline before /v1/models is ready.
+    return 10.0 if category == "local" else 2.0


 def _manual_refresh_timeout(ep: Any, category: str, requested: Any = None) -> float:
@@ -473,7 +478,7 @@ def _explicit_model_list_timeout(base_url: str, endpoint_kind: str = "auto", req
    category = _classify_endpoint(base_url, kind)
    if kind in ("api", "proxy") or category == "api":
        return 30.0
-    return 3.0 if _is_ollama_base(base_url) else 2.0
+    return 15.0 if category == "local" else (3.0 if _is_ollama_base(base_url) else 2.0)


 def _cached_model_ids(ep: Any) -> List[str]:
@@ -518,6 +523,10 @@ _NON_CHAT_EXACT_PREFIXES = (

 def _is_chat_model(model_id: str) -> bool:
    """Return True if the model ID looks like a chat/completions-capable model."""
+    if not isinstance(model_id, str):
+        # Non-compliant upstreams can return non-string IDs (e.g. int/None);
+        # treat them as chat-capable rather than crashing on .lower().
+        return True
    mid = model_id.lower()
    for prefix in _NON_CHAT_PREFIXES:
        if mid.startswith(prefix):
@@ -562,6 +571,8 @@ def _safe_build_models_url(base_url: str) -> str:
    """Build a /models URL without letting optional provider imports break probes."""
    try:
        return build_models_url(base_url)
+    except ValueError:
+        raise
    except Exception as exc:
        logger.debug("Model URL detection failed for %s: %s", base_url, exc)
        return f"{(base_url or '').rstrip('/')}/models"
@@ -633,7 +644,7 @@ def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 1

    try:
        t0 = _time.time()
-        r = httpx.post(target_url, headers=h, json=payload, timeout=timeout)
+        r = httpx.post(target_url, headers=h, json=payload, timeout=timeout, verify=llm_verify())
        latency = round((_time.time() - t0) * 1000)
        if r.is_success:
            return {"status": "ok", "latency_ms": latency}
@@ -659,13 +670,20 @@ def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 1

 # Hostnames / IP prefixes that indicate a local endpoint
 _LOCAL_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "::1"}
-_PRIVATE_PREFIXES = ("10.", "172.16.", "172.17.", "172.18.", "172.19.",
-                     "172.20.", "172.21.", "172.22.", "172.23.", "172.24.",
-                     "172.25.", "172.26.", "172.27.", "172.28.", "172.29.",
-                     "172.30.", "172.31.", "192.168.")
+_PRIVATE_NETWORKS = (
+    ipaddress.ip_network("10.0.0.0/8"),
+    ipaddress.ip_network("172.16.0.0/12"),
+    ipaddress.ip_network("192.168.0.0/16"),
+)
+_TAILSCALE_CGNAT = ipaddress.ip_network("100.64.0.0/10")


-_TAILSCALE_RE = re.compile(r"^100\.(6[4-9]|[7-9]\d|1[01]\d|12[0-7])\.")
+def _local_ip_literal(host: str) -> bool:
+    try:
+        ip = ipaddress.ip_address(host)
+    except ValueError:
+        return False
+    return any(ip in network for network in _PRIVATE_NETWORKS) or ip in _TAILSCALE_CGNAT


 def _classify_endpoint(base_url: str, endpoint_kind: str = "auto") -> str:
@@ -679,9 +697,7 @@ def _classify_endpoint(base_url: str, endpoint_kind: str = "auto") -> str:
        return "api"
    try:
        host = urlparse(base_url).hostname or ""
-        if host in _LOCAL_HOSTS or host.startswith(_PRIVATE_PREFIXES):
-            return "local"
-        if _TAILSCALE_RE.match(host):
+        if host in _LOCAL_HOSTS or _local_ip_literal(host):
            return "local"
    except Exception:
        pass
@@ -703,6 +719,44 @@ def _effective_endpoint_kind(ep: Any, base_url: str) -> str:
    return "auto"


+def _is_loading_model_response(resp: Any) -> bool:
+    if getattr(resp, "status_code", None) != 503:
+        return False
+    try:
+        body = resp.text or ""
+    except Exception:
+        body = ""
+    return "loading model" in body.lower()
+
+
+
+def _openai_model_ids(data: Any) -> List[str]:
+    """Extract OpenAI-style model IDs (``{"data": [{"id": ...}]}``).
+
+    Tolerates a non-dict body and non-string IDs from non-compliant upstreams,
+    returning only non-empty string IDs.
+    """
+    items = data.get("data") if isinstance(data, dict) else None
+    return [m["id"] for m in (items or [])
+            if isinstance(m, dict) and isinstance(m.get("id"), str) and m["id"]]
+
+
+def _ollama_model_names(data: Any) -> List[str]:
+    """Extract native-Ollama model names (``{"models": [{"name"|"model": ...}]}``).
+
+    Same tolerance as :func:`_openai_model_ids`: a non-dict body or non-string
+    value is skipped rather than crashing, preserving name-then-model precedence.
+    """
+    items = data.get("models") if isinstance(data, dict) else None
+    out: List[str] = []
+    for m in (items or []):
+        if not isinstance(m, dict):
+            continue
+        v = m.get("name") or m.get("model")
+        if isinstance(v, str) and v:
+            out.append(v)
+    return out
+

 def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> List[str]:
    """Probe a base URL's /models endpoint and return list of model IDs.
@@ -726,7 +780,7 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
            r = httpx.get(url, headers=headers, timeout=timeout, verify=llm_verify())
            r.raise_for_status()
            data = r.json()
-            models = [m.get("id") for m in (data.get("data") or []) if m.get("id")]
+            models = _openai_model_ids(data)
            if models:
                return models
        except httpx.HTTPStatusError as e:
@@ -748,10 +802,10 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
        r.raise_for_status()
        data = r.json()
        # OpenAI format: {"data": [{"id": "model-name"}]}
-        models = [m.get("id") for m in (data.get("data") or []) if m.get("id")]
+        models = _openai_model_ids(data)
        # Ollama format: {"models": [{"name": "model-name"}]}
        if not models:
-            models = [m.get("name") or m.get("model") for m in (data.get("models") or []) if m.get("name") or m.get("model")]
+            models = _ollama_model_names(data)
        if models:
            # Z.AI coding plan omits some working models from /models;
            # append curated-only entries for that endpoint only.
@@ -767,16 +821,19 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
                        models.append(_e)
            return [m for m in models if _is_chat_model(m)]
    except httpx.HTTPStatusError as e:
+        if e.response is not None and _is_loading_model_response(e.response):
+            logger.info("Endpoint still loading model at %s", _redact_url_for_log(url))
+            return []
        if api_key:
            status = e.response.status_code if e.response is not None else "unknown"
-            logger.warning(f"Failed to probe {url} with API key: HTTP {status}")
+            logger.warning("Failed to probe %s with API key: HTTP %s", _redact_url_for_log(url), status)
            return []
-        logger.warning(f"Failed to probe {url}: {e}")
+        logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
    except Exception as e:
        if api_key:
-            logger.warning(f"Failed to probe {url} with API key: {e}")
+            logger.warning("Failed to probe %s with API key: %s", _redact_url_for_log(url), e)
            return []
-        logger.warning(f"Failed to probe {url}: {e}")
+        logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)

    # Older Ollama builds and some proxies expose native /api/tags even when
    # the OpenAI-compatible /v1/models path is unavailable.
@@ -787,7 +844,7 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
            r = httpx.get(root + "/api/tags", timeout=timeout, verify=llm_verify())
            r.raise_for_status()
            data = r.json()
-            models = [m.get("name") or m.get("model") for m in (data.get("models") or []) if m.get("name") or m.get("model")]
+            models = _ollama_model_names(data)
            if models:
                return [m for m in models if _is_chat_model(m)]
    except Exception as e:
@@ -816,6 +873,15 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
        or "ollama" in (parsed_base.hostname or "").lower()
    )

+    def _is_loading_model_response(r) -> bool:
+        if getattr(r, "status_code", None) != 503:
+            return False
+        try:
+            body = r.text or ""
+        except Exception:
+            body = ""
+        return "loading model" in body.lower()
+
    def _result_from_response(r) -> Dict[str, Any]:
        if 300 <= r.status_code < 400:
            loc = r.headers.get("location", "")
@@ -832,6 +898,13 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
                "status_code": r.status_code,
                "error": None,
            }
+        if _is_loading_model_response(r):
+            return {
+                "reachable": True,
+                "loading": True,
+                "status_code": r.status_code,
+                "error": "Loading model",
+            }
        return {"reachable": False, "status_code": r.status_code, "error": f"HTTP {r.status_code}"}

    last_error: Optional[str] = None
@@ -864,7 +937,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
        if 400 <= sc < 500 and sc not in (401, 403):
            models_url = _safe_build_models_url(base)
            try:
-                r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify())
+                r2 = httpx.get(models_url, headers=headers,timeout=timeout, verify=llm_verify())
                result2 = _result_from_response(r2)
                if result2["reachable"]:
                    return result2
@@ -1048,9 +1121,11 @@ def setup_model_routes(model_discovery):
        except Exception:
            return 0.0

-    def _failure_delay(fails: int) -> float:
+    def _failure_delay(fails: int, *, empty_local: bool = False) -> float:
        if fails <= 0:
            return 0.0
+        if empty_local:
+            return min(5.0 * (2 ** max(0, fails - 1)), 30.0)
        return min(_REFRESH_FAILURE_BASE * (2 ** max(0, fails - 1)), _REFRESH_FAILURE_MAX)

    def _should_refresh_endpoint(ep: Any, now: float, force: bool = False) -> tuple[bool, Dict[str, Any]]:
@@ -1081,7 +1156,12 @@ def setup_model_routes(model_discovery):
        fails = int(state.get("fail_count") or 0)
        if fails and not force:
            last_failure = float(state.get("last_failure") or 0.0)
-            if now - last_failure < _failure_delay(fails):
+            empty_local = (
+                not cached
+                and category == "local"
+                and str(getattr(ep, "id", "") or "").startswith("local-")
+            )
+            if now - last_failure < _failure_delay(fails, empty_local=empty_local):
                return False, info
        if cached and not force:
            interval = _endpoint_refresh_interval(ep, category)
@@ -1396,7 +1476,7 @@ def setup_model_routes(model_discovery):
                t0 = _time.time()
                ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
                entry["latency_ms"] = round((_time.time() - t0) * 1000)
-                entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
+                entry["status"] = "loading" if ping.get("loading") else ("online" if ping.get("reachable") or cached_count else "offline")
                entry["error"] = ping.get("error")
                entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
            except Exception as e:
@@ -1570,9 +1650,37 @@ def setup_model_routes(model_discovery):
                # "everything's already cached" path because this branch only
                # runs for endpoints with an empty cached_models.
                if not all_models and not pinned and r.is_enabled:
-                    ping = _ping_endpoint(r.base_url, r.api_key, timeout=3.5)
+                    base_for_ping = _normalize_base(r.base_url)
+                    kind_for_ping = _effective_endpoint_kind(r, base_for_ping)
+                    ping_timeout = 10.0 if _classify_endpoint(base_for_ping, kind_for_ping) == "local" else 3.5
+                    ping = _ping_endpoint(r.base_url, r.api_key, timeout=ping_timeout)
                    if ping.get("reachable"):
-                        status = "empty"
+                        status = "loading" if ping.get("loading") else "empty"
+                        if ping.get("loading"):
+                            base = _normalize_base(r.base_url)
+                            kind = _effective_endpoint_kind(r, base)
+                            results.append({
+                                "id": r.id,
+                                "name": r.name,
+                                "base_url": r.base_url,
+                                "has_key": bool(r.api_key),
+                                "api_key_fingerprint": _api_key_fingerprint(r.api_key),
+                                "is_enabled": r.is_enabled,
+                                "models": visible,
+                                "pinned_models": pinned,
+                                "hidden_count": len(hidden),
+                                "online": True,
+                                "status": status,
+                                "ping_error": (ping or {}).get("error") if ping else None,
+                                "model_type": getattr(r, "model_type", None) or "llm",
+                                "supports_tools": getattr(r, "supports_tools", None),
+                                "endpoint_kind": kind,
+                                "category": _classify_endpoint(base, kind),
+                                "model_refresh_mode": _endpoint_refresh_mode(r, kind),
+                                "model_refresh_interval": getattr(r, "model_refresh_interval", None),
+                                "model_refresh_timeout": getattr(r, "model_refresh_timeout", None),
+                            })
+                            continue
                        # Best-effort: if the probe came back reachable, try
                        # to populate cached_models in the background so the
                        # NEXT picker load shows "online" instead of "empty".
@@ -1580,7 +1688,7 @@ def setup_model_routes(model_discovery):
                        # "empty" status, and the existing background refresh
                        # path will eventually fill it in too.
                        try:
-                            probed = _probe_endpoint(r.base_url, r.api_key, timeout=5)
+                            probed = _probe_endpoint(r.base_url, r.api_key, timeout=max(5, int(ping_timeout)))
                            if probed:
                                r.cached_models = json.dumps(probed)
                                db.commit()
@@ -1758,7 +1866,7 @@ def setup_model_routes(model_discovery):
        model_ids = _probe_endpoint(base_url, api_key.strip() or None, timeout=explicit_timeout) if should_probe else []
        ping = {"reachable": False, "error": None}
        if (should_probe or requested_kind in ("api", "proxy")) and not model_ids:
-            ping = _ping_endpoint(base_url, api_key.strip() or None, timeout=min(explicit_timeout, 2.0))
+            ping = _ping_endpoint(base_url, api_key.strip() or None, timeout=min(explicit_timeout, 10.0))
        if require_model_list and not model_ids:
            raise HTTPException(400, _model_endpoint_error_message(base_url, ping))

@@ -1825,7 +1933,7 @@ def setup_model_routes(model_discovery):
            "models": _merge_model_ids(model_ids, _pinned),
            "pinned_models": _pinned,
            "online": bool(model_ids) or bool(_pinned) or bool(ping.get("reachable")),
-            "status": "online" if (model_ids or _pinned) else ("empty" if ping.get("reachable") else "offline"),
+            "status": "online" if (model_ids or _pinned) else ("loading" if ping.get("loading") else ("empty" if ping.get("reachable") else "offline")),
            "ping_error": ping.get("error") if ping else None,
            "endpoint_kind": requested_kind,
            "category": _classify_endpoint(base_url, requested_kind),
@@ -1850,11 +1958,11 @@ def setup_model_routes(model_discovery):
        configured_timeout = _parse_positive_int(model_refresh_timeout, minimum=1, maximum=60)
        probe_timeout = _explicit_model_list_timeout(base_url, requested_kind, configured_timeout)
        models = _probe_endpoint(base_url, api_key.strip() or None, timeout=probe_timeout)
-        ping = {"reachable": True, "error": None} if models else _ping_endpoint(base_url, api_key.strip() or None, timeout=min(probe_timeout, 2.0))
+        ping = {"reachable": True, "error": None} if models else _ping_endpoint(base_url, api_key.strip() or None, timeout=min(probe_timeout, 10.0))
        return {
            "base_url": base_url,
            "online": bool(models) or bool(ping.get("reachable")),
-            "status": "online" if models else ("empty" if ping.get("reachable") else "offline"),
+            "status": "online" if models else ("loading" if ping.get("loading") else ("empty" if ping.get("reachable") else "offline")),
            "ping_error": ping.get("error") if ping else None,
            "models": models,
            "count": len(models),
@@ -2032,6 +2140,16 @@ def setup_model_routes(model_discovery):
            ep_id = (_user_prefs.get("default_endpoint_id") or "").strip()
            model = (_user_prefs.get("default_model") or "").strip()
            _fallbacks = _user_prefs.get("default_model_fallbacks") or []
+            # If user has no personal default, fall back to global default
+            # But only based on the "share_defaults_with_users" flag
+            # (only if share_defaults_with_users is enabled)
+            if settings.get("share_defaults_with_users", False):
+                if not ep_id:
+                    ep_id = settings.get("default_endpoint_id", "")
+                if not model:
+                    model = settings.get("default_model", "")
+                if not _fallbacks:
+                    _fallbacks = settings.get("default_model_fallbacks") or []
        else:
            ep_id = settings.get("default_endpoint_id", "")
            model = settings.get("default_model", "")
@@ -10,6 +10,7 @@ from fastapi import APIRouter, HTTPException, Request
 from pydantic import BaseModel

 from core.database import SessionLocal, Note
+from core.middleware import INTERNAL_TOOL_USER
 from src.auth_helpers import require_user
 from src.constants import DATA_DIR
 from sqlalchemy.orm.attributes import flag_modified
@@ -334,10 +335,11 @@ async def dispatch_reminder(
            # Loud diagnostic so we can see WHY a reminder didn't send (the
            # previous "silently no-op when cfg has no smtp_host" was invisible).
            logger.info(
-                f"dispatch_reminder[email] note_id={note_id} owner={owner!r} "
-                f"smtp_host={cfg.get('smtp_host')!r} smtp_user={cfg.get('smtp_user')!r} "
-                f"from={from_addr!r} recipient={recipient!r} "
-                f"account_name={cfg.get('account_name')!r}"
+                "dispatch_reminder[email] note_id=%s owner=%r "
+                "has_smtp_host=%s has_smtp_user=%s has_from=%s has_recipient=%s",
+                note_id, owner,
+                bool(cfg.get("smtp_host")), bool(cfg.get("smtp_user")),
+                bool(from_addr), bool(recipient),
            )
            missing = []
            if not cfg.get("smtp_host"):
@@ -582,7 +584,7 @@ def setup_note_routes(task_scheduler=None):
        return require_user(request) or None

    def _is_admin_or_single_user(request: Request, user: str | None) -> bool:
-        if user == "internal-tool":
+        if user == INTERNAL_TOOL_USER:
            return True
        if not user:
            # require_user() already admitted this request, which only happens
@@ -2,8 +2,9 @@
 """Routes for personal documents management."""
 import os
 import logging
+import shutil
 import uuid
-from typing import List, Tuple
+from typing import Any, Dict, List, Tuple
 from fastapi import APIRouter, HTTPException, Query, Request, UploadFile, File, Depends
 from src.request_models import DirectoryRequest
 from core.constants import BASE_DIR, PERSONAL_DIR, PERSONAL_UPLOADS_DIR
@@ -18,14 +19,15 @@ UPLOADS_DIR = PERSONAL_UPLOADS_DIR
 logger = logging.getLogger(__name__)


-def _personal_upload_dir_for_owner(owner: str | None) -> str:
+def _personal_upload_dir_for_owner(owner: str | None, *, create: bool = True) -> str:
    """Return the per-owner upload directory used for direct RAG uploads."""
    owner_segment = secure_filename((owner or "local").strip())[:80] or "local"
    upload_dir = os.path.abspath(os.path.join(UPLOADS_DIR, owner_segment))
    base_abs = os.path.abspath(UPLOADS_DIR)
    if os.path.commonpath([upload_dir, base_abs]) != base_abs:
        raise ValueError("Unsafe upload owner path")
-    os.makedirs(upload_dir, exist_ok=True)
+    if create:
+        os.makedirs(upload_dir, exist_ok=True)
    return upload_dir


@@ -44,6 +46,87 @@ def _unique_personal_upload_path(upload_dir: str, original_name: str | None) ->
        raise ValueError("Unsafe upload filename")
    return file_path, filename, safe_name

+
+def _unique_existing_target(path: str) -> str:
+    """Return a non-existing sibling path for rename collision handling."""
+    if not os.path.exists(path):
+        return path
+    stem, ext = os.path.splitext(path)
+    while True:
+        candidate = f"{stem}-{uuid.uuid4().hex[:10]}{ext}"
+        if not os.path.exists(candidate):
+            return candidate
+
+
+def _remove_empty_tree(path: str) -> None:
+    """Best-effort removal of empty directories under ``path``."""
+    if not os.path.isdir(path):
+        return
+    for root, dirs, _files in os.walk(path, topdown=False):
+        for dirname in dirs:
+            candidate = os.path.join(root, dirname)
+            try:
+                os.rmdir(candidate)
+            except OSError:
+                pass
+    try:
+        os.rmdir(path)
+    except OSError:
+        pass
+
+
+def rename_personal_upload_owner(
+    old_owner: str,
+    new_owner: str,
+    *,
+    personal_docs_manager: Any = None,
+    rag_manager: Any = None,
+) -> Dict[str, Any]:
+    """Move direct personal uploads and rewrite RAG owner metadata on user rename."""
+    old_dir = _personal_upload_dir_for_owner(old_owner, create=False)
+    new_dir = _personal_upload_dir_for_owner(new_owner, create=False)
+    path_map: Dict[str, str] = {}
+    moved_files = 0
+
+    if os.path.isdir(old_dir) and old_dir != new_dir:
+        os.makedirs(new_dir, exist_ok=True)
+        for root, _dirs, files in os.walk(old_dir):
+            rel_root = os.path.relpath(root, old_dir)
+            target_root = new_dir if rel_root == "." else os.path.join(new_dir, rel_root)
+            os.makedirs(target_root, exist_ok=True)
+            for filename in files:
+                source = os.path.abspath(os.path.join(root, filename))
+                target = _unique_existing_target(os.path.abspath(os.path.join(target_root, filename)))
+                shutil.move(source, target)
+                path_map[source] = target
+                moved_files += 1
+        _remove_empty_tree(old_dir)
+
+    if personal_docs_manager is not None:
+        rename_directory = getattr(personal_docs_manager, "rename_directory", None)
+        if callable(rename_directory):
+            rename_directory(old_dir, new_dir, path_map=path_map)
+
+    rag_result = None
+    if rag_manager is not None:
+        rename_owner = getattr(rag_manager, "rename_owner", None)
+        if callable(rename_owner):
+            rag_result = rename_owner(
+                old_owner,
+                new_owner,
+                path_map=path_map,
+                path_prefixes=[(old_dir, new_dir)],
+            )
+
+    return {
+        "old_dir": old_dir,
+        "new_dir": new_dir,
+        "moved_files": moved_files,
+        "path_map": path_map,
+        "rag_result": rag_result,
+    }
+
+
 def setup_personal_routes(personal_docs_manager, rag_manager, rag_available):
    """
    Setup personal documents related routes.
@@ -275,11 +358,13 @@ def setup_personal_routes(personal_docs_manager, rag_manager, rag_available):
                except Exception as e:
                    logger.warning(f"RAG removal failed for {filepath}: {e}")

-            # Delete file from disk if it's in uploads dir
+            # Delete file from disk if it's in the caller's own uploads dir.
+            # Scope to the per-owner subdir, not the shared uploads root, so one
+            # admin can't delete another user's personal files by path.
            deleted_from_disk = False
            try:
                abs_target = os.path.realpath(filepath)
-                base_abs = os.path.realpath(UPLOADS_DIR)
+                base_abs = os.path.realpath(_personal_upload_dir_for_owner(owner, create=False))
                in_uploads = (
                    abs_target == base_abs
                    or os.path.commonpath([abs_target, base_abs]) == base_abs
@@ -12,8 +12,10 @@ from typing import Optional
 from fastapi import APIRouter, HTTPException, Query, Request
 from fastapi.responses import HTMLResponse, StreamingResponse
 from pydantic import BaseModel, Field
+from core.middleware import INTERNAL_TOOL_USER
 from src.endpoint_resolver import resolve_endpoint
 from src.auth_helpers import _auth_disabled, get_current_user
+from core.auth import RESERVED_USERNAMES
 from src.constants import DEEP_RESEARCH_DIR

 _SESSION_ID_RE = re.compile(r"^[a-zA-Z0-9-]{1,128}$")
@@ -385,9 +387,9 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
        """Launch a research job from the dedicated panel."""
        from src.auth_helpers import require_privilege
        user = require_privilege(request, "can_use_research")
-        if user == "internal-tool":
+        if user == INTERNAL_TOOL_USER:
            tool_owner = (request.headers.get("X-Odysseus-Owner") or "").strip()
-            if tool_owner and tool_owner not in {"internal-tool", "api", "demo", "system"}:
+            if tool_owner and tool_owner not in RESERVED_USERNAMES:
                auth_mgr = getattr(request.app.state, "auth_manager", None)
                if auth_mgr is not None and getattr(auth_mgr, "is_configured", False):
                    try:
@@ -11,7 +11,7 @@ from core.session_manager import SessionManager
 from core.models import ChatMessage
 from src.request_models import SessionResponse
 from core.database import Session as DbSession, SessionLocal, Document, GalleryImage, utcnow_naive
-from src.auth_helpers import get_current_user, effective_user, _auth_disabled, owner_filter
+from src.auth_helpers import effective_user, _auth_disabled, owner_filter
 from src.session_actions import is_session_recently_active


@@ -328,7 +328,7 @@ def setup_session_routes(session_manager: SessionManager, config: dict, webhook_
        endpoint_id: str = Form(""),
    ):
        skip_val = str(skip_validation).lower() == "true"
-        user = get_current_user(request)
+        user = effective_user(request)
        endpoint_api_key = ""
        endpoint_base_url = ""
        _reject_raw_endpoint_url_for_non_admin(request, user, endpoint_id, endpoint_url)
@@ -477,7 +477,7 @@ def setup_session_routes(session_manager: SessionManager, config: dict, webhook_
                db.close()
        # Switch model/endpoint mid-session
        if model is not None and endpoint_url is not None:
-            user = get_current_user(request)
+            user = effective_user(request)
            _reject_raw_endpoint_url_for_non_admin(request, user, endpoint_id, endpoint_url)
            endpoint_api_key = ""
            endpoint_base_url = ""
@@ -1004,6 +1004,7 @@ def setup_session_routes(session_manager: SessionManager, config: dict, webhook_
        """
        from src.llm_core import llm_call
        user = effective_user(request)
+        single_user_mode = not user and _auth_disabled()
        user_sessions = session_manager.get_sessions_for_user(user)

        # Delete empty and throwaway sessions before sorting
@@ -1022,7 +1023,12 @@ def setup_session_routes(session_manager: SessionManager, config: dict, webhook_
        }
        _THROWAWAY_MAX_MESSAGES = 4  # only delete if <= this many messages
        try:
-            rows = db.query(DbSession).filter(DbSession.archived == False, DbSession.owner == user).limit(2000).all()
+            rows_q = db.query(DbSession).filter(DbSession.archived == False)
+            if user:
+                rows_q = rows_q.filter(DbSession.owner == user)
+            elif not single_user_mode:
+                rows_q = rows_q.filter(DbSession.owner == user)
+            rows = rows_q.limit(2000).all()
            folder_map = {r.id: r.folder for r in rows}
            # Precompute per-session message counts in TWO aggregate queries
            # instead of 1–3 queries PER session — with many chats the per-row
@@ -1242,7 +1248,12 @@ def setup_session_routes(session_manager: SessionManager, config: dict, webhook_
        db = SessionLocal()
        try:
            for sid, folder_name in assignments.items():
-                db_session = db.query(DbSession).filter(DbSession.id == sid, DbSession.owner == user).first()
+                db_session_q = db.query(DbSession).filter(DbSession.id == sid)
+                if user:
+                    db_session_q = db_session_q.filter(DbSession.owner == user)
+                elif not single_user_mode:
+                    db_session_q = db_session_q.filter(DbSession.owner == user)
+                db_session = db_session_q.first()
                if db_session:
                    db_session.folder = folder_name
                    db_session.updated_at = datetime.utcnow()
@@ -15,6 +15,7 @@ from collections import namedtuple
 from pathlib import Path
 from typing import Dict, Any
 from core.platform_compat import IS_APPLE_SILICON, which_tool
+from core.middleware import INTERNAL_TOOL_USER
 from src.optional_deps import prepare_optional_dependency_import

 # POSIX-only: `pty`/`fcntl` transitively import `termios`, which does NOT exist
@@ -55,7 +56,7 @@ def _require_admin(request: Request):
    # In-process tool loopback. The AuthMiddleware already validated the
    # internal token + loopback client before setting this marker, so
    # honour it here as admin-equivalent.
-    if user == "internal-tool":
+    if user == INTERNAL_TOOL_USER:
        return
    if not user or user == "api":
        raise HTTPException(403, "Admin only")
@@ -330,6 +331,9 @@ def add_user_install_bins_to_path():
        candidates.append(os.path.join(site.USER_BASE, 'bin'))
    except Exception:
        pass
+    candidates.append(os.path.expanduser('~/bin'))
+    candidates.append(os.path.expanduser('~/llama.cpp/build/bin'))
+    candidates.append(os.path.expanduser('~/llama.cpp/build-vulkan/bin'))
    candidates.append(os.path.expanduser('~/.local/bin'))
    parts = os.environ.get('PATH', '').split(os.pathsep) if os.environ.get('PATH') else []
    changed = False
@@ -961,12 +965,84 @@ def setup_shell_routes() -> APIRouter:

        return StreamingResponse(generate(), media_type="text/event-stream")

+    def _os_id_from_release(text: str) -> str:
+        """Map /etc/os-release contents to a canonical family for our matrix."""
+        if not text:
+            return ""
+        ids = []
+        for line in text.splitlines():
+            line = line.strip()
+            if line.startswith("ID=") or line.startswith("ID_LIKE="):
+                ids += line.split("=", 1)[1].strip().strip('"').split()
+        ids = [i.lower() for i in ids]
+        if any(x in ids for x in ("debian", "ubuntu", "linuxmint", "pop", "elementary")):
+            return "debian"
+        if any(x in ids for x in ("arch", "manjaro", "endeavouros", "cachyos", "garuda")):
+            return "arch"
+        if any(x in ids for x in ("fedora", "rhel", "centos", "rocky", "almalinux", "ol")):
+            return "fedora"
+        if "alpine" in ids:
+            return "alpine"
+        if any(x in ids for x in ("suse", "opensuse", "opensuse-leap", "opensuse-tumbleweed", "sles")):
+            return "suse"
+        return ""
+
+    # Matrix lookup keyed on (os_family, backend) → (pkg_mgr_cmd_template, pkg_list_per_dep).
+    # Each `system_prereqs` name resolves to a list of OS-specific package
+    # names that get joined into the final `sudo apt install -y …` etc.
+    # command. Backend-specific extras (CUDA toolkit, ROCm, Vulkan headers)
+    # are added only when the detected backend needs them.
+    _PKG_NAMES = {
+        # canonical-name → {os_id: [actual_pkg_names_on_this_os]}
+        "cmake":           {"debian": ["cmake"], "arch": ["cmake"], "fedora": ["cmake"], "alpine": ["cmake"], "suse": ["cmake"], "macos": ["cmake"]},
+        "build-essential": {"debian": ["build-essential"], "arch": ["base-devel"], "fedora": ["gcc", "gcc-c++", "make"], "alpine": ["build-base"], "suse": ["gcc-c++", "make"], "macos": []},
+        "g++":             {"debian": ["g++"], "arch": ["gcc"], "fedora": ["gcc-c++"], "alpine": ["g++"], "suse": ["gcc-c++"], "macos": []},
+        "gcc":             {"debian": ["gcc"], "arch": ["gcc"], "fedora": ["gcc"], "alpine": ["gcc"], "suse": ["gcc"], "macos": []},
+        "make":            {"debian": ["make"], "arch": ["make"], "fedora": ["make"], "alpine": ["make"], "suse": ["make"], "macos": []},
+        "git":             {"debian": ["git"], "arch": ["git"], "fedora": ["git"], "alpine": ["git"], "suse": ["git"], "macos": ["git"]},
+        "tmux":            {"debian": ["tmux"], "arch": ["tmux"], "fedora": ["tmux"], "alpine": ["tmux"], "suse": ["tmux"], "macos": ["tmux"]},
+    }
+    _BACKEND_EXTRAS = {
+        "cuda":   {"debian": ["nvidia-cuda-toolkit"], "arch": ["cuda"], "fedora": ["cuda-toolkit"], "alpine": [], "suse": ["cuda"], "macos": []},
+        "rocm":   {"debian": ["rocm-dev"], "arch": ["rocm-hip-sdk"], "fedora": ["rocm-devel"], "alpine": [], "suse": ["rocm-dev"], "macos": []},
+        "vulkan": {"debian": ["libvulkan-dev", "vulkan-tools"], "arch": ["vulkan-headers", "vulkan-tools"], "fedora": ["vulkan-headers", "vulkan-tools"], "alpine": ["vulkan-loader-dev", "vulkan-tools"], "suse": ["vulkan-devel", "vulkan-tools"], "macos": []},
+    }
+    _PKG_MGR = {
+        "debian": "sudo apt install -y {pkgs}",
+        "arch":   "sudo pacman -S --needed {pkgs}",
+        "fedora": "sudo dnf install -y {pkgs}",
+        "alpine": "sudo apk add {pkgs}",
+        "suse":   "sudo zypper install -n {pkgs}",
+        "macos":  "brew install {pkgs}",
+    }
+
+    def _install_cmd_for_target(os_id: str, backend: str, missing: list[str]) -> str:
+        """Build a single OS+backend-aware install command for the missing prereqs."""
+        if not os_id or os_id not in _PKG_MGR:
+            return ""
+        pkgs: list[str] = []
+        seen: set[str] = set()
+        for m in missing:
+            for p in _PKG_NAMES.get(m, {}).get(os_id, []):
+                if p not in seen:
+                    pkgs.append(p); seen.add(p)
+        # Add backend-specific extras only when the build would actually
+        # consume them (a CUDA toolkit isn't useful on a Vulkan box).
+        backend = (backend or "").lower()
+        for p in _BACKEND_EXTRAS.get(backend, {}).get(os_id, []):
+            if p not in seen:
+                pkgs.append(p); seen.add(p)
+        if not pkgs:
+            return ""
+        return _PKG_MGR[os_id].format(pkgs=" ".join(pkgs))
+
    @router.get("/api/cookbook/packages")
    async def list_packages(
        request: Request,
        host: str | None = None,
        ssh_port: str | None = None,
        venv: str | None = None,
+        backend: str | None = None,
    ):
        """Check which optional packages are installed.

@@ -1015,6 +1091,12 @@ def setup_shell_routes() -> APIRouter:
                "kind": "system",
                "install_hint": "Install Docker on the selected server and allow this user to run docker.",
            },
+            # Note: cmake / gcc / git are not separate dependency rows —
+            # they're declared as `system_prereqs` on llama_cpp (and any
+            # other engine that compiles from source) so they appear as
+            # an inline status note on that engine's row instead of
+            # cluttering the panel with raw OS package names that aren't
+            # meaningful product-level dependencies on their own.
            # ── LLM ── installs on GPU servers for model serving/downloading
            {
                "name": "hf_transfer",
@@ -1026,9 +1108,16 @@ def setup_shell_routes() -> APIRouter:
            {
                "name": "llama_cpp",
                "pip": "llama-cpp-python[server]",
-                "desc": "Serve GGUF models via llama.cpp",
+                "desc": "Great for single-GPU or CPU inference with GGUF models",
                "category": "LLM",
                "target": "remote",
+                # Build-toolchain prereqs. Cookbook's launch bootstrap
+                # compiles llama-server from source when no prebuilt
+                # binary is present; without these the build aborts
+                # with `cmake: command not found`. Surfaced inline on
+                # this row so the user doesn't have to chase three
+                # separate OS-package rows.
+                "system_prereqs": ["cmake", "g++", "git"],
            },
            {
                "name": "sglang",
@@ -1040,7 +1129,7 @@ def setup_shell_routes() -> APIRouter:
            {
                "name": "vllm",
                "pip": "vllm",
-                "desc": "High-throughput LLM serving engine",
+                "desc": "Great for high-throughput multi-GPU inference",
                "category": "LLM",
                "target": "remote",
            },
@@ -1103,6 +1192,7 @@ def setup_shell_routes() -> APIRouter:
        # venv over SSH so a remote `pip install` actually reflects here.
        remote_status: dict = {}
        remote_details: dict = {}
+        remote_probe_error = ""
        remote_names = [
            p["name"]
            for p in packages
@@ -1141,16 +1231,56 @@ def setup_shell_routes() -> APIRouter:
                        break
            except ValueError as e:
                raise HTTPException(400, str(e))
-            except Exception:
+            except Exception as e:
                remote_status = {}
-        if host and remote_system_names:
+                remote_probe_error = f"SSH package probe failed: {str(e)[:160]}"
+            if "llama_cpp" in remote_names:
+                try:
+                    inner = (
+                        'export PATH="$HOME/.local/bin:$HOME/bin:'
+                        '$HOME/llama.cpp/build/bin:$HOME/llama.cpp/build-vulkan/bin:$PATH"; '
+                        "command -v llama-server 2>/dev/null || true"
+                    )
+                    argv = _ssh_base_argv(host, ssh_port) + [inner]
+                    proc = await asyncio.create_subprocess_exec(
+                        *argv,
+                        stdout=asyncio.subprocess.PIPE,
+                        stderr=asyncio.subprocess.PIPE,
+                    )
+                    out, _err = await asyncio.wait_for(proc.communicate(), timeout=8)
+                    llama_server_path = out.decode("utf-8", errors="replace").strip().splitlines()
+                    llama_server_path = llama_server_path[-1].strip() if llama_server_path else ""
+                    if llama_server_path:
+                        remote_status["llama_cpp"] = True
+                        probe = remote_details.setdefault("llama_cpp", {})
+                        if isinstance(probe, dict):
+                            probe.setdefault("binaries", {})["llama-server"] = llama_server_path
+                except Exception as e:
+                    if not remote_probe_error:
+                        remote_probe_error = f"SSH llama-server probe failed: {str(e)[:160]}"
+                    pass
+        # Union of system_names + every package's system_prereqs. Probing
+        # the prereqs alongside the main system deps in a single SSH call
+        # avoids a second round-trip per Cookbook → Dependencies refresh.
+        prereq_names: set[str] = set()
+        for p in packages:
+            for pr in p.get("system_prereqs") or []:
+                prereq_names.add(str(pr))
+        all_system_names = list(set(remote_system_names) | prereq_names)
+        # Detect the target's OS family + read /etc/os-release in the same
+        # SSH round-trip as the prereq probe — used downstream to render a
+        # single OS-specific install command per row instead of dumping
+        # every distro's syntax onto the user.
+        target_os_id: str = ""
+        if host and all_system_names:
            try:
                checks = []
-                for name in remote_system_names:
+                for name in all_system_names:
                    qn = shlex.quote(name)
                    checks.append(
                        f"if command -v {qn} >/dev/null 2>&1; then echo {qn}=1; else echo {qn}=0; fi"
                    )
+                checks.append("echo '---OSREL---'; cat /etc/os-release 2>/dev/null || true")
                inner = " ; ".join(checks)
                argv = _ssh_base_argv(host, ssh_port) + [inner]
                proc = await asyncio.create_subprocess_exec(
@@ -1160,20 +1290,45 @@ def setup_shell_routes() -> APIRouter:
                )
                out, _err = await asyncio.wait_for(proc.communicate(), timeout=12)
                txt = out.decode("utf-8", errors="replace").strip()
+                _section, _osrel_lines = "probe", []
                for line in txt.splitlines():
+                    if line.strip() == "---OSREL---":
+                        _section = "osrel"; continue
+                    if _section == "osrel":
+                        _osrel_lines.append(line)
+                        continue
                    name, sep, value = line.strip().partition("=")
-                    if sep and name in remote_system_names:
+                    if sep and name in all_system_names:
                        remote_status[name] = value == "1"
+                target_os_id = _os_id_from_release("\n".join(_osrel_lines))
            except ValueError as e:
                raise HTTPException(400, str(e))
-            except Exception:
+            except Exception as e:
+                if not remote_probe_error:
+                    remote_probe_error = f"SSH system probe failed: {str(e)[:160]}"
                pass
+        elif not host:
+            # Local target — probe in-process so the inline install command
+            # still appears in the dep panel when the cookbook container
+            # itself is the selected server.
+            try:
+                with open("/etc/os-release", encoding="utf-8") as f:
+                    target_os_id = _os_id_from_release(f.read())
+            except Exception:
+                target_os_id = ""
+            if sys.platform == "darwin":
+                target_os_id = "macos"

        for pkg in packages:
            on_remote = bool(host and pkg.get("target") == "remote")
            probe = None
            if on_remote:
-                pkg["installed"] = bool(remote_status.get(pkg["name"], False))
+                if remote_probe_error and pkg["name"] not in remote_status:
+                    pkg["installed"] = None
+                    pkg["probe_error"] = remote_probe_error
+                    pkg["status_note"] = remote_probe_error
+                else:
+                    pkg["installed"] = bool(remote_status.get(pkg["name"], False))
                probe = remote_details.get(pkg["name"])
                if isinstance(probe, dict):
                    pkg["details"] = probe
@@ -1222,13 +1377,116 @@ def setup_shell_routes() -> APIRouter:
                    pkg["installed"] = False
                except importlib_metadata.PackageNotFoundError:
                    pkg["installed"] = False
-                except Exception:
+                except (Exception, SystemExit):
                    # Installed but crashes on import — e.g. a CUDA build of
                    # llama-cpp-python raising FileNotFoundError when the CUDA
-                    # toolkit dir is absent. One broken optional package must not
-                    # 500 the entire packages panel; report it as not usable.
+                    # toolkit dir is absent, or rembg calling sys.exit(1) when no
+                    # onnxruntime backend can be loaded. SystemExit is a
+                    # BaseException, not Exception, so without catching it here a
+                    # single sys.exit-on-import package escapes and takes down the
+                    # whole packages panel / worker (the panel hangs forever). One
+                    # broken optional package must not 500 — or hang — the entire
+                    # panel; report it as not usable.
                    pkg["installed"] = False

+            # llama_cpp partial-state probe: when the package is installed
+            # but the wheel was built CPU-only AND the target has NVIDIA
+            # hardware, mark the row as partial (yellow/orange) with a
+            # one-click upgrade to the CUDA wheel. Without this the row
+            # reads "ready" green while inference runs at 3 tok/s on GPU
+            # silicon — actively misleading.
+            if pkg["name"] == "llama_cpp" and pkg.get("installed"):
+                _native_llama_server = bool(
+                    isinstance(probe, dict)
+                    and isinstance(probe.get("binaries"), dict)
+                    and probe["binaries"].get("llama-server")
+                )
+                _gpu_capable = False
+                _has_nvidia_target = False
+                if _native_llama_server:
+                    # Native llama-server is the launcher path Cookbook now
+                    # prefers. Do not mark this as a CPU-only Python wheel just
+                    # because llama-cpp-python is absent from the selected venv.
+                    _gpu_capable = True
+                elif on_remote and host:
+                    try:
+                        # Activate the configured venv FIRST so the probe
+                        # runs against the same python the launch script
+                        # would activate. Without this prefix, bare
+                        # `python3` was checked — which can disagree with
+                        # the venv's wheel (e.g. user-site has CUDA wheel
+                        # but venv has CPU-only), and the dep panel then
+                        # showed "ready" green while every launch fell to
+                        # CPU.
+                        _vp = _venv_activate_prefix(venv)
+                        probe = (
+                            f'{_vp}python3 -c "import llama_cpp; import sys; '
+                            'sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" '
+                            '&& echo llama_cpp_gpu=1 || echo llama_cpp_gpu=0; '
+                            'command -v nvidia-smi >/dev/null 2>&1 '
+                            '&& nvidia-smi -L 2>/dev/null | grep -q "GPU " '
+                            '&& echo nvidia=1 || echo nvidia=0'
+                        )
+                        argv = _ssh_base_argv(host, ssh_port) + [probe]
+                        proc = await asyncio.create_subprocess_exec(
+                            *argv, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
+                        )
+                        out, _ = await asyncio.wait_for(proc.communicate(), timeout=8)
+                        txt = out.decode("utf-8", errors="replace")
+                        if "llama_cpp_gpu=1" in txt:
+                            _gpu_capable = True
+                        if "nvidia=1" in txt:
+                            _has_nvidia_target = True
+                    except Exception:
+                        pass
+                else:
+                    try:
+                        import llama_cpp as _lcp  # type: ignore
+                        _gpu_capable = bool(_lcp.llama_supports_gpu_offload())
+                    except Exception:
+                        _gpu_capable = False
+                    _has_nvidia_target = shutil.which("nvidia-smi") is not None
+                if (not _gpu_capable) and _has_nvidia_target:
+                    pkg["partial"] = True
+                    pkg["partial_reason"] = "Installed but CPU-only wheel — GPU detected on this target. Upgrade to a CUDA wheel for ~10× faster inference."
+                    pkg["partial_action"] = "reinstall_llama_cpp_cuda"
+            # Attach per-package system_prereqs status. We probed each
+            # prereq name above; surface "Missing build deps: …" ONLY
+            # when the package itself is not installed — if the package
+            # works (e.g. llama-cpp-python already imports cleanly), the
+            # build toolchain is irrelevant and surfacing it as a red
+            # flag confuses users ("ready" + "missing" on the same row).
+            _prereqs = list(pkg.get("system_prereqs") or [])
+            if _prereqs:
+                if on_remote:
+                    _pr_present = {n: bool(remote_status.get(n)) for n in _prereqs}
+                else:
+                    _pr_present = {n: shutil.which(n) is not None for n in _prereqs}
+                pkg["system_prereqs_status"] = _pr_present
+                _missing = [n for n, ok in _pr_present.items() if not ok]
+                # Suppress the "missing build deps" hint when the package
+                # itself is installed — build deps are only relevant if
+                # the user would need to recompile from source.
+                if pkg.get("installed"):
+                    _missing = []
+                if _missing:
+                    # Build a target-specific install command from the
+                    # (os_family, backend) matrix when we know both. Fall
+                    # back to the multi-distro hint only when the target's
+                    # OS can't be classified (e.g. ssh probe failed).
+                    _resolved_os = target_os_id or "debian"  # safest default
+                    _cmd = _install_cmd_for_target(_resolved_os, backend or "", _missing)
+                    if _cmd and target_os_id:
+                        _hint = "Missing build deps for this target: " + ", ".join(_missing)
+                        pkg["install_cmd_for_target"] = _cmd
+                        pkg["install_cmd_os"] = target_os_id
+                        pkg["install_cmd_backend"] = (backend or "").lower()
+                    else:
+                        _hint = "Missing build deps: " + ", ".join(_missing) + ". Install via apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git."
+                    _existing_note = pkg.get("status_note") or ""
+                    pkg["status_note"] = (_existing_note + " — " + _hint) if _existing_note else _hint
+                    pkg["build_deps_missing"] = _missing
+
            if pkg.get("installed"):
                update_status = _package_pip_update_status(pkg, probe)
                pkg["pip_update_available"] = update_status.available
@@ -1288,6 +1546,102 @@ def setup_shell_routes() -> APIRouter:
            return {"ok": True, "output": stdout.decode()[-200:]}
        return {"ok": False, "error": stderr.decode()[-300:]}

+    @router.post("/api/cookbook/install-system-deps")
+    async def install_system_deps(request: Request):
+        """Install OS-level system packages (cmake/build-essential/git/tmux)
+        on a remote target or in the local container. Admin only.
+
+        Bounded by a per-package allowlist — anything outside the catalog
+        is rejected so the route can't be coerced into installing arbitrary
+        OS packages. Uses `sudo -n` (passwordless) so the call returns a
+        clear "needs sudo password" error instead of hanging when interactive
+        sudo is required.
+        """
+        _require_admin(request)
+        body = await request.json()
+        raw = body.get("packages") or []
+        host = (body.get("remote_host") or "").strip()
+        ssh_port = body.get("ssh_port")
+        # Names users can request — must match canonical names used in the
+        # deps catalog's `system_prereqs` field and on the System rows.
+        ALLOWED = {"cmake", "build-essential", "g++", "gcc", "git", "tmux", "make"}
+        pkgs = [str(p).strip() for p in raw if str(p).strip() in ALLOWED]
+        if not pkgs:
+            return {"ok": False, "error": "no installable packages requested (allowlist: " + ", ".join(sorted(ALLOWED)) + ")"}
+        # Re-map to the right package name per OS. apt/dpkg use the names
+        # as-is; pacman has base-devel for build-essential, etc.
+        def _apt(names): return list(names)
+        def _pacman(names):
+            return ["base-devel" if n == "build-essential" else n for n in names]
+        def _dnf(names):
+            out = []
+            for n in names:
+                if n == "build-essential": out += ["gcc", "gcc-c++", "make"]
+                elif n == "g++": out += ["gcc-c++"]
+                else: out.append(n)
+            return out
+        def _brew(names):
+            return [n for n in names if n not in ("build-essential", "g++", "gcc", "make")]
+        # Build a single shell snippet that detects the package manager and
+        # runs the right install. Non-interactive sudo (-n) only — if sudo
+        # asks for a password the script reports it instead of hanging.
+        apt_pkgs = " ".join(shlex.quote(p) for p in _apt(pkgs))
+        pac_pkgs = " ".join(shlex.quote(p) for p in _pacman(pkgs))
+        dnf_pkgs = " ".join(shlex.quote(p) for p in _dnf(pkgs))
+        brew_pkgs = " ".join(shlex.quote(p) for p in _brew(pkgs))
+        # Error messages go to stderr (>&2) so the route's error field
+        # gets populated. Without the redirect, `echo "ERROR…"` on stdout
+        # left stderr empty and the frontend toast fell through to a
+        # bare "HTTP 200" instead of surfacing the real reason.
+        script = (
+            'set -e; '
+            'if ! sudo -n true 2>/dev/null; then '
+            '  echo "ERROR: passwordless sudo unavailable on this target. Run once: sudo apt install -y ' + " ".join(pkgs) + ' (or your distro equivalent: pacman -S, dnf install, brew install). After that, Cookbook can install the rest." >&2; exit 2; fi; '
+            'if command -v apt-get >/dev/null 2>&1; then '
+            f'  sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update -qq && sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends {apt_pkgs}; '
+            'elif command -v pacman >/dev/null 2>&1; then '
+            f'  sudo -n pacman -Sy --needed --noconfirm {pac_pkgs}; '
+            'elif command -v dnf >/dev/null 2>&1; then '
+            f'  sudo -n dnf install -y {dnf_pkgs}; '
+            'elif command -v brew >/dev/null 2>&1; then '
+            f'  brew install {brew_pkgs}; '
+            'else '
+            '  echo "ERROR: no supported package manager (apt/pacman/dnf/brew) on this target." >&2; exit 3; fi'
+        )
+        try:
+            if host:
+                argv = _ssh_base_argv(host, ssh_port) + [script]
+            else:
+                argv = ["bash", "-lc", script]
+        except ValueError as e:
+            raise HTTPException(400, str(e))
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                *argv, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            out, err = await asyncio.wait_for(proc.communicate(), timeout=180)
+        except asyncio.TimeoutError:
+            return {"ok": False, "error": "Install timed out after 180s"}
+        ok = (proc.returncode == 0)
+        # Combine stderr + (last lines of stdout) into a single error
+        # blob when ok=False — some package managers print useful failure
+        # context to stdout, and a script that exits via `echo ...; exit N`
+        # without `>&2` would otherwise hand back an empty error string
+        # and force the frontend to show a bare "HTTP 200".
+        err_txt = err.decode("utf-8", errors="replace").strip()
+        out_txt = out.decode("utf-8", errors="replace").strip()
+        if not ok:
+            tail_out = out_txt[-500:] if out_txt else ""
+            combined = err_txt or tail_out or f"exit code {proc.returncode}"
+        else:
+            combined = None
+        return {
+            "ok": ok,
+            "exit_code": proc.returncode,
+            "output": out_txt[-1000:],
+            "error": combined,
+        }
+
    @router.post("/api/cookbook/rebuild-engine")
    async def rebuild_engine(request: Request):
        """Clear the cached llama.cpp build so the next serve recompiles.
@@ -1308,7 +1662,8 @@ def setup_shell_routes() -> APIRouter:
            return {"ok": False, "error": f"Unsupported engine: {engine}"}
        host = str(body.get("remote_host") or "").strip()
        ssh_port = body.get("ssh_port")
-        cmd = _llama_cpp_rebuild_cmd()
+        update_source = bool(body.get("update_source"))
+        cmd = _llama_cpp_rebuild_cmd(update_source=update_source)
        try:
            argv = (
                (_ssh_base_argv(host, ssh_port) + [cmd])
@@ -11,6 +11,7 @@ from fastapi import APIRouter, HTTPException, Request
 from pydantic import BaseModel

 from core.database import SessionLocal, ScheduledTask, TaskRun
+from core.middleware import INTERNAL_TOOL_USER
 from core.constants import internal_api_base
 from src.auth_helpers import get_current_user
 from src.constants import DATA_DIR, EMAIL_URGENCY_CACHE_DIR
@@ -427,7 +428,7 @@ def setup_task_routes(task_scheduler) -> APIRouter:
        # In-process tool-loopback marker — AuthMiddleware validated
        # the internal token + loopback client before stamping this,
        # so treat as admin-equivalent.
-        if user == "internal-tool":
+        if user == INTERNAL_TOOL_USER:
            return True
        try:
            from core.auth import AuthManager
@@ -3,11 +3,16 @@ import os
 import time
 import json
 import asyncio
+import shutil
+import uuid
+from pathlib import Path
 from fastapi import APIRouter, Request, File, UploadFile, HTTPException
 from typing import List
 import logging
 from core.middleware import require_admin
-from src.auth_helpers import get_current_user
+from core.database import SessionLocal, GalleryImage
+from src.auth_helpers import effective_user
+from src.constants import GENERATED_IMAGES_DIR
 from src.upload_handler import count_recent_uploads

 logger = logging.getLogger(__name__)
@@ -50,6 +55,69 @@ def setup_upload_routes(upload_handler):
            raise HTTPException(404, "File not found")

        raise HTTPException(404, "File not found")
+
+    def _promote_chat_image_to_gallery(meta: dict, owner: str | None) -> str | None:
+        """Make chat-uploaded images visible in Gallery without changing chat storage."""
+        is_image_file = getattr(upload_handler, "is_image_file", None)
+        if not callable(is_image_file):
+            return None
+        if not is_image_file(meta.get("name", ""), meta.get("mime", "")):
+            return None
+
+        source_path = meta.get("path")
+        if not source_path or not os.path.isfile(source_path):
+            return None
+
+        db = SessionLocal()
+        try:
+            file_hash = meta.get("hash")
+            if file_hash:
+                q = db.query(GalleryImage).filter(
+                    GalleryImage.file_hash == file_hash,
+                    GalleryImage.is_active == True,  # noqa: E712
+                )
+                if owner:
+                    q = q.filter(GalleryImage.owner == owner)
+                existing = q.first()
+                if existing:
+                    return existing.id
+
+            image_dir = Path(GENERATED_IMAGES_DIR)
+            image_dir.mkdir(parents=True, exist_ok=True)
+            ext = Path(meta.get("name") or source_path).suffix.lower()
+            if ext not in {".png", ".jpg", ".jpeg", ".webp", ".gif"}:
+                mime_ext = {
+                    "image/png": ".png",
+                    "image/jpeg": ".jpg",
+                    "image/jpg": ".jpg",
+                    "image/webp": ".webp",
+                    "image/gif": ".gif",
+                }.get(meta.get("mime", ""))
+                ext = mime_ext or ".png"
+            filename = f"{uuid.uuid4().hex[:12]}{ext}"
+            dest_path = image_dir / filename
+            shutil.copy2(source_path, dest_path)
+
+            image_id = str(uuid.uuid4())
+            db.add(GalleryImage(
+                id=image_id,
+                filename=filename,
+                prompt=meta.get("name") or "Chat upload",
+                model="chat-upload",
+                owner=owner,
+                file_hash=file_hash,
+                width=meta.get("width"),
+                height=meta.get("height"),
+                file_size=meta.get("size"),
+            ))
+            db.commit()
+            return image_id
+        except Exception as e:
+            db.rollback()
+            logger.warning("Failed to add chat image upload to gallery: %s", e)
+            return None
+        finally:
+            db.close()
    
    @router.post("")
    async def api_upload(request: Request, files: List[UploadFile] = File(...)):
@@ -78,8 +146,10 @@ def setup_upload_routes(upload_handler):
        
        for u in files:
            try:
-                meta = upload_handler.save_upload(u, client_ip, owner=get_current_user(request))
-                out.append({
+                owner = effective_user(request)
+                meta = upload_handler.save_upload(u, client_ip, owner=owner)
+                gallery_id = _promote_chat_image_to_gallery(meta, owner)
+                item = {
                    "id": meta["id"],
                    "name": meta["name"],
                    "mime": meta["mime"],
@@ -89,7 +159,10 @@ def setup_upload_routes(upload_handler):
                    "width": meta.get("width"),
                    "height": meta.get("height"),
                    "is_duplicate": meta.get("is_duplicate", False)
-                })
+                }
+                if gallery_id:
+                    item["gallery_id"] = gallery_id
+                out.append(item)
            except HTTPException:
                raise
            except Exception as e:
@@ -138,7 +211,7 @@ def setup_upload_routes(upload_handler):
                original_name = info.get("name", file_id)
        auth_mgr = getattr(request.app.state, "auth_manager", None)
        auth_configured = bool(auth_mgr and auth_mgr.is_configured)
-        current_user = get_current_user(request)
+        current_user = effective_user(request)
        file_owner = info.get("owner") if info else None
        if auth_configured:
            if not current_user:
@@ -204,7 +277,7 @@ def setup_upload_routes(upload_handler):
        info = _load_upload_info(file_id)
        auth_mgr = getattr(request.app.state, "auth_manager", None)
        auth_configured = bool(auth_mgr and auth_mgr.is_configured)
-        current_user = get_current_user(request)
+        current_user = effective_user(request)
        file_owner = info.get("owner") if info else None
        if auth_configured:
            if not current_user:
@@ -247,7 +320,7 @@ def setup_upload_routes(upload_handler):
            raise HTTPException(404, "File not found")
        auth_mgr = getattr(request.app.state, "auth_manager", None)
        auth_configured = bool(auth_mgr and auth_mgr.is_configured)
-        current_user = get_current_user(request)
+        current_user = effective_user(request)
        file_owner = info.get("owner")
        if auth_configured:
            if not current_user:
@@ -1,6 +1,5 @@
 """Webhook, API Token, and sync chat routes."""

-import asyncio
 import uuid
 import logging
 from typing import Optional
@@ -385,10 +384,10 @@ def setup_webhook_routes(
        sess.add_message(ChatMessage("assistant", reply))
        session_manager.save_sessions()

-        asyncio.create_task(webhook_manager.fire("chat.completed", {
+        webhook_manager.fire_and_forget("chat.completed", {
            "session_id": session_id, "model": sess.model,
            "user_message": message[:2000], "response": reply[:2000],
-        }))
+        })

        return {"response": reply, "session_id": session_id, "model": sess.model}