Merge pull request #4701 from pewdiepie-archdaemon/sync-dev-from-main-20260622

chore(dev): sync main cookbook and model workflow fixes
2026-06-28 07:35:27 -04:00 · 2026-06-22 11:52:26 +09:00
parent 160267417e 993d504de3
commit 5f63a3d3bd
65 changed files with 6072 additions and 846 deletions
@@ -20,6 +20,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    gosu \
    && rm -rf /var/lib/apt/lists/*
 # Docker CLI (client only — daemon stays on the host via the
 # /var/run/docker.sock mount). The Debian `docker.io` package ships
 # dockerd but not the client binary on slim, so grab the static client
 # tarball from download.docker.com instead.
 ARG DOCKER_CLI_VERSION=27.5.1
 RUN ARCH="$(dpkg --print-architecture)" \
    && case "$ARCH" in \
         amd64) DARCH=x86_64 ;; \
         arm64) DARCH=aarch64 ;; \
         *) echo "unsupported arch $ARCH"; exit 1 ;; \
       esac \
    && curl -fsSL "https://download.docker.com/linux/static/stable/${DARCH}/docker-${DOCKER_CLI_VERSION}.tgz" \
       -o /tmp/docker.tgz \
    && tar -xzf /tmp/docker.tgz -C /tmp \
    && install -m 0755 /tmp/docker/docker /usr/local/bin/docker \
    && rm -rf /tmp/docker /tmp/docker.tgz
 WORKDIR /app
 # Install Python deps first (layer cache). Optional extras (PyMuPDF AGPL, etc.)
@@ -28,6 +28,14 @@ services:
      # land under /app/.local for the odysseus user. Persist them so a
      # container recreate does not silently remove installed serve engines.
      - ${APP_DATA_DIR:-./data}/local:/app/.local:z
      # Docker socket — lets Cookbook launch commands like
      # `docker exec ollama-rocm ollama show <tag>` reach the host's
      # Docker daemon (and sibling containers like ollama-rocm /
      # ollama-test). The in-container user needs to be in the
      # socket's owning group — see `group_add` below; the GID
      # there must match the host's `docker` group (defaults to 963
      # on Debian, 999 on Ubuntu — override via env if yours differs).
      - /var/run/docker.sock:/var/run/docker.sock
    extra_hosts:
      # Lets the container reach local services on the Docker host, including
      # Ollama at http://host.docker.internal:11434.
@@ -93,6 +101,7 @@ services:
      - /dev/kfd
      - /dev/dri
    group_add:
      - "${DOCKER_GID:-963}"
      - video
      - ${RENDER_GID:-render}
@@ -27,6 +27,16 @@ services:
      # land under /app/.local for the odysseus user. Persist them so a
      # container recreate does not silently remove installed serve engines.
      - ${APP_DATA_DIR:-./data}/local:/app/.local:z
      # Docker socket — lets Cookbook launch commands like
      # `docker exec ollama-rocm ollama show <tag>` reach the host's
      # Docker daemon (and sibling containers like ollama-rocm /
      # ollama-test). The in-container user needs to be in the
      # socket's owning group — see `group_add` below; the GID
      # there must match the host's `docker` group (defaults to 963
      # on Debian, 999 on Ubuntu — override via env if yours differs).
      - /var/run/docker.sock:/var/run/docker.sock
    group_add:
      - "${DOCKER_GID:-963}"
    extra_hosts:
      # Lets the container reach local services on the Docker host, including
      # Ollama at http://host.docker.internal:11434.
@@ -16,6 +16,16 @@ services:
      # land under /app/.local for the odysseus user. Persist them so a
      # container recreate does not silently remove installed serve engines.
      - ${APP_DATA_DIR:-./data}/local:/app/.local:z
      # Docker socket — lets Cookbook launch commands like
      # `docker exec ollama-rocm ollama show <tag>` reach the host's
      # Docker daemon (and sibling containers like ollama-rocm /
      # ollama-test). The in-container user needs to be in the
      # socket's owning group — see `group_add` below; the GID
      # there must match the host's `docker` group (defaults to 963
      # on Debian, 999 on Ubuntu — override via env if yours differs).
      - /var/run/docker.sock:/var/run/docker.sock
    group_add:
      - "${DOCKER_GID:-963}"
    extra_hosts:
      # Lets the container reach local services on the Docker host, including
      # Ollama at http://host.docker.internal:11434.
@@ -26,6 +26,27 @@ if ! getent passwd "$PUID" >/dev/null 2>&1; then
    useradd -u "$PUID" -g "$PGID" -M -s /bin/sh -d /app odysseus
 fi
 ODY_USER="$(getent passwd "$PUID" | cut -d: -f1)"
 [ -z "$ODY_USER" ] && ODY_USER=odysseus
 # Docker-socket group plumbing. When /var/run/docker.sock is bind-mounted
 # (Cookbook uses docker exec to reach sibling containers), the socket is
 # owned by root:<host docker gid>. Add the app user to that group and later
 # call gosu by username so supplementary groups are retained.
 DOCKER_SOCK="${DOCKER_SOCK:-/var/run/docker.sock}"
 if [ -S "$DOCKER_SOCK" ]; then
    SOCK_GID="$(stat -c '%g' "$DOCKER_SOCK" 2>/dev/null || echo '')"
    if [ -n "$SOCK_GID" ] && [ "$SOCK_GID" != "0" ]; then
        if ! getent group "$SOCK_GID" >/dev/null 2>&1; then
            groupadd -g "$SOCK_GID" docker_host || true
        fi
        SOCK_GROUP="$(getent group "$SOCK_GID" | cut -d: -f1)"
        if [ -n "$SOCK_GROUP" ]; then
            usermod -aG "$SOCK_GROUP" "$ODY_USER" 2>/dev/null || true
        fi
    fi
 fi
 mount_root_for() {
    awk -v target="$1" '$5 == target { print $4; exit }' /proc/self/mountinfo 2>/dev/null || true
 }
@@ -103,6 +124,7 @@ for cu in \
        break
    fi
 done
 # Disable the FlashInfer JIT sampler unconditionally — it is sampler-only
 # and has no impact on the attention path, but requires nvcc + matching
 # CUDA headers at startup. Without this, vLLM crashes with "Could not find
@@ -116,9 +138,9 @@ export PATH="/app/.local/bin:$PATH"
 # Run first-time setup as the app user so data/ files get the right ownership.
 # setup.py is idempotent — skips auth.json / .env if they already exist.
 # || true so a setup failure never prevents the container from starting.
-"$GOSU_BIN" "$PUID:$PGID" "$PYTHON_BIN" /app/setup.py || true
+"$GOSU_BIN" "$ODY_USER" "$PYTHON_BIN" /app/setup.py || true
 # Drop root and run the actual app. `gosu` is preferred over `su` /
 # `sudo` because it cleans up the process tree (no extra shell layer)
 # so signals (SIGTERM from `docker stop`) reach uvicorn directly.
-exec "$GOSU_BIN" "$PUID:$PGID" "$@"
+exec "$GOSU_BIN" "$ODY_USER" "$@"
@@ -22,6 +22,31 @@ from fastapi import HTTPException
 logger = logging.getLogger(__name__)
 _CASUAL_OPENING_RE = re.compile(
    r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|"
    r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P<tail>.*)$",
    re.IGNORECASE,
 )
 _CASUAL_BLOCKLIST_RE = re.compile(
    r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|"
    r"download|model|email|document|doc|note|calendar|task|search|web|research|"
    r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b",
    re.IGNORECASE,
 )
 def _is_casual_low_signal(text: str) -> bool:
    """Short greetings/slang should not pull memory, skills, RAG, or docs."""
    s = str(text or "").strip()
    m = _CASUAL_OPENING_RE.match(s)
    if not m:
        return False
    tail = m.group("tail") or ""
    if _CASUAL_BLOCKLIST_RE.search(tail):
        return False
    tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail)
    return len(tail_words) <= 2
 # Strong references to in-flight fire-and-forget tasks scheduled from this
 # module. asyncio only keeps weak references to tasks created via
@@ -588,6 +613,7 @@ async def build_chat_context(
    # bearer-token chat requests use the token owner instead of the "api" sentinel.
    user = effective_user(request)
    uprefs = load_prefs_for_user(user)
    casual_low_signal = _is_casual_low_signal(message)
    # Memory enabled?
    mem_enabled = not incognito and not no_memory and uprefs.get("memory_enabled", True)
@@ -597,6 +623,9 @@ async def build_chat_context(
    if not allow_tool_preprocessing:
        mem_enabled = False
        skills_enabled = False
    if casual_low_signal:
        mem_enabled = False
        skills_enabled = False
    logger.debug(
        "Memory enabled=%s for user=%s (incognito=%s, no_memory=%s, pref=%s)",
        mem_enabled, user, incognito, no_memory, uprefs.get("memory_enabled", "NOT_SET"),
@@ -612,11 +641,11 @@ async def build_chat_context(
    # Use RAG?
    use_rag_val = (str(use_rag).lower() != "false") if use_rag is not None else True
-    if incognito or not allow_tool_preprocessing or is_research_spinoff:
+    if incognito or not allow_tool_preprocessing or is_research_spinoff or casual_low_signal:
        use_rag_val = False
    # If pre-fetched search context was provided (compare mode), skip live web search
-    skip_web = bool(search_context) or not allow_tool_preprocessing
+    skip_web = bool(search_context) or not allow_tool_preprocessing or casual_low_signal
    # Build context preface
    # The stream path uses enhanced_message (with CoT/preprocessing applied),
@@ -635,7 +664,7 @@ async def build_chat_context(
        incognito=incognito,
        use_skills=skills_enabled,
    )
-    if use_rag is not None or is_research_spinoff:
+    if use_rag is not None or is_research_spinoff or casual_low_signal:
        _preface_kwargs["use_rag"] = use_rag_val
    preface, rag_sources, web_sources = chat_processor.build_context_preface(**_preface_kwargs)
@@ -643,7 +672,7 @@ async def build_chat_context(
    used_memories = getattr(chat_processor, '_last_used_memories', [])
    # Inject pre-fetched search context (compare mode)
-    if search_context and allow_tool_preprocessing:
+    if search_context and allow_tool_preprocessing and not casual_low_signal:
        preface.append(untrusted_context_message("prefetched search context", search_context))
    # YouTube transcripts
@@ -829,7 +829,11 @@ def setup_chat_routes(
        from src.settings import get_setting
        _global_disabled = get_setting("disabled_tools", [])
        if _global_disabled and isinstance(_global_disabled, list):
-            disabled_tools.update(_global_disabled)
+            explicit_web_allowed = allow_web_search is not None and str(allow_web_search).lower() == "true"
            if explicit_web_allowed:
                disabled_tools.update(t for t in _global_disabled if t not in {"web_search", "web_fetch"})
            else:
                disabled_tools.update(_global_disabled)
        # Light auto-escalation: the user is in chat mode and just expressed a
        # notes/calendar/email intent. Grant the relevant managers but withhold
@@ -1259,6 +1263,10 @@ def setup_chat_routes(
                        _max_rounds = _DEFAULT_ROUNDS
                    _max_rounds = max(1, min(_max_rounds, 200))
                    _forced_tools = None
                    if allow_web_search is not None and str(allow_web_search).lower() == "true":
                        _forced_tools = {"web_search", "web_fetch"}
                    async for chunk in stream_agent_loop(
                        sess.endpoint_url,
                        sess.model,
@@ -1280,6 +1288,7 @@ def setup_chat_routes(
                        plan_mode=plan_mode,
                        approved_plan=approved_plan or None,
                        workspace=workspace or None,
                        forced_tools=_forced_tools,
                    ):
                        if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
                            try:
@@ -786,25 +786,149 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
    to hard-wire CUDA on Linux. That made ROCm hosts attempt a CUDA configure and
    fail with "CUDA Toolkit not found" instead of building with HIP.
    """
    # Try a prebuilt binary from llama.cpp's GitHub releases FIRST — no
    # cmake/build-essential/git/CUDA-headers needed at all. The from-source
    # build below stays as a fallback (custom flags, esoteric arch, no
    # internet, etc). 30 seconds vs 5+ minutes of compile, and removes
    # every OS-package dep from the launch path. Sets _odysseus_have_prebuilt=1
    # on success; the existing build-tier if/elif chain below is gated on
    # that variable so we never compile twice or shadow the prebuilt symlink.
    runner_lines.append('    _odysseus_have_prebuilt=""')
    runner_lines.append('    _odysseus_arch="$(uname -m)"')
    runner_lines.append('    _odysseus_prebuilt_url=""')
    runner_lines.append('    if command -v curl >/dev/null 2>&1 && [ "$_odysseus_arch" = "x86_64" ]; then')
    runner_lines.append('      _odysseus_pat=""')
    runner_lines.append('      _odysseus_has_nv_inline() { command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU "; }')
    runner_lines.append('      _odysseus_has_vk_inline() { ldconfig -p 2>/dev/null | grep -q "libvulkan\\.so" || command -v vulkaninfo >/dev/null 2>&1 || [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ]; }')
    runner_lines.append('      _odysseus_has_vkdev_inline() { ls /dev/dri/renderD* >/dev/null 2>&1 || (lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\'); }')
    runner_lines.append('      if _odysseus_has_nv_inline; then')
    runner_lines.append('        _odysseus_pat="ubuntu.*cuda"')
    runner_lines.append('      elif _odysseus_has_vkdev_inline && _odysseus_has_vk_inline; then')
    runner_lines.append('        _odysseus_pat="ubuntu.*vulkan"')
    runner_lines.append('      else')
    runner_lines.append('        _odysseus_pat="ubuntu-x64\\\\.zip"')
    runner_lines.append('      fi')
    runner_lines.append('      _odysseus_prebuilt_url="$(curl -fsSL --max-time 15 https://api.github.com/repos/ggml-org/llama.cpp/releases/latest 2>/dev/null | grep \'"browser_download_url"\' | cut -d\'"\' -f4 | grep -iE "$_odysseus_pat" | grep -iv "arm\\|aarch64" | head -1)"')
    runner_lines.append('    fi')
    # Accept any of unzip / bsdtar / python3 -m zipfile as the extractor.
    # python3 is essentially always present on modern Linux, so this lets
    # the prebuilt path work on minimal Ubuntu installs that lack `unzip`.
    runner_lines.append('    if [ -n "$_odysseus_prebuilt_url" ] && (command -v unzip >/dev/null 2>&1 || command -v bsdtar >/dev/null 2>&1 || command -v python3 >/dev/null 2>&1); then')
    runner_lines.append('      echo "[odysseus] Found prebuilt llama-server: $_odysseus_prebuilt_url"')
    runner_lines.append('      mkdir -p ~/bin "$HOME/.cache/odysseus/llama-cpp-prebuilt" && cd "$HOME/.cache/odysseus/llama-cpp-prebuilt"')
    runner_lines.append('      rm -f llama-cpp.zip')
    runner_lines.append('      if curl -fsSL --max-time 120 "$_odysseus_prebuilt_url" -o llama-cpp.zip && [ -s llama-cpp.zip ]; then')
    runner_lines.append('        rm -rf build && mkdir -p build')
    runner_lines.append('        if command -v unzip >/dev/null 2>&1; then unzip -qq -o llama-cpp.zip -d build; elif command -v bsdtar >/dev/null 2>&1; then bsdtar -xf llama-cpp.zip -C build; else python3 -c "import zipfile; zipfile.ZipFile(\\"llama-cpp.zip\\").extractall(\\"build\\")"; fi')
    runner_lines.append('        _odysseus_extracted="$(find build -type f -name llama-server 2>/dev/null | head -1)"')
    runner_lines.append('        if [ -n "$_odysseus_extracted" ]; then')
    runner_lines.append('          chmod +x "$_odysseus_extracted"')
    runner_lines.append('          ln -sf "$_odysseus_extracted" ~/bin/llama-server')
    runner_lines.append('          _odysseus_libdir="$(dirname "$_odysseus_extracted")"')
    runner_lines.append('          mkdir -p ~/.config && echo "export LD_LIBRARY_PATH=\\"$_odysseus_libdir:\\${LD_LIBRARY_PATH:-}\\"" > ~/.config/odysseus-llama-cpp-env')
    runner_lines.append('          _odysseus_have_prebuilt=1')
    runner_lines.append('          echo "[odysseus] Prebuilt llama-server installed at $_odysseus_extracted"')
    runner_lines.append('        fi')
    runner_lines.append('      fi')
    runner_lines.append('      [ -z "$_odysseus_have_prebuilt" ] && echo "[odysseus] Prebuilt download/extract failed — falling back to from-source build."')
    runner_lines.append('    elif [ -z "$_odysseus_prebuilt_url" ]; then')
    runner_lines.append('      echo "[odysseus] No matching prebuilt llama-server for this host (arch=$_odysseus_arch) — will build from source."')
    runner_lines.append('    fi')
    runner_lines.append('  if [ -z "$_odysseus_have_prebuilt" ]; then')
    # Detect pip-installed nvcc (from vLLM/nvidia CUDA wheels) and put it on PATH
-    # so cmake's CUDA configure can find it. We keep this after the ROCm/HIP
+    # so cmake's CUDA configure can find it — BUT only when actual NVIDIA
-    # check — a machine with both stacks should honor the native HIP toolchain on
+    # hardware is present. On AMD/Intel hosts the pip nvcc is a misleading
-    # AMD hosts instead of accidentally preferring a stray nvcc wheel.
+    # leftover (no libcudart, no GPU it could target) and would otherwise
-    runner_lines.append('    for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
+    # send the build down the CUDA branch and fail with "CUDA Toolkit not
-    runner_lines.append('      [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
+    # found" instead of trying Vulkan.
-    runner_lines.append('    done')
+    runner_lines.append('    _odysseus_has_nvidia_hw() {')
    runner_lines.append('      command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && return 0')
    runner_lines.append('      ls /dev/nvidia* >/dev/null 2>&1 && return 0')
    runner_lines.append('      lspci 2>/dev/null | grep -iE \'VGA|3D|Display\' | grep -iq nvidia && return 0')
    runner_lines.append('      return 1')
    runner_lines.append('    }')
    runner_lines.append('    if _odysseus_has_nvidia_hw; then')
    runner_lines.append('      for _cudir in ~/.local/lib/python*/site-packages/nvidia/cu13 ~/.local/lib/python*/site-packages/nvidia/cu12 ~/.local/lib/python*/site-packages/nvidia/cuda_nvcc; do')
    runner_lines.append('        [ -x "$_cudir/bin/nvcc" ] && export CUDA_HOME="$_cudir" && export PATH="$_cudir/bin:$PATH" && break')
    runner_lines.append('      done')
    runner_lines.append('    fi')
    # rm -rf build so a prior poisoned CMakeCache.txt (e.g. from a failed CUDA
    # or HIP attempt) doesn't cause the next configure to reuse stale settings.
    runner_lines.append('    mkdir -p ~/bin')
-    runner_lines.append('    cd ~/llama.cpp && rm -rf build')
+    # Try to install cmake / build-essential / git automatically before the
    # build, but ONLY via passwordless sudo (`sudo -n`) — interactive sudo
    # would hang a tmux-backgrounded serve task waiting for a password. If
    # sudo asks for a password the install is skipped silently and the
    # diagnosis pattern (cookbook_routes.py / cookbook_helpers.py) surfaces
    # an explicit "install cmake" suggestion in the Cookbook diagnosis
    # toolbar after the inevitable build failure.
    runner_lines.append('    _odysseus_apt_bootstrap() {')
    runner_lines.append('      local _missing=""')
    runner_lines.append('      command -v cmake >/dev/null 2>&1 || _missing="$_missing cmake"')
    runner_lines.append('      command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _missing="$_missing build-essential"')
    runner_lines.append('      command -v git >/dev/null 2>&1 || _missing="$_missing git"')
    runner_lines.append('      [ -z "$_missing" ] && return 0')
    runner_lines.append('      if command -v apt-get >/dev/null 2>&1 && sudo -n true 2>/dev/null; then')
    runner_lines.append('        echo "[odysseus] Auto-installing missing build deps via apt:$_missing"')
    runner_lines.append('        sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update -qq 2>&1 | tail -3')
    runner_lines.append('        sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends $_missing 2>&1 | tail -5 || true')
    runner_lines.append('      elif command -v pacman >/dev/null 2>&1 && sudo -n true 2>/dev/null; then')
    runner_lines.append('        echo "[odysseus] Auto-installing missing build deps via pacman:$_missing"')
    runner_lines.append('        local _pacpkgs="$(echo "$_missing" | sed -e \'s/build-essential/base-devel/g\')"')
    runner_lines.append('        sudo -n pacman -Sy --needed --noconfirm $_pacpkgs 2>&1 | tail -5 || true')
    runner_lines.append('      elif command -v dnf >/dev/null 2>&1 && sudo -n true 2>/dev/null; then')
    runner_lines.append('        echo "[odysseus] Auto-installing missing build deps via dnf:$_missing"')
    runner_lines.append('        local _dnfpkgs="$(echo "$_missing" | sed -e \'s/build-essential/gcc gcc-c++ make/g\')"')
    runner_lines.append('        sudo -n dnf install -y $_dnfpkgs 2>&1 | tail -5 || true')
    runner_lines.append('      else')
    runner_lines.append('        echo "[odysseus] WARNING: missing build deps ($_missing) — passwordless sudo is unavailable, cannot auto-install. Cookbook Diagnosis will explain the fix after the build fails."')
    runner_lines.append('      fi')
    runner_lines.append('    }')
    runner_lines.append('    _odysseus_apt_bootstrap')
    runner_lines.append('    _odysseus_missing_build_deps=""')
    runner_lines.append('    command -v cmake >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps cmake"')
    runner_lines.append('    command -v git >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps git"')
    runner_lines.append('    command -v g++ >/dev/null 2>&1 || command -v gcc >/dev/null 2>&1 || _odysseus_missing_build_deps="$_odysseus_missing_build_deps build-essential"')
    runner_lines.append('    if [ -n "$_odysseus_missing_build_deps" ]; then')
    runner_lines.append('      echo "ERROR: llama.cpp source build needs missing packages:$_odysseus_missing_build_deps"')
    runner_lines.append('      if command -v apt-get >/dev/null 2>&1; then')
    runner_lines.append('        echo "Install on this host: sudo apt-get update && sudo apt-get install -y cmake build-essential git"')
    runner_lines.append('      elif command -v pacman >/dev/null 2>&1; then')
    runner_lines.append('        echo "Install on this host: sudo pacman -Sy --needed cmake base-devel git"')
    runner_lines.append('      elif command -v dnf >/dev/null 2>&1; then')
    runner_lines.append('        echo "Install on this host: sudo dnf install -y cmake gcc gcc-c++ make git"')
    runner_lines.append('      fi')
    runner_lines.append('      echo "Alternative: install a native llama-server on PATH, then relaunch."')
    runner_lines.append('      ODYSSEUS_PREFLIGHT_EXIT=127')
    runner_lines.append('    fi')
    runner_lines.append('    cd ~/llama.cpp')
    runner_lines.append('    _odysseus_has_vulkan() {')
    runner_lines.append('      ldconfig -p 2>/dev/null | grep -q \'libvulkan\\.so\' && return 0')
    runner_lines.append('      [ -e /usr/lib/libvulkan.so.1 ] && return 0')
    runner_lines.append('      [ -e /usr/lib/x86_64-linux-gnu/libvulkan.so.1 ] && return 0')
    runner_lines.append('      command -v vulkaninfo >/dev/null 2>&1 && return 0')
    runner_lines.append('      return 1')
    runner_lines.append('    }')
    runner_lines.append('    _odysseus_has_vulkan_device() {')
    runner_lines.append('      ls /dev/dri/renderD* >/dev/null 2>&1 && return 0')
    runner_lines.append('      lspci 2>/dev/null | grep -Ei \'VGA|3D|Display\' | grep -Eiq \'AMD|ATI|Radeon\' && return 0')
    runner_lines.append('      return 1')
    runner_lines.append('    }')
    # Backend preference: native ROCm/HIP > native CUDA > Vulkan > CPU.
    # Vulkan is a portable fallback that works on AMD when ROCm isn't
    # installed (e.g. Strix Halo) and on any vendor's discrete GPU, but
    # it's ~30-40% slower than native HIP/CUDA for LLM inference — only
    # pick it when no native toolchain is present.
    runner_lines.append('    if command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]; then')
    runner_lines.append('      rm -rf build')
    runner_lines.append('      if command -v hipconfig &>/dev/null; then')
    runner_lines.append('        export HIPCXX="${HIPCXX:-$(hipconfig -l)/clang}"')
    runner_lines.append('        export HIP_PATH="${HIP_PATH:-$(hipconfig -R)}"')
    runner_lines.append('      fi')
    runner_lines.append('      echo "[odysseus] ROCm/HIP detected — building llama-server with HIP support..."')
    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
-    runner_lines.append('    elif command -v nvcc &>/dev/null; then')
+    runner_lines.append('    elif command -v nvcc &>/dev/null && _odysseus_has_nvidia_hw; then')
    runner_lines.append('      rm -rf build')
    # nvcc alone is not sufficient — pip-installed CUDA wheels or incomplete
    # tooling can expose nvcc without shipping libcudart, causing cmake to fail
    # mid-build with "CUDA runtime library not found". Check cudart explicitly
@@ -828,31 +952,50 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None:
    runner_lines.append('        echo "[odysseus]   Ensure libcudart is installed (e.g. cuda-runtime package) and visible via ldconfig or CUDA_HOME."')
    runner_lines.append('        cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
    runner_lines.append('      fi')
    runner_lines.append('    elif _odysseus_has_vulkan_device && _odysseus_has_vulkan; then')
    runner_lines.append('      echo "[odysseus] Vulkan-capable GPU detected (no ROCm/CUDA toolchain installed) — building llama-server with Vulkan support..."')
    runner_lines.append('      rm -rf build-vulkan')
    runner_lines.append('      cmake -B build-vulkan -DCMAKE_BUILD_TYPE=Release -DGGML_VULKAN=ON && cmake --build build-vulkan -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build-vulkan/bin/llama-server ~/bin/llama-server')
    runner_lines.append('    else')
-    runner_lines.append('      echo "[odysseus] WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only."')
+    runner_lines.append('      echo "[odysseus] WARNING: no HIP/CUDA/Vulkan toolchain found — building llama-server for CPU only."')
    runner_lines.append('      echo "[odysseus]   GPU inference will not be available for this llama.cpp build."')
-    runner_lines.append('      echo "[odysseus]   Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA, then re-launch this serve task."')
+    runner_lines.append('      echo "[odysseus]   Install Vulkan (libvulkan-dev) / ROCm for AMD GPUs or CUDA tooling for NVIDIA, then re-launch this serve task."')
    runner_lines.append('      rm -rf build')
    runner_lines.append('      cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j"$NPROC" --target llama-server && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
    runner_lines.append('    fi')
    runner_lines.append('  fi  # end _odysseus_have_prebuilt guard')
-def _llama_cpp_rebuild_cmd() -> str:
+def _llama_cpp_rebuild_cmd(update_source: bool = False) -> str:
    """Shell command that clears the Cookbook-managed llama.cpp build.
-    Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build``
+    Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build*``
    directory so the next llama.cpp serve recompiles from source, picking up a
    CUDA or HIP toolchain if one is now available. The serve bootstrap only
    builds when ``llama-server`` is missing from PATH, so without this an
-    existing CPU-only build is reused forever. It deliberately installs and
+    existing CPU-only build is reused forever. When ``update_source`` is true,
-    downloads nothing; the rebuild itself happens on the next serve.
+    the command also fast-forwards the Cookbook-managed ``~/llama.cpp`` checkout
    if it exists. The rebuild itself happens on the next serve.
    """
    update_cmd = ''
    if update_source:
        update_cmd = (
            'if [ -d "$HOME/llama.cpp/.git" ]; then '
            'git -C "$HOME/llama.cpp" pull --ff-only --depth 1 || '
            'echo "[odysseus] WARNING: llama.cpp source update failed; clearing cached build anyway."; '
            'elif command -v git >/dev/null 2>&1; then '
            'git clone --depth 1 https://github.com/ggml-org/llama.cpp "$HOME/llama.cpp" || '
            'echo "[odysseus] WARNING: llama.cpp clone failed; clearing cached build anyway."; '
            'fi && '
        )
    return (
        'mkdir -p "$HOME/bin" && '
        f'{update_cmd}'
        'rm -f "$HOME/bin/llama-server" && '
-        'rm -rf "$HOME/llama.cpp/build" && '
+        'rm -rf "$HOME/llama.cpp/build" "$HOME/llama.cpp/build-vulkan" && '
        'echo "[odysseus] Cleared the cached llama.cpp build. '
        'Re-launch the serve task to rebuild llama-server from source '
-        '(CUDA or HIP will be used if a toolchain is now available)."'
+        '(Vulkan, HIP, or CUDA will be used if a matching toolchain is now available)."'
    )
@@ -1115,8 +1258,27 @@ def _diagnose_serve_output(text: str) -> dict | None:
            "SGLang is not installed or not in PATH on this server.",
            [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
        ),
        # System build deps come BEFORE the generic llama.cpp catch-all so
        # cmake / build-essential / git missing → a specific OS-package
        # remediation instead of "install llama-cpp-python[server]" (which
        # itself fails to compile when cmake is absent).
        (
-            r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+            r"cmake: command not found|cmake.*not found.*[Cc]ould not",
            "cmake is required to build llama.cpp from source but isn't installed on this server.",
            [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}],
        ),
        (
            r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler",
            "A C/C++ compiler (build-essential) is required to build llama.cpp from source.",
            [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
        ),
        (
            r"^git: command not found",
            "git is required to clone the llama.cpp source tree.",
            [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
        ),
        (
            r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'",
            "llama.cpp / llama-cpp-python dependencies are missing.",
            [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
        ),
@@ -189,8 +189,27 @@ def setup_cookbook_routes() -> APIRouter:
                "SGLang is not installed or not in PATH on this server.",
                [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
            ),
            # System build deps come BEFORE the generic llama.cpp catch-all
            # so cmake / build-essential / git missing → a specific OS-package
            # remediation instead of "install llama-cpp-python[server]" (which
            # itself fails to compile when cmake is absent).
            (
-                r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+                r"cmake: command not found|cmake.*not found.*[Cc]ould not",
                "cmake is required to build llama.cpp from source but isn't installed on this server.",
                [{"label": "install build deps for llama.cpp (apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git)", "op": "dependency", "package": "llama-cpp-python[server]"}],
            ),
            (
                r"^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler",
                "A C/C++ compiler (build-essential) is required to build llama.cpp from source.",
                [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
            ),
            (
                r"^git: command not found",
                "git is required to clone the llama.cpp source tree.",
                [{"label": "install build deps for llama.cpp on this server", "op": "dependency", "package": "llama-cpp-python[server]"}],
            ),
            (
                r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'",
                "llama.cpp / llama-cpp-python dependencies are missing.",
                [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
            ),
@@ -254,6 +273,79 @@ def setup_cookbook_routes() -> APIRouter:
    def _load_stored_hf_token() -> str:
        return load_stored_hf_token(state_path=_cookbook_state_path)
    def _normalize_minimax_m3_vllm_cmd(cmd: str) -> str:
        """Patch MiniMax M3 vLLM launches into the known-good local form.
        The browser form can be stale or omit advanced-only fields. MiniMax M3
        is sensitive to several flags: using the HF repo id with block-size 128
        fails KV-cache setup, and FlashInfer sampler JIT fails on this host's
        system nvcc. Normalize server-side before writing the tmux runner.
        """
        cmd_lower = (cmd or "").lower()
        if not cmd or "vllm serve" not in cmd_lower or "minimax" not in cmd_lower or "m3" not in cmd_lower:
            return cmd
        try:
            parts = shlex.split(cmd)
        except ValueError:
            return cmd
        if "serve" not in parts:
            return cmd
        env_re = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
        env_parts = [p for p in parts if env_re.match(p)]
        body = [p for p in parts if not env_re.match(p)]
        try:
            serve_i = body.index("serve")
        except ValueError:
            return cmd
        if serve_i + 1 >= len(body):
            return cmd
        repo_id = "cyankiwi/MiniMax-M3-AWQ-INT4"
        snapshot = (
            "/home/pewds/.cache/huggingface/hub/"
            "models--cyankiwi--MiniMax-M3-AWQ-INT4/"
            "snapshots/4082acbbec1236d21828d55b6bb0fe02ade4ab5b"
        )
        if body[serve_i + 1] == repo_id:
            body[serve_i + 1] = snapshot
        def add_env(key: str, value: str) -> None:
            if not any(p.startswith(f"{key}=") for p in env_parts):
                env_parts.append(f"{key}={value}")
        def has_flag(flag: str) -> bool:
            return any(p == flag or p.startswith(flag + "=") for p in body)
        def set_flag(flag: str, value: str) -> None:
            for i, part in enumerate(body):
                if part == flag:
                    if i + 1 < len(body):
                        body[i + 1] = value
                    else:
                        body.append(value)
                    return
                if part.startswith(flag + "="):
                    body[i] = f"{flag}={value}"
                    return
            body.extend([flag, value])
        def add_bool(flag: str) -> None:
            if not has_flag(flag):
                body.append(flag)
        add_env("VLLM_TARGET_DEVICE", "cuda")
        add_env("VLLM_USE_FLASHINFER_SAMPLER", "0")
        set_flag("--served-model-name", repo_id)
        set_flag("--tool-call-parser", "minimax_m3")
        set_flag("--reasoning-parser", "minimax_m3")
        set_flag("--attention-backend", "TRITON_ATTN")
        set_flag("--block-size", "128")
        add_bool("--language-model-only")
        add_bool("--disable-custom-all-reduce")
        add_bool("--enable-expert-parallel")
        return shlex.join(env_parts + body)
    def _cookbook_ssh_dir() -> Path:
        # The Docker image keeps cookbook keys under /app/.ssh; that path only
        # exists inside the container. On Windows (and any non-container host)
@@ -1230,6 +1322,7 @@ def setup_cookbook_routes() -> APIRouter:
        # `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400).
        req.cmd = _validate_serve_cmd(req.cmd) or ""
        req.cmd = _normalize_llama_cpp_python_cache_types(req.cmd) or ""
        req.cmd = _normalize_minimax_m3_vllm_cmd(req.cmd)
        req.cmd = _venv_safe_local_pip_install_cmd(
            req.cmd,
            local=not bool(req.remote_host),
@@ -1243,8 +1336,16 @@ def setup_cookbook_routes() -> APIRouter:
            req.cmd = _pip_install_no_cache(req.cmd)
            # Accept common aliases and enforce server extras for llama-cpp so
            # `python -m llama_cpp.server` has all runtime dependencies.
-            req.cmd = re.sub(r"(?<![A-Za-z0-9_.-])llama_cpp(?![A-Za-z0-9_.-])", "llama-cpp-python[server]", req.cmd)
+            # CRITICAL: the lookbehind / lookahead must also exclude `/` so
-            req.cmd = re.sub(r"(?<![A-Za-z0-9_.-])llama-cpp-python(?!\[)", "llama-cpp-python[server]", req.cmd)
+            # the regex DOESN'T mangle a URL path like
            #   https://abetlen.github.io/llama-cpp-python/whl/cu124
            # The previous regex turned that URL into
            #   https://abetlen.github.io/llama-cpp-python[server]/whl/cu124
            # which pip then couldn't resolve → silent fallback to source
            # build of the .tar.gz → CPU-only binary (because CMAKE_ARGS
            # isn't set), defeating the entire purpose of the CUDA index.
            req.cmd = re.sub(r"(?<![A-Za-z0-9_.\-/])llama_cpp(?![A-Za-z0-9_.\-/])", "llama-cpp-python[server]", req.cmd)
            req.cmd = re.sub(r"(?<![A-Za-z0-9_.\-/])llama-cpp-python(?![\[/])", "llama-cpp-python[server]", req.cmd)
            if "llama-cpp-python" in req.cmd and "--extra-index-url" not in req.cmd:
                req.cmd += " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
            # PEP-508-style package spec — letters, digits, `.-_` for the
@@ -1431,6 +1532,69 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('  else')
                _append_llama_cpp_linux_accel_build_lines(runner_lines)
                runner_lines.append('  fi')
                # Source the env file the prebuilt-download path writes so
                # LD_LIBRARY_PATH includes the directory holding libllama.so
                # and friends. No-op when prebuilt wasn't used.
                runner_lines.append('  [ -r ~/.config/odysseus-llama-cpp-env ] && . ~/.config/odysseus-llama-cpp-env')
                # Auto-upgrade pip llama-cpp-python to the CUDA-enabled
                # wheel when (a) NVIDIA hardware is present and (b) the
                # currently-installed wheel is CPU-only. Without this the
                # user gets the Python server happily running at 3 tok/s
                # because pip's default index ships CPU-only wheels.
                # Forward-compat: cu124 wheels work on driver/runtime
                # 12.4+ including the cu13.x line.
                runner_lines.append('  if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L 2>/dev/null | grep -q "GPU " && python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    if ! python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then')
                runner_lines.append('      echo "[odysseus] NVIDIA detected but installed llama-cpp-python is CPU-only — reinstalling with CUDA wheel index for GPU offload..."')
                runner_lines.append('      python3 -m pip install --user --break-system-packages --force-reinstall --no-cache-dir "llama-cpp-python[server]" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124 2>&1 | tail -8 || echo "[odysseus] WARNING: CUDA wheel reinstall failed — Python server will stay CPU-only (slow). Manual fix: pip install --user --force-reinstall \'llama-cpp-python[server]\' --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124"')
                runner_lines.append('      if python3 -c "import llama_cpp; import sys; sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" 2>/dev/null; then')
                runner_lines.append('        echo "[odysseus] llama-cpp-python now supports GPU offload."')
                runner_lines.append('      fi')
                runner_lines.append('    fi')
                runner_lines.append('  fi')
                # SHORT-CIRCUIT before the build/pip fallback: if the
                # native binary is missing but llama_cpp Python is already
                # installed, drop a wrapper at ~/bin/llama-server that
                # translates llama-server CLI args to llama_cpp.server's
                # underscore-style flags. The user's serve command stays
                # `llama-server ...` and "just works" — no build, no cmake,
                # no second install. This is the path that unblocks every
                # remote where pip-installed llama-cpp-python is already
                # working but Cookbook used to insist on a native binary.
                runner_lines.append('  if ! command -v llama-server >/dev/null 2>&1 && python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    mkdir -p ~/bin')
                runner_lines.append('    cat > ~/bin/llama-server <<\'_ODY_LLAMA_SHIM_EOF\'')
                runner_lines.append('#!/usr/bin/env bash')
                runner_lines.append('# Auto-generated by Odysseus Cookbook: a `llama-server` lookalike')
                runner_lines.append('# that translates the native CLI to `python -m llama_cpp.server`.')
                runner_lines.append('# Lets cookbook-generated launch commands run unchanged on hosts')
                runner_lines.append('# where only the pip llama-cpp-python package is installed.')
                runner_lines.append('ARGS=()')
                runner_lines.append('while [ $# -gt 0 ]; do')
                runner_lines.append('  case "$1" in')
                runner_lines.append('    -ngl|--gpu-layers|--n-gpu-layers) ARGS+=(--n_gpu_layers "$2"); shift 2 ;;')
                runner_lines.append('    -c|--ctx-size) ARGS+=(--n_ctx "$2"); shift 2 ;;')
                runner_lines.append('    -b|--batch-size) ARGS+=(--n_batch "$2"); shift 2 ;;')
                runner_lines.append('    -ub|--ubatch-size) shift 2 ;;  # llama-cpp-python has no separate ubatch')
                runner_lines.append('    --flash-attn) ARGS+=(--flash_attn true); shift 2 ;;')
                runner_lines.append('    --cache-type-k) ARGS+=(--type_k "$2"); shift 2 ;;')
                runner_lines.append('    --cache-type-v) ARGS+=(--type_v "$2"); shift 2 ;;')
                runner_lines.append('    --n-cpu-moe) ARGS+=(--n_cpu_moe "$2"); shift 2 ;;')
                runner_lines.append('    --mmproj) ARGS+=(--clip_model_path "$2"); shift 2 ;;')
                runner_lines.append('    --image-max-tokens) shift 2 ;;  # native-only')
                runner_lines.append('    --no-mmap) ARGS+=(--no_mmap true); shift ;;')
                runner_lines.append('    --no-warmup) shift ;;  # native-only')
                runner_lines.append('    --chat-template) ARGS+=(--chat_format "$2"); shift 2 ;;')
                runner_lines.append('    --fit|--split-mode|--tensor-split|--main-gpu|--parallel) shift 2 ;;  # native-only')
                runner_lines.append('    --mlock) ARGS+=(--use_mlock true); shift ;;')
                runner_lines.append('    *) ARGS+=("$1"); shift ;;')
                runner_lines.append('  esac')
                runner_lines.append('done')
                runner_lines.append('exec python3 -m llama_cpp.server "${ARGS[@]}"')
                runner_lines.append('_ODY_LLAMA_SHIM_EOF')
                runner_lines.append('    chmod +x ~/bin/llama-server')
                runner_lines.append('    echo "[odysseus] Created llama-server shim → python -m llama_cpp.server (no native binary needed)"')
                runner_lines.append('  fi')
                runner_lines.append('  # If the native build failed, fall back to the Python bindings.')
                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
                runner_lines.append('    echo "llama-server build failed — installing Python bindings as fallback..."')
@@ -1494,6 +1658,96 @@ def setup_cookbook_routes() -> APIRouter:
                runner_lines.append('  echo "ERROR: vLLM is not installed."')
                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                runner_lines.append('fi')
                runner_lines.append(f"ODYSSEUS_SERVE_CMD='{_bash_squote(req.cmd)}'")
                runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ]; then')
                runner_lines.append('  ODYSSEUS_VLLM_HELP_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'')
                runner_lines.append('import shlex, sys')
                runner_lines.append('parts = shlex.split(sys.argv[1])')
                runner_lines.append('try:')
                runner_lines.append('    serve_i = parts.index("serve")')
                runner_lines.append('except ValueError:')
                runner_lines.append('    print("vllm serve --help")')
                runner_lines.append('else:')
                runner_lines.append('    print(shlex.join(parts[:serve_i + 1] + ["--help"]))')
                runner_lines.append('PY')
                runner_lines.append(')"')
                runner_lines.append('  ODYSSEUS_VLLM_SUPPORTS_SWAP=0')
                runner_lines.append('  if eval "$ODYSSEUS_VLLM_HELP_CMD" 2>&1 | grep -q -- "--swap-space"; then ODYSSEUS_VLLM_SUPPORTS_SWAP=1; fi')
                runner_lines.append('fi')
                runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" = "1" ] && ! printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then')
                runner_lines.append('  echo "[odysseus] Setting vLLM --swap-space 0 so the runtime does not reserve CPU swap per GPU."')
                runner_lines.append('  ODYSSEUS_SERVE_CMD="${ODYSSEUS_SERVE_CMD} --swap-space 0"')
                runner_lines.append('fi')
                runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" != "1" ]; then')
                runner_lines.append('  if printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then')
                runner_lines.append('    echo "[odysseus] vLLM serve does not expose --swap-space; removing the flag and patching the runtime default to 0."')
                runner_lines.append('    ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'')
                runner_lines.append('import shlex, sys')
                runner_lines.append('parts = shlex.split(sys.argv[1])')
                runner_lines.append('out = []')
                runner_lines.append('skip = False')
                runner_lines.append('for part in parts:')
                runner_lines.append('    if skip:')
                runner_lines.append('        skip = False')
                runner_lines.append('        continue')
                runner_lines.append('    if part == "--swap-space":')
                runner_lines.append('        skip = True')
                runner_lines.append('        continue')
                runner_lines.append('    if part.startswith("--swap-space="):')
                runner_lines.append('        continue')
                runner_lines.append('    out.append(part)')
                runner_lines.append('print(shlex.join(out))')
                runner_lines.append('PY')
                runner_lines.append(')"')
                runner_lines.append('  fi')
                runner_lines.append('  ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'')
                runner_lines.append('import shlex, sys')
                runner_lines.append('parts = shlex.split(sys.argv[1])')
                runner_lines.append('patch = r"""import inspect, sys')
                runner_lines.append('from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs')
                runner_lines.append('def _odysseus_swap0(cls):')
                runner_lines.append('    params = list(inspect.signature(cls).parameters)')
                runner_lines.append('    if "swap_space" not in params:')
                runner_lines.append('        return')
                runner_lines.append('    idx = params.index("swap_space")')
                runner_lines.append('    defaults = list(cls.__init__.__defaults__ or ())')
                runner_lines.append('    if idx < len(defaults):')
                runner_lines.append('        defaults[idx] = 0')
                runner_lines.append('        cls.__init__.__defaults__ = tuple(defaults)')
                runner_lines.append('    fields = getattr(cls, "__dataclass_fields__", {})')
                runner_lines.append('    if "swap_space" in fields:')
                runner_lines.append('        fields["swap_space"].default = 0')
                runner_lines.append('_odysseus_swap0(EngineArgs)')
                runner_lines.append('_odysseus_swap0(AsyncEngineArgs)')
                runner_lines.append('try:')
                runner_lines.append('    from vllm.config import CacheConfig')
                runner_lines.append('    CacheConfig.swap_space = 0')
                runner_lines.append('except Exception:')
                runner_lines.append('    pass')
                runner_lines.append('_orig_create_engine_config = EngineArgs.create_engine_config')
                runner_lines.append('def _odysseus_create_engine_config(self, *args, **kwargs):')
                runner_lines.append('    self.swap_space = 0')
                runner_lines.append('    return _orig_create_engine_config(self, *args, **kwargs)')
                runner_lines.append('EngineArgs.create_engine_config = _odysseus_create_engine_config')
                runner_lines.append('AsyncEngineArgs.create_engine_config = _odysseus_create_engine_config')
                runner_lines.append('from vllm.entrypoints.cli.main import main')
                runner_lines.append('sys.exit(main())"""')
                runner_lines.append('try:')
                runner_lines.append('    serve_i = parts.index("serve")')
                runner_lines.append('except ValueError:')
                runner_lines.append('    print(shlex.join(parts))')
                runner_lines.append('else:')
                runner_lines.append('    exe_i = serve_i - 1')
                runner_lines.append('    exe = parts[exe_i] if exe_i >= 0 else "vllm"')
                runner_lines.append('    py = "python3"')
                runner_lines.append('    if exe.endswith("/bin/vllm"):')
                runner_lines.append('        py = exe[:-len("/bin/vllm")] + "/bin/python"')
                runner_lines.append('    parts[exe_i:serve_i] = [py, "-c", patch]')
                runner_lines.append('    print(shlex.join(parts))')
                runner_lines.append('PY')
                runner_lines.append(')"')
                runner_lines.append('  echo "[odysseus] Patched vLLM internal swap_space default to 0 for this runtime."')
                runner_lines.append('fi')
            elif "sglang.launch_server" in req.cmd:
                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
                runner_lines.append('if ! command -v sglang &>/dev/null; then')
@@ -1535,7 +1789,10 @@ def setup_cookbook_routes() -> APIRouter:
                    runner_lines,
                    keep_shell_open=not local_windows,
                )
-                runner_lines.append(req.cmd)
+                if "vllm serve" in req.cmd:
                    runner_lines.append('eval "$ODYSSEUS_SERVE_CMD"')
                else:
                    runner_lines.append(req.cmd)
                if local_windows:
                    # Detached background process — no interactive shell to keep open.
                    # Print the exit marker the status poller looks for, then stop.
@@ -1839,6 +2096,25 @@ def setup_cookbook_routes() -> APIRouter:
        out, err = await _run_gpu_shell("ls -1 /sys/class/drm 2>/dev/null", host, ssh_port, timeout=4)
        if err is not None or not out:
            return []
        # Pick the runtime label up-front so each GPU dict gets the
        # right `backend`. AMD silicon can be driven by ROCm/HIP (native)
        # OR Vulkan (mesa RADV). Reporting "rocm" on a host where no
        # ROCm toolchain is installed misleads the frontend env-var
        # prefix logic — it would emit `HIP_VISIBLE_DEVICES=` for a
        # Vulkan-only stack, which is a silent no-op at best.
        rt_out, _ = await _run_gpu_shell(
            'command -v rocminfo >/dev/null 2>&1 && echo rocm '
            '|| (command -v hipconfig >/dev/null 2>&1 && echo rocm) '
            '|| (command -v vulkaninfo >/dev/null 2>&1 && echo vulkan) '
            '|| echo unknown',
            host, ssh_port, timeout=4,
        )
        _amd_runtime = (rt_out or "").strip().splitlines()[-1:][0].strip() if rt_out else "rocm"
        if _amd_runtime not in ("rocm", "vulkan"):
            # Default to rocm so existing ROCm-installed hosts keep
            # working; "unknown" only happens when neither toolchain is
            # detected (e.g. minimal sysfs read on a fresh box).
            _amd_runtime = "rocm"
        gpus = []
        for entry in out.split():
            if not entry.startswith("card") or "-" in entry:
@@ -1882,7 +2158,7 @@ def setup_cookbook_routes() -> APIRouter:
                "free_mb": free_mb, "total_mb": total_mb, "used_mb": used_mb,
                "gtt_used_mb": gtt_used_mb,
                "util_pct": 0, "busy": bool(total_mb and (free_mb / total_mb) < 0.85),
-                "processes": [], "backend": "rocm", "source": "amd-sysfs",
+                "processes": [], "backend": _amd_runtime, "source": "amd-sysfs",
                "unified_memory": unified,
            })
        if gpus:
@@ -2023,10 +2299,15 @@ def setup_cookbook_routes() -> APIRouter:
        amd_gpus = await _probe_amd_sysfs(host, ssh_port)
        if amd_gpus:
            # The per-GPU dict already carries the runtime label picked by
            # _probe_amd_sysfs (rocm vs vulkan); mirror that into the
            # wrapper so the frontend can read `data.backend` directly
            # without scanning the list.
            _amd_wrap_backend = str(amd_gpus[0].get("backend") or "rocm")
            return {
                "ok": True,
                "gpus": amd_gpus,
-                "backend": "rocm",
+                "backend": _amd_wrap_backend,
                "source": "amd-sysfs",
                "fallback_from": "nvidia-smi",
                "nvidia_error": nvidia_error,
@@ -2166,6 +2447,17 @@ def setup_cookbook_routes() -> APIRouter:
            disk_tasks = on_disk.get("tasks") or [] if isinstance(on_disk, dict) else []
            incoming_tasks = data.get("tasks") if isinstance(data.get("tasks"), list) else []
            incoming_removed = data.get("removedTasks") if isinstance(data.get("removedTasks"), dict) else {}
            disk_removed = on_disk.get("removedTasks") if isinstance(on_disk, dict) and isinstance(on_disk.get("removedTasks"), dict) else {}
            removed_tasks = {**disk_removed, **incoming_removed}
            data["removedTasks"] = removed_tasks
            removed_ids = set(removed_tasks.keys())
            if removed_ids:
                incoming_tasks = [
                    t for t in incoming_tasks
                    if not (isinstance(t, dict) and t.get("sessionId") in removed_ids)
                ]
                data["tasks"] = incoming_tasks
            # Anti-poisoning guard: a stale browser tab can keep POSTing a
            # download task as status='done' from before the strict-finish
            # fix landed, undoing any server-side correction. For each
@@ -2203,6 +2495,8 @@ def setup_cookbook_routes() -> APIRouter:
                sid = t.get("sessionId")
                if not sid or sid in incoming_ids:
                    continue  # client's version wins
                if sid in removed_ids:
                    continue  # intentional cross-device clear/remove
                ts = t.get("ts") or 0
                if isinstance(ts, (int, float)) and (now_ms - ts) <= RACE_WINDOW_MS:
                    preserved.append(t)
@@ -2309,16 +2603,14 @@ def setup_cookbook_routes() -> APIRouter:
            # Add 30% headroom for KV cache, activations, etc.
            needed_vram = (est_vram * 1.3) if est_vram else None
-            if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb:
+            if vram_gb > 0:
-                continue
+                if needed_vram is None:
-            # Unknown-size models (e.g. MiniMax-M2.7, DeepSeek-V4-Flash) have no
+                    # The "trending models that fit" list must be conservative:
-            # "NB" in the repo id, so the regex above can't extract their
+                    # if we cannot estimate size from the repo id/tags, do not
-            # param count. Previously we dropped them entirely, which made
+                    # present it as runnable on this hardware.
-            # brand-new flagship releases silently vanish from this list even
+                    continue
-            # on rigs with hundreds of GB of VRAM. Adapters/LoRAs are already
+                if needed_vram > vram_gb:
-            # filtered by _is_excluded(), so what falls through here is
+                    continue
            # overwhelmingly full models — keep them, just without a size
            # badge (the frontend handles needed_vram_gb=null gracefully).
            out.append({
                "repo_id": repo_id,
@@ -2515,6 +2807,33 @@ def setup_cookbook_routes() -> APIRouter:
            except Exception as e:
                logger.warning(f"orphan sweep: state write failed: {e}")
    @router.get("/api/cookbook/hf-gguf-files")
    async def hf_gguf_files(repo_id: str, owner: str = Depends(require_user)):
        """List GGUF files in a HuggingFace repo for the direct-download picker."""
        import httpx
        repo_id = _validate_repo_id(repo_id)
        url = f"https://huggingface.co/api/models/{repo_id}"
        try:
            headers = {}
            token = _load_stored_hf_token()
            if token:
                headers["Authorization"] = f"Bearer {token}"
            async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client:
                resp = await client.get(url, headers=headers)
                if resp.status_code != 200:
                    return {"ok": False, "files": [], "error": f"HF API HTTP {resp.status_code}"}
                data = resp.json()
        except Exception:
            logger.exception("HF GGUF file scan failed for %s", repo)
            return {"ok": False, "files": [], "error": "HF API request failed"}
        files = [
            str(s.get("rfilename") or "")
            for s in data.get("siblings", [])
            if str(s.get("rfilename") or "").lower().endswith(".gguf")
        ]
        return {"ok": True, "repo_id": repo_id, "files": files}
    # In-memory cache for the Ollama library scrape. ollama.com is a public
    # site, but it doesn't expose a stable JSON listing — we fetch the HTML
    # search page and regex out the model cards. Cached for 1 h so a busy
@@ -1233,22 +1233,30 @@ def _list_attachments_from_msg(msg):
        return attachments
    idx = 0
    for part in msg.walk():
        if part.is_multipart():
            continue
        cd = str(part.get("Content-Disposition", ""))
        ct = part.get_content_type()
        is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename())
        if part.is_multipart() and not is_attached_email:
            continue
        # Skip text/html body parts (only consider real attachments)
        if ct in ("text/plain", "text/html") and "attachment" not in cd:
            continue
        filename = part.get_filename()
        if filename:
            filename = _decode_header(filename)
            if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename):
                filename = f"{filename}.eml"
        else:
            # Inline images, etc. - generate a name
-            ext = ct.split("/")[-1] if "/" in ct else "bin"
+            ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin")
            filename = f"attachment_{idx}.{ext}"
        payload = part.get_payload(decode=True)
-        size = len(payload) if payload else 0
+        if payload is None and ct == "message/rfc822":
            try:
                payload = part.as_bytes()
            except Exception:
                payload = b""
        size = len(payload) if payload is not None else 0
        attachments.append({
            "index": idx,
            "filename": filename,
@@ -1260,29 +1268,58 @@ def _list_attachments_from_msg(msg):
    return attachments
 def _is_likely_signature_image_attachment(att: dict) -> bool:
    """Match the reader's inline signature/logo image filter."""
    filename = str((att or {}).get("filename") or "").lower()
    if not re.search(r"\.(png|jpe?g|gif|bmp|svg|webp)$", filename):
        return False
    size = int((att or {}).get("size") or 0)
    if re.search(r"^image\d{3,}\.(png|jpe?g|gif)$", filename):
        return True
    if re.search(r"^(signature|logo|sig|footer|banner)[-_\d]*\.(png|jpe?g|gif|svg)$", filename):
        return True
    return 0 < size < 30 * 1024
 def _has_visible_attachments(msg) -> bool:
    """Return True only for attachments the reader will render as chips."""
    return any(
        not _is_likely_signature_image_attachment(att)
        for att in _list_attachments_from_msg(msg)
    )
 def _extract_attachment_to_disk(msg, index, target_dir):
    """Extract a specific attachment to disk and return the file path."""
    if not msg.is_multipart():
        return None
    idx = 0
    for part in msg.walk():
        if part.is_multipart():
            continue
        cd = str(part.get("Content-Disposition", ""))
        ct = part.get_content_type()
        is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename())
        if part.is_multipart() and not is_attached_email:
            continue
        if ct in ("text/plain", "text/html") and "attachment" not in cd:
            continue
        if idx == index:
            filename = part.get_filename()
            if filename:
                filename = _decode_header(filename)
                if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename):
                    filename = f"{filename}.eml"
            else:
-                ext = ct.split("/")[-1] if "/" in ct else "bin"
+                ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin")
                filename = f"attachment_{idx}.{ext}"
            # Sanitize
            safe_name = re.sub(r"[^\w\s\-.]", "_", filename).strip()
            payload = part.get_payload(decode=True)
-            if not payload:
+            if payload is None and ct == "message/rfc822":
                try:
                    payload = part.as_bytes()
                except Exception:
                    payload = b""
            if payload is None:
                return None
            target_dir.mkdir(parents=True, exist_ok=True)
            filepath = target_dir / safe_name
@@ -47,7 +47,7 @@ from routes.email_helpers import (
    _IMAP_TIMEOUT_SECONDS, _open_imap_connection,
    make_oauth_state, verify_oauth_state,
    _imap_connect, _imap, _decode_header, _detect_sent_folder, _detect_drafts_folder,
-    _extract_attachment_text, _list_attachments_from_msg,
+    _extract_attachment_text, _list_attachments_from_msg, _has_visible_attachments, _is_likely_signature_image_attachment,
    _extract_attachment_to_disk, _extract_html, _extract_text,
    _fetch_sender_thread_context, _pre_retrieve_context,
    _EMAIL_REPLY_SYS_PROMPT_BASE, _POOL_HOOKS,
@@ -61,6 +61,7 @@ from routes.email_pollers import _start_poller
 logger = logging.getLogger(__name__)
 ODYSSEUS_MAIL_ORIGIN = "odysseus-ui"
 EMAIL_READ_ATTACHMENT_VERSION = 2
 def _email_tag_owner_aliases(account_id: str | None, owner: str = "") -> list[str]:
@@ -248,6 +249,21 @@ def _imap_uid_fetch(conn, uid_set: str | bytes, query: str):
    return conn.uid("FETCH", _uid_bytes(uid_set), query)
 def _imap_search_quote(value: str) -> str:
    return '"' + str(value or "").replace("\\", "\\\\").replace('"', '\\"') + '"'
 def _message_id_chain(*values: str) -> list[str]:
    seen = set()
    out = []
    for value in values:
        for mid in re.findall(r"<[^>]+>", value or ""):
            if mid not in seen:
                seen.add(mid)
                out.append(mid)
    return out
 def _uid_from_fetch_meta(meta_b: bytes) -> str:
    m = re.search(rb"\bUID\s+(\d+)\b", meta_b)
    return m.group(1).decode() if m else ""
@@ -366,6 +382,21 @@ def _apply_odysseus_headers(msg, kind: str | None = None, ref_id: str | None = N
        msg["X-Odysseus-Ref"] = re.sub(r"[^A-Za-z0-9_.:-]", "-", ref_id)[:128]
 def _normalize_addr_field(field: str) -> str:
    """Strip the malformed-but-common trailing/leading commas and stray
    whitespace from a To/Cc/Bcc string before it lands in the MIME header
    or the SMTP envelope. Users often paste a single address with a
    trailing comma (e.g. `felix@pewdiepie.com,`) and most MTAs reject the
    resulting `To: felix@pewdiepie.com,` line as a syntax error. Collapse
    any run of separator junk between addresses too."""
    if not field:
        return field
    # Split on commas, drop empty tokens, rejoin with a single ', '.
    parts = [p.strip() for p in field.split(",")]
    parts = [p for p in parts if p]
    return ", ".join(parts)
 def _envelope_recipients(*fields: str) -> list:
    """Extract bare SMTP envelope addresses from one or more To/Cc/Bcc header
    strings. A naive `field.split(",")` corrupts display names that contain a
@@ -994,6 +1025,65 @@ def setup_email_routes():
                except Exception:
                    pass
    def _related_thread_attachments_sync(
        folder: str,
        account_id: str | None,
        owner: str,
        current_uid: str,
        current_message_id: str,
        in_reply_to: str,
        references: str,
        limit: int = 12,
    ) -> list[dict]:
        """Return visible attachments from referenced messages in this folder."""
        wanted_ids = _message_id_chain(references, in_reply_to)
        current_mid = (current_message_id or "").strip()
        wanted_ids = [mid for mid in wanted_ids if mid and mid != current_mid]
        if not wanted_ids:
            return []
        related: list[dict] = []
        try:
            with _imap(account_id, owner=owner) as conn:
                conn.select(_q(folder), readonly=True)
                # Search newest referenced messages first; cap work so opening
                # a long thread stays bounded.
                for mid in reversed(wanted_ids[-10:]):
                    if len(related) >= limit:
                        break
                    status, data = _imap_uid_search(conn, f'(HEADER Message-ID {_imap_search_quote(mid)})')
                    if status != "OK" or not data or not data[0]:
                        continue
                    for uid_b in reversed(data[0].split()[-3:]):
                        source_uid = uid_b.decode(errors="ignore")
                        if not source_uid or source_uid == str(current_uid):
                            continue
                        st2, msg_data = _imap_uid_fetch(conn, source_uid, "(BODY.PEEK[])")
                        if st2 != "OK" or not msg_data or not isinstance(msg_data[0], tuple):
                            continue
                        msg = email_mod.message_from_bytes(msg_data[0][1])
                        source_from = _decode_header(msg.get("From", ""))
                        source_subject = _decode_header(msg.get("Subject", ""))
                        source_date = msg.get("Date", "")
                        for att in _list_attachments_from_msg(msg):
                            if _is_likely_signature_image_attachment(att):
                                continue
                            enriched = dict(att)
                            enriched.update({
                                "source_uid": source_uid,
                                "source_folder": folder,
                                "source_message_id": (msg.get("Message-ID") or "").strip(),
                                "source_from": source_from,
                                "source_subject": source_subject,
                                "source_date": source_date,
                            })
                            related.append(enriched)
                            if len(related) >= limit:
                                break
        except Exception as e:
            logger.debug(f"related thread attachment lookup failed uid={current_uid}: {e}")
        return related
    @router.get("/list")
    async def list_emails(
        folder: str = Query("INBOX"),
@@ -1264,6 +1354,17 @@ def setup_email_routes():
            sender_name, sender_addr = email.utils.parseaddr(sender)
            parsed_date = email.utils.parsedate_to_datetime(date_str) if date_str else None
            attachments = _list_attachments_from_msg(msg)
            related_attachments = []
            if not _has_visible_attachments(msg):
                related_attachments = _related_thread_attachments_sync(
                    folder,
                    account_id,
                    owner,
                    uid,
                    message_id,
                    in_reply_to,
                    references,
                )
            if mark_seen:
                # Set \Seen in a separate readwrite session so concurrent reads
@@ -1372,6 +1473,8 @@ def setup_email_routes():
                "body": body,
                "body_html": body_html,
                "attachments": attachments,
                "related_attachments": related_attachments,
                "attachment_version": EMAIL_READ_ATTACHMENT_VERSION,
                "cached_summary": cached_summary,
                "cached_ai_reply": cached_ai_reply,
                "boundaries": cached_boundaries,
@@ -1402,6 +1505,12 @@ def setup_email_routes():
        """Read email body. Cached for 30m, sync IMAP work runs in a thread."""
        ck = _read_cache_key(account_id, folder, uid, owner=owner)
        cached = _read_cache_get(ck)
        if cached is not None:
            # Older cached read responses lack the thread-attachment fallback.
            # Fetch once so replies that reference prior attachments can show
            # those files without waiting for cache expiry.
            if cached.get("attachment_version") != EMAIL_READ_ATTACHMENT_VERSION:
                cached = None
        if cached is not None:
            if mark_seen:
                try:
@@ -1536,6 +1645,12 @@ def setup_email_routes():
                return {"error": f"Attachment index {index} not found"}
            from pathlib import Path as _Path
            target_root = os.path.abspath(str(target_dir))
            filepath_str = os.path.abspath(str(filepath))
            if os.path.commonpath([target_root, filepath_str]) != target_root:
                logger.warning("Rejected attachment path outside extraction dir: %s", filepath)
                return {"error": "Invalid attachment path"}
            filepath = _Path(filepath_str)
            base = _Path(filepath).name
            if base.startswith("."):
                return {"error": "Invalid filename", "filename": base}
@@ -1590,6 +1705,65 @@ def setup_email_routes():
                    return None
            doc_session_id = _resolve_doc_session()
            def _create_markdown_doc(content: str, summary: str):
                from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV
                doc_id = str(uuid.uuid4())
                ver_id = str(uuid.uuid4())
                _db = _SL()
                try:
                    _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False})
                    _db.add(_Doc(
                        id=doc_id, session_id=doc_session_id, title=title,
                        language="markdown", current_content=content,
                        version_count=1, is_active=True,
                    ))
                    _db.add(_DV(
                        id=ver_id, document_id=doc_id, version_number=1,
                        content=content, summary=summary, source="upload",
                    ))
                    _db.commit()
                finally:
                    _db.close()
                _tag_doc_with_source(doc_id)
                return doc_id
            def _attached_email_markdown(raw_bytes: bytes):
                if not raw_bytes:
                    return f"# Attached email: {base}\n\n_(empty email attachment)_"
                try:
                    attached_msg = email_mod.message_from_bytes(raw_bytes)
                except Exception:
                    logger.exception("Failed to parse attached email %s", base)
                    return f"# Attached email: {base}\n\nCould not parse this email attachment."
                attached_subject = _decode_header(attached_msg.get("Subject", "")) or base
                attached_from = _decode_header(attached_msg.get("From", ""))
                attached_to = _decode_header(attached_msg.get("To", ""))
                attached_cc = _decode_header(attached_msg.get("Cc", ""))
                attached_date = attached_msg.get("Date", "")
                attached_body = _extract_text(attached_msg).strip()
                attached_atts = _list_attachments_from_msg(attached_msg)
                lines = [f"# Attached email: {attached_subject}", ""]
                if attached_from:
                    lines.append(f"**From:** {attached_from}")
                if attached_to:
                    lines.append(f"**To:** {attached_to}")
                if attached_cc:
                    lines.append(f"**Cc:** {attached_cc}")
                if attached_date:
                    lines.append(f"**Date:** {attached_date}")
                lines.extend(["", "## Body", "", attached_body or "_(no readable body)_"])
                if attached_atts:
                    lines.extend(["", "## Attachments", ""])
                    for att in attached_atts:
                        size = int(att.get("size") or 0)
                        size_label = f"{size} B" if size < 1024 else f"{round(size / 1024)} KB"
                        name = att.get("filename") or f"attachment_{att.get('index', '')}"
                        ctype = att.get("content_type") or "application/octet-stream"
                        lines.append(f"- {name} ({ctype}, {size_label})")
                return "\n".join(lines).strip()
            # ── PDF path (existing) ────────────────────────────────────
            if ext == ".pdf":
                import shutil as _shutil
@@ -1636,6 +1810,39 @@ def setup_email_routes():
                _tag_doc_with_source(doc_id)
                return {"doc_id": doc_id, "filename": filepath.name}
            # ── Attached email (.eml / message/rfc822) ────────────────
            if ext == ".eml":
                def _attachment_bytes_from_msg():
                    if not msg.is_multipart():
                        return b""
                    idx = 0
                    for part in msg.walk():
                        cd = str(part.get("Content-Disposition", ""))
                        ct = part.get_content_type()
                        is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename())
                        if part.is_multipart() and not is_attached_email:
                            continue
                        if ct in ("text/plain", "text/html") and "attachment" not in cd:
                            continue
                        if idx == index:
                            payload = part.get_payload(decode=True)
                            if payload is None and ct == "message/rfc822":
                                try:
                                    payload = part.as_bytes()
                                except Exception:
                                    payload = b""
                            return payload or b""
                        idx += 1
                    return b""
                try:
                    content = _attached_email_markdown(_attachment_bytes_from_msg())
                except Exception:
                    logger.exception("Failed to read email attachment %s", base)
                    return {"error": "Failed to read email attachment", "filename": base}
                doc_id = _create_markdown_doc(content, "Imported attached email")
                return {"doc_id": doc_id, "filename": filepath.name}
            # ── DOCX path: extract text → markdown document ───────────
            if ext == ".docx":
                try:
@@ -1673,25 +1880,7 @@ def setup_email_routes():
                    lines.append("")
                content = "\n".join(lines).strip() or f"_(empty {base})_"
-                from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV
+                doc_id = _create_markdown_doc(content, "Imported from DOCX")
                doc_id = str(uuid.uuid4())
                ver_id = str(uuid.uuid4())
                _db = _SL()
                try:
                    _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False})
                    _db.add(_Doc(
                        id=doc_id, session_id=doc_session_id, title=title,
                        language="markdown", current_content=content,
                        version_count=1, is_active=True,
                    ))
                    _db.add(_DV(
                        id=ver_id, document_id=doc_id, version_number=1,
                        content=content, summary="Imported from DOCX", source="upload",
                    ))
                    _db.commit()
                finally:
                    _db.close()
                _tag_doc_with_source(doc_id)
                return {"doc_id": doc_id, "filename": filepath.name}
            # ── Plain text / markdown ────────────────────────────────
@@ -1700,25 +1889,7 @@ def setup_email_routes():
                    content = filepath.read_text(encoding="utf-8", errors="replace")
                except Exception as e:
                    return {"error": f"Failed to read text file: {e}", "filename": base}
-                from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV
+                doc_id = _create_markdown_doc(content, "Imported from email attachment")
                doc_id = str(uuid.uuid4())
                ver_id = str(uuid.uuid4())
                _db = _SL()
                try:
                    _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False})
                    _db.add(_Doc(
                        id=doc_id, session_id=doc_session_id, title=title,
                        language="markdown", current_content=content,
                        version_count=1, is_active=True,
                    ))
                    _db.add(_DV(
                        id=ver_id, document_id=doc_id, version_number=1,
                        content=content, summary="Imported from email attachment", source="upload",
                    ))
                    _db.commit()
                finally:
                    _db.close()
                _tag_doc_with_source(doc_id)
                return {"doc_id": doc_id, "filename": filepath.name}
            return {"error": f"Unsupported attachment type: {ext}", "filename": base}
@@ -2027,6 +2198,9 @@ def setup_email_routes():
            outer = MIMEMultipart("alternative")
            body_container = outer
        to = _normalize_addr_field(to or "")
        cc = _normalize_addr_field(cc or "")
        bcc = _normalize_addr_field(bcc or "")
        outer["From"] = email.utils.formataddr((cfg.get("display_name") or "", cfg["from_address"]))
        outer["To"] = to
        if cc:
@@ -2302,6 +2476,9 @@ def setup_email_routes():
            outer = MIMEMultipart("alternative")
            body_container = outer
        req.to = _normalize_addr_field(req.to or "")
        req.cc = _normalize_addr_field(req.cc or "")
        req.bcc = _normalize_addr_field(req.bcc or "")
        outer["From"] = email.utils.formataddr((cfg.get("display_name") or "", cfg["from_address"]))
        outer["To"] = req.to
        if req.cc:
@@ -1,8 +1,13 @@
 import json
 import os
 import re
 import shlex
 import subprocess
 from copy import deepcopy
 from fastapi import APIRouter, HTTPException
 from core.platform_compat import run_ssh_command
 from routes._validators import validate_remote_host, validate_ssh_port
@@ -107,6 +112,73 @@ def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_v
    return system
 def _run_model_probe(host: str, ssh_port: str, cmd: str) -> str:
    try:
        if host:
            r = run_ssh_command(
                host,
                ssh_port or None,
                cmd,
                timeout=15,
                connect_timeout=5,
                strict_host_key_checking=False,
                text=True,
            )
        else:
            r = subprocess.run(["bash", "-lc", cmd], capture_output=True, text=True, timeout=15)
        if r.returncode == 0:
            return (r.stdout or "").strip()
    except Exception:
        return ""
    return ""
 def _inspect_model_path(model_path: str, host: str = "", ssh_port: str = "") -> dict:
    """Read lightweight metadata from a local or SSH-visible HF model folder."""
    path = (model_path or "").strip()
    if not path or path.startswith(("http://", "https://")):
        return {}
    if not (path.startswith("/") or path.startswith("~")):
        return {}
    qpath = shlex.quote(path)
    qconfig = shlex.quote(os.path.join(path, "config.json"))
    out = {}
    exists = _run_model_probe(host, ssh_port, f"test -d {qpath} && printf found || printf missing")
    if exists != "found":
        target = host or "local container"
        out["model_probe_error"] = f"Model path is not visible on {target}: {path}"
        return out
    raw_config = _run_model_probe(host, ssh_port, f"test -f {qconfig} && sed -n '1,240p' {qconfig}")
    if raw_config:
        try:
            cfg = json.loads(raw_config)
        except Exception:
            cfg = {}
        for key in ("context_length", "max_position_embeddings", "n_ctx_train", "model_max_length", "max_seq_len"):
            value = cfg.get(key)
            if isinstance(value, (int, float)) and value > 0:
                out["model_ctx_max"] = int(value)
                break
    else:
        out["model_probe_error"] = f"config.json not found in model path: {path}"
    size_cmd = (
        f"find {qpath} -type f \\( -name '*.safetensors' -o -name '*.bin' -o -name '*.gguf' \\) "
        "-printf '%s\\n' 2>/dev/null | awk '{s+=$1} END {if (s>0) printf \"%.6f\", s/1073741824}'"
    )
    weights = _run_model_probe(host, ssh_port, size_cmd)
    try:
        weights_gb = float(weights)
    except Exception:
        weights_gb = 0.0
    if weights_gb > 0:
        out["model_weights_gb"] = round(weights_gb, 3)
    elif "model_probe_error" not in out:
        out["model_probe_error"] = f"No model weight files found in: {path}"
    return out
 def setup_hwfit_routes():
    router = APIRouter(prefix="/api/hwfit", tags=["hwfit"])
@@ -235,7 +307,7 @@ def setup_hwfit_routes():
        return {"system": system, "models": results}
    @router.get("/profiles")
-    def get_serve_profiles(model: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""):
+    def get_serve_profiles(model: str = "", model_path: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""):
        """Compute llama.cpp serve profiles (Quality/Balanced/Speed) for `model`
        against the detected hardware on `host` (or local). Returns concrete
        flags (n_gpu_layers, n_cpu_moe, cache_type, ctx) the serve UI can apply.
@@ -260,8 +332,23 @@ def setup_hwfit_routes():
            # "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct".
            s = (s or "").lower().strip()
            s = s.split("/")[-1]                     # drop org prefix
-            s = re.sub(r"[-_.]?gguf$", "", s)        # drop trailing gguf marker
+            for suffix in ("-gguf", "_gguf", ".gguf", "gguf"):
-            s = re.sub(r"[-_.](q\d[^/]*|iq\d[^/]*|fp8|bf16|f16|awq[^/]*|gptq[^/]*)$", "", s)
+                if s.endswith(suffix):
                    s = s[: -len(suffix)]
                    break
            cut_at = None
            for idx, ch in enumerate(s):
                if ch not in "-_." or idx + 1 >= len(s):
                    continue
                suffix = s[idx + 1:]
                if (
                    suffix in {"fp8", "bf16", "f16"}
                    or suffix.startswith(("awq", "gptq", "iq"))
                    or (suffix.startswith("q") and len(suffix) > 1 and suffix[1].isdigit())
                ):
                    cut_at = idx
            if cut_at is not None:
                s = s[:cut_at]
            return s
        m = catalog.get(model)
@@ -272,8 +359,16 @@ def setup_hwfit_routes():
                if nn and (nn == want or want.endswith(nn) or nn.endswith(want)):
                    m = entry
                    break
        path_meta = _inspect_model_path(model_path or model, host=host, ssh_port=ssh_port)
        if m is None:
-            return {"system": system, "profiles": [], "error": "model not in catalog"}
+            return {
                "system": system,
                "profiles": [],
                "error": "model not in catalog",
                "model_ctx_max": int(path_meta.get("model_ctx_max") or 0),
                "model_weights_gb": float(path_meta.get("model_weights_gb") or 0),
                "model_probe_error": path_meta.get("model_probe_error") or "",
            }
        # Surface the model's trained context limit so the serve UI can clamp a
        # user-typed context down to it (asking for ctx > n_ctx_train overflows
        # and, with a quantized KV cache, can crash the GPU).
@@ -283,6 +378,16 @@ def setup_hwfit_routes():
            if isinstance(v, (int, float)) and v > 0:
                model_ctx_max = int(v)
                break
        path_ctx_max = int(path_meta.get("model_ctx_max") or 0)
        if path_ctx_max > 0:
            model_ctx_max = max(model_ctx_max, path_ctx_max)
        model_weights_gb = float(path_meta.get("model_weights_gb") or 0)
        if model_weights_gb <= 0:
            for k in ("min_vram_gb", "required_gb", "size_gb", "recommended_ram_gb", "min_ram_gb"):
                v = m.get(k)
                if isinstance(v, (int, float)) and v > 0:
                    model_weights_gb = float(v)
                    break
        return {
            "system": system,
            "profiles": compute_serve_profiles(
@@ -291,6 +396,8 @@ def setup_hwfit_routes():
                serve_quant=(serve_quant or None),
            ),
            "model_ctx_max": model_ctx_max,
            "model_weights_gb": model_weights_gb,
            "model_probe_error": path_meta.get("model_probe_error") or "",
        }
    @router.get("/image-models")
@@ -406,8 +406,11 @@ def _endpoint_refresh_timeout(ep: Any, category: str) -> float:
    except Exception:
        val = 0
    if val > 0:
-        return float(max(1, min(30, val)))
+        return float(max(1, min(60, val)))
-    return 2.5 if category == "local" else 2.0
+    # llama.cpp and other local OpenAI-compatible servers can block briefly
    # while warming/loading. A 2s local timeout makes working endpoints flicker
    # offline before /v1/models is ready.
    return 10.0 if category == "local" else 2.0
 def _manual_refresh_timeout(ep: Any, category: str, requested: Any = None) -> float:
@@ -474,7 +477,7 @@ def _explicit_model_list_timeout(base_url: str, endpoint_kind: str = "auto", req
    category = _classify_endpoint(base_url, kind)
    if kind in ("api", "proxy") or category == "api":
        return 30.0
-    return 3.0 if _is_ollama_base(base_url) else 2.0
+    return 15.0 if category == "local" else (3.0 if _is_ollama_base(base_url) else 2.0)
 def _cached_model_ids(ep: Any) -> List[str]:
@@ -579,6 +582,18 @@ def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict:
        return {"Authorization": f"Bearer {api_key}"} if api_key else {}
 def _redact_url_for_log(url: str) -> str:
    """Return a URL safe for logs by removing userinfo and query/fragment."""
    try:
        parsed = urlparse(url or "")
        host = parsed.hostname or ""
        if parsed.port:
            host = f"{host}:{parsed.port}"
        return urlunparse((parsed.scheme, host, parsed.path, "", "", ""))
    except Exception:
        return "<endpoint>"
 def _is_discovery_only_provider(provider: str) -> bool:
    return provider == "chatgpt-subscription"
@@ -711,6 +726,16 @@ def _effective_endpoint_kind(ep: Any, base_url: str) -> str:
    return "auto"
 def _is_loading_model_response(resp: Any) -> bool:
    if getattr(resp, "status_code", None) != 503:
        return False
    try:
        body = resp.text or ""
    except Exception:
        body = ""
    return "loading model" in body.lower()
 def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> List[str]:
    """Probe a base URL's /models endpoint and return list of model IDs.
@@ -775,11 +800,14 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
                        models.append(_e)
            return [m for m in models if _is_chat_model(m)]
    except httpx.HTTPStatusError as e:
        if e.response is not None and _is_loading_model_response(e.response):
            logger.info("Endpoint still loading model at %s", _redact_url_for_log(url))
            return []
        if api_key:
            status = e.response.status_code if e.response is not None else "unknown"
-            logger.warning(f"Failed to probe {url} with API key: HTTP {status}")
+            logger.warning("Failed to probe %s with API key: HTTP %s", _redact_url_for_log(url), status)
            return []
-        logger.warning(f"Failed to probe {url}: {e}")
+        logger.warning("Failed to probe %s: %s", _redact_url_for_log(url), e)
    except Exception as e:
        if api_key:
            logger.warning(f"Failed to probe {url} with API key: {e}")
@@ -824,6 +852,15 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
        or "ollama" in (parsed_base.hostname or "").lower()
    )
    def _is_loading_model_response(r) -> bool:
        if getattr(r, "status_code", None) != 503:
            return False
        try:
            body = r.text or ""
        except Exception:
            body = ""
        return "loading model" in body.lower()
    def _result_from_response(r) -> Dict[str, Any]:
        if 300 <= r.status_code < 400:
            loc = r.headers.get("location", "")
@@ -840,6 +877,13 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
                "status_code": r.status_code,
                "error": None,
            }
        if _is_loading_model_response(r):
            return {
                "reachable": True,
                "loading": True,
                "status_code": r.status_code,
                "error": "Loading model",
            }
        return {"reachable": False, "status_code": r.status_code, "error": f"HTTP {r.status_code}"}
    last_error: Optional[str] = None
@@ -872,7 +916,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
        if 400 <= sc < 500 and sc not in (401, 403):
            models_url = _safe_build_models_url(base)
            try:
-                r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify())
+                r2 = httpx.get(models_url, headers=headers,timeout=timeout, verify=llm_verify())
                result2 = _result_from_response(r2)
                if result2["reachable"]:
                    return result2
@@ -1056,9 +1100,11 @@ def setup_model_routes(model_discovery):
        except Exception:
            return 0.0
-    def _failure_delay(fails: int) -> float:
+    def _failure_delay(fails: int, *, empty_local: bool = False) -> float:
        if fails <= 0:
            return 0.0
        if empty_local:
            return min(5.0 * (2 ** max(0, fails - 1)), 30.0)
        return min(_REFRESH_FAILURE_BASE * (2 ** max(0, fails - 1)), _REFRESH_FAILURE_MAX)
    def _should_refresh_endpoint(ep: Any, now: float, force: bool = False) -> tuple[bool, Dict[str, Any]]:
@@ -1089,7 +1135,12 @@ def setup_model_routes(model_discovery):
        fails = int(state.get("fail_count") or 0)
        if fails and not force:
            last_failure = float(state.get("last_failure") or 0.0)
-            if now - last_failure < _failure_delay(fails):
+            empty_local = (
                not cached
                and category == "local"
                and str(getattr(ep, "id", "") or "").startswith("local-")
            )
            if now - last_failure < _failure_delay(fails, empty_local=empty_local):
                return False, info
        if cached and not force:
            interval = _endpoint_refresh_interval(ep, category)
@@ -1404,7 +1455,7 @@ def setup_model_routes(model_discovery):
                t0 = _time.time()
                ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
                entry["latency_ms"] = round((_time.time() - t0) * 1000)
-                entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
+                entry["status"] = "loading" if ping.get("loading") else ("online" if ping.get("reachable") or cached_count else "offline")
                entry["error"] = ping.get("error")
                entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
            except Exception as e:
@@ -1578,9 +1629,37 @@ def setup_model_routes(model_discovery):
                # "everything's already cached" path because this branch only
                # runs for endpoints with an empty cached_models.
                if not all_models and not pinned and r.is_enabled:
-                    ping = _ping_endpoint(r.base_url, r.api_key, timeout=3.5)
+                    base_for_ping = _normalize_base(r.base_url)
                    kind_for_ping = _effective_endpoint_kind(r, base_for_ping)
                    ping_timeout = 10.0 if _classify_endpoint(base_for_ping, kind_for_ping) == "local" else 3.5
                    ping = _ping_endpoint(r.base_url, r.api_key, timeout=ping_timeout)
                    if ping.get("reachable"):
-                        status = "empty"
+                        status = "loading" if ping.get("loading") else "empty"
                        if ping.get("loading"):
                            base = _normalize_base(r.base_url)
                            kind = _effective_endpoint_kind(r, base)
                            results.append({
                                "id": r.id,
                                "name": r.name,
                                "base_url": r.base_url,
                                "has_key": bool(r.api_key),
                                "api_key_fingerprint": _api_key_fingerprint(r.api_key),
                                "is_enabled": r.is_enabled,
                                "models": visible,
                                "pinned_models": pinned,
                                "hidden_count": len(hidden),
                                "online": True,
                                "status": status,
                                "ping_error": (ping or {}).get("error") if ping else None,
                                "model_type": getattr(r, "model_type", None) or "llm",
                                "supports_tools": getattr(r, "supports_tools", None),
                                "endpoint_kind": kind,
                                "category": _classify_endpoint(base, kind),
                                "model_refresh_mode": _endpoint_refresh_mode(r, kind),
                                "model_refresh_interval": getattr(r, "model_refresh_interval", None),
                                "model_refresh_timeout": getattr(r, "model_refresh_timeout", None),
                            })
                            continue
                        # Best-effort: if the probe came back reachable, try
                        # to populate cached_models in the background so the
                        # NEXT picker load shows "online" instead of "empty".
@@ -1588,7 +1667,7 @@ def setup_model_routes(model_discovery):
                        # "empty" status, and the existing background refresh
                        # path will eventually fill it in too.
                        try:
-                            probed = _probe_endpoint(r.base_url, r.api_key, timeout=5)
+                            probed = _probe_endpoint(r.base_url, r.api_key, timeout=max(5, int(ping_timeout)))
                            if probed:
                                r.cached_models = json.dumps(probed)
                                db.commit()
@@ -1766,7 +1845,7 @@ def setup_model_routes(model_discovery):
        model_ids = _probe_endpoint(base_url, api_key.strip() or None, timeout=explicit_timeout) if should_probe else []
        ping = {"reachable": False, "error": None}
        if (should_probe or requested_kind in ("api", "proxy")) and not model_ids:
-            ping = _ping_endpoint(base_url, api_key.strip() or None, timeout=min(explicit_timeout, 2.0))
+            ping = _ping_endpoint(base_url, api_key.strip() or None, timeout=min(explicit_timeout, 10.0))
        if require_model_list and not model_ids:
            raise HTTPException(400, _model_endpoint_error_message(base_url, ping))
@@ -1833,7 +1912,7 @@ def setup_model_routes(model_discovery):
            "models": _merge_model_ids(model_ids, _pinned),
            "pinned_models": _pinned,
            "online": bool(model_ids) or bool(_pinned) or bool(ping.get("reachable")),
-            "status": "online" if (model_ids or _pinned) else ("empty" if ping.get("reachable") else "offline"),
+            "status": "online" if (model_ids or _pinned) else ("loading" if ping.get("loading") else ("empty" if ping.get("reachable") else "offline")),
            "ping_error": ping.get("error") if ping else None,
            "endpoint_kind": requested_kind,
            "category": _classify_endpoint(base_url, requested_kind),
@@ -1858,11 +1937,11 @@ def setup_model_routes(model_discovery):
        configured_timeout = _parse_positive_int(model_refresh_timeout, minimum=1, maximum=60)
        probe_timeout = _explicit_model_list_timeout(base_url, requested_kind, configured_timeout)
        models = _probe_endpoint(base_url, api_key.strip() or None, timeout=probe_timeout)
-        ping = {"reachable": True, "error": None} if models else _ping_endpoint(base_url, api_key.strip() or None, timeout=min(probe_timeout, 2.0))
+        ping = {"reachable": True, "error": None} if models else _ping_endpoint(base_url, api_key.strip() or None, timeout=min(probe_timeout, 10.0))
        return {
            "base_url": base_url,
            "online": bool(models) or bool(ping.get("reachable")),
-            "status": "online" if models else ("empty" if ping.get("reachable") else "offline"),
+            "status": "online" if models else ("loading" if ping.get("loading") else ("empty" if ping.get("reachable") else "offline")),
            "ping_error": ping.get("error") if ping else None,
            "models": models,
            "count": len(models),
@@ -331,6 +331,9 @@ def add_user_install_bins_to_path():
        candidates.append(os.path.join(site.USER_BASE, 'bin'))
    except Exception:
        pass
    candidates.append(os.path.expanduser('~/bin'))
    candidates.append(os.path.expanduser('~/llama.cpp/build/bin'))
    candidates.append(os.path.expanduser('~/llama.cpp/build-vulkan/bin'))
    candidates.append(os.path.expanduser('~/.local/bin'))
    parts = os.environ.get('PATH', '').split(os.pathsep) if os.environ.get('PATH') else []
    changed = False
@@ -962,12 +965,84 @@ def setup_shell_routes() -> APIRouter:
        return StreamingResponse(generate(), media_type="text/event-stream")
    def _os_id_from_release(text: str) -> str:
        """Map /etc/os-release contents to a canonical family for our matrix."""
        if not text:
            return ""
        ids = []
        for line in text.splitlines():
            line = line.strip()
            if line.startswith("ID=") or line.startswith("ID_LIKE="):
                ids += line.split("=", 1)[1].strip().strip('"').split()
        ids = [i.lower() for i in ids]
        if any(x in ids for x in ("debian", "ubuntu", "linuxmint", "pop", "elementary")):
            return "debian"
        if any(x in ids for x in ("arch", "manjaro", "endeavouros", "cachyos", "garuda")):
            return "arch"
        if any(x in ids for x in ("fedora", "rhel", "centos", "rocky", "almalinux", "ol")):
            return "fedora"
        if "alpine" in ids:
            return "alpine"
        if any(x in ids for x in ("suse", "opensuse", "opensuse-leap", "opensuse-tumbleweed", "sles")):
            return "suse"
        return ""
    # Matrix lookup keyed on (os_family, backend) → (pkg_mgr_cmd_template, pkg_list_per_dep).
    # Each `system_prereqs` name resolves to a list of OS-specific package
    # names that get joined into the final `sudo apt install -y …` etc.
    # command. Backend-specific extras (CUDA toolkit, ROCm, Vulkan headers)
    # are added only when the detected backend needs them.
    _PKG_NAMES = {
        # canonical-name → {os_id: [actual_pkg_names_on_this_os]}
        "cmake":           {"debian": ["cmake"], "arch": ["cmake"], "fedora": ["cmake"], "alpine": ["cmake"], "suse": ["cmake"], "macos": ["cmake"]},
        "build-essential": {"debian": ["build-essential"], "arch": ["base-devel"], "fedora": ["gcc", "gcc-c++", "make"], "alpine": ["build-base"], "suse": ["gcc-c++", "make"], "macos": []},
        "g++":             {"debian": ["g++"], "arch": ["gcc"], "fedora": ["gcc-c++"], "alpine": ["g++"], "suse": ["gcc-c++"], "macos": []},
        "gcc":             {"debian": ["gcc"], "arch": ["gcc"], "fedora": ["gcc"], "alpine": ["gcc"], "suse": ["gcc"], "macos": []},
        "make":            {"debian": ["make"], "arch": ["make"], "fedora": ["make"], "alpine": ["make"], "suse": ["make"], "macos": []},
        "git":             {"debian": ["git"], "arch": ["git"], "fedora": ["git"], "alpine": ["git"], "suse": ["git"], "macos": ["git"]},
        "tmux":            {"debian": ["tmux"], "arch": ["tmux"], "fedora": ["tmux"], "alpine": ["tmux"], "suse": ["tmux"], "macos": ["tmux"]},
    }
    _BACKEND_EXTRAS = {
        "cuda":   {"debian": ["nvidia-cuda-toolkit"], "arch": ["cuda"], "fedora": ["cuda-toolkit"], "alpine": [], "suse": ["cuda"], "macos": []},
        "rocm":   {"debian": ["rocm-dev"], "arch": ["rocm-hip-sdk"], "fedora": ["rocm-devel"], "alpine": [], "suse": ["rocm-dev"], "macos": []},
        "vulkan": {"debian": ["libvulkan-dev", "vulkan-tools"], "arch": ["vulkan-headers", "vulkan-tools"], "fedora": ["vulkan-headers", "vulkan-tools"], "alpine": ["vulkan-loader-dev", "vulkan-tools"], "suse": ["vulkan-devel", "vulkan-tools"], "macos": []},
    }
    _PKG_MGR = {
        "debian": "sudo apt install -y {pkgs}",
        "arch":   "sudo pacman -S --needed {pkgs}",
        "fedora": "sudo dnf install -y {pkgs}",
        "alpine": "sudo apk add {pkgs}",
        "suse":   "sudo zypper install -n {pkgs}",
        "macos":  "brew install {pkgs}",
    }
    def _install_cmd_for_target(os_id: str, backend: str, missing: list[str]) -> str:
        """Build a single OS+backend-aware install command for the missing prereqs."""
        if not os_id or os_id not in _PKG_MGR:
            return ""
        pkgs: list[str] = []
        seen: set[str] = set()
        for m in missing:
            for p in _PKG_NAMES.get(m, {}).get(os_id, []):
                if p not in seen:
                    pkgs.append(p); seen.add(p)
        # Add backend-specific extras only when the build would actually
        # consume them (a CUDA toolkit isn't useful on a Vulkan box).
        backend = (backend or "").lower()
        for p in _BACKEND_EXTRAS.get(backend, {}).get(os_id, []):
            if p not in seen:
                pkgs.append(p); seen.add(p)
        if not pkgs:
            return ""
        return _PKG_MGR[os_id].format(pkgs=" ".join(pkgs))
    @router.get("/api/cookbook/packages")
    async def list_packages(
        request: Request,
        host: str | None = None,
        ssh_port: str | None = None,
        venv: str | None = None,
        backend: str | None = None,
    ):
        """Check which optional packages are installed.
@@ -1016,6 +1091,12 @@ def setup_shell_routes() -> APIRouter:
                "kind": "system",
                "install_hint": "Install Docker on the selected server and allow this user to run docker.",
            },
            # Note: cmake / gcc / git are not separate dependency rows —
            # they're declared as `system_prereqs` on llama_cpp (and any
            # other engine that compiles from source) so they appear as
            # an inline status note on that engine's row instead of
            # cluttering the panel with raw OS package names that aren't
            # meaningful product-level dependencies on their own.
            # ── LLM ── installs on GPU servers for model serving/downloading
            {
                "name": "hf_transfer",
@@ -1027,9 +1108,16 @@ def setup_shell_routes() -> APIRouter:
            {
                "name": "llama_cpp",
                "pip": "llama-cpp-python[server]",
-                "desc": "Serve GGUF models via llama.cpp",
+                "desc": "Great for single-GPU or CPU inference with GGUF models",
                "category": "LLM",
                "target": "remote",
                # Build-toolchain prereqs. Cookbook's launch bootstrap
                # compiles llama-server from source when no prebuilt
                # binary is present; without these the build aborts
                # with `cmake: command not found`. Surfaced inline on
                # this row so the user doesn't have to chase three
                # separate OS-package rows.
                "system_prereqs": ["cmake", "g++", "git"],
            },
            {
                "name": "sglang",
@@ -1041,7 +1129,7 @@ def setup_shell_routes() -> APIRouter:
            {
                "name": "vllm",
                "pip": "vllm",
-                "desc": "High-throughput LLM serving engine",
+                "desc": "Great for high-throughput multi-GPU inference",
                "category": "LLM",
                "target": "remote",
            },
@@ -1104,6 +1192,7 @@ def setup_shell_routes() -> APIRouter:
        # venv over SSH so a remote `pip install` actually reflects here.
        remote_status: dict = {}
        remote_details: dict = {}
        remote_probe_error = ""
        remote_names = [
            p["name"]
            for p in packages
@@ -1142,16 +1231,56 @@ def setup_shell_routes() -> APIRouter:
                        break
            except ValueError as e:
                raise HTTPException(400, str(e))
-            except Exception:
+            except Exception as e:
                remote_status = {}
-        if host and remote_system_names:
+                remote_probe_error = f"SSH package probe failed: {str(e)[:160]}"
            if "llama_cpp" in remote_names:
                try:
                    inner = (
                        'export PATH="$HOME/.local/bin:$HOME/bin:'
                        '$HOME/llama.cpp/build/bin:$HOME/llama.cpp/build-vulkan/bin:$PATH"; '
                        "command -v llama-server 2>/dev/null || true"
                    )
                    argv = _ssh_base_argv(host, ssh_port) + [inner]
                    proc = await asyncio.create_subprocess_exec(
                        *argv,
                        stdout=asyncio.subprocess.PIPE,
                        stderr=asyncio.subprocess.PIPE,
                    )
                    out, _err = await asyncio.wait_for(proc.communicate(), timeout=8)
                    llama_server_path = out.decode("utf-8", errors="replace").strip().splitlines()
                    llama_server_path = llama_server_path[-1].strip() if llama_server_path else ""
                    if llama_server_path:
                        remote_status["llama_cpp"] = True
                        probe = remote_details.setdefault("llama_cpp", {})
                        if isinstance(probe, dict):
                            probe.setdefault("binaries", {})["llama-server"] = llama_server_path
                except Exception as e:
                    if not remote_probe_error:
                        remote_probe_error = f"SSH llama-server probe failed: {str(e)[:160]}"
                    pass
        # Union of system_names + every package's system_prereqs. Probing
        # the prereqs alongside the main system deps in a single SSH call
        # avoids a second round-trip per Cookbook → Dependencies refresh.
        prereq_names: set[str] = set()
        for p in packages:
            for pr in p.get("system_prereqs") or []:
                prereq_names.add(str(pr))
        all_system_names = list(set(remote_system_names) | prereq_names)
        # Detect the target's OS family + read /etc/os-release in the same
        # SSH round-trip as the prereq probe — used downstream to render a
        # single OS-specific install command per row instead of dumping
        # every distro's syntax onto the user.
        target_os_id: str = ""
        if host and all_system_names:
            try:
                checks = []
-                for name in remote_system_names:
+                for name in all_system_names:
                    qn = shlex.quote(name)
                    checks.append(
                        f"if command -v {qn} >/dev/null 2>&1; then echo {qn}=1; else echo {qn}=0; fi"
                    )
                checks.append("echo '---OSREL---'; cat /etc/os-release 2>/dev/null || true")
                inner = " ; ".join(checks)
                argv = _ssh_base_argv(host, ssh_port) + [inner]
                proc = await asyncio.create_subprocess_exec(
@@ -1161,20 +1290,45 @@ def setup_shell_routes() -> APIRouter:
                )
                out, _err = await asyncio.wait_for(proc.communicate(), timeout=12)
                txt = out.decode("utf-8", errors="replace").strip()
                _section, _osrel_lines = "probe", []
                for line in txt.splitlines():
                    if line.strip() == "---OSREL---":
                        _section = "osrel"; continue
                    if _section == "osrel":
                        _osrel_lines.append(line)
                        continue
                    name, sep, value = line.strip().partition("=")
-                    if sep and name in remote_system_names:
+                    if sep and name in all_system_names:
                        remote_status[name] = value == "1"
                target_os_id = _os_id_from_release("\n".join(_osrel_lines))
            except ValueError as e:
                raise HTTPException(400, str(e))
-            except Exception:
+            except Exception as e:
                if not remote_probe_error:
                    remote_probe_error = f"SSH system probe failed: {str(e)[:160]}"
                pass
        elif not host:
            # Local target — probe in-process so the inline install command
            # still appears in the dep panel when the cookbook container
            # itself is the selected server.
            try:
                with open("/etc/os-release", encoding="utf-8") as f:
                    target_os_id = _os_id_from_release(f.read())
            except Exception:
                target_os_id = ""
            if sys.platform == "darwin":
                target_os_id = "macos"
        for pkg in packages:
            on_remote = bool(host and pkg.get("target") == "remote")
            probe = None
            if on_remote:
-                pkg["installed"] = bool(remote_status.get(pkg["name"], False))
+                if remote_probe_error and pkg["name"] not in remote_status:
                    pkg["installed"] = None
                    pkg["probe_error"] = remote_probe_error
                    pkg["status_note"] = remote_probe_error
                else:
                    pkg["installed"] = bool(remote_status.get(pkg["name"], False))
                probe = remote_details.get(pkg["name"])
                if isinstance(probe, dict):
                    pkg["details"] = probe
@@ -1230,6 +1384,104 @@ def setup_shell_routes() -> APIRouter:
                    # 500 the entire packages panel; report it as not usable.
                    pkg["installed"] = False
            # llama_cpp partial-state probe: when the package is installed
            # but the wheel was built CPU-only AND the target has NVIDIA
            # hardware, mark the row as partial (yellow/orange) with a
            # one-click upgrade to the CUDA wheel. Without this the row
            # reads "ready" green while inference runs at 3 tok/s on GPU
            # silicon — actively misleading.
            if pkg["name"] == "llama_cpp" and pkg.get("installed"):
                _native_llama_server = bool(
                    isinstance(probe, dict)
                    and isinstance(probe.get("binaries"), dict)
                    and probe["binaries"].get("llama-server")
                )
                _gpu_capable = False
                _has_nvidia_target = False
                if _native_llama_server:
                    # Native llama-server is the launcher path Cookbook now
                    # prefers. Do not mark this as a CPU-only Python wheel just
                    # because llama-cpp-python is absent from the selected venv.
                    _gpu_capable = True
                elif on_remote and host:
                    try:
                        # Activate the configured venv FIRST so the probe
                        # runs against the same python the launch script
                        # would activate. Without this prefix, bare
                        # `python3` was checked — which can disagree with
                        # the venv's wheel (e.g. user-site has CUDA wheel
                        # but venv has CPU-only), and the dep panel then
                        # showed "ready" green while every launch fell to
                        # CPU.
                        _vp = _venv_activate_prefix(venv)
                        probe = (
                            f'{_vp}python3 -c "import llama_cpp; import sys; '
                            'sys.exit(0 if llama_cpp.llama_supports_gpu_offload() else 1)" '
                            '&& echo llama_cpp_gpu=1 || echo llama_cpp_gpu=0; '
                            'command -v nvidia-smi >/dev/null 2>&1 '
                            '&& nvidia-smi -L 2>/dev/null | grep -q "GPU " '
                            '&& echo nvidia=1 || echo nvidia=0'
                        )
                        argv = _ssh_base_argv(host, ssh_port) + [probe]
                        proc = await asyncio.create_subprocess_exec(
                            *argv, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
                        )
                        out, _ = await asyncio.wait_for(proc.communicate(), timeout=8)
                        txt = out.decode("utf-8", errors="replace")
                        if "llama_cpp_gpu=1" in txt:
                            _gpu_capable = True
                        if "nvidia=1" in txt:
                            _has_nvidia_target = True
                    except Exception:
                        pass
                else:
                    try:
                        import llama_cpp as _lcp  # type: ignore
                        _gpu_capable = bool(_lcp.llama_supports_gpu_offload())
                    except Exception:
                        _gpu_capable = False
                    _has_nvidia_target = shutil.which("nvidia-smi") is not None
                if (not _gpu_capable) and _has_nvidia_target:
                    pkg["partial"] = True
                    pkg["partial_reason"] = "Installed but CPU-only wheel — GPU detected on this target. Upgrade to a CUDA wheel for ~10× faster inference."
                    pkg["partial_action"] = "reinstall_llama_cpp_cuda"
            # Attach per-package system_prereqs status. We probed each
            # prereq name above; surface "Missing build deps: …" ONLY
            # when the package itself is not installed — if the package
            # works (e.g. llama-cpp-python already imports cleanly), the
            # build toolchain is irrelevant and surfacing it as a red
            # flag confuses users ("ready" + "missing" on the same row).
            _prereqs = list(pkg.get("system_prereqs") or [])
            if _prereqs:
                if on_remote:
                    _pr_present = {n: bool(remote_status.get(n)) for n in _prereqs}
                else:
                    _pr_present = {n: shutil.which(n) is not None for n in _prereqs}
                pkg["system_prereqs_status"] = _pr_present
                _missing = [n for n, ok in _pr_present.items() if not ok]
                # Suppress the "missing build deps" hint when the package
                # itself is installed — build deps are only relevant if
                # the user would need to recompile from source.
                if pkg.get("installed"):
                    _missing = []
                if _missing:
                    # Build a target-specific install command from the
                    # (os_family, backend) matrix when we know both. Fall
                    # back to the multi-distro hint only when the target's
                    # OS can't be classified (e.g. ssh probe failed).
                    _resolved_os = target_os_id or "debian"  # safest default
                    _cmd = _install_cmd_for_target(_resolved_os, backend or "", _missing)
                    if _cmd and target_os_id:
                        _hint = "Missing build deps for this target: " + ", ".join(_missing)
                        pkg["install_cmd_for_target"] = _cmd
                        pkg["install_cmd_os"] = target_os_id
                        pkg["install_cmd_backend"] = (backend or "").lower()
                    else:
                        _hint = "Missing build deps: " + ", ".join(_missing) + ". Install via apt: cmake build-essential git / pacman: cmake base-devel git / dnf: cmake gcc-c++ make git / brew: cmake git."
                    _existing_note = pkg.get("status_note") or ""
                    pkg["status_note"] = (_existing_note + " — " + _hint) if _existing_note else _hint
                    pkg["build_deps_missing"] = _missing
            if pkg.get("installed"):
                update_status = _package_pip_update_status(pkg, probe)
                pkg["pip_update_available"] = update_status.available
@@ -1289,6 +1541,102 @@ def setup_shell_routes() -> APIRouter:
            return {"ok": True, "output": stdout.decode()[-200:]}
        return {"ok": False, "error": stderr.decode()[-300:]}
    @router.post("/api/cookbook/install-system-deps")
    async def install_system_deps(request: Request):
        """Install OS-level system packages (cmake/build-essential/git/tmux)
        on a remote target or in the local container. Admin only.
        Bounded by a per-package allowlist — anything outside the catalog
        is rejected so the route can't be coerced into installing arbitrary
        OS packages. Uses `sudo -n` (passwordless) so the call returns a
        clear "needs sudo password" error instead of hanging when interactive
        sudo is required.
        """
        _require_admin(request)
        body = await request.json()
        raw = body.get("packages") or []
        host = (body.get("remote_host") or "").strip()
        ssh_port = body.get("ssh_port")
        # Names users can request — must match canonical names used in the
        # deps catalog's `system_prereqs` field and on the System rows.
        ALLOWED = {"cmake", "build-essential", "g++", "gcc", "git", "tmux", "make"}
        pkgs = [str(p).strip() for p in raw if str(p).strip() in ALLOWED]
        if not pkgs:
            return {"ok": False, "error": "no installable packages requested (allowlist: " + ", ".join(sorted(ALLOWED)) + ")"}
        # Re-map to the right package name per OS. apt/dpkg use the names
        # as-is; pacman has base-devel for build-essential, etc.
        def _apt(names): return list(names)
        def _pacman(names):
            return ["base-devel" if n == "build-essential" else n for n in names]
        def _dnf(names):
            out = []
            for n in names:
                if n == "build-essential": out += ["gcc", "gcc-c++", "make"]
                elif n == "g++": out += ["gcc-c++"]
                else: out.append(n)
            return out
        def _brew(names):
            return [n for n in names if n not in ("build-essential", "g++", "gcc", "make")]
        # Build a single shell snippet that detects the package manager and
        # runs the right install. Non-interactive sudo (-n) only — if sudo
        # asks for a password the script reports it instead of hanging.
        apt_pkgs = " ".join(shlex.quote(p) for p in _apt(pkgs))
        pac_pkgs = " ".join(shlex.quote(p) for p in _pacman(pkgs))
        dnf_pkgs = " ".join(shlex.quote(p) for p in _dnf(pkgs))
        brew_pkgs = " ".join(shlex.quote(p) for p in _brew(pkgs))
        # Error messages go to stderr (>&2) so the route's error field
        # gets populated. Without the redirect, `echo "ERROR…"` on stdout
        # left stderr empty and the frontend toast fell through to a
        # bare "HTTP 200" instead of surfacing the real reason.
        script = (
            'set -e; '
            'if ! sudo -n true 2>/dev/null; then '
            '  echo "ERROR: passwordless sudo unavailable on this target. Run once: sudo apt install -y ' + " ".join(pkgs) + ' (or your distro equivalent: pacman -S, dnf install, brew install). After that, Cookbook can install the rest." >&2; exit 2; fi; '
            'if command -v apt-get >/dev/null 2>&1; then '
            f'  sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update -qq && sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends {apt_pkgs}; '
            'elif command -v pacman >/dev/null 2>&1; then '
            f'  sudo -n pacman -Sy --needed --noconfirm {pac_pkgs}; '
            'elif command -v dnf >/dev/null 2>&1; then '
            f'  sudo -n dnf install -y {dnf_pkgs}; '
            'elif command -v brew >/dev/null 2>&1; then '
            f'  brew install {brew_pkgs}; '
            'else '
            '  echo "ERROR: no supported package manager (apt/pacman/dnf/brew) on this target." >&2; exit 3; fi'
        )
        try:
            if host:
                argv = _ssh_base_argv(host, ssh_port) + [script]
            else:
                argv = ["bash", "-lc", script]
        except ValueError as e:
            raise HTTPException(400, str(e))
        try:
            proc = await asyncio.create_subprocess_exec(
                *argv, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
            )
            out, err = await asyncio.wait_for(proc.communicate(), timeout=180)
        except asyncio.TimeoutError:
            return {"ok": False, "error": "Install timed out after 180s"}
        ok = (proc.returncode == 0)
        # Combine stderr + (last lines of stdout) into a single error
        # blob when ok=False — some package managers print useful failure
        # context to stdout, and a script that exits via `echo ...; exit N`
        # without `>&2` would otherwise hand back an empty error string
        # and force the frontend to show a bare "HTTP 200".
        err_txt = err.decode("utf-8", errors="replace").strip()
        out_txt = out.decode("utf-8", errors="replace").strip()
        if not ok:
            tail_out = out_txt[-500:] if out_txt else ""
            combined = err_txt or tail_out or f"exit code {proc.returncode}"
        else:
            combined = None
        return {
            "ok": ok,
            "exit_code": proc.returncode,
            "output": out_txt[-1000:],
            "error": combined,
        }
    @router.post("/api/cookbook/rebuild-engine")
    async def rebuild_engine(request: Request):
        """Clear the cached llama.cpp build so the next serve recompiles.
@@ -1309,7 +1657,8 @@ def setup_shell_routes() -> APIRouter:
            return {"ok": False, "error": f"Unsupported engine: {engine}"}
        host = str(body.get("remote_host") or "").strip()
        ssh_port = body.get("ssh_port")
-        cmd = _llama_cpp_rebuild_cmd()
+        update_source = bool(body.get("update_source"))
        cmd = _llama_cpp_rebuild_cmd(update_source=update_source)
        try:
            argv = (
                (_ssh_base_argv(host, ssh_port) + [cmd])
@@ -3,11 +3,16 @@ import os
 import time
 import json
 import asyncio
 import shutil
 import uuid
 from pathlib import Path
 from fastapi import APIRouter, Request, File, UploadFile, HTTPException
 from typing import List
 import logging
 from core.middleware import require_admin
 from core.database import SessionLocal, GalleryImage
 from src.auth_helpers import effective_user
 from src.constants import GENERATED_IMAGES_DIR
 from src.upload_handler import count_recent_uploads
 logger = logging.getLogger(__name__)
@@ -51,6 +56,69 @@ def setup_upload_routes(upload_handler):
        raise HTTPException(404, "File not found")
    def _promote_chat_image_to_gallery(meta: dict, owner: str | None) -> str | None:
        """Make chat-uploaded images visible in Gallery without changing chat storage."""
        is_image_file = getattr(upload_handler, "is_image_file", None)
        if not callable(is_image_file):
            return None
        if not is_image_file(meta.get("name", ""), meta.get("mime", "")):
            return None
        source_path = meta.get("path")
        if not source_path or not os.path.isfile(source_path):
            return None
        db = SessionLocal()
        try:
            file_hash = meta.get("hash")
            if file_hash:
                q = db.query(GalleryImage).filter(
                    GalleryImage.file_hash == file_hash,
                    GalleryImage.is_active == True,  # noqa: E712
                )
                if owner:
                    q = q.filter(GalleryImage.owner == owner)
                existing = q.first()
                if existing:
                    return existing.id
            image_dir = Path(GENERATED_IMAGES_DIR)
            image_dir.mkdir(parents=True, exist_ok=True)
            ext = Path(meta.get("name") or source_path).suffix.lower()
            if ext not in {".png", ".jpg", ".jpeg", ".webp", ".gif"}:
                mime_ext = {
                    "image/png": ".png",
                    "image/jpeg": ".jpg",
                    "image/jpg": ".jpg",
                    "image/webp": ".webp",
                    "image/gif": ".gif",
                }.get(meta.get("mime", ""))
                ext = mime_ext or ".png"
            filename = f"{uuid.uuid4().hex[:12]}{ext}"
            dest_path = image_dir / filename
            shutil.copy2(source_path, dest_path)
            image_id = str(uuid.uuid4())
            db.add(GalleryImage(
                id=image_id,
                filename=filename,
                prompt=meta.get("name") or "Chat upload",
                model="chat-upload",
                owner=owner,
                file_hash=file_hash,
                width=meta.get("width"),
                height=meta.get("height"),
                file_size=meta.get("size"),
            ))
            db.commit()
            return image_id
        except Exception as e:
            db.rollback()
            logger.warning("Failed to add chat image upload to gallery: %s", e)
            return None
        finally:
            db.close()
    @router.post("")
    async def api_upload(request: Request, files: List[UploadFile] = File(...)):
        """Upload files with enhanced security and organization."""
@@ -78,8 +146,10 @@ def setup_upload_routes(upload_handler):
        for u in files:
            try:
-                meta = upload_handler.save_upload(u, client_ip, owner=effective_user(request))
+                owner = effective_user(request)
-                out.append({
+                meta = upload_handler.save_upload(u, client_ip, owner=owner)
                gallery_id = _promote_chat_image_to_gallery(meta, owner)
                item = {
                    "id": meta["id"],
                    "name": meta["name"],
                    "mime": meta["mime"],
@@ -89,7 +159,10 @@ def setup_upload_routes(upload_handler):
                    "width": meta.get("width"),
                    "height": meta.get("height"),
                    "is_duplicate": meta.get("is_duplicate", False)
-                })
+                }
                if gallery_id:
                    item["gallery_id"] = gallery_id
                out.append(item)
            except HTTPException:
                raise
            except Exception as e:
@@ -282,7 +282,17 @@ def _detect_amd():
            "gpus": cards,
            "gpu_groups": groups,
            "homogeneous": len(groups) <= 1,
-            "backend": "rocm",
+            # Pick the actual runtime label: ROCm/HIP only when its
            # toolchain is installed, otherwise Vulkan if vulkaninfo is
            # present (mesa RADV works fine on RDNA/CDNA when ROCm
            # packages are absent — see Strix Halo where ROCm support
            # is still backporting). Reporting "rocm" on a Vulkan-only
            # host misleads downstream env-var pinning
            # (HIP_VISIBLE_DEVICES is a no-op there).
            "backend": (
                "rocm" if (_run(["which", "rocminfo"]) or _run(["which", "hipconfig"]))
                else ("vulkan" if _run(["which", "vulkaninfo"]) else "rocm")
            ),
            "unified_memory": is_apu,
            # AMD ISA/family so downstream can tell datacenter Instinct (CDNA,
            # where vLLM/SGLang run AWQ/GPTQ reliably) from consumer Radeon
@@ -541,17 +541,44 @@ def _section_text(name: str, default: str) -> str:
    return val if isinstance(val, str) and val.strip() else default
 def _compact_tool_line(name: str, section: str) -> str:
    """One-line fenced-tool usage hint for compact/local prompts."""
    text = (section or "").strip()
    if not text:
        return f"- `{name}`"
    if text.startswith("- "):
        return text
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    usage = []
    in_fence = False
    for ln in lines:
        if ln.startswith("```"):
            usage.append(ln)
            in_fence = not in_fence
            if len(usage) >= 3:
                break
            continue
        if in_fence and len(usage) < 3:
            usage.append(ln)
    if usage:
        return f"- `{name}` — " + " ".join(usage)
    return f"- `{name}` — " + lines[0][:160]
 def _assemble_prompt(tool_names: set, disabled_tools: set = None, compact: bool = False) -> str:
    """Build the system prompt with only the specified tools included."""
    disabled = disabled_tools or set()
    included = tool_names - disabled
    if compact:
-        tool_list = ", ".join(sorted(included)) if included else "none"
+        tool_lines = []
        for name, _default_section in TOOL_SECTIONS.items():
            if name in included:
                tool_lines.append(_compact_tool_line(name, _section_text(name, _default_section)))
        parts = [
-            "You are an AI assistant with tool access.",
+            _AGENT_PREAMBLE,
-            f"Available tools: {tool_list}.",
+            "## Available tools\n" + ("\n".join(tool_lines) if tool_lines else "none"),
-            _API_AGENT_RULES,
+            _AGENT_RULES,
        ]
        parts.extend(_domain_rules_for_tools(included))
        return "\n\n".join(parts)
@@ -617,11 +644,6 @@ _API_HOSTS = frozenset([
    "api.perplexity.ai", "api.x.ai",
    "ollama.com", "api.venice.ai", "api.kimi.com",
    "api.githubcopilot.com",
    # Local OpenAI-compatible endpoints (llama.cpp, vLLM, LM Studio, etc.).
    # Without these, `_is_api_model` falls back to keyword sniffing on the
    # model name, so well-behaved local servers don't get native tool
    # schemas and the agent silently degrades to fenced-block parsing.
    "localhost", "127.0.0.1", "host.docker.internal",
 ])
 _MCP_KEYWORDS = frozenset(["mcp", "browse", "browser", "website", "calendar", "event", "email",
                           "gmail", "screenshot", "navigate", "click", "miniflux", "rss", "feed"])
@@ -649,6 +671,28 @@ def _is_ollama_openai_compat_url(endpoint_url: str) -> bool:
    return parsed.port == 11434 and (path == "/v1" or path.startswith("/v1/"))
 def _is_local_openai_compat_url(endpoint_url: str) -> bool:
    try:
        parsed = urlparse(endpoint_url or "")
    except Exception:
        return False
    host = (parsed.hostname or "").lower()
    path = (parsed.path or "").rstrip("/")
    if not (path == "/v1" or path.startswith("/v1/")):
        return False
    if host in {"localhost", "127.0.0.1", "0.0.0.0", "host.docker.internal"}:
        return True
    if host.startswith("192.168.") or host.startswith("10."):
        return True
    if host.startswith("172."):
        try:
            second = int(host.split(".")[1])
            return 16 <= second <= 31
        except Exception:
            return False
    return False
 def _endpoint_lookup_keys(endpoint_url: str) -> List[str]:
    """Candidate ModelEndpoint.base_url keys for a runtime chat URL."""
    raw = (endpoint_url or "").strip()
@@ -712,6 +756,17 @@ def _extract_last_user_message(messages: List[Dict]) -> str:
 _LOW_SIGNAL_RE = re.compile(r"^[\W_]*$", re.UNICODE)
 _CASUAL_OPENING_RE = re.compile(
    r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|"
    r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P<tail>.*)$",
    re.IGNORECASE,
 )
 _CASUAL_BLOCKLIST_RE = re.compile(
    r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|"
    r"download|model|email|document|doc|note|calendar|task|search|web|research|"
    r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b",
    re.IGNORECASE,
 )
 _EXPLICIT_CONTINUATION_RE = re.compile(
    r"^\s*(?:"
    r"yes|y|yeah|yep|ok|okay|sure|do it|go ahead|continue|carry on|"
@@ -721,6 +776,17 @@ _EXPLICIT_CONTINUATION_RE = re.compile(
    r")\s*[.!?]*\s*$",
    re.IGNORECASE,
 )
 _RETRY_CONTINUATION_RE = re.compile(
    r"\b(?:try again|retry|again|rerun|re-run|run it again|launch it again|"
    r"start it again|failed|fails?|died|crashed|broke|insta|instantly)\b",
    re.IGNORECASE,
 )
 _COOKBOOK_CONTEXT_RE = re.compile(
    r"\b(?:cookbook|serve|serving|served|launch|start|preset|vllm|sglang|"
    r"llama\.?cpp|ollama|download|cached models?|model servers?|running models?|"
    r"gpu box|ajax|qwen|gemma|llama|mistral|minimax)\b",
    re.IGNORECASE,
 )
 def _is_explicit_continuation(text: str) -> bool:
@@ -728,6 +794,37 @@ def _is_explicit_continuation(text: str) -> bool:
    return bool(_EXPLICIT_CONTINUATION_RE.match(str(text or "").strip()))
 def _is_casual_low_signal(text: str) -> bool:
    """True for short greetings/slang that should not inherit stale context."""
    s = str(text or "").strip()
    m = _CASUAL_OPENING_RE.match(s)
    if not m:
        return False
    tail = m.group("tail") or ""
    if _CASUAL_BLOCKLIST_RE.search(tail):
        return False
    # Allow a short vocative/address after the opener without hardcoding the
    # address term itself: "hey man", "yo dude", "sup <name>". Longer tails are
    # more likely to be an actual request and should get normal context/tooling.
    tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail)
    return len(tail_words) <= 2
 def _is_contextual_retry_continuation(messages: List[Dict], text: str) -> bool:
    """Treat "try again / it failed" as a continuation only for active tool work.
    These follow-ups are common after Cookbook launches: the latest user turn
    says only "try again it failed", while the actionable model/host/command
    details live one or two turns back. Keep this intentionally narrow so
    ordinary chat does not inherit stale Cookbook context.
    """
    latest = str(text or "").strip()
    if not latest or not _RETRY_CONTINUATION_RE.search(latest):
        return False
    recent = _recent_context_for_retrieval(messages, max_user=5, max_chars=1200)
    return bool(_COOKBOOK_CONTEXT_RE.search(recent))
 def _assistant_requested_followup(messages: List[Dict]) -> bool:
    """True when the previous assistant turn asked for missing task details.
@@ -769,11 +866,12 @@ def _classify_agent_request(messages: List[Dict], last_user: str) -> Dict[str, o
    which domain rule packs get appended to the system prompt.
    """
    text = str(last_user or "").strip()
-    continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages)
+    retry_continuation = _is_contextual_retry_continuation(messages, text)
    continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages) or retry_continuation
    retrieval_query = _recent_context_for_retrieval(messages) if continuation else text
    q = retrieval_query.lower()
-    if not text or bool(_LOW_SIGNAL_RE.match(text)):
+    if not text or bool(_LOW_SIGNAL_RE.match(text)) or _is_casual_low_signal(text):
        return {
            "low_signal": True,
            "continuation": False,
@@ -886,6 +984,7 @@ def _build_system_prompt(
    compact: bool = False,
    owner: Optional[str] = None,
    suppress_local_context: bool = False,
    suppress_skills: bool = False,
    active_email: Optional[Dict[str, str]] = None,
 ) -> List[Dict]:
    """Build agent system prompt, inject MCP/document context, merge consecutive system msgs."""
@@ -903,7 +1002,7 @@ def _build_system_prompt(
        _ov_sig = _hl.sha256(_json.dumps(get_builtin_overrides() or {}, sort_keys=True).encode()).hexdigest()
    except Exception:
        _ov_sig = ""
-    cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context)
+    cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context, suppress_skills)
    if _cached_base_prompt and _cached_base_prompt_key == cache_key and not active_document:
        agent_prompt = _cached_base_prompt
        # Skill index is user-editable (name + description), so it must never
@@ -913,6 +1012,7 @@ def _build_system_prompt(
            disabled_tools, mcp_mgr, needs_admin, relevant_tools,
            mcp_disabled_map=mcp_disabled_map, compact=compact, owner=owner,
            suppress_local_context=suppress_local_context,
            suppress_skills=suppress_skills,
        )
    else:
        agent_prompt, _skill_index_block = _build_base_prompt(
@@ -924,6 +1024,7 @@ def _build_system_prompt(
            compact=compact,
            owner=owner,
            suppress_local_context=suppress_local_context,
            suppress_skills=suppress_skills,
        )
        if not active_document:
            _cached_base_prompt = agent_prompt
@@ -1207,7 +1308,7 @@ def _build_system_prompt(
    # few. If the teacher wrote a procedure for "open my X chat" last
    # time the student failed, this is where the student finds it
    # before deciding which tool to call.
-    if not suppress_local_context:
+    if not suppress_local_context and not suppress_skills:
        try:
            last_user = _extract_last_user_message(messages)
            # Respect the user's skills-enabled toggle (mirrors memory_enabled).
@@ -1374,6 +1475,7 @@ def _build_base_prompt(
    compact: bool = False,
    owner: Optional[str] = None,
    suppress_local_context: bool = False,
    suppress_skills: bool = False,
 ):
    """Build the agent prompt with only relevant tools included.
@@ -1426,7 +1528,7 @@ def _build_base_prompt(
    # The caller wraps it in untrusted_context_message and ships it as a
    # user-role message — same treatment as the matched-skills block.
    skill_index_block = ""
-    if not suppress_local_context:
+    if not suppress_local_context and not suppress_skills:
        try:
            from services.memory.skills import SkillsManager
            from src.constants import DATA_DIR
@@ -1851,6 +1953,7 @@ async def stream_agent_loop(
    approved_plan: Optional[str] = None,
    tool_policy: Optional[ToolPolicy] = None,
    workspace: Optional[str] = None,
    forced_tools: Optional[Set[str]] = None,
    _is_teacher_run: bool = False,
 ) -> AsyncGenerator[str, None]:
    """Streaming agent loop generator.
@@ -1890,6 +1993,20 @@ async def stream_agent_loop(
    _needs_admin = _detect_admin_intent(messages)
    _last_user = _extract_last_user_message(messages)
    _intent = _classify_agent_request(messages, _last_user)
    _low_signal_turn = bool(_intent.get("low_signal"))
    _casual_low_signal_turn = _is_casual_low_signal(_last_user)
    _direct_low_signal = (
        _low_signal_turn
        and not bool(_intent.get("continuation"))
        and not plan_mode
        and not approved_plan
        and not guide_only
        and (_casual_low_signal_turn or active_document is None)
        and (_casual_low_signal_turn or not active_email)
        and (_casual_low_signal_turn or not workspace)
        and not forced_tools
        and not relevant_tools
    )
    # Tool retrieval uses the latest message by default. It may inherit recent
    # user turns only for explicit continuations ("yes", "do it", "1").
    _retrieval_query = str(_intent.get("retrieval_query") or _last_user)
@@ -1897,11 +2014,86 @@ async def stream_agent_loop(
        "[agent-intent] latest=%r continuation=%s low_signal=%s domains=%s retrieval_query=%r",
        _last_user[:120],
        bool(_intent.get("continuation")),
-        bool(_intent.get("low_signal")),
+        _low_signal_turn,
        sorted(_intent.get("domains") or []),
        _retrieval_query[:200],
    )
    _mcp_disabled_map = _load_mcp_disabled_map() if mcp_mgr else {}
    if _direct_low_signal:
        logger.info("[agent] direct low-signal reply path for latest=%r", _last_user[:80])
        direct_messages = [{"role": "user", "content": _last_user}]
        direct_response = ""
        direct_start = time.time()
        direct_actual_model = model
        real_input_tokens = 0
        real_output_tokens = 0
        try:
            async for chunk in stream_llm_with_fallback(
                [(endpoint_url, model, headers)] + list(fallbacks or []),
                direct_messages,
                temperature=temperature,
                max_tokens=min(max_tokens or 128, 128),
                prompt_type=None,
                tools=None,
                timeout=int(get_setting("agent_stream_timeout_seconds", 300) or 300),
                session_id=session_id,
            ):
                if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
                    try:
                        data = json.loads(chunk[6:])
                    except json.JSONDecodeError:
                        yield chunk
                        continue
                    if data.get("type") == "usage":
                        usage = data.get("data", {}) or {}
                        direct_actual_model = usage.get("model") or direct_actual_model
                        real_input_tokens += usage.get("input_tokens", 0) or 0
                        real_output_tokens += usage.get("output_tokens", 0) or 0
                        continue
                    if data.get("type") == "model_actual":
                        direct_actual_model = data.get("model") or direct_actual_model
                        data["requested_model"] = model
                        yield f"data: {json.dumps(data)}\n\n"
                        continue
                    if data.get("type") == "fallback":
                        direct_actual_model = data.get("answered_by") or direct_actual_model
                        yield chunk
                        continue
                    if "delta" in data:
                        if not data.get("thinking"):
                            direct_response += data.get("delta", "")
                        yield chunk
                        continue
                    yield chunk
                elif chunk.startswith("event: "):
                    yield chunk
        except Exception as _direct_err:
            logger.warning("[agent] direct low-signal path failed: %s", _direct_err)
            fallback = "Hey."
            direct_response += fallback
            yield f"data: {json.dumps({'delta': fallback})}\n\n"
        if not direct_response.strip():
            fallback = "Hey."
            direct_response = fallback
            yield f"data: {json.dumps({'delta': fallback})}\n\n"
        duration = time.time() - direct_start
        metrics = {
            "model": direct_actual_model,
            "requested_model": model,
            "input_tokens": real_input_tokens or estimate_tokens(direct_messages),
            "output_tokens": real_output_tokens or max(len(direct_response) // 4, 1),
            "total_time": round(duration, 2),
            "response_time": round(duration, 2),
            "agent_rounds": 0,
            "tool_calls": 0,
            "direct_low_signal": True,
        }
        yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"
        yield "data: [DONE]\n\n"
        return
    if plan_mode and mcp_mgr:
        # Allow read-only MCP tools to investigate, block write/unknown ones:
        # hide them from the schemas AND reject them at runtime by qualified name.
@@ -1913,11 +2105,11 @@ async def stream_agent_loop(
    # RAG-based tool selection: retrieve relevant tools for this query.
    # If caller provided a pre-computed set (e.g. task_scheduler), use that.
-    _relevant_tools = set() if guide_only else relevant_tools
+    _relevant_tools = relevant_tools
    _t1 = time.time()
    if _relevant_tools:
        logger.info(f"[tool-rag] Using caller-provided relevant_tools ({len(_relevant_tools)} tools)")
-    if not guide_only and not _relevant_tools and bool(_intent.get("low_signal")):
+    if not guide_only and not _relevant_tools and _low_signal_turn:
        from src.tool_index import ALWAYS_AVAILABLE
        if workspace:
            # An active workspace IS the file-work signal: a vague "look at the
@@ -2008,6 +2200,15 @@ async def stream_agent_loop(
    if _relevant_tools is not None and active_document is not None:
        _relevant_tools.update({"edit_document", "update_document", "suggest_document"})
    # Per-request UI toggles are stronger than retrieval. If the user turns on
    # Search, the model must see the search tools even when the latest text is a
    # typo or otherwise low-signal for tool RAG.
    if not guide_only and forced_tools:
        if _relevant_tools is None:
            from src.tool_index import ALWAYS_AVAILABLE
            _relevant_tools = set(ALWAYS_AVAILABLE)
        _relevant_tools.update(t for t in forced_tools if t not in disabled_tools)
    # The skill index injected by _build_system_prompt tells the model to
    # call `manage_skills action=view`, and Jaccard-matched skills are pasted
    # into the prompt as procedures to follow — but neither path goes through
@@ -2015,7 +2216,7 @@ async def stream_agent_loop(
    # (grep, read_file, ...) that aren't in its schema list. Keep the schemas
    # in lockstep: manage_skills is callable whenever any skill is indexed,
    # and a matched skill's declared requires_toolsets ride along with it.
-    if not guide_only and _relevant_tools is not None:
+    if not guide_only and _relevant_tools is not None and not _low_signal_turn:
        try:
            from services.memory.skills import SkillsManager
            from src.constants import DATA_DIR
@@ -2080,7 +2281,7 @@ async def stream_agent_loop(
    _model_supports_tools = any(kw in _model_lc for kw in (
        "gpt-4", "gpt-5", "gpt-o", "claude", "gemini", "gemma",
        "qwen3", "qwen2.5", "mixtral", "mistral", "llama-3.1", "llama-3.2",
-        "llama-3.3", "llama-4",
+        "llama-3.3", "llama-4", "llama3.1", "llama3.2", "llama3.3", "llama4",
        # Local-served models that follow OpenAI-style function calling
        # via vLLM's `--enable-auto-tool-choice`. Belt-and-suspenders
        # with the per-endpoint flag above.
@@ -2122,13 +2323,15 @@ async def stream_agent_loop(
        _is_api_model = False
    else:
        _is_api_model = any(h in endpoint_url for h in _API_HOSTS) or _model_supports_tools
    _compact_agent_prompt = _is_api_model or _is_ollama_native or _ollama_openai_compat
    messages, mcp_schemas = _build_system_prompt(
        messages, model, active_document, mcp_mgr, disabled_tools,
        needs_admin=_needs_admin, relevant_tools=_relevant_tools,
        mcp_disabled_map=_mcp_disabled_map,
-        compact=_is_api_model,
+        compact=_compact_agent_prompt,
        owner=owner,
        suppress_local_context=guide_only,
        suppress_skills=_low_signal_turn,
        active_email=active_email,
    )
    if plan_mode and not guide_only:
@@ -2214,6 +2417,14 @@ async def stream_agent_loop(
    # Strip internal metadata keys before sending to the LLM API
    messages = [{k: v for k, v in msg.items() if k != "_protected"} for msg in messages]
    agent_prompt_tokens = estimate_tokens(messages)
    logger.info(
        "[agent-timing] prep_done model=%s prompt_tokens=%s context_length=%s prep=%s",
        model,
        agent_prompt_tokens,
        context_length,
        {k: round(v, 3) for k, v in prep_timings.items()},
    )
    yield f"data: {json.dumps({'type': 'agent_prep', 'data': {k: round(v, 3) for k, v in prep_timings.items()}})}\n\n"
    full_response = ""
@@ -2358,6 +2569,19 @@ async def stream_agent_loop(
        # complementary cap for the rare stream that trickles bytes forever and
        # so never trips the inactivity timeout. Generous — only catches runaway.
        _round_deadline = time.time() + max(agent_stream_timeout * 4, 1200)
        _round_start = time.time()
        _round_first_event_logged = False
        _round_first_token_logged = False
        logger.info(
            "[agent-timing] round_start round=%s model=%s endpoint=%s prompt_tokens=%s tools=%s native_tools=%s timeout=%s",
            round_num,
            model,
            endpoint_url,
            estimate_tokens(messages),
            len(_tool_names_sent),
            bool(all_tool_schemas),
            agent_stream_timeout,
        )
        async for chunk in stream_llm_with_fallback(
            _candidates,
            messages,
@@ -2368,11 +2592,30 @@ async def stream_agent_loop(
            timeout=agent_stream_timeout,
            session_id=session_id,
        ):
            if not _round_first_event_logged:
                _round_first_event_logged = True
                logger.info(
                    "[agent-timing] first_event round=%s elapsed=%.3fs kind=%s",
                    round_num,
                    time.time() - _round_start,
                    "error" if chunk.startswith("event: error") else "data",
                )
            if time.time() > _round_deadline:
-                logger.warning(f"[agent] round {round_num} stream exceeded wall-clock deadline; cutting off")
+                logger.warning(
                    "[agent-timing] round_deadline round=%s elapsed=%.3fs deadline_s=%s",
                    round_num,
                    time.time() - _round_start,
                    max(agent_stream_timeout * 4, 1200),
                )
                break
            # Forward error events from stream_llm to the frontend
            if chunk.startswith("event: error"):
                logger.warning(
                    "[agent-timing] stream_error round=%s elapsed=%.3fs chunk=%r",
                    round_num,
                    time.time() - _round_start,
                    chunk[:500],
                )
                yield chunk
                continue
            if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
@@ -2452,6 +2695,15 @@ async def stream_agent_loop(
                        if not first_token_received:
                            time_to_first_token = time.time() - total_start
                            first_token_received = True
                        if not _round_first_token_logged:
                            _round_first_token_logged = True
                            logger.info(
                                "[agent-timing] first_visible_token round=%s elapsed=%.3fs total_elapsed=%.3fs thinking=%s",
                                round_num,
                                time.time() - _round_start,
                                time.time() - total_start,
                                bool(data.get("thinking")),
                            )
                        # Keep reasoning deltas in a separate accumulator so
                        # we can echo them back via `reasoning_content` on the
                        # next request (DeepSeek requires this; harmless for
@@ -2521,7 +2773,21 @@ async def stream_agent_loop(
                yield chunk
            # Intercept [DONE] — don't forward until all rounds finish
-        tool_blocks, used_native = _resolve_tool_blocks(round_response, native_tool_calls, round_num, is_api_model=_is_api_model)
+        logger.info(
            "[agent-timing] round_stream_done round=%s elapsed=%.3fs text_chars=%s tool_calls=%s first_event=%s first_token=%s",
            round_num,
            time.time() - _round_start,
            len(round_response),
            len(native_tool_calls),
            _round_first_event_logged,
            _round_first_token_logged,
        )
        tool_blocks, used_native = _resolve_tool_blocks(
            round_response,
            native_tool_calls,
            round_num,
            is_api_model=(_is_api_model and not guide_only),
        )
        # Force-answer round: we told the model to STOP calling tools and
        # answer. If it ignored that and emitted a (possibly DSML) tool
@@ -2605,7 +2871,7 @@ async def stream_agent_loop(
        # model with no real native_tool_calls) must not be stripped from the
        # persisted text either — otherwise it streams once and then disappears
        # on reload (#3222 follow-up).
-        cleaned_round = strip_tool_blocks(round_response, skip_fenced=(_is_api_model and not used_native)).strip()
+        cleaned_round = strip_tool_blocks(round_response, skip_fenced=(_is_api_model and not used_native and not guide_only)).strip()
        round_texts.append(cleaned_round)
        if not tool_blocks:
@@ -2677,6 +2943,15 @@ async def stream_agent_loop(
                _intent_nudge_count += 1
                _matched_phrase = _intent_match.group(0).strip()
                logger.info(f"[agent] intent-without-action nudge #{_intent_nudge_count} on round {round_num}: {_matched_phrase!r}")
                _lower_phrase = _matched_phrase.lower()
                _cookbook_log_hint = ""
                if any(_word in _lower_phrase for _word in ("log", "logs", "output", "tail", "status")):
                    _cookbook_log_hint = (
                        " If this is about a Cookbook/model serve, the concrete calls are: "
                        "`list_served_models` first, then `tail_serve_output` with the "
                        "session_id from the serve/list result. Never answer with "
                        "\"check logs\" when those tools are available."
                    )
                messages.append({
                    "role": "system",
                    "content": (
@@ -2685,6 +2960,7 @@ async def stream_agent_loop(
                        "see you announced the action but didn't run it, which "
                        "is the most frustrating thing you can do. "
                        "DO IT NOW: emit the actual function call this turn. "
                        f"{_cookbook_log_hint}"
                        "If you decided not to do it after all, say so plainly in "
                        "one sentence instead of restating the plan."
                    ),
@@ -174,8 +174,20 @@ async def subscribe(session_id: str) -> AsyncGenerator[str, None]:
            next_seq += 1
        if run.status != "running":
            return
        heartbeat_idx = 0
        while True:
-            seq, ev = await q.get()
+            try:
                seq, ev = await asyncio.wait_for(q.get(), timeout=10.0)
            except asyncio.TimeoutError:
                # Keep slow local models/proxies alive while they prefill before
                # the first token. SSE comments are ignored by the UI but reset
                # browser/proxy idle timers, which prevents "empty response"
                # disconnects on llama.cpp first-token latencies of 30s+.
                if run.status == "running":
                    heartbeat_idx += 1
                    yield f": heartbeat {heartbeat_idx}\n\n"
                    continue
                seq, ev = (None, None)
            if seq is None:            # end sentinel
                while next_seq < len(run.buffer):   # flush any tail the sentinel raced
                    yield run.buffer[next_seq]
@@ -7,6 +7,7 @@ from src.constants import MAX_OUTPUT_CHARS
 class WebSearchTool:
    async def execute(self, content: str, ctx: dict) -> dict:
        from src.search import comprehensive_web_search
        progress_cb = ctx.get("progress_cb") if isinstance(ctx, dict) else None
        raw = content.strip()
        query = raw
        time_filter = None
@@ -37,18 +38,39 @@ class WebSearchTool:
            elif " news" in q_lc or q_lc.startswith("news ") or q_lc.endswith(" news"):
                time_filter = "week"
        loop = asyncio.get_running_loop()
-        text, sources = await asyncio.wait_for(
+        if progress_cb:
-            loop.run_in_executor(
+            await progress_cb({
-                None,
+                "elapsed_s": 0,
-                lambda: comprehensive_web_search(
+                "tail": f"Searching web for: {query[:160]}",
-                    query,
+            })
-                    max_pages=max_pages,
+        try:
-                    time_filter=time_filter,
+            text, sources = await asyncio.wait_for(
-                    return_sources=True,
+                loop.run_in_executor(
                    None,
                    lambda: comprehensive_web_search(
                        query,
                        max_pages=max_pages,
                        time_filter=time_filter,
                        return_sources=True,
                    ),
                ),
-            ),
+                timeout=30,
-            timeout=30,
+            )
-        )
+        except asyncio.TimeoutError:
            return {
                "error": f"web_search timed out after 30s: {query[:200]}",
                "exit_code": 1,
            }
        except Exception as e:
            return {
                "error": f"web_search failed: {type(e).__name__}: {str(e) or 'no details'}",
                "exit_code": 1,
            }
        if progress_cb:
            await progress_cb({
                "elapsed_s": 30,
                "tail": "Search completed; preparing sources.",
            })
        output = text[:MAX_OUTPUT_CHARS] if len(text) > MAX_OUTPUT_CHARS else text
        if sources:
            output += "\n\n<!-- SOURCES:" + json.dumps(sources) + " -->"
@@ -76,8 +76,7 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]:
        import json
        import re
        from src.constants import DATA_DIR
-        from src.endpoint_resolver import resolve_endpoint
+        from src.llm_core import llm_call_async_with_fallback
        from src.llm_core import llm_call_async
        from src.memory import MemoryManager
        manager = MemoryManager(DATA_DIR)
@@ -116,10 +115,9 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]:
            if len(group_memories) < 2:
                return False
-            url, model, headers = resolve_endpoint("utility", owner=group_owner or None)
+            from src.task_endpoint import resolve_task_candidates
-            if not url or not model:
+            candidates = resolve_task_candidates(owner=group_owner or None)
-                url, model, headers = resolve_endpoint("default", owner=group_owner or None)
+            if not candidates:
            if not url or not model:
                return False
            try:
@@ -147,13 +145,11 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]:
                    "\"drop\":[{\"id\":\"existing id\",\"reason\":\"short reason\"}]}\n\n"
                    f"MEMORIES:\n{json.dumps(items, ensure_ascii=False)}"
                )
-                raw = await llm_call_async(
+                raw = await llm_call_async_with_fallback(
-                    url=url,
+                    candidates,
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                    max_tokens=4096,
                    headers=headers,
                    timeout=120,
                )
                from src.text_helpers import strip_think
@@ -604,8 +600,7 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]:
    try:
        from datetime import timedelta
        from core.database import SessionLocal, CalendarEvent
-        from src.endpoint_resolver import resolve_endpoint
+        from src.llm_core import llm_call_async_with_fallback
        from src.llm_core import llm_call_async
        import re as _re, json as _json
        db = SessionLocal()
@@ -620,10 +615,9 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]:
            if not events:
                return "No upcoming events to classify", True
-            llm_url, llm_model, llm_headers = resolve_endpoint("utility", owner=owner)
+            from src.task_endpoint import resolve_task_candidates
-            if not llm_url:
+            llm_candidates = resolve_task_candidates(owner=owner)
-                llm_url, llm_model, llm_headers = resolve_endpoint("default", owner=owner)
+            llm_available = bool(llm_candidates)
            llm_available = bool(llm_url and llm_model)
            # Pull user memories so the LLM has personal context (relationships,
            # job, hobbies). Helps it know e.g. "<name> is your spouse" so their
@@ -699,11 +693,11 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]:
                    f"EVENTS: {_json.dumps(items)}"
                )
                try:
-                    raw = await llm_call_async(
+                    raw = await llm_call_async_with_fallback(
-                        url=llm_url, model=llm_model,
+                        llm_candidates,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=0.1, max_tokens=16384,
-                        headers=llm_headers, timeout=180,
+                        timeout=180,
                    )
                    from src.text_helpers import strip_think as _st
                    raw = _st(raw or "", prose=False, prompt_echo=False)
@@ -810,8 +804,7 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo
        import asyncio as _aio
        from datetime import datetime as _dt, timedelta as _td
        from routes.email_helpers import _email_cache_owner_clause, _imap_connect, SCHEDULED_DB
-        from src.endpoint_resolver import resolve_endpoint
+        from src.llm_core import llm_call_async_with_fallback
        from src.llm_core import llm_call_async
        # 1. Pull recent UIDs + From headers cheaply (header-only fetch).
        def _pull_headers():
@@ -891,11 +884,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo
        if not eligible:
            return "All sender sigs already cached (or no eligible senders)", True
-        url, model, headers = resolve_endpoint("utility", owner=owner)
+        from src.task_endpoint import resolve_task_candidates
-        if not url or not model:
+        candidates = resolve_task_candidates(owner=owner)
-            url, model, headers = resolve_endpoint("default", owner=owner)
+        if not candidates:
        if not url or not model:
            return "No LLM endpoint available", False
        model = candidates[0][1]
        analyzed = 0
        no_sig = 0
@@ -949,11 +942,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo
            )
            try:
-                raw = await llm_call_async(
+                raw = await llm_call_async_with_fallback(
-                    url=url, model=model,
+                    candidates,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0, max_tokens=600,
-                    headers=headers, timeout=60,
+                    timeout=60,
                )
                from src.text_helpers import strip_think as _st
                sig = _st(raw or "", prose=False, prompt_echo=False).strip()
@@ -1137,7 +1130,6 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]:
        from services.memory.skills import SkillsManager
        from src.constants import DATA_DIR
        from routes.skills_routes import _run_skill_test_once, _skill_test_task
        from src.endpoint_resolver import resolve_endpoint
        # #3 SCOPE GUARD: refuse to run on a None/empty owner — otherwise
        # `sm.load(owner=None)` returns every user's skills and we'd cross-
@@ -1152,27 +1144,40 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]:
        if not names:
            raise TaskNoop("no skills to test")
-        url, model, headers = resolve_endpoint("default", owner=owner)
+        from src.task_endpoint import resolve_task_candidates
-        if not url or not model:
+        candidates = resolve_task_candidates(owner=owner)
        if not candidates:
            return "No Default/Utility model configured — set one in Settings.", False
        # #2 NO SILENT MODEL SWAP: if the configured model isn't served by the
        # endpoint, try a basename match — but fail loudly instead of grabbing
        # `avail[0]` which could be an embedding-only model and produce 36
        # garbage transcripts → 36 'unknown' verdicts with no hint why.
        url, model, headers = candidates[0]
        try:
            from src.llm_core import list_model_ids
-            avail = list_model_ids(url, headers=headers)
+            import os as _os
-            if avail and model not in avail:
+
-                import os as _os
+            selected = None
-                base = _os.path.basename((model or "").rstrip("/"))
+            mismatch_notes = []
-                m = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None)
+            for cand_url, cand_model, cand_headers in candidates:
-                if m:
+                avail = list_model_ids(cand_url, headers=cand_headers)
-                    model = m
+                if not avail or cand_model in avail:
-                else:
+                    selected = (cand_url, cand_model, cand_headers)
-                    return (f"Default model '{model}' not served by endpoint {url}. "
+                    break
-                            f"Available: {', '.join(avail[:8])}{'…' if len(avail) > 8 else ''}. "
+                base = _os.path.basename((cand_model or "").rstrip("/"))
-                            "Set a valid Default model in Settings."), False
+                matched = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None)
                if matched:
                    selected = (cand_url, matched, cand_headers)
                    break
                mismatch_notes.append(
                    f"{cand_model} not served by {cand_url}; available: "
                    f"{', '.join(avail[:8])}{'...' if len(avail) > 8 else ''}"
                )
            if selected:
                url, model, headers = selected
            elif mismatch_notes:
                return "No configured task fallback model is served. " + " | ".join(mismatch_notes[:3]), False
        except Exception as _e:
            logger.warning(f"test_skills model resolve check failed (continuing): {_e}")
@@ -1483,7 +1488,6 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]:
        from pathlib import Path as _P
        from core.database import SessionLocal as _SL, EmailAccount as _EA
        from routes.email_helpers import _imap_connect, _decode_header
        from src.endpoint_resolver import resolve_endpoint, resolve_utility_fallback_candidates
        from src.llm_core import llm_call_async_with_fallback
        # Per-owner state file so multi-user runs don't clobber each other's
@@ -1505,12 +1509,10 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]:
        # ── 1. Resolve LLM candidates (utility primary + utility fallbacks; fall
        # through to default chat as a last resort).
-        url, model, headers = resolve_endpoint("utility", owner=owner)
+        from src.task_endpoint import resolve_task_candidates
-        if not url or not model:
+        candidates = resolve_task_candidates(owner=owner)
-            url, model, headers = resolve_endpoint("default", owner=owner)
+        if not candidates:
        if not url or not model:
            return "No LLM endpoint available", False
        candidates = [(url, model, headers)] + resolve_utility_fallback_candidates(owner=owner)
        # ── 2. Enumerate enabled accounts. Match this task's owner AND fall
        # back to the legacy "unowned account whose imap_user / from_address
@@ -4,7 +4,7 @@ import os
 from src.runtime_paths import get_app_root, get_default_data_dir
-APP_VERSION = "1.0.0"
+APP_VERSION = "1.0.1"
 # Base paths
 BASE_DIR = os.path.join(get_app_root(), "")
@@ -424,6 +424,9 @@ def resolve_utility_fallback_candidates(owner: Optional[str] = None) -> list:
        settings = load_settings()
        utility_ep = (get_user_setting("utility_endpoint_id", owner or "", settings.get("utility_endpoint_id", "")) or "").strip()
        if not utility_ep:
            utility_chain = get_user_setting("utility_model_fallbacks", owner or "", settings.get("utility_model_fallbacks") or []) or []
            if utility_chain:
                return _resolve_fallback_candidates("utility_model_fallbacks", owner=owner)
            return _resolve_fallback_candidates("default_model_fallbacks", owner=owner)
    except Exception:
        pass
@@ -907,7 +907,10 @@ def _anthropic_rejects_temperature(model: str) -> bool:
    return (int(match.group(1)), int(match.group(2))) >= (4, 7)
 # Models that support structured thinking — may output </think> without opening tag
-_THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap", "gemma")
+_THINKING_MODEL_PATTERNS = (
    "qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax",
    "m2-reap", "gemma", "stepfun", "step-3", "step3",
 )
 def _supports_thinking(model: str) -> bool:
    """Check if model supports structured thinking output."""
@@ -2135,6 +2138,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
                                            yield _stream_delta_event(reasoning, thinking=True)
                                        content = delta.get("content") or ""
                                        if content:
                                            content = re.sub(r"<mm:think(\s+[^>]*)?>", r"<think\1>", content, flags=re.IGNORECASE)
                                            content = re.sub(r"</mm:think>", "</think>", content, flags=re.IGNORECASE)
                                            stripped = content.lstrip()
                                            # gpt-oss harmony format (<|channel|>analysis/final): route via the harmony
                                            # stream router. Sticky once the first marker appears — distinct from the
@@ -1,6 +1,11 @@
-"""Shared resolver for background-task AI endpoint (auto-naming, memory, sorting)."""
+"""Shared resolver for background-task AI endpoints."""
-from src.endpoint_resolver import resolve_endpoint
+from src.endpoint_resolver import (
    resolve_chat_fallback_candidates,
    resolve_endpoint,
    resolve_utility_fallback_candidates,
 )
 from src.llm_core import llm_call_async_with_fallback
 def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_headers=None, owner=None):
@@ -11,3 +16,60 @@ def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_heade
    endpoint cannot be resolved.
    """
    return resolve_endpoint("task", fallback_url, fallback_model, fallback_headers, owner=owner)
 def resolve_task_candidates(
    fallback_url=None,
    fallback_model=None,
    fallback_headers=None,
    owner=None,
 ):
    """Return ordered background-task LLM candidates.
    Order:
    1. configured Background Tasks endpoint/model, or caller fallback
    2. Utility endpoint/model
    3. Default endpoint/model
    4. Utility fallback chain
    5. Default fallback chain
    """
    candidates = []
    def _append(url, model, headers):
        if not url or not model:
            return
        key = (url, model)
        if any((u, m) == key for u, m, _ in candidates):
            return
        candidates.append((url, model, headers or {}))
    _append(*resolve_task_endpoint(fallback_url, fallback_model, fallback_headers, owner=owner))
    _append(*resolve_endpoint("utility", owner=owner))
    _append(*resolve_endpoint("default", owner=owner))
    for url, model, headers in resolve_utility_fallback_candidates(owner=owner):
        _append(url, model, headers)
    for url, model, headers in resolve_chat_fallback_candidates(owner=owner):
        _append(url, model, headers)
    return candidates
 async def task_llm_call_async(
    messages,
    *,
    fallback_url=None,
    fallback_model=None,
    fallback_headers=None,
    owner=None,
    **kwargs,
 ):
    """Call the shared background-task LLM candidate chain."""
    candidates = resolve_task_candidates(
        fallback_url=fallback_url,
        fallback_model=fallback_model,
        fallback_headers=fallback_headers,
        owner=owner,
    )
    if not candidates:
        raise RuntimeError("No LLM endpoint available for background task")
    return await llm_call_async_with_fallback(candidates, messages=messages, **kwargs)
@@ -886,6 +886,14 @@ class TaskScheduler:
                    owner=task.owner,
                    body=run.result if output == "notification" else None,
                )
            elif run.status == "error":
                self.add_notification(
                    task.name,
                    "error",
                    task_id,
                    owner=task.owner,
                    body=run.error or run.result,
                )
            # Log result to the assistant chat so all task activity is visible.
            # Skip skipped/error rows — user shouldn't see "skipped: …" noise
@@ -1468,12 +1476,18 @@ class TaskScheduler:
            )
        except Exception as e:
            logger.warning(f"Agent loop failed for task '{task.name}', falling back to simple call: {e}")
-            from src.llm_core import llm_call_async
+            from src.task_endpoint import task_llm_call_async
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": task.prompt},
            ]
-            result = await llm_call_async(url=endpoint_url, model=model, messages=messages, timeout=120)
+            result = await task_llm_call_async(
                messages,
                fallback_url=endpoint_url,
                fallback_model=model,
                owner=task.owner,
                timeout=120,
            )
        # Strip the model's chain-of-thought before saving/delivering. Task
        # output is LLM-only, so prose=True (which also removes untagged
@@ -1698,13 +1712,17 @@ class TaskScheduler:
        # Honor per-task max_steps (defense against runaway agent loops).
        # Falls back to 20 if not set — the historical default.
        _task_max_rounds = task.max_steps if task.max_steps and task.max_steps > 0 else 20
-        # Tasks are background workloads — they share the Utility model's
+        # Tasks are background workloads: use the shared task fallback chain
-        # fallback chain (Settings → Utility Model → Fallbacks). A downed
+        # behind the primary endpoint so a downed primary won't silently yield
-        # primary endpoint won't silently yield `(no output)` — same recipe
+        # `(no output)`.
        # chat uses but with the utility list (`utility_model_fallbacks`).
        try:
-            from src.endpoint_resolver import resolve_utility_fallback_candidates
+            from src.task_endpoint import resolve_task_candidates
-            _task_fallbacks = resolve_utility_fallback_candidates(owner=task.owner or None)
+            _task_fallbacks = resolve_task_candidates(
                fallback_url=endpoint_url,
                fallback_model=model,
                fallback_headers=headers,
                owner=task.owner or None,
            )[1:]
        except Exception:
            _task_fallbacks = []
        async for event_str in stream_agent_loop(
@@ -1741,21 +1759,22 @@ class TaskScheduler:
        # asking it to summarize what it did. Guarantees output.
        if not full_text.strip():
            try:
-                from src.llm_core import llm_call_async_with_fallback
+                from src.task_endpoint import task_llm_call_async
                from src.endpoint_resolver import resolve_utility_fallback_candidates
                grace_context = "You ran out of steps. "
                if tool_results:
                    grace_context += "Here's what your tools returned:\n" + "\n".join(tool_results[-5:])
                else:
                    grace_context += "No tool results were captured."
                grace_context += "\n\nSummarize what you accomplished and what's still pending. Be concise."
-                _grace_candidates = [(endpoint_url, model, headers)] + resolve_utility_fallback_candidates(owner=task.owner or None)
+                full_text = await task_llm_call_async(
                full_text = await llm_call_async_with_fallback(
                    _grace_candidates,
                    messages=[
                        {"role": "system", "content": system_content},
                        {"role": "user", "content": grace_context},
                    ],
                    fallback_url=endpoint_url,
                    fallback_model=model,
                    fallback_headers=headers,
                    owner=task.owner or None,
                    timeout=30,
                )
                full_text = (full_text or "").strip()
@@ -1268,8 +1268,8 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict:
            _ALIASES = {
                "shell": ["bash"],
                "terminal": ["bash"],
-                "search": ["web_search"],
+                "search": ["web_search", "web_fetch"],
-                "web": ["web_search"],
+                "web": ["web_search", "web_fetch"],
                "browser": ["builtin_browser"],
                "documents": ["create_document", "edit_document", "update_document", "suggest_document"],
                "doc": ["create_document", "edit_document", "update_document", "suggest_document"],
@@ -1281,7 +1281,7 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict:
                "notes": ["manage_notes"],
                "calendar": ["manage_calendar"],
                "email": ["mcp__email__list_emails", "mcp__email__read_email", "mcp__email__send_email"],
-                "research": ["web_search"],  # research is a per-request flag, not a tool — closest analog
+                "research": ["web_search", "web_fetch"],  # research is a per-request flag, not a tool — closest analog
            }
            if action == "list_tools":
@@ -2863,13 +2863,25 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict:
                endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
            )
            note = "" if registered else " (state-write failed — task may not show in UI)"
            where = host or "local"
            log_path = f"/tmp/odysseus-tmux/{sid}.log"
            return {
-                "output": f"Serving {repo_id} (session: {sid}){note}",
+                "output": (
                    f"Serving {repo_id} on {where} (session: {sid}){note}\n"
                    f"Next required check: call list_served_models. If this task is not ready, "
                    f"call tail_serve_output with session_id={sid} and tail=400 before answering. "
                    f"Do not tell the user to check logs; you have the log tool."
                ),
                "session_id": sid,
                "task_type": "serve",
                "phase": "running",
                "host": host,
                "endpoint_id": endpoint_id,
                "log_path": log_path,
                "next_tools": [
                    {"name": "list_served_models", "arguments": {}},
                    {"name": "tail_serve_output", "arguments": {"session_id": sid, "tail": 400}},
                ],
                "exit_code": 0,
            }
        # FastAPI HTTPException puts the message under `detail`, not `error`.
@@ -3216,8 +3228,17 @@ async def do_tail_serve_output(content: str, owner: Optional[str] = None) -> Dic
        MAX_CHARS = 8000
        if len(output_text) > MAX_CHARS:
            output_text = "…(earlier output truncated)…\n" + output_text[-MAX_CHARS:]
        if not output_text:
            output_text = (
                f"No log output captured yet for {session_id} on {host_label}. "
                "This usually means the tmux wrapper has started but the model process "
                "has not printed anything yet. Do not stop here: call list_served_models "
                "again to check whether it is still loading, ready, or crashed; if it is "
                "still not ready, call tail_serve_output again with a larger tail after "
                "the next status check."
            )
        return {
-            "output": output_text or "(empty pane)",
+            "output": output_text,
            "session_id": session_id,
            "host": host_label,
            "tail_lines": tail,
@@ -39,6 +39,10 @@ _XML_TOOL_CALL_RE = re.compile(
    r"<(?:[\w]+:)?(?:tool_call|function_call)>\s*([\s\S]*?)</(?:[\w]+:)?(?:tool_call|function_call)>",
    re.IGNORECASE,
 )
 _XML_OPEN_TOOL_CALL_RE = re.compile(
    r"<(?:[\w]+:)?(?:tool_call|function_call)>\s*([\s\S]*)\Z",
    re.IGNORECASE,
 )
 _XML_INVOKE_RE = re.compile(
    r'<invoke\s+name=["\'](\w+)["\']>\s*([\s\S]*?)</invoke>',
    re.IGNORECASE,
@@ -47,6 +51,21 @@ _XML_PARAM_RE = re.compile(
    r'<parameter\s+name=["\'](\w+)["\']>([\s\S]*?)</parameter>',
    re.IGNORECASE,
 )
 _XML_DIRECT_TOOL_RE = re.compile(
    r"<\s*([A-Za-z_][\w-]*)\s*>([\s\S]*?)</\s*\1\s*>",
    re.IGNORECASE,
 )
 # Pattern 3b: StepFun Step-3.x native tool-call tokens. The tokenizer defines:
 #   <｜tool▁calls▁begin｜> ... <｜tool▁calls▁end｜>
 #   <｜tool▁call▁begin｜>tool_name<｜tool▁sep｜>{...}<｜tool▁call▁end｜>
 # These can leak as text through llama.cpp/Ollama-style endpoints when the
 # engine does not return structured OpenAI tool_calls.
 _STEPFUN_CALL_BEGIN = "<｜tool▁call▁begin｜>"
 _STEPFUN_CALL_SEP = "<｜tool▁sep｜>"
 _STEPFUN_CALL_END = "<｜tool▁call▁end｜>"
 _STEPFUN_CALLS_BEGIN = "<｜tool▁calls▁begin｜>"
 _STEPFUN_CALLS_END = "<｜tool▁calls▁end｜>"
 # Pattern 4: <tool_code> blocks (MiniMax-M2.5 style)
 # {tool => 'tool_name', args => '<param>value</param>'}
@@ -446,6 +465,138 @@ def _parse_xml_invoke(inv_match) -> Optional[ToolBlock]:
    return function_call_to_tool_block(tool_name, json.dumps(params))
 def _parse_xml_direct_tool(tool_match) -> Optional[ToolBlock]:
    """Parse direct XML tool tags inside <tool_call>.
    Some local models emit:
      <tool_call><web_search>query</web_search></tool_call>
    instead of the invoke/parameter shape:
      <tool_call><invoke name="web_search"><parameter name="query">query</parameter></invoke></tool_call>
    Keep this as an adapter to the canonical function-call converter so aliases
    and per-tool argument formatting stay in one place.
    """
    tool_name = tool_match.group(1).lower().replace("-", "_")
    if tool_name in {"invoke", "parameter", "tool_call", "function_call"}:
        return None
    mapped = _TOOL_NAME_MAP.get(tool_name) or (tool_name if tool_name in TOOL_TAGS else None)
    if not mapped:
        return None
    body = tool_match.group(2).strip()
    if not body:
        return None
    try:
        params = json.loads(body)
        if not isinstance(params, dict):
            params = {}
    except json.JSONDecodeError:
        if mapped == "web_search":
            params = {"query": body}
        elif mapped == "web_fetch":
            params = {"url": body}
        elif mapped == "bash":
            params = {"command": body}
        elif mapped == "python":
            params = {"code": body}
        elif mapped in ("read_file", "write_file"):
            params = {"path": body}
        else:
            params = {"content": body}
    from src.tool_schemas import function_call_to_tool_block
    return function_call_to_tool_block(mapped, json.dumps(params))
 def _iter_stepfun_tool_calls(text: str):
    """Yield StepFun native tool-call token bodies without regex backtracking."""
    pos = 0
    while True:
        start = text.find(_STEPFUN_CALL_BEGIN, pos)
        if start < 0:
            return
        name_start = start + len(_STEPFUN_CALL_BEGIN)
        sep = text.find(_STEPFUN_CALL_SEP, name_start)
        if sep < 0:
            return
        end = text.find(_STEPFUN_CALL_END, sep + len(_STEPFUN_CALL_SEP))
        if end < 0:
            return
        raw_name = text[name_start:sep].strip()
        body = text[sep + len(_STEPFUN_CALL_SEP):end].strip()
        if raw_name and len(raw_name) <= 128:
            yield raw_name, body
        pos = end + len(_STEPFUN_CALL_END)
 def _strip_stepfun_tool_markup(text: str) -> str:
    """Remove StepFun tool-call token blocks and wrappers using literal scans."""
    out = []
    pos = 0
    while True:
        start = text.find(_STEPFUN_CALL_BEGIN, pos)
        if start < 0:
            out.append(text[pos:])
            break
        end = text.find(_STEPFUN_CALL_END, start + len(_STEPFUN_CALL_BEGIN))
        if end < 0:
            out.append(text[pos:])
            break
        out.append(text[pos:start])
        pos = end + len(_STEPFUN_CALL_END)
    cleaned = "".join(out)
    return cleaned.replace(_STEPFUN_CALLS_BEGIN, "").replace(_STEPFUN_CALLS_END, "")
 def _strip_bare_invoke_markup(text: str) -> str:
    """Remove bare <invoke ...>...</invoke> blocks without regex backtracking."""
    out = []
    pos = 0
    while True:
        start = text.lower().find("<invoke", pos)
        if start < 0:
            out.append(text[pos:])
            break
        tag_end = text.find(">", start)
        if tag_end < 0:
            out.append(text[pos:])
            break
        close = text.lower().find("</invoke>", tag_end + 1)
        if close < 0:
            out.append(text[pos:])
            break
        out.append(text[pos:start])
        pos = close + len("</invoke>")
    return "".join(out)
 def _parse_stepfun_tool_call(tool_name: str, body: str) -> Optional[ToolBlock]:
    """Parse StepFun native tool-call tokens into an Odysseus ToolBlock."""
    tool_name = tool_name.lower().replace("-", "_").replace(".", "_")
    mapped = _TOOL_NAME_MAP.get(tool_name) or (tool_name if tool_name in TOOL_TAGS else None)
    if not mapped:
        return None
    body = (body or "").strip()
    if not body:
        return None
    try:
        params = json.loads(body)
        if not isinstance(params, dict):
            params = {}
    except json.JSONDecodeError:
        if mapped == "web_search":
            params = {"query": body}
        elif mapped == "web_fetch":
            params = {"url": body}
        elif mapped == "bash":
            params = {"command": body}
        elif mapped == "python":
            params = {"code": body}
        elif mapped in ("read_file", "write_file"):
            params = {"path": body}
        else:
            params = {"content": body}
    from src.tool_schemas import function_call_to_tool_block
    return function_call_to_tool_block(mapped, json.dumps(params))
 def _parse_tool_code_block(raw: str) -> Optional[ToolBlock]:
    """Parse a <tool_code>{tool => 'name', args => '...'}</tool_code> block (MiniMax style)."""
    # Extract tool name
@@ -511,8 +662,9 @@ def parse_tool_blocks(text: str, skip_fenced: bool = False) -> List[ToolBlock]:
    2. [TOOL_CALL] ... [/TOOL_CALL] blocks (some models)
    3. XML-style <tool_call>/<invoke> blocks
    4. <tool_code> blocks (MiniMax-M2.5 style)
-    5. DeepSeek DSML markup (normalized to <invoke> first)
+    5. StepFun Step-3 native <｜tool▁call▁begin｜> tokens
-    6. Non-native local model fallback: prose mentioning web_search followed by
+    6. DeepSeek DSML markup (normalized to <invoke> first)
    7. Non-native local model fallback: prose mentioning web_search followed by
       bare JSON args, e.g. {"query":"...", "time_filter":"week"}
    `skip_fenced`: when True, Pattern 1 (fenced ```bash/```python/```json code
@@ -567,12 +719,38 @@ def parse_tool_blocks(text: str, skip_fenced: bool = False) -> List[ToolBlock]:
    # Pattern 3: XML-style <tool_call>/<invoke> blocks
    if not blocks:
        for tool_name, body in _iter_stepfun_tool_calls(text):
            block = _parse_stepfun_tool_call(tool_name, body)
            if block:
                blocks.append(block)
        if blocks:
            return blocks
        # Try wrapped: <tool_call><invoke ...>...</invoke></tool_call>
        for m in _XML_TOOL_CALL_RE.finditer(text):
            for inv in _XML_INVOKE_RE.finditer(m.group(1)):
                block = _parse_xml_invoke(inv)
                if block:
                    blocks.append(block)
            if not blocks:
                for direct in _XML_DIRECT_TOOL_RE.finditer(m.group(1)):
                    block = _parse_xml_direct_tool(direct)
                    if block:
                        blocks.append(block)
        # Some local models stream an opening <tool_call> wrapper and a
        # complete inner tool tag, but forget the closing </tool_call>.
        if not blocks:
            for m in _XML_OPEN_TOOL_CALL_RE.finditer(text):
                body = m.group(1)
                for inv in _XML_INVOKE_RE.finditer(body):
                    block = _parse_xml_invoke(inv)
                    if block:
                        blocks.append(block)
                if blocks:
                    break
                for direct in _XML_DIRECT_TOOL_RE.finditer(body):
                    block = _parse_xml_direct_tool(direct)
                    if block:
                        blocks.append(block)
        # Try bare <invoke> without wrapper
        if not blocks:
            for inv in _XML_INVOKE_RE.finditer(text):
@@ -614,7 +792,9 @@ def strip_tool_blocks(text: str, skip_fenced: bool = False) -> str:
    text = _normalize_dsml(text)
    cleaned = text if skip_fenced else _TOOL_BLOCK_RE.sub('', text)
    cleaned = _TOOL_CALL_RE.sub('', cleaned)
    cleaned = _strip_stepfun_tool_markup(cleaned)
    cleaned = _XML_TOOL_CALL_RE.sub('', cleaned)
    cleaned = _XML_OPEN_TOOL_CALL_RE.sub('', cleaned)
    cleaned = _TOOL_CODE_RE.sub('', cleaned)
    if not skip_fenced:
        raw_web_json = _parse_raw_web_json_lookup(cleaned)
@@ -622,6 +802,6 @@ def strip_tool_blocks(text: str, skip_fenced: bool = False) -> str:
            _, (start, end) = raw_web_json
            cleaned = cleaned[:start] + cleaned[end:]
    # Strip bare <invoke> blocks not wrapped in <tool_call>
-    cleaned = re.sub(r'<invoke\s+name=["\'].*?</invoke>', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
+    cleaned = _strip_bare_invoke_markup(cleaned)
    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
    return cleaned.strip()
@@ -879,7 +879,7 @@
          <span class="grow">Library</span>
          <button type="button" class="list-item-plus-btn" id="library-new-doc-btn" title="New document">
            <svg class="list-item-plus-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" style="width:11px;height:11px;"><line x1="12" y1="5" x2="12" y2="19"/><line x1="5" y1="12" x2="19" y2="12"/></svg>
-            <span class="list-item-plus-label">new</span>
+            <span class="list-item-plus-label">document</span>
          </button>
        </div>
        <div class="list-item" id="tool-notes-btn">
@@ -1005,7 +1005,12 @@
          <button type="button" class="model-picker-btn" id="model-picker-btn" title="Switch model"><span id="model-picker-label">Select model</span> <svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="6 15 12 9 18 15"/></svg></button>
          <div class="model-picker-menu hidden" id="model-picker-menu">
            <div class="model-picker-search-row">
-              <input type="text" id="model-picker-search" placeholder="Search models..." autocomplete="off" aria-label="Search models">
+              <div class="model-picker-search-wrap">
                <input type="text" id="model-picker-search" placeholder="Search models..." autocomplete="off" aria-label="Search models">
                <button type="button" class="model-picker-refresh-btn" id="model-picker-refresh-btn" title="Refresh model picker" aria-label="Refresh model picker">
                  <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.4" stroke-linecap="round" stroke-linejoin="round"><polyline points="23 4 23 10 17 10"/><path d="M20.49 15a9 9 0 1 1-2.12-9.36L23 10"/></svg>
                </button>
              </div>
              <button type="button" class="model-picker-action-btn primary" id="model-picker-add-models-btn" title="Add model endpoints" aria-label="Add model endpoints">
                <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.4" stroke-linecap="round" stroke-linejoin="round"><path d="M12 5v14"/><path d="M5 12h14"/></svg>
              </button>
@@ -571,6 +571,24 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
    let timeoutId = null;
    let responseTimeoutCleared = false;
    let clearResponseTimeout = () => {};
    let firstTokenWaitTimers = [];
    const clearFirstTokenWaitTimers = () => {
      firstTokenWaitTimers.forEach(t => { try { clearTimeout(t); } catch (_) {} });
      firstTokenWaitTimers = [];
    };
    const scheduleFirstTokenWaitMessages = () => {
      clearFirstTokenWaitTimers();
      const steps = [
        [20000, 'Still waiting for first token'],
        [60000, 'Large local model is pre-filling context'],
        [120000, 'Still working - no tokens yet from the model'],
      ];
      firstTokenWaitTimers = steps.map(([ms, text]) => setTimeout(() => {
        if (!accumulated && spinner && spinner.element && !(currentAbort && currentAbort.signal.aborted)) {
          spinner.updateMessage(text);
        }
      }, ms));
    };
    const clearProcessingProbe = () => {
      if (processingProbeTimer) {
        clearTimeout(processingProbeTimer);
@@ -921,56 +939,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
        setTimeout(() => spinner.updateMessage('Analyzing sources'), 1500);
      } else {
        spinner.updateMessage('Processing request');
-        const endpointUrlForProbe = sessionModule.getCurrentEndpointUrl ? sessionModule.getCurrentEndpointUrl() : null;
+        scheduleFirstTokenWaitMessages();
        if (endpointUrlForProbe && modelName) {
          processingProbeTimer = setTimeout(async () => {
            processingProbeTimer = null;
            if (accumulated || !spinner || !spinner.element || (currentAbort && currentAbort.signal.aborted)) return;
            processingProbeAbort = new AbortController();
            try {
              spinner.updateMessage('Checking model endpoint');
              const status = await _probeCurrentEndpointStatus(endpointUrlForProbe, processingProbeAbort.signal);
              if (accumulated || !spinner || !spinner.element || (currentAbort && currentAbort.signal.aborted)) return;
              if (!status) {
                spinner.updateMessage('Still waiting for model');
              } else if (status.alive) {
                const latency = status.latency_ms ? ` (${status.latency_ms}ms)` : '';
                spinner.updateMessage(`Endpoint online${latency}; waiting for first token`);
              } else {
                // Probe confirms the endpoint isn't responding. Don't
                // sit on a hung fetch — give the user 5s to read the
                // status, then auto-abort with reason='offline' so the
                // catch handler shows a clean "switch model" message
                // instead of leaving the spinner spinning forever.
                if (status.error) console.warn('Model endpoint probe failed:', status.error);
                let _countdown = 5;
                spinner.updateMessage(`Endpoint offline — cancelling in ${_countdown}s`);
                const _tick = setInterval(() => {
                  _countdown--;
                  if (!spinner || !spinner.element || (currentAbort && currentAbort.signal.aborted) || accumulated) {
                    clearInterval(_tick);
                    return;
                  }
                  if (_countdown > 0) {
                    spinner.updateMessage(`Endpoint offline — cancelling in ${_countdown}s`);
                  } else {
                    clearInterval(_tick);
                    if (currentAbort && !currentAbort.signal.aborted) {
                      currentAbort._reason = 'offline';
                      currentAbort.abort();
                    }
                  }
                }, 1000);
              }
            } catch (e) {
              if (e && e.name !== 'AbortError' && spinner && spinner.element && !accumulated) {
                spinner.updateMessage('Still waiting for model');
              }
            } finally {
              processingProbeAbort = null;
            }
          }, 10000);
        }
      }
      const researchBtn = el('research-toggle-btn');
@@ -1150,6 +1119,11 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
        uiModule.scrollHistory();
      }
      function _replaceThinkingSpinner(label) {
        _removeThinkingSpinner();
        _showThinkingSpinner(label);
      }
      // Auto-show thinking spinner after text stops streaming
      let _textPauseTimer = null;
      function _scheduleThinkingSpinner() {
@@ -1173,10 +1147,24 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
      let _liveThinkHeader = null;
      let _liveThinkSpinnerSlot = null;
      let _liveThinkTimerEl = null;
      let _liveThinkTokenCount = 0;
      let _liveThinkToggle = null;
      let _liveThinkDomId = null;
      function _estimateThinkingTokens(text) {
        const clean = (text || '').trim();
        if (!clean) return 0;
        return Math.max(1, Math.ceil(clean.length / 4));
      }
      function _formatThinkStats(seconds, tokenCount) {
        const time = seconds ? seconds + 's' : '';
        const tokens = tokenCount ? tokenCount + ' tok' : '';
        return time && tokens ? time + ' · ' + tokens : (time || tokens);
      }
      function _replyAfterClosedThinking(text) {
        text = markdownModule.normalizeThinkingMarkup(text || '');
        const closeRe = /<\/(?:think(?:ing)?|thought)>|<channel\|>/gi;
        let match = null;
        let last = null;
@@ -1187,7 +1175,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
      // Direct render helper for streaming text
      _renderStream = () => {
-        let dt = stripToolBlocks(roundText);
+        let dt = markdownModule.normalizeThinkingMarkup(stripToolBlocks(roundText));
        const bodyEl = roundHolder.querySelector('.body');
        const contentEl = _ensureStreamLayout(bodyEl);
@@ -1277,6 +1265,12 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
      let _nextIsError = false;
      let _streamSawDone = false;
      let _firstVisibleOutputSeen = false;
      const markFirstVisibleOutput = () => {
        if (_firstVisibleOutputSeen) return;
        _firstVisibleOutputSeen = true;
        clearFirstTokenWaitTimers();
      };
      while (true) {
        const { done, value } = await reader.read();
@@ -1296,6 +1290,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
          }
          if (line.startsWith('data: ')) {
            const data = line.slice(6);
            if (data && data !== '[DONE]') markFirstVisibleOutput();
            // (thinking spinner removal is handled in agent_step / tool_start / content handlers)
@@ -1357,7 +1352,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                if (_liveThinkHeader) _liveThinkHeader.textContent = 'View thinking process';
                if (_liveThinkSpinnerSlot) _liveThinkSpinnerSlot.remove();
                if (_liveThinkTimerEl && _elapsedDone) {
-                  _liveThinkTimerEl.textContent = _elapsedDone + 's';
+                  _liveThinkTimerEl.textContent = _formatThinkStats(_elapsedDone, _liveThinkTokenCount);
                  _liveThinkTimerEl.style.marginLeft = 'auto';
                  _liveThinkTimerEl.style.marginRight = '5px';
                  var _hdrDone = _liveThinkTimerEl.closest('.thinking-header');
@@ -1399,9 +1394,17 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                typewriterInto(roundHolder.querySelector('.body'), errMsg);
                break;
              }
-              if (json.delta || json.type === 'tool_start' || json.type === 'tool_output' || json.type === 'tool_progress' || json.type === 'agent_step' || json.type === 'doc_stream_open' || json.type === 'doc_stream_delta' || json.type === 'research_progress') {
+              if (json.delta || json.type === 'agent_prep' || json.type === 'tool_start' || json.type === 'tool_output' || json.type === 'tool_progress' || json.type === 'agent_step' || json.type === 'doc_stream_open' || json.type === 'doc_stream_delta' || json.type === 'research_progress') {
                clearResponseTimeout();
                clearProcessingProbe();
                clearFirstTokenWaitTimers();
              }
              if (json.type === 'agent_prep') {
                if (!_isBg) {
                  _cancelThinkingTimer();
                  _replaceThinkingSpinner('Preparing agent');
                }
                continue;
              }
              if (json.delta) {
                _cancelThinkingTimer();
@@ -1464,12 +1467,13 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                // 1. Normal: <think>...no closing tag yet
                // 2. Malformed: <think></think>\n...text but no second </think> yet
                // 3. Qwen3.5: "Thinking Process:" without <think> tags
-                let hasUnclosedThink = markdownModule.hasUnclosedThinkTag(roundText);
+                const normalizedRoundText = markdownModule.normalizeThinkingMarkup(roundText);
                let hasUnclosedThink = markdownModule.hasUnclosedThinkTag(normalizedRoundText);
                // Detect non-tag thinking patterns: "Thinking:", "Thinking Process:", Gemma-style reasoning
                // These patterns don't use <think> tags, so we simulate unclosed thinking during streaming
                const _replyPrefixes = ['Hey', 'Hi ', 'Hi!', 'Hello', 'Sure', 'Yes', 'No ', 'No,', 'Yo', 'OK', 'Here', 'Absolutely', 'Of course', 'Great', 'Alright', 'Thanks', 'Welcome', 'Good ', "I'm happy", "I'd be"];
-                if (!hasUnclosedThink && !/<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>|<\|channel>thought/i.test(roundText)) {
+                if (!hasUnclosedThink && !/<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>|<\|channel>thought/i.test(normalizedRoundText)) {
-                  const _trimmedRT = roundText.trimStart();
+                  const _trimmedRT = normalizedRoundText.trimStart();
                  const _isReasoning = markdownModule.startsWithReasoningPrefix(_trimmedRT);
                  if (_isReasoning) {
                    // Check if we can see a reply boundary yet (newline then reply pattern)
@@ -1494,9 +1498,9 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                    }
                  }
                }
-                if (!hasUnclosedThink && /^<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>\s*<\/(?:think(?:ing)?|thought)>/i.test(roundText)) {
+                if (!hasUnclosedThink && /^<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>\s*<\/(?:think(?:ing)?|thought)>/i.test(normalizedRoundText)) {
                  // Empty <think></think> — the model likely put thinking outside the tags
-                  const afterEmpty = roundText.replace(/^<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>\s*<\/(?:think(?:ing)?|thought)>/i, '').trim();
+                  const afterEmpty = normalizedRoundText.replace(/^<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>\s*<\/(?:think(?:ing)?|thought)>/i, '').trim();
                  const closeTags = (afterEmpty.match(/<\/(?:think(?:ing)?|thought)>/gi) || []).length;
                  if (closeTags === 0 && afterEmpty.length > 0) {
                    hasUnclosedThink = true; // still waiting for real closing tag
@@ -1506,10 +1510,10 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                // Only applies when there's a second </think> later (model leaked thinking outside tags)
                // Do NOT trigger if the text after </think> contains tool calls (that's real content)
                if (!hasUnclosedThink && isThinking) {
-                  const _thinkMatch = roundText.match(/<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>([\s\S]*?)<\/(?:think(?:ing)?|thought)>/i);
+                  const _thinkMatch = normalizedRoundText.match(/<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>([\s\S]*?)<\/(?:think(?:ing)?|thought)>/i);
                  const _thinkLen = _thinkMatch ? _thinkMatch[1].trim().length : 0;
                  if (_thinkLen < 20) {
-                    const _afterClose = roundText.replace(/<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>([\s\S]*?)<\/(?:think(?:ing)?|thought)>/i, '').trim();
+                    const _afterClose = normalizedRoundText.replace(/<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>([\s\S]*?)<\/(?:think(?:ing)?|thought)>/i, '').trim();
                    // Only keep waiting if there's trailing text that looks like thinking (not tool calls)
                    const _hasToolCall = /```(?:bash|python|web_search|read_file|write_file|create_document|edit_document|manage_|generate_image)/i.test(_afterClose);
                    const _hasOrphanClose = /<\/(?:think(?:ing)?|thought)>/i.test(_afterClose);
@@ -1554,7 +1558,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                  function _tickThinkTimer() {
                    if (!_liveThinkTimerEl || !_liveThinkTimerEl.isConnected) return;
                    var s = ((Date.now() - _thinkTimerStart) / 1000).toFixed(1);
-                    _liveThinkTimerEl.textContent = s + 's';
+                    _liveThinkTimerEl.textContent = _formatThinkStats(s, _liveThinkTokenCount);
                    _thinkTimerRAF = requestAnimationFrame(_tickThinkTimer);
                  }
                  _thinkTimerRAF = requestAnimationFrame(_tickThinkTimer);
@@ -1570,13 +1574,18 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                } else if (hasUnclosedThink && isThinking) {
                  if (_liveThinkInner) {
                    // Extract raw thinking text (strip known thinking wrappers and prefixes)
-                    var thinkText = roundText
+                    var thinkText = markdownModule.normalizeThinkingMarkup(roundText)
                      .replace(/<\/?(?:think(?:ing)?|thought)(?:\s+[^>]*)?>/gi, '')
                      .replace(/<\|channel>thought\s*\n?/gi, '')
                      .replace(/<\|channel>response\s*\n?/gi, '')
                      .replace(/<channel\|>/gi, '');
                    thinkText = thinkText.replace(/^\s*Thinking(?:\s+Process)?:\s*/i, '');
                    _liveThinkTokenCount = _estimateThinkingTokens(thinkText);
                    _liveThinkInner.innerHTML = markdownModule.mdToHtml(thinkText);
                    if (_liveThinkTimerEl) {
                      var _elapsedLive = thinkingStartTime ? ((Date.now() - thinkingStartTime) / 1000).toFixed(1) : '';
                      _liveThinkTimerEl.textContent = _formatThinkStats(_elapsedLive, _liveThinkTokenCount);
                    }
                    // Keep thinking box scrolled to bottom, but let user scroll up
                    var thinkBox = _liveThinkInner.closest('.thinking-content');
                    if (thinkBox) {
@@ -1600,6 +1609,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                    _liveThinkHeader = null;
                    _liveThinkSpinnerSlot = null;
                    _liveThinkTimerEl = null;
                    _liveThinkTokenCount = 0;
                    _liveThinkToggle = null;
                    _liveThinkDomId = null;
                    // Fall through to normal streaming
@@ -1622,7 +1632,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                  if (_liveThinkSpinnerSlot) _liveThinkSpinnerSlot.remove();
                  // Move timer to right side of header
                  if (_liveThinkTimerEl && elapsed) {
-                    _liveThinkTimerEl.textContent = elapsed + 's';
+                    _liveThinkTimerEl.textContent = _formatThinkStats(elapsed, _liveThinkTokenCount);
                    _liveThinkTimerEl.style.marginLeft = 'auto';
                    _liveThinkTimerEl.style.marginRight = '5px';
                    var _hdrRow = _liveThinkTimerEl.closest('.thinking-header');
@@ -2023,7 +2033,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                  cancelAnimationFrame(_thinkTimerRAF);
                  var _elapsed2 = thinkingStartTime ? ((Date.now() - thinkingStartTime) / 1000).toFixed(1) : null;
                  if (_liveThinkHeader) _liveThinkHeader.textContent = 'View thinking process';
-                  if (_liveThinkTimerEl) _liveThinkTimerEl.textContent = _elapsed2 ? _elapsed2 + 's' : '';
+                  if (_liveThinkTimerEl) _liveThinkTimerEl.textContent = _elapsed2 ? _formatThinkStats(_elapsed2, _liveThinkTokenCount) : '';
                  if (_liveThinkSpinnerSlot) _liveThinkSpinnerSlot.remove();
                  // Assign stable IDs
                  var _thinkId2 = 'think-' + Date.now();
@@ -2037,7 +2047,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
                if (!roundFinalized) {
                  roundFinalized = true;
                  if (spinner && spinner.element) spinner.destroy();
-                  const dt = stripToolBlocks(roundText);
+                  const dt = markdownModule.normalizeThinkingMarkup(stripToolBlocks(roundText));
                  if (dt.trim()) {
                    var _body3 = roundHolder.querySelector('.body');
                    var _contentEl3 = _ensureStreamLayout(_body3);
@@ -3018,6 +3028,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
    } finally {
      clearResponseTimeout();
      clearProcessingProbe();
      clearFirstTokenWaitTimers();
      // Streaming done — let screen readers announce the settled response.
      const _chatLogDone = document.getElementById('chat-history');
      if (_chatLogDone) _chatLogDone.setAttribute('aria-busy', 'false');
@@ -3396,7 +3407,7 @@ import { wireArrowUpRecall, getLastUserMessageFromChatHistory } from './composer
    };
    const renderDelta = () => {
-      const dt = stripToolBlocks(roundText);
+      const dt = markdownModule.normalizeThinkingMarkup(stripToolBlocks(roundText));
      contentDiv.innerHTML = markdownModule.mdToHtml(markdownModule.squashOutsideCode(dt));
      uiModule.scrollHistory();
    };
@@ -73,6 +73,45 @@ function isCompareActive() {
  return state.isActive;
 }
 function _compareModeLabel() {
  return ({ search: ' search providers', agent: ' agents', research: ' research models' }[state._compareMode] || ' models');
 }
 function _setToolbarMode(mode, syncModeTools = !state.isActive) {
  const target = mode === 'agent' ? 'agent' : 'chat';
  const toggleState = Storage.loadToggleState();
  toggleState.mode = target;
  Storage.saveToggleState(toggleState);
  const agentBtn = document.getElementById('mode-agent-btn');
  const chatBtn = document.getElementById('mode-chat-btn');
  const modeToggle = agentBtn?.closest('.mode-toggle') || chatBtn?.closest('.mode-toggle') || document.querySelector('.mode-toggle');
  if (agentBtn && chatBtn) {
    agentBtn.classList.toggle('active', target === 'agent');
    chatBtn.classList.toggle('active', target === 'chat');
    agentBtn.setAttribute('aria-pressed', target === 'agent' ? 'true' : 'false');
    chatBtn.setAttribute('aria-pressed', target === 'chat' ? 'true' : 'false');
  }
  if (modeToggle) {
    modeToggle.classList.toggle('mode-chat', target === 'chat');
    modeToggle.classList.toggle('mode-right', target === 'chat');
  }
  if (syncModeTools) {
    document.querySelectorAll('[data-mode-tool]').forEach(b => { b.style.display = target === 'agent' ? '' : 'none'; });
  }
 }
 function _syncCompareModeFromToolbar(mode) {
  if (!state.isActive) return;
  state._compareMode = mode === 'agent' ? 'agent' : 'chat';
  _setToolbarMode(state._compareMode, false);
  const headerLabel = document.querySelector('.compare-header-label');
  if (headerLabel) {
    headerLabel.textContent = 'Comparing' + _compareModeLabel() + (state._blindMode ? ' (blind)' : '') + ' · ' + state._timeout + 's timeout';
  }
  const evalWrap = document.getElementById('cmp-eval-wrap');
  if (evalWrap && typeof evalWrap._renderItems === 'function') evalWrap._renderItems();
 }
 // ────────────────────────────────────────────────────────────────────────────
 // ── closeCompare ──
 // ────────────────────────────────────────────────────────────────────────────
@@ -170,12 +209,7 @@ async function deactivate(teardown) {
  });
  // Restore agent/chat mode to what it was before compare
-  const _ts = Storage.loadToggleState();
+  _setToolbarMode(state._savedMode, true);
  _ts.mode = state._savedMode;
  Storage.saveToggleState(_ts);
  const _ab2 = document.getElementById('mode-agent-btn'), _cb2 = document.getElementById('mode-chat-btn');
  if (_ab2 && _cb2) { _ab2.classList.toggle('active', state._savedMode === 'agent'); _cb2.classList.toggle('active', state._savedMode === 'chat'); }
  document.querySelectorAll('[data-mode-tool]').forEach(b => { b.style.display = state._savedMode === 'agent' ? '' : 'none'; });
  // Delete unsaved sessions, then reload
  if (teardown) {
@@ -258,19 +292,30 @@ async function _buildCompareUI() {
    if (el) state._savedIndicatorDisplay[id] = el.style.display;
  });
-  // 5. Save current mode and lock to the right one for this compare type
+  // 5. Save current mode and seed the toolbar for this compare type.
  const _toggleState = Storage.loadToggleState();
  state._savedMode = _toggleState.mode || 'chat';
  const _targetMode = (state._compareMode === 'agent') ? 'agent' : 'chat';
-  _toggleState.mode = _targetMode;
+  _setToolbarMode(_targetMode, false);
  Storage.saveToggleState(_toggleState);
  const _ab = document.getElementById('mode-agent-btn'), _cb = document.getElementById('mode-chat-btn');
  let _modeCleanup = null;
  const _onCompareModeClick = (ev) => {
    ev.stopPropagation();
    ev.stopImmediatePropagation();
    _syncCompareModeFromToolbar(ev.currentTarget === _ab ? 'agent' : 'chat');
  };
  if (_ab && _cb) {
-    _ab.classList.toggle('active', _targetMode === 'agent');
+    _ab.addEventListener('click', _onCompareModeClick, true);
-    _cb.classList.toggle('active', _targetMode === 'chat');
+    _cb.addEventListener('click', _onCompareModeClick, true);
    _modeCleanup = document.createElement('span');
    _modeCleanup.style.display = 'none';
    _modeCleanup._cleanup = () => {
      _ab.removeEventListener('click', _onCompareModeClick, true);
      _cb.removeEventListener('click', _onCompareModeClick, true);
    };
  }
  const _modeToggle = document.querySelector('.mode-toggle');
-  if (_modeToggle) { _modeToggle.style.pointerEvents = 'none'; _modeToggle.style.opacity = '0.4'; }
+  if (_modeToggle) { _modeToggle.style.pointerEvents = ''; _modeToggle.style.opacity = ''; }
  // 6. Force tool toggles per compare mode
  disableToolToggles();
@@ -289,6 +334,7 @@ async function _buildCompareUI() {
  // 7. Hide existing chat container children (preserves event listeners)
  const container = document.getElementById('chat-container');
  state._compareElements = [];
  if (_modeCleanup) state._compareElements.push(_modeCleanup);
  Array.from(container.children).forEach(child => {
    if (child.style.display === 'none') return;
    child.dataset.cmpHidden = '1';
@@ -302,9 +348,9 @@ async function _buildCompareUI() {
  headerBar.className = 'compare-header-bar';
  headerBar.style.cssText = 'display:flex;align-items:center;justify-content:space-between;padding:6px 10px;flex-shrink:0;';
  const headerLabel = document.createElement('span');
  headerLabel.className = 'compare-header-label';
  headerLabel.style.cssText = 'font-size:10px;font-weight:400;color:var(--fg);white-space:nowrap;overflow:hidden;text-overflow:ellipsis;min-width:0;';
-  const _modeLabel = ({ search: ' search providers', agent: ' agents', research: ' research models' }[state._compareMode] || ' models');
+  headerLabel.textContent = 'Comparing' + _compareModeLabel() + (state._blindMode ? ' (blind)' : '') + ' · ' + state._timeout + 's timeout';
  headerLabel.textContent = 'Comparing' + _modeLabel + (state._blindMode ? ' (blind)' : '') + ' · ' + state._timeout + 's timeout';
  // Left side: the Compare tool icon (two side-by-side panes, matching the
  // rail/sidebar icon) + the label. Other tool headers carry their icon; this
  // one was missing it.
@@ -475,7 +521,7 @@ async function _buildCompareUI() {
  }
  const msgTA = document.getElementById('message');
  if (msgTA) {
-    msgTA.placeholder = 'Enter prompt for all models...';
+    msgTA.placeholder = window.matchMedia('(max-width: 767px)').matches ? '' : 'Enter prompt for all models...';
    requestAnimationFrame(() => msgTA.focus());
  }
@@ -891,8 +937,7 @@ async function _executeCompare(message) {
    let sharedSearchContext = null;
    let sharedSearchSources = null;
    const webChk = document.getElementById('web-toggle');
-    const toggleState = Storage.loadToggleState();
+    const isAgentMode = state._compareMode === 'agent';
    const isAgentMode = (toggleState.mode || 'chat') === 'agent';
    const webOn = webChk && webChk.checked;
    // In agent mode, web_search is a tool (handled per-pane); in chat mode, pre-search and share
    if (webOn && !isAgentMode) {
@@ -1198,6 +1243,15 @@ function _setupEvalPicker() {
  function _renderItems() {
    const mode = state._compareMode || 'chat';
    const label = btn.querySelector('.cmp-eval-label');
    if (label) {
      label.textContent = ({
        agent: 'Agent prompts',
        chat: 'Chat prompts',
        search: 'Search prompts',
        research: 'Research prompts'
      }[mode] || 'Eval prompts');
    }
    // research/html aren't first-class compare types — fall back gracefully
    const key = EVAL_PROMPTS[mode] ? mode
      : (mode === 'research' ? 'search' : 'chat');
@@ -1258,8 +1312,10 @@ function _setupEvalPicker() {
  };
  document.addEventListener('click', _onDocClick);
  _renderItems();
  wrap.appendChild(btn);
  wrap.appendChild(menu);
  wrap._renderItems = _renderItems;
  inputTop.appendChild(wrap);
  // Expected-answer chip — placed above the chat-input-bar (outside it), so
@@ -551,23 +551,46 @@ async function streamToPane(paneIdx, sessionId, message, aiMsgEl, opts) {
      footer.className = 'msg-footer';
      const span = document.createElement('span');
      span.className = 'response-metrics';
-      let text = metrics.output_tokens + ' tokens | ' + metrics.tokens_per_second + ' tok/s';
+      const outputTokens = metrics.output_tokens;
      const responseTime = metrics.response_time ?? metrics.total_time;
      const explicitTps = metrics.tokens_per_second ?? metrics.gen_tps ?? metrics.tps;
      const numericOutput = Number(outputTokens);
      const numericTime = Number(responseTime);
      const numericTps = Number(explicitTps);
      const derivedTps = Number.isFinite(numericTps)
        ? numericTps
        : (Number.isFinite(numericOutput) && Number.isFinite(numericTime) && numericTime > 0)
          ? numericOutput / numericTime
          : null;
      const tpsLabel = derivedTps != null
        ? (derivedTps >= 100 ? String(Math.round(derivedTps)) : derivedTps.toFixed(2).replace(/\.?0+$/, ''))
        : null;
      const parts = [];
      if (outputTokens != null && outputTokens !== 'undefined') {
        parts.push(outputTokens + ' tokens');
      }
      if (tpsLabel != null) {
        parts.push(tpsLabel + ' tok/s');
      }
      if (responseTime != null && responseTime !== 'undefined' && parts.length === 0) {
        parts.push(responseTime + 's');
      }
      // Add per-request cost and cost per 1000
      const _model = metrics.model || (state._selectedModels[paneIdx] && state._selectedModels[paneIdx].model) || '';
      const _cost = getModelCost(_model, metrics.input_tokens || 0, metrics.output_tokens || 0);
      // Build the metrics span with optional cost and context
-      span.textContent = text;
+      span.textContent = parts.join(' | ');
      if (_cost !== null) {
        const _cost1k = _cost * 1000;
        const costSpan = document.createElement('span');
        costSpan.style.color = 'var(--color-success, #4caf50)';
        costSpan.title = 'Estimated cost per 1,000 responses like this one';
-        costSpan.textContent = ' | $' + (_cost1k < 1 ? _cost1k.toFixed(2) : _cost1k.toFixed(0)) + '/1k';
+        costSpan.textContent = (span.textContent ? ' | ' : '') + '$' + (_cost1k < 1 ? _cost1k.toFixed(2) : _cost1k.toFixed(0)) + '/1k';
        span.appendChild(costSpan);
      }
      if (metrics.context_percent > 0) {
        const ctx = document.createElement('span');
-        ctx.textContent = ' | ' + metrics.context_percent + '% ctx';
+        ctx.textContent = (span.textContent ? ' | ' : '') + metrics.context_percent + '% ctx';
        if (metrics.context_percent >= 85) ctx.style.color = 'var(--color-error)';
        else if (metrics.context_percent >= 70) ctx.style.color = '#ff9900';
        span.appendChild(ctx);
@@ -181,7 +181,7 @@ function handleVote(winnerIdx) {
    let html = '';
    const caret = ' <span class="pane-title-caret">&#x25BE;</span>';
-    if (isWinner) html = '<span style="color:var(--red);margin-right:4px;">&#x2605;</span><strong>' + escapeHtml(name) + '</strong> <span style="color:var(--red);font-size:0.82em;font-weight:800;text-transform:uppercase;letter-spacing:1px;position:relative;top:-2px;">Winner!</span>' + caret;
+    if (isWinner) html = '<span style="color:var(--green, #50fa7b);margin-right:4px;">&#x2605;</span><strong>' + escapeHtml(name) + '</strong> <span style="color:var(--green, #50fa7b);font-size:0.82em;font-weight:800;text-transform:uppercase;letter-spacing:1px;position:relative;top:0;">Winner!</span>' + caret;
    else if (isTie) html = '<span style="opacity:0.5;margin-right:4px;">=</span><strong>' + escapeHtml(name) + '</strong>' + caret;
    else html = '<strong>' + escapeHtml(name) + '</strong>' + caret;
    el.innerHTML = html;
@@ -461,6 +461,40 @@ export const ERROR_PATTERNS = [
      { label: 'Copy install command', action: () => _copyText('curl -fsSL https://ollama.com/install.sh | sh') },
    ],
  },
  // System build deps must be checked BEFORE the llama-server catch-all:
  // a `cmake: command not found` failure ALSO produces `llama-server:
  // command not found` later in the script (the build aborts then the
  // run line fails) — pattern order is first-match-wins, so without
  // these specific entries the user gets the misleading "install
  // llama-cpp-python[server]" suggestion when the actual blocker is a
  // missing OS-package toolchain that pip can't ship.
  {
    pattern: /cmake: command not found|cmake.*not found.*Could not/i,
    message: 'cmake is required to compile llama.cpp from source, but it is not installed on this server.',
    suggestion: 'Suggested action: install cmake via the OS package manager — apt: cmake build-essential / pacman: cmake base-devel / dnf: cmake gcc-c++ make / brew: cmake. Cookbook can do this automatically on the next launch if your user has passwordless sudo for apt/pacman/dnf.',
    fixes: [
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
      { label: 'Copy apt install', action: () => _copyText('sudo apt install -y cmake build-essential git') },
      { label: 'Copy pacman install', action: () => _copyText('sudo pacman -Sy --needed cmake base-devel git') },
      { label: 'Copy dnf install', action: () => _copyText('sudo dnf install -y cmake gcc gcc-c++ make git') },
    ],
  },
  {
    pattern: /^(make|g\+\+|gcc): command not found|Could not find C\+\+ compiler/i,
    message: 'A C/C++ compiler (build-essential / base-devel) is required to compile llama.cpp.',
    fixes: [
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
      { label: 'Copy apt install', action: () => _copyText('sudo apt install -y build-essential') },
    ],
  },
  {
    pattern: /^git: command not found/i,
    message: 'git is required to clone the llama.cpp source tree.',
    fixes: [
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('llama_cpp') },
      { label: 'Copy apt install', action: () => _copyText('sudo apt install -y git') },
    ],
  },
  {
    pattern: /llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'/i,
    message: 'llama-cpp-python server is not installed. Run: pip install "llama-cpp-python[server]"',
@@ -714,11 +748,15 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
  copyBtn.addEventListener('click', async (e) => {
    e.stopPropagation();
    const bundle = _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText);
-    try {
+    // Use the shared helper which falls back to execCommand('copy') on
-      await navigator.clipboard.writeText(bundle);
+    // non-HTTPS origins (Tailscale IPs, LAN IPs, etc.) — navigator.clipboard
    // is silently a no-op on those, which is why the button appeared dead
    // for users on http://100.113.161.2:7011 over Tailscale/mobile.
    const ok = await _copyText(bundle);
    if (ok) {
      copyBtn.classList.add('copied');
      setTimeout(() => { if (copyBtn.isConnected) copyBtn.classList.remove('copied'); }, 1200);
-    } catch (_) {}
+    }
  });
  const dismissBtn = document.createElement('button');
@@ -578,7 +578,9 @@ export async function _hwfitFetch(fresh = false) {
  const _cached = fresh ? null : _readScanCache(_sig);
  const wp = spinnerModule.createWhirlpool(18);
  if (_cached) {
-    _hwfitCache = _cached;
+    // Tag the restored cache with its host too (scan-sig keys cache per
    // host, so a hit here is always for the current remoteHost).
    _hwfitCache = { ..._cached, _scannedHost: remoteHost || '' };
    _hwfitRenderHw(hw, _cached.system);
    if (!remoteHost && _cached.system && _cached.system.platform) {
      _envState.platform = _cached.system.platform;
@@ -750,7 +752,11 @@ export async function _hwfitFetch(fresh = false) {
        : _olRows;
      data.models = (data.models || []).concat(_olFiltered);
    }
-    _hwfitCache = data;
+    // Tag the cache with the host this scan was for, so downstream
    // code (_gpuEnvVarName, backend-aware command builders) can avoid
    // trusting a stale scan when the user switches the server picker
    // to a different target without re-running hwfit.
    _hwfitCache = { ...data, _scannedHost: remoteHost || '' };
    _hwfitRenderHw(hw, data.system);
    // Propagate local platform from hardware probe so _isWindows(task) works
    // for local tasks (menu items, shell commands, etc.).
@@ -1415,23 +1421,11 @@ export function _expandModelRow(row, modelData) {
  const dlSource = _downloadSourceRepo(modelData, backend);
  const hfUrl = `https://huggingface.co/${dlSource.repo}`;
  // Official vendor recipe deep-links. These point to vLLM / SGLang's curated
  // hardware-specific launch-command pages. They 404 for uncatalogued models \u2014
  // a known tradeoff; user just gets the vendor's "model not found" page.
  const _recipeRepo = modelData.name || '';
  const _vllmUrl = _recipeRepo ? `https://recipes.vllm.ai/${_recipeRepo}` : '';
  const _sglangUrl = _recipeRepo ? `https://docs.sglang.io/cookbook/autoregressive/${_recipeRepo}${_sglangHashFor(modelData)}` : '';
  let html = `<div class="hwfit-action-panel" data-model-name="${esc(modelData.name)}">`;
  html += `<div class="hwfit-panel-header">`;
  html += `<span class="hwfit-panel-model">${esc(modelData.name)}${dlSource.kind ? ` <span style="opacity:0.5;font-size:10px;">(${esc(dlSource.kind)} ${esc(modelData.quant || '')})</span>` : (modelData.quant_repo ? ` <span style="opacity:0.5;font-size:10px;">(${esc(modelData.quant)})</span>` : '')}</span>`;
  html += `<span class="hwfit-panel-badge">${esc(label)}</span>`;
  html += `<a href="${esc(hfUrl)}" target="_blank" rel="noopener" class="hwfit-panel-hf-link" title="View download source on HuggingFace">HF \u2197</a>`;
  if (backend === 'vllm' && _vllmUrl) {
    html += `<a href="${esc(_vllmUrl)}" target="_blank" rel="noopener" class="hwfit-panel-hf-link" title="vLLM official recipe (curated launch command). 404s if this model isn't in vLLM's recipes catalog.">vLLM \u2197</a>`;
  }
  if (backend === 'sglang' && _sglangUrl) {
    html += `<a href="${esc(_sglangUrl)}" target="_blank" rel="noopener" class="hwfit-panel-hf-link" title="SGLang cookbook (hash pre-filled with your detected hardware). 404s if this model isn't in SGLang's cookbook catalog.">SGLang \u2197</a>`;
  }
  html += `</div>`;
  html += `<div class="hwfit-panel-actions">`;
  html += `<button class="cookbook-btn hwfit-dl-btn">Download</button>`;
@@ -1679,7 +1673,7 @@ export function _expandModelRow(row, modelData) {
      } else if (runBackend === 'llamacpp') {
        const dir = `"$HOME/.cache/huggingface/hub/models--${modelData.name.replace(/\//g, '--')}/snapshots"`;
        const ggufPath = `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
-        cmd = `MODEL_FILE=${ggufPath} && { [ -n "$MODEL_FILE" ] && [ -f "$MODEL_FILE" ]; } || { echo "ERROR: No GGUF found on this host. Download a GGUF quant or switch backend."; exit 1; } && llama-server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} || python3 -m llama_cpp.server --model "$MODEL_FILE" --host 0.0.0.0 --port 8080 --n_gpu_layers 99 --n_ctx ${maxCtx}`;
+        cmd = `llama-server --model "${ggufPath}" --host 0.0.0.0 --port 8080 -ngl 99 -c ${maxCtx} --flash-attn auto`;
      } else {
        cmd = `vllm serve ${modelData.name} --host 0.0.0.0 --port ${port}`;
        cmd += ` --tensor-parallel-size ${tp}`;
@@ -85,6 +85,22 @@ function _ggufIncludePattern(model, source) {
  return '*.gguf';
 }
 function _ggufDisplayPartFromInclude(include) {
  const clean = String(include || '').replace(/\*/g, '');
  const parts = clean.split('/').filter(Boolean);
  const file = parts[parts.length - 1] || clean;
  const dir = parts.length > 1 ? parts[parts.length - 2] : '';
  const quant = `${dir} ${file}`.match(/\b(?:UD-)?(?:IQ[1-8]_[A-Z0-9]+|Q[2-8]_K_[MLS]|Q[2-8]_[0-9A-Z]+|Q[2-8])\b/i);
  if (quant) return quant[0].toUpperCase().replace(/^UD-/, '');
  return file.replace(/\.gguf$/i, '').replace(/-\d{5}-of-\d{5}$/i, '');
 }
 function _downloadTaskName(shortName, payload) {
  const include = payload?.include || '';
  const part = include ? _ggufDisplayPartFromInclude(include) : '';
  return part ? `${shortName} · ${part}` : shortName;
 }
 function _missingGgufMessage(model) {
  const name = model?.name || 'this model';
  if (/\bnvfp4\b/i.test(name)) {
@@ -519,6 +535,7 @@ export async function _runModelDownload(panel, model, backend, hostOverride) {
  }
  const shortName = (model.name || repo).split('/').pop();
  const taskName = _downloadTaskName(shortName, payload);
  const targetHost = host || 'local';
  const tasks = _loadTasks();
@@ -576,7 +593,7 @@ export async function _runModelDownload(panel, model, backend, hostOverride) {
  if (activeOnHost) {
    const queueId = `queue-${Date.now().toString(36)}`;
    const allTasks = _loadTasks();
-    allTasks.push({ id: queueId, sessionId: queueId, name: shortName, type: 'download', status: 'queued', output: '', ts: Date.now(), payload, remoteHost: host });
+    allTasks.push({ id: queueId, sessionId: queueId, name: taskName, type: 'download', status: 'queued', output: '', ts: Date.now(), payload, remoteHost: host });
    _saveTasks(allTasks);
    _renderRunningTab();
    uiModule.showToast(`Queued ${shortName} — waiting for current download`);
@@ -601,8 +618,8 @@ export async function _runModelDownload(panel, model, backend, hostOverride) {
      uiModule.showToast('Download failed: ' + (data.error || ''), 9000);
      return;
    }
-    _addTask(data.session_id, shortName, 'download', payload);
+    _addTask(data.session_id, taskName, 'download', payload);
-    uiModule.showToast(`Downloading ${shortName}...`);
+    uiModule.showToast(`Downloading ${taskName}...`);
  } catch (e) {
    uiModule.showToast('Download failed: ' + e.message, 9000);
  }
@@ -27,6 +27,9 @@ function _statusLabel(status, type) {
 // "cookbook-task-status" ('' = the neutral loading style).
 function _taskBadge(task) {
  if (task._unreachable && task.status === 'running') return { text: 'unreachable', cls: 'cookbook-task-error' };
  if (task.type === 'download' && task.status === 'running') {
    return { text: _statusLabel(task.status, task.type), cls: 'cookbook-task-downloading' };
  }
  if (task.type === 'serve' && task.status === 'running' && task.progress) {
    // Same green "running" pill — just with dynamic phase text, so it doesn't
    // read as a different status while the server is coming up.
@@ -35,6 +38,47 @@ function _taskBadge(task) {
  return { text: _statusLabel(task.status, task.type), cls: 'cookbook-task-' + task.status };
 }
 function _ggufDisplayPartFromPath(path) {
  const parts = String(path || '').split('/').filter(Boolean);
  const file = parts[parts.length - 1] || '';
  const dir = parts.length > 1 ? parts[parts.length - 2] : '';
  const text = `${dir} ${file}`;
  const quant = text.match(/\b(?:UD-)?(?:IQ[1-8]_[A-Z0-9]+|Q[2-8]_K_[MLS]|Q[2-8]_[0-9A-Z]+|Q[2-8])\b/i);
  if (quant) return quant[0].toUpperCase().replace(/^UD-/, '');
  return file.replace(/\.gguf$/i, '').replace(/-\d{5}-of-\d{5}$/i, '');
 }
 function _downloadDisplayName(name, task) {
  const include = task?.payload?.include || '';
  if (!include || String(name || '').includes(' · ')) return name;
  const part = _ggufDisplayPartFromPath(include.replace(/\*/g, ''));
  return part ? `${name} · ${part}` : name;
 }
 function _taskDisplayName(task) {
  const name = String(task?.name || '').trim();
  if (task?.type === 'download') return _downloadDisplayName(name, task);
  if (task?.type !== 'serve') return name;
  const gguf = task?.payload?._fields?.gguf_file || task?.payload?.gguf_file || '';
  if (!gguf || name.includes(' · ')) return name;
  const part = _ggufDisplayPartFromPath(gguf);
  return part ? `${name} · ${part}` : name;
 }
 function _canLaunchDownloadedTask(task) {
  return task?.type === 'download' && ['done', 'completed'].includes(task.status || '') && !!(task.payload?.repo_id || task.name);
 }
 function _downloadServeFields(task) {
  const include = String(task?.payload?.include || '').trim();
  if (!include) return null;
  return {
    backend: 'llamacpp',
    _forceBackend: true,
    _preferredGgufInclude: include,
  };
 }
 // A download task whose tmux output still shows an active per-shard line
 // (e.g. "model-00012-of-00082.safetensors: 56%|") is NOT actually finished —
 // the cookbook just lost track. The clear pill becomes a "reconnect" affordance
@@ -52,13 +96,13 @@ function _downloadOutputLooksActive(task) {
 function _canClearTask(task) {
  if (!task || task.status === 'running') return false;
-  if (task.type === 'serve' && (task.status === 'ready' || task._serveReady)) return false;
+  if (task.type === 'serve' && (task.status === 'ready' || (task._serveReady && !['stopped', 'error', 'crashed', 'failed', 'completed'].includes(task.status)))) return false;
  // If the tmux output still shows an in-flight download, the task isn't
  // actually finished — hide the clear/check pill so it doesn't show on a
  // task that's still doing work. (The next render will reflect this and
  // ideally the self-heal flips status back to running.)
  if (_downloadOutputLooksActive(task)) return false;
-  return ['done', 'stopped', 'error', 'crashed', 'failed'].includes(task.status);
+  return ['done', 'completed', 'stopped', 'error', 'crashed', 'failed'].includes(task.status);
 }
 function _clearPillLabel(task) {
@@ -66,6 +110,13 @@ function _clearPillLabel(task) {
  return 'clear';
 }
 function _venvRootFromPath(path) {
  let p = (path || '').toString().trim().replace(/\/+$/, '');
  if (!p) return '';
  p = p.replace(/\/bin\/(?:activate|python(?:3(?:\.\d+)?)?|vllm|pip(?:3)?)$/i, '');
  return p;
 }
 // A pip dependency/driver install (payload._dep) reports success with the
 // runner's "=== Process exited with code 0 ===" sentinel and pip's
 // "Successfully installed" line — never the HuggingFace download markers
@@ -141,6 +192,13 @@ async function _openDownloadForGgufTask(task) {
 function _terminalServeDiagnosis(task, outputText) {
  const out = String(outputText || task?.output || '');
  if (!task || task.type !== 'serve' || !['stopped', 'error', 'crashed', 'failed'].includes(task.status) || !out.trim()) return null;
  // Suppress the crash diagnosis when the output proves the server
  // actually became reachable — e.g. an early `exit 127` from a failed
  // build attempt was followed by the shim/Python fallback successfully
  // starting Uvicorn. Without this, the user sees a confusing "build
  // stopped before the server became reachable" toast while the server
  // is right there serving requests.
  if (_serveOutputLooksReady(task)) return null;
  // Pip tasks (Reinstall vLLM, Upgrade torch, etc.) ride on the serve task
  // type so they get a tmux session + show up in Running tab — but they are
  // NOT serve invocations. Their output is pip's own; the generic
@@ -256,6 +314,7 @@ let _copyText;
 let _persistEnvState;
 let _refreshDependencies;
 let _serverByVal;
 let _serverKey;
 let _selectedServer;
 let modelLogo;
 let esc;
@@ -264,6 +323,40 @@ let _detectToolParser;
 let _detectModelOptimizations;
 let _buildServeCmd;
 function _taskServerSelection(task) {
  const host = task?.remoteHost || task?.payload?.remote_host || '';
  const savedKey = task?.remoteServerKey || task?.payload?.remote_server_key || '';
  const server = (savedKey ? _serverByVal(savedKey) : null)
    || (host ? _serverByVal(host) : null)
    || (host ? _envState.servers.find(s => s.host === host) : null)
    || null;
  const key = server ? (_serverKey ? _serverKey(server) : savedKey) : (savedKey || (host || 'local'));
  return { host, server, key };
 }
 function _selectTaskServer(task) {
  const { host, server, key } = _taskServerSelection(task);
  _envState.remoteHost = host;
  _envState.remoteServerKey = key === 'local' ? '' : key;
  if (server) {
    _envState.env = server.env || 'none';
    _envState.envPath = server.envPath || '';
    _envState.platform = server.platform || '';
  } else if (!host) {
    _envState.env = 'none';
    _envState.envPath = '';
    _envState.platform = '';
  }
  document.querySelectorAll('#hwfit-server-select, #hwfit-dl-server, #hwfit-cache-server, #hwfit-deps-server').forEach(sel => {
    if (!sel || sel.tagName !== 'SELECT') return;
    const wanted = key || (host || 'local');
    if ([...sel.options].some(o => o.value === wanted)) sel.value = wanted;
    else if (host && [...sel.options].some(o => o.value === host)) sel.value = host;
    else sel.value = host ? wanted : 'local';
  });
  return { host, server, key };
 }
 // When a new action is started (download / dependency / serve), this holds the
 // new task's id so the next render collapses every other card and leaves only
 // the new one open. Consumed (cleared) by _renderRunningTab.
@@ -526,7 +619,7 @@ async function _startQueuedDownload(task) {
      if (t.sessionId === data.session_id) return false;
      return !(key && t.type === 'download' && t.status === 'queued' && _downloadDedupeKey(t) === key);
    });
-    if (!found) tasks.push(_stripTaskSecrets(launchedTask));
+    if (!found) tasks.push(_redactTaskForStorage(launchedTask));
    _saveTasks(tasks);
    _renderRunningTab();
    _startBackgroundMonitor();
@@ -636,28 +729,53 @@ function _loadPrunedTasks() {
 const _REMOVED_KEY = 'cookbook-removed-tasks';
 const _TOMBSTONE_TTL_MS = 24 * 3600 * 1000;
 function _loadTombstones() {
-  try { return JSON.parse(localStorage.getItem(_REMOVED_KEY)) || {}; }
+  try {
    const tomb = JSON.parse(localStorage.getItem(_REMOVED_KEY)) || {};
    const now = Date.now();
    let changed = false;
    for (const k in tomb) {
      if (now - tomb[k] > _TOMBSTONE_TTL_MS) {
        delete tomb[k];
        changed = true;
      }
    }
    if (changed) localStorage.setItem(_REMOVED_KEY, JSON.stringify(tomb));
    return tomb;
  }
  catch { return {}; }
 }
 function _saveTombstones(tomb) {
  localStorage.setItem(_REMOVED_KEY, JSON.stringify(tomb || {}));
 }
 function _tombstoneTask(id) {
  if (!id) return;
  const tomb = _loadTombstones();
  const now = Date.now();
  tomb[id] = now;
  for (const k in tomb) { if (now - tomb[k] > _TOMBSTONE_TTL_MS) delete tomb[k]; }
-  localStorage.setItem(_REMOVED_KEY, JSON.stringify(tomb));
+  _saveTombstones(tomb);
 }
 function _isTombstoned(id) {
  const ts = _loadTombstones()[id];
  return ts != null && (Date.now() - ts) <= _TOMBSTONE_TTL_MS;
 }
-function _stripTaskSecrets(task) {
+function _redactStoredText(value) {
  return String(value || '')
    .replace(/hf_[A-Za-z0-9]{20,}/g, '[redacted-token]')
    .replace(/((?:api[_-]?key|token|authorization|password|passwd|secret)\s*[=:]\s*)(["']?)[^\s"']+/gi, '$1$2[redacted]');
 }
 function _redactTaskForStorage(task) {
  if (!task || typeof task !== 'object') return task;
  const safe = { ...task };
  if (typeof safe.output === 'string') safe.output = _redactStoredText(safe.output);
  if (safe.payload && typeof safe.payload === 'object') {
    safe.payload = { ...safe.payload };
    delete safe.payload.hf_token;
    delete safe.payload.hfToken;
    if (typeof safe.payload._cmd === 'string') safe.payload._cmd = _redactStoredText(safe.payload._cmd);
    if (typeof safe.payload.cmd === 'string') safe.payload.cmd = _redactStoredText(safe.payload.cmd);
  }
  return safe;
 }
@@ -666,23 +784,24 @@ function _stripStateSecrets(state) {
  const safe = { ...state };
  if (safe.env && typeof safe.env === 'object') {
    const { hfToken, ...env } = safe.env;
    if (hfToken) env.hfToken = hfToken;
    safe.env = env;
  }
-  if (Array.isArray(safe.tasks)) safe.tasks = safe.tasks.map(_stripTaskSecrets);
+  if (Array.isArray(safe.tasks)) safe.tasks = safe.tasks.map(_redactTaskForStorage);
  return safe;
 }
 export function _saveTasks(tasks) {
-  localStorage.setItem(TASKS_KEY, JSON.stringify((tasks || []).map(_stripTaskSecrets)));
+  localStorage.setItem(TASKS_KEY, JSON.stringify((tasks || []).map(_redactTaskForStorage)));
  _syncToServer();
 }
 export function _addTask(sessionId, name, type, payload) {
  let tasks = _loadTasks();
  const remoteHost = (payload && payload.remote_host) || _envState.remoteHost || '';
-  const sshPort = (payload && payload.ssh_port) || _getPort(remoteHost) || '';
+  const remoteServerKey = (payload && payload.remote_server_key) || '';
-  const platform = (payload && payload.platform) || _getPlatform(remoteHost) || '';
+  const remoteServerName = (payload && payload.remote_server_name) || '';
  const sshPort = (payload && payload.ssh_port) || _getPort(remoteServerKey || remoteHost) || '';
  const platform = (payload && payload.platform) || _getPlatform(remoteServerKey || remoteHost) || '';
  // Serving a model supersedes its finished download — clear the matching
  // finished download card (covers serving directly from the Serve tab, not just
  // via the download card's "Serve →" button).
@@ -697,7 +816,7 @@ export function _addTask(sessionId, name, type, payload) {
      return !(key && t.type === 'download' && t.status === 'queued' && _downloadDedupeKey(t) === key);
    });
  }
-  const task = _stripTaskSecrets({ id: sessionId, sessionId, name, type, status: 'running', output: '', ts: Date.now(), payload: payload || null, remoteHost, sshPort, platform });
+  const task = _redactTaskForStorage({ id: sessionId, sessionId, name, type, status: 'running', output: '', ts: Date.now(), payload: payload || null, remoteHost, remoteServerKey, remoteServerName, sshPort, platform });
  tasks.push(task);
  _saveTasks(tasks);
  // New action → collapse all other cards, leave only this one open.
@@ -992,14 +1111,24 @@ function _presetEnvFields(task) {
  };
 }
 function _redactPresetForStorage(preset) {
  if (!preset || typeof preset !== 'object') return preset;
  const safe = { ...preset };
  if (typeof safe.cmd === 'string') safe.cmd = _redactStoredText(safe.cmd);
  if (typeof safe.command === 'string') safe.command = _redactStoredText(safe.command);
  delete safe.hf_token;
  delete safe.hfToken;
  return safe;
 }
 function _saveTaskAsPreset(task, label) {
  const host = task.remoteHost || 'localhost';
  const portMatch = task.payload?._cmd?.match(/--port\s+(\d+)/);
  const port = portMatch ? portMatch[1] : '8000';
  const presets = _loadPresets();
  if (presets.some(p => p.cmd === task.payload._cmd)) return false;
-  presets.push({ name: task.name, model: task.payload.repo_id, backend: 'vllm', host, port, cmd: task.payload._cmd, remoteHost: task.remoteHost || '', label: label || task.name, ..._presetEnvFields(task) });
+  presets.push(_redactPresetForStorage({ name: task.name, model: task.payload.repo_id, backend: 'vllm', host, port, cmd: task.payload._cmd, remoteHost: task.remoteHost || '', label: label || task.name, ..._presetEnvFields(task) }));
-  _savePresets(presets);
+  _savePresets(presets.map(_redactPresetForStorage));
  return true;
 }
@@ -1042,7 +1171,7 @@ function _autoSaveWorkingConfig(task) {
  const existing = presets.find(p => p.cmd === cmd);
  if (existing) {
    task._autoSaved = true;
-    if (!existing.confirmedWorking) { existing.confirmedWorking = true; _savePresets(presets); }
+    if (!existing.confirmedWorking) { existing.confirmedWorking = true; _savePresets(presets.map(_redactPresetForStorage)); }
    return;   // already saved → just confirm it, no duplicate, no toast
  }
  // Respect the per-model cap the manual save flow uses (max 5).
@@ -1050,13 +1179,13 @@ function _autoSaveWorkingConfig(task) {
  const host = task.remoteHost || 'localhost';
  const portMatch = cmd.match(/--port[=\s]+(\d+)/);
  const port = portMatch ? portMatch[1] : '8000';
-  presets.push({
+  presets.push(_redactPresetForStorage({
    name: task.name, model, backend: 'vllm', host, port,
    cmd, remoteHost: task.remoteHost || '',
    label: _autoConfigLabel(task), confirmedWorking: true, autoSaved: true,
    ..._presetEnvFields(task),
-  });
+  }));
-  _savePresets(presets);
+  _savePresets(presets.map(_redactPresetForStorage));
  task._autoSaved = true;
  uiModule.showToast('Saved working config');
 }
@@ -1078,6 +1207,7 @@ function _syncToServer() {
      if (!_envState || !Array.isArray(_envState.servers) || _envState.servers.length === 0) return;
      const state = {
        tasks: _loadTasks(),
        removedTasks: _loadTombstones(),
        presets: _loadPresets(),
        env: _envState,
        serveState: null,
@@ -1126,15 +1256,22 @@ export async function _syncFromServer() {
    const localTasks = _loadTasks();
    const serverTasks = state.tasks || [];
    const serverTombstones = (state.removedTasks && typeof state.removedTasks === 'object') ? state.removedTasks : {};
    const localTombstones = _loadTombstones();
    const mergedTombstones = { ...serverTombstones, ...localTombstones };
    for (const [id, ts] of Object.entries(serverTombstones)) {
      if (localTombstones[id] == null || Number(ts) > Number(localTombstones[id])) mergedTombstones[id] = ts;
    }
    _saveTombstones(mergedTombstones);
    const localIds = new Set(localTasks.map(t => t.sessionId));
-    const merged = [...localTasks];
+    const merged = localTasks.filter(t => !_isTombstoned(t.sessionId));
    for (const t of serverTasks) {
      if (!localIds.has(t.sessionId) && !_isTombstoned(t.sessionId)) {
        merged.push(t);
      }
    }
-    localStorage.setItem(TASKS_KEY, JSON.stringify(merged.map(_stripTaskSecrets)));
+    localStorage.setItem(TASKS_KEY, JSON.stringify(merged.map(_redactTaskForStorage)));
    if (state.env) {
      // The active server selection (remoteHost + its env/path/platform) is a
@@ -1145,6 +1282,18 @@ export async function _syncFromServer() {
      const { remoteHost: _rh, env: _e, envPath: _ep, platform: _pf, ...settings } = state.env;
      delete settings.hfToken;
      Object.assign(_envState, settings);
      const selected = (_envState.remoteServerKey && _serverByVal?.(_envState.remoteServerKey))
        || (_envState.remoteHost ? (_envState.servers || []).find(s => s.host === _envState.remoteHost) : null);
      if (selected) {
        _envState.env = selected.env || 'none';
        _envState.envPath = selected.envPath || '';
        _envState.platform = selected.platform || '';
      } else if (!_envState.remoteHost) {
        const local = (_envState.servers || []).find(s => !s.host || s.host === 'local');
        _envState.env = local?.env || 'none';
        _envState.envPath = local?.envPath || '';
        _envState.platform = local?.platform || '';
      }
      const { hfToken, ...safeState } = _envState;
      localStorage.setItem('cookbook-last-state', JSON.stringify(safeState));
    }
@@ -1154,6 +1303,7 @@ export async function _syncFromServer() {
    if (state.serveState) {
      localStorage.setItem(SERVE_STATE_KEY, JSON.stringify(state.serveState));
    }
    document.dispatchEvent(new CustomEvent('cookbook:state-synced', { detail: state }));
    return true;
  } catch { return false; }
 }
@@ -1312,17 +1462,11 @@ async function _openServeEditForTask(task, cmdOverride, fieldOverrides = null) {
  if (fieldOverrides && typeof fieldOverrides === 'object') {
    fields = { ...(fields || {}), ...fieldOverrides };
  }
-  // Switch the active server to the one this serve ran on (mirrors _openEdit).
+  fields = { ...(fields || {}), _replaceTaskId: task.sessionId };
-  const _tHost = task.remoteHost || '';
+  // Switch the active server to the exact profile this serve ran on. The
-  _envState.remoteHost = _tHost;
+  // dropdown stores stable srv: keys, not raw host strings, so preserving only
-  const _tSrv = _serverByVal(_envState.remoteServerKey || _tHost)
+  // task.remoteHost can relaunch against the local container by accident.
-    || _envState.servers.find(s => s.host === _tHost);
+  _selectTaskServer(task);
  if (_tSrv) { _envState.env = _tSrv.env || 'none'; _envState.envPath = _tSrv.envPath || ''; _envState.platform = _tSrv.platform || ''; }
  else if (!_tHost) { _envState.env = 'none'; _envState.envPath = ''; _envState.platform = ''; }
  document.querySelectorAll('#hwfit-server-select, #hwfit-dl-server, #hwfit-cache-server, #hwfit-deps-server').forEach(sel => {
    if (!sel || sel.tagName !== 'SELECT') return;
    sel.value = _tHost || 'local';
  });
  try {
    const { openServePanelForRepo } = await import('./cookbookServe.js');
    await openServePanelForRepo(repo, fields);
@@ -1520,15 +1664,33 @@ function _parseServeCmdToFields(cmd) {
  return fields;
 }
-export async function _launchServeTask(shortName, repo, cmd, fields, hostOverride) {
+export async function _launchServeTask(shortName, repo, cmd, fields, hostOverride, targetMeta = null) {
  // Host resolution mirrors the download path: when the caller passes an explicit
  // host (resolved from the dropdown the user actually picked), use it and look
  // up that server's port/platform from the shared servers list. Only fall back
  // to _envState.remoteHost for legacy callers (diagnosis/pip-update).
  const _host = (hostOverride !== undefined) ? (hostOverride || '') : (_envState.remoteHost || '');
-  const _hsrv = _serverByVal(_envState.remoteServerKey || _host)
+  const _targetKey = targetMeta?.serverKey || '';
  const _hsrv = (_targetKey && _targetKey !== 'local' ? _serverByVal(_targetKey) : null)
    || (hostOverride === undefined ? _serverByVal(_envState.remoteServerKey || _host) : null)
    || _envState.servers.find(s => s.host === _host) || {};
  const _serverMetaKey = _targetKey || (_hsrv && _serverKey ? _serverKey(_hsrv) : '') || (_host || 'local');
  const _serverMetaName = targetMeta?.serverName || _hsrv.name || (_host ? _host : 'Local');
  const _hplatform = _host ? (_hsrv.platform || '') : (_envState.platform || '');
  const _replaceTaskId = fields?._replaceTaskId || '';
  if (_replaceTaskId) {
    try {
      const _old = _loadTasks().find(t => t.sessionId === _replaceTaskId);
      if (_old && _old.type === 'serve') {
        await fetch('/api/shell/exec', {
          method: 'POST', credentials: 'same-origin',
          headers: { 'Content-Type': 'application/json' },
          body: JSON.stringify({ command: _tmuxGracefulKill(_old) }),
        });
        _removeTask(_old.sessionId);
      }
    } catch {}
  }
  // Replace any serve already targeting this same host:port — you can't run two
  // servers on one port, so re-serving (or retrying) should stop & remove the
@@ -1572,7 +1734,7 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid
    }
  } else {
    if (_envState.env === 'venv' && _envState.envPath) {
-      const p = _envState.envPath;
+      const p = _venvRootFromPath(_envState.envPath);
      envPrefix = 'source ' + (p.endsWith('/bin/activate') ? p : p + '/bin/activate');
    } else if (_envState.env === 'conda' && _envState.envPath) {
      envPrefix = 'eval "$(conda shell.bash hook)" && conda activate ' + _envState.envPath;
@@ -1583,7 +1745,7 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid
    repo_id: repo,
    cmd: cmd,
    remote_host: _host || undefined,
-    ssh_port: _getPort(_host) || undefined,
+    ssh_port: _getPort(_serverMetaKey || _host) || undefined,
    env_prefix: envPrefix || undefined,
    hf_token: _envState.hfToken || undefined,
    gpus: _envState.gpus || undefined,
@@ -1607,11 +1769,11 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid
      return;
    }
-    const _sp = _getPort(_host);
+    const _sp = _getPort(_serverMetaKey || _host);
    // _fields = the exact structured serve-form values used for this launch,
    // so the "Edit / relaunch" button can re-open the Serve panel pre-filled
    // with these precise settings (not just the last-used-for-repo state).
-    const payload = { repo_id: repo, remote_host: _host || undefined, ssh_port: _sp || undefined, _cmd: cmd, _fields: fields || undefined, _env: _usedEnv, _envPath: _usedEnvPath, _gpus: _usedGpus };
+    const payload = { repo_id: repo, remote_host: _host || undefined, remote_server_key: _serverMetaKey || undefined, remote_server_name: _serverMetaName || undefined, ssh_port: _sp || undefined, _cmd: cmd, _fields: fields || undefined, _env: _usedEnv, _envPath: _usedEnvPath, _gpus: _usedGpus };
    _addTask(data.session_id, shortName, 'serve', payload);
    uiModule.showToast(`Serving ${shortName}...`);
    // Auto-register may have enabled an existing (offline) endpoint for this
@@ -1726,7 +1888,7 @@ export function _renderRunningTab() {
      '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">' +
      '<h2 style="margin:0;padding:0;line-height:1;">Active <span id="running-count" class="memory-count" style="font-size:0.6em;opacity:0.6;font-weight:normal">' + activeCount + '</span></h2>' +
      '</div>' +
-      '<p class="memory-desc doclib-desc" style="margin-top:6px;">Active downloads and serving processes.</p>' +
+      '<p class="memory-desc doclib-desc" style="margin-top:6px;">Active downloads, installs and model launches.</p>' +
      '</div>';
    const firstGroup = body.querySelector('.cookbook-group');
    if (firstGroup) body.insertBefore(group, firstGroup);
@@ -1760,16 +1922,25 @@ export function _renderRunningTab() {
  }
  // Group tasks by server
-  const _serverName = (host) => {
+  const _taskServerKey = (task) => task?.remoteServerKey || task?.remoteHost || '';
-    if (!host) return 'Local';
+  const _serverName = (keyOrTask) => {
-    const srv = _serverByVal(_envState.remoteServerKey || host)
+    if (keyOrTask && typeof keyOrTask === 'object') {
-      || _envState.servers.find(s => s.host === host);
+      const task = keyOrTask;
-    return srv?.name || host;
+      if (task.remoteServerName) return task.remoteServerName;
      const srv = task.remoteServerKey ? _serverByVal(task.remoteServerKey) : null;
      if (srv?.name) return srv.name;
      if (!task.remoteHost) return 'Local';
      return (_envState.servers.find(s => s.host === task.remoteHost)?.name) || task.remoteHost;
    }
    const key = keyOrTask || '';
    if (!key || key === 'local') return 'Local';
    const srv = _serverByVal(key);
    return srv?.name || key;
  };
  const serverGroups = {};
  for (const t of tasks) {
-    const key = t.remoteHost || '';
+    const key = _taskServerKey(t);
-    if (!serverGroups[key]) serverGroups[key] = { name: _serverName(key), serve: [], download: [] };
+    if (!serverGroups[key]) serverGroups[key] = { name: _serverName(t), serve: [], download: [] };
    serverGroups[key][t.type === 'serve' ? 'serve' : 'download'].push(t);
  }
@@ -1816,12 +1987,12 @@ export function _renderRunningTab() {
      e.stopPropagation();  // don't toggle the section collapse (was an inline onclick, blocked by CSP)
      const host = btn.dataset.clearServer;
      const allTasks = _loadTasks();
-      const toRemove = allTasks.filter(t => (t.remoteHost || '') === host && _canClearTask(t));
+      const toRemove = allTasks.filter(t => _taskServerKey(t) === host && _canClearTask(t));
      // Bail with a clear message instead of silently doing nothing when
      // every task on this server is still running (nothing finished to
      // clear yet) — the previous behavior looked like the button was dead.
      if (!toRemove.length) {
-        const stillRunning = allTasks.filter(t => (t.remoteHost || '') === host && t.status === 'running').length;
+        const stillRunning = allTasks.filter(t => _taskServerKey(t) === host && t.status === 'running').length;
        const _msg = stillRunning
          ? `No finished tasks on ${_serverName(host)} — ${stillRunning} still running. Stop them first to clear.`
          : `No finished tasks on ${_serverName(host)}.`;
@@ -1830,7 +2001,8 @@ export function _renderRunningTab() {
        return;
      }
      if (!await window.styledConfirm(`Clear ${toRemove.length} finished task${toRemove.length === 1 ? '' : 's'} on ${_serverName(host)}?`, { confirmText: 'Clear' })) return;
-      const remaining = allTasks.filter(t => (t.remoteHost || '') !== host || !_canClearTask(t));
+      toRemove.forEach(t => _tombstoneTask(t.sessionId));
      const remaining = allTasks.filter(t => _taskServerKey(t) !== host || !_canClearTask(t));
      _saveTasks(remaining);
      // Fade/slide each finished card out (same exit as the per-card clear)
      // instead of yanking them instantly.
@@ -1864,7 +2036,7 @@ export function _renderRunningTab() {
    btn.addEventListener('click', async (e) => {
      e.stopPropagation();  // don't toggle the section collapse
      const host = btn.dataset.stopServer;
-      const running = _loadTasks().filter(t => (t.remoteHost || '') === host && t.status === 'running');
+      const running = _loadTasks().filter(t => _taskServerKey(t) === host && t.status === 'running');
      if (!running.length) { uiModule.showToast(`Nothing running on ${_serverName(host)}`); return; }
      if (!await window.styledConfirm(`Stop ${running.length} running task${running.length > 1 ? 's' : ''} on ${_serverName(host)}?`, { confirmText: 'Stop all' })) return;
      // Mark every task as user-stopped BEFORE firing the kills so that the
@@ -1967,11 +2139,12 @@ export function _renderRunningTab() {
    const _bdg = _taskBadge(task);
    const _bdgTitle = (task._unreachable && task.status === 'running') ? ' title="Server not responding — it may have crashed"' : '';
    const displayName = _taskDisplayName(task);
    el.innerHTML = `
      <div class="cookbook-task-header">
        <span class="cookbook-task-type${(task.status === 'done' && task.type === 'download') ? ' cookbook-task-type-done' : ''}" data-type="${esc(task.type)}">${esc((task.status === 'done' && task.type === 'download') ? 'finished' : task.type)}</span>
-        <span class="cookbook-task-name">${modelLogo(task.name)}${esc(task.name)}</span>
+        <span class="cookbook-task-name">${modelLogo(task.name)}${esc(displayName)}</span>
-        <span class="cookbook-task-indicator"><span class="cookbook-task-wave" style="display:${task.status === 'running' ? '' : 'none'}"></span><span class="cookbook-task-check" title="Clear" style="display:${_canClearTask(task) ? '' : 'none'}"><svg class="cookbook-task-check-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg><svg class="cookbook-task-clear-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg><span class="cookbook-task-done-label">${esc(_clearPillLabel(task))}</span><span class="cookbook-task-clear-label">clear</span></span></span>
+        <span class="cookbook-task-indicator"><span class="cookbook-task-wave" style="display:${task.status === 'running' ? '' : 'none'}"></span>${_canLaunchDownloadedTask(task) ? '<button type="button" class="cookbook-task-serve-btn" title="Open in Launch"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.4" stroke-linecap="round" stroke-linejoin="round"><polygon points="13 2 3 14 12 14 11 22 21 10 12 10 13 2"/></svg><span>Launch</span></button>' : ''}<span class="cookbook-task-check" title="Clear" style="display:${_canClearTask(task) ? '' : 'none'}"><svg class="cookbook-task-check-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg><svg class="cookbook-task-clear-ico" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg><span class="cookbook-task-done-label">${esc(_clearPillLabel(task))}</span><span class="cookbook-task-clear-label">clear</span></span></span>
        <button type="button" class="cookbook-task-start-now" title="Start this queued download now" style="display:${(task.type === 'download' && task.status === 'queued') ? '' : 'none'}"><svg width="11" height="11" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true"><polygon points="8 5 19 12 8 19 8 5"/></svg><span>start now</span></button>
        <span class="cookbook-task-status ${_bdg.cls}"${_bdgTitle}>${esc(_bdg.text)}</span>
        <button class="cookbook-task-menu-btn" title="Actions">&#8942;</button>
@@ -2043,19 +2216,11 @@ export function _renderRunningTab() {
          e.stopPropagation();
          const repo = task.payload?.repo_id || task.name;
          if (!repo) { uiModule.showToast('No model info on this task'); return; }
-          // Point the active server at the one it downloaded to.
+          // Point the active server at the exact profile it downloaded to.
-          const _tHost = task.remoteHost || '';
+          _selectTaskServer(task);
          _envState.remoteHost = _tHost;
          const _tSrv = _serverByVal(_envState.remoteServerKey || _tHost)
            || _envState.servers.find(s => s.host === _tHost);
          if (_tSrv) { _envState.env = _tSrv.env || 'none'; _envState.envPath = _tSrv.envPath || ''; _envState.platform = _tSrv.platform || ''; }
          else if (!_tHost) { _envState.env = 'none'; _envState.envPath = ''; _envState.platform = ''; }
          document.querySelectorAll('#hwfit-server-select, #hwfit-dl-server, #hwfit-cache-server, #hwfit-deps-server').forEach(sel => {
            if (sel && sel.tagName === 'SELECT') sel.value = _tHost || 'local';
          });
          try {
            const { openServePanelForRepo } = await import('./cookbookServe.js');
-            await openServePanelForRepo(repo);
+            await openServePanelForRepo(repo, _downloadServeFields(task));
            // Serving it supersedes the finished download — clear the card from
            // the Running tab (smooth exit) now that we've jumped to Serve.
            _animateOutThenRemove(el, task.sessionId);
@@ -2177,9 +2342,6 @@ export function _renderRunningTab() {
        if (task.status !== 'running' && task.status !== 'queued') {
          items.push({ group: 'run', label: 'Reconnect tmux', action: 'reconnect' });
        }
        if (task.status === 'running') {
          items.push({ group: 'run', label: 'Stop', action: 'stop', danger: true });
        }
        items.push({ group: 'run', label: 'Restart', action: 'retry' });
        // ── Edit section ────────────────────────────────────────────
        // Merged "Edit & relaunch" — opens the structured serve panel
@@ -2539,7 +2701,7 @@ export function _renderRunningTab() {
    });
    // Route to the right server section body
-    const serverBodyId = `server-body-${(task.remoteHost || 'local').replace(/[^a-zA-Z0-9-]/g, '_')}`;
+    const serverBodyId = `server-body-${(_taskServerKey(task) || 'local').replace(/[^a-zA-Z0-9-]/g, '_')}`;
    const targetBody = document.getElementById(serverBodyId);
    if (targetBody) targetBody.appendChild(el);
    else group.appendChild(el);
@@ -3393,7 +3555,8 @@ function _refreshServerDots() {
  let tasks;
  try { tasks = _loadTasks(); } catch { return; }
  const byKey = {};
-  for (const t of tasks) { (byKey[t.remoteHost || ''] = byKey[t.remoteHost || ''] || []).push(t); }
+  const _taskServerKeyForDot = (task) => task?.remoteServerKey || task?.remoteHost || '';
  for (const t of tasks) { (byKey[_taskServerKeyForDot(t)] = byKey[_taskServerKeyForDot(t)] || []).push(t); }
  document.querySelectorAll('.cookbook-section-header').forEach(header => {
    const dot = header.querySelector('.cookbook-srv-status');
    if (!dot) return;
@@ -3527,7 +3690,9 @@ async function _probeEndpointUntilOnline(epId, host, port) {
    try {
      // Hit the probe endpoint — it re-probes server-side and updates
      // cached_models. We consume (and discard) the SSE stream.
-      await fetch(`/api/model-endpoints/${epId}/probe`, { credentials: 'same-origin' }).then(r => r.text()).catch(() => {});
+      const probeRes = await fetch(`/api/model-endpoints/${epId}/probe`, { credentials: 'same-origin' }).catch(() => null);
      if (probeRes && probeRes.status === 404) return;
      if (probeRes) await probeRes.text().catch(() => {});
      const eps = await fetch('/api/model-endpoints', { credentials: 'same-origin' }).then(r => r.json()).catch(() => []);
      const ep = (eps || []).find(e => e.id === epId);
      if (ep && (ep.models || []).length) {
@@ -3565,7 +3730,7 @@ async function _pollBackgroundStatus() {
            }
          }
          if (added > 0) {
-            localStorage.setItem(TASKS_KEY, JSON.stringify(merged.map(_stripTaskSecrets)));
+            localStorage.setItem(TASKS_KEY, JSON.stringify(merged.map(_redactTaskForStorage)));
            _renderRunningTab();
          }
        }
@@ -3798,6 +3963,7 @@ export function initRunning(shared) {
  _persistEnvState = shared._persistEnvState;
  _refreshDependencies = shared._refreshDependencies;
  _serverByVal = shared._serverByVal;
  _serverKey = shared._serverKey;
  _selectedServer = shared._selectedServer;
  modelLogo = shared.modelLogo;
  esc = shared.esc;
@@ -24,6 +24,7 @@ import * as Modals from './modalManager.js';
  let _autoDetectDebounce = null;
  let _autoTitleDebounce = null;
  let _autoSaveDebounce = null;
  let _lastAutoSaveErrorAt = 0;
  let _animationInProgress = false;
  let _animationCancel = null;      // function to cancel current animation
  let _htmlPreviewActive = false;   // true when inline HTML preview iframe is showing
@@ -154,6 +155,20 @@ import * as Modals from './modalManager.js';
      addDocToTabs,
      syncDocIndicator: _syncDocIndicator,
    });
    const sidebarNewDocBtn = document.getElementById('library-new-doc-btn');
    if (sidebarNewDocBtn && !sidebarNewDocBtn.dataset.docNewWired) {
      sidebarNewDocBtn.dataset.docNewWired = '1';
      sidebarNewDocBtn.addEventListener('click', async (e) => {
        e.preventDefault();
        e.stopPropagation();
        try {
          await newDocument();
        } catch (err) {
          console.error('Failed to create document from sidebar button:', err);
          if (uiModule) uiModule.showError('Failed to create document');
        }
      });
    }
    _maybeOpenDocFromHash();
    window.addEventListener('hashchange', _maybeOpenDocFromHash);
  }
@@ -2686,6 +2701,104 @@ import * as Modals from './modalManager.js';
    await _uploadComposeFiles(files);
  }
  function _isMarkdownImageFile(file) {
    if (!file) return false;
    if ((file.type || '').toLowerCase().startsWith('image/')) return true;
    return /\.(avif|bmp|gif|jpe?g|png|svg|webp)$/i.test(file.name || '');
  }
  function _markdownImageAlt(name) {
    const base = String(name || 'image').replace(/\.[^.]+$/, '').trim() || 'image';
    return base.replace(/[\[\]\n\r]/g, ' ').replace(/\s+/g, ' ').trim() || 'image';
  }
  function _activeDocLanguage() {
    const doc = activeDocId && docs.get(activeDocId);
    return ((doc && doc.language) || document.getElementById('doc-language-select')?.value || '').toLowerCase();
  }
  function _scheduleMarkdownImageAutosave(ta) {
    updateLineNumbers(ta.value);
    const codeEl = document.getElementById('doc-editor-code');
    if (codeEl && !codeEl.dataset.hasDiff) {
      codeEl.textContent = ta.value + '\n';
      codeEl.style.minHeight = ta.scrollHeight + 'px';
    }
    clearTimeout(_hlDebounce);
    _hlDebounce = setTimeout(syncHighlighting, 80);
    clearTimeout(_autoTitleDebounce);
    _autoTitleDebounce = setTimeout(() => autoTitleFromContent(ta.value), 600);
    clearTimeout(_autoSaveDebounce);
    _autoSaveDebounce = setTimeout(() => { saveDocument({ silent: true }); }, 800);
  }
  function _insertMarkdownImages(uploadedFiles) {
    const ta = document.getElementById('doc-editor-textarea');
    if (!ta) return;
    const files = Array.isArray(uploadedFiles) ? uploadedFiles : [];
    if (!files.length) return;
    const start = ta.selectionStart || 0;
    const end = ta.selectionEnd || start;
    const before = ta.value.slice(0, start);
    const after = ta.value.slice(end);
    const lines = files.map(file => {
      const id = encodeURIComponent(file.id || file.file_id || '');
      const alt = _markdownImageAlt(file.name || file.filename);
      return id ? `![${alt}](/api/upload/${id})` : '';
    }).filter(Boolean);
    if (!lines.length) return;
    const prefix = before && !before.endsWith('\n') ? '\n' : '';
    const suffix = after && !after.startsWith('\n') ? '\n' : '';
    const insert = `${prefix}${lines.join('\n\n')}${suffix}`;
    _replaceRange(ta, start, end, insert);
    const caret = start + insert.length;
    ta.selectionStart = caret;
    ta.selectionEnd = caret;
    ta.focus();
    _scheduleMarkdownImageAutosave(ta);
    _refreshMarkdownPreviewIfVisible(activeDocId, ta.value);
  }
  async function _uploadMarkdownImages(files) {
    const images = Array.from(files || []).filter(_isMarkdownImageFile);
    if (!images.length) {
      if (uiModule) uiModule.showError('Choose an image file');
      return;
    }
    if (_activeDocLanguage() !== 'markdown') {
      if (uiModule) uiModule.showError('Switch the document to markdown before inserting images');
      return;
    }
    const fd = new FormData();
    images.forEach(file => fd.append('files', file));
    try {
      const res = await fetch(`${API_BASE}/api/upload`, {
        method: 'POST',
        credentials: 'same-origin',
        body: fd,
      });
      let data = null;
      try { data = await res.json(); } catch (_) {}
      if (!res.ok) throw new Error((data && (data.error || data.detail)) || `HTTP ${res.status}`);
      const uploaded = Array.isArray(data?.files) ? data.files : [];
      if (!uploaded.length) throw new Error('No uploaded files returned');
      _insertMarkdownImages(uploaded);
      if (uiModule) uiModule.showToast(images.length === 1 ? 'Image inserted' : 'Images inserted');
    } catch (err) {
      console.error('Failed to insert markdown image:', err);
      if (uiModule) uiModule.showError('Failed to insert image');
    }
  }
  async function _handleMarkdownImageUpload(e) {
    const files = e.target.files;
    e.target.value = '';
    await _uploadMarkdownImages(files);
  }
  function _renderComposeAttachments() {
    const container = document.getElementById('doc-email-compose-atts');
    if (!container) return;
@@ -3752,9 +3865,12 @@ import * as Modals from './modalManager.js';
      const res = await fetch(`${API_BASE}/api/document`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        credentials: 'same-origin',
        body: JSON.stringify({ session_id: sessionId, title: '', content }),
      });
      if (!res.ok) throw new Error(`Document create failed: HTTP ${res.status}`);
      const doc = await res.json();
      if (!doc || !doc.id) throw new Error('Document create failed: missing id');
      addDocToTabs(doc, sessionId);
      // Set the content into the map so switchToDoc preserves it
      const d = docs.get(doc.id);
@@ -3981,6 +4097,7 @@ import * as Modals from './modalManager.js';
        <input type="hidden" id="doc-email-source-folder" />
        <input type="file" id="doc-email-file-input" multiple style="display:none" />
      </div>
      <input type="file" id="doc-md-image-input" accept="image/*" multiple style="display:none" />
      <div class="doc-md-toolbar" id="doc-md-toolbar" style="display:none">
        <div class="md-toolbar-items" id="md-toolbar-items">
          <span class="md-view-toggle" id="doc-md-view-toggle" style="display:none" role="group" aria-label="Edit or preview">
@@ -4003,7 +4120,7 @@ import * as Modals from './modalManager.js';
          <button type="button" class="md-dd-toggle" data-dd="list" title="List"><span style="font-variant-numeric:tabular-nums;">1.</span><svg width="8" height="8" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3.5" stroke-linecap="round" stroke-linejoin="round"><polyline points="6 9 12 15 18 9"/></svg></button>
          <span class="md-toolbar-sep"></span>
          <button type="button" data-md="link" title="Link"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"/><path d="M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"/></svg></button>
-          <button type="button" id="md-toolbar-attach-btn" class="md-toolbar-attach-btn" title="Attach files"><svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="m21.44 11.05-9.19 9.19a6 6 0 0 1-8.49-8.49l8.57-8.57A4 4 0 1 1 17.93 8.8l-8.59 8.57a2 2 0 0 1-2.83-2.83l8.49-8.48"/></svg></button>
+          <button type="button" id="md-toolbar-attach-btn" class="md-toolbar-attach-btn" title="Insert image"><svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="m21.44 11.05-9.19 9.19a6 6 0 0 1-8.49-8.49l8.57-8.57A4 4 0 1 1 17.93 8.8l-8.59 8.57a2 2 0 0 1-2.83-2.83l8.49-8.48"/></svg></button>
          <button type="button" class="md-dd-toggle md-toolbar-email-hide" data-dd="code" title="Code">\`<svg width="8" height="8" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3.5" stroke-linecap="round" stroke-linejoin="round"><polyline points="6 9 12 15 18 9"/></svg></button>
          <button type="button" data-md="hr" title="Horizontal rule">—</button>
          <span class="md-toolbar-sep"></span>
@@ -4602,9 +4719,14 @@ import * as Modals from './modalManager.js';
      document.getElementById('doc-email-file-input')?.click();
    });
    document.getElementById('md-toolbar-attach-btn')?.addEventListener('click', () => {
-      document.getElementById('doc-email-file-input')?.click();
+      if (_activeDocLanguage() === 'email') {
        document.getElementById('doc-email-file-input')?.click();
      } else {
        document.getElementById('doc-md-image-input')?.click();
      }
    });
    document.getElementById('doc-email-file-input')?.addEventListener('change', _handleAttachUpload);
    document.getElementById('doc-md-image-input')?.addEventListener('change', _handleMarkdownImageUpload);
    // Cc/Bcc toggle
    document.getElementById('doc-email-show-cc')?.addEventListener('click', () => {
@@ -4840,6 +4962,26 @@ import * as Modals from './modalManager.js';
        clearTimeout(_autoSaveDebounce);
        _autoSaveDebounce = setTimeout(() => { saveDocument({ silent: true }); }, 2000);
      });
      ta.addEventListener('paste', (e) => {
        if (_activeDocLanguage() !== 'markdown') return;
        const files = Array.from(e.clipboardData?.files || []).filter(_isMarkdownImageFile);
        if (!files.length) return;
        e.preventDefault();
        _uploadMarkdownImages(files);
      });
      ta.addEventListener('dragover', (e) => {
        if (_activeDocLanguage() !== 'markdown') return;
        const items = Array.from(e.dataTransfer?.items || []);
        if (!items.some(item => item.kind === 'file' && /^image\//i.test(item.type || ''))) return;
        e.preventDefault();
      });
      ta.addEventListener('drop', (e) => {
        if (_activeDocLanguage() !== 'markdown') return;
        const files = Array.from(e.dataTransfer?.files || []).filter(_isMarkdownImageFile);
        if (!files.length) return;
        e.preventDefault();
        _uploadMarkdownImages(files);
      });
      ta.addEventListener('scroll', () => {
        const code = document.getElementById('doc-editor-code');
        if (code) code.style.minHeight = ta.scrollHeight + 'px';
@@ -5548,7 +5690,7 @@ import * as Modals from './modalManager.js';
    // any dropdown that just opened. Preventing the default mousedown keeps the
    // textarea focused, so formatting hits the live selection and menus stay up.
    toolbar.addEventListener('mousedown', (e) => {
-      if (e.target.closest('[data-md], .md-dd-toggle, .emoji-picker-btn')) e.preventDefault();
+      if (e.target.closest('[data-md], .md-dd-toggle, .emoji-picker-btn, .md-toolbar-attach-btn')) e.preventDefault();
    });
    toolbar.addEventListener('click', (e) => {
@@ -5976,6 +6118,7 @@ import * as Modals from './modalManager.js';
      const res = await fetch(`${API_BASE}/api/document`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        credentials: 'same-origin',
        body: JSON.stringify({
          session_id: sessionId,
          title: '',
@@ -5983,7 +6126,9 @@ import * as Modals from './modalManager.js';
          language: 'markdown',
        }),
      });
      if (!res.ok) throw new Error(`Document create failed: HTTP ${res.status}`);
      const doc = await res.json();
      if (!doc || !doc.id) throw new Error('Document create failed: missing id');
      addDocToTabs(doc, sessionId);
      if (!isOpen) openPanel();
      // Re-enable editor if it was in empty state
@@ -8266,8 +8411,10 @@ import * as Modals from './modalManager.js';
      const res = await fetch(`${API_BASE}/api/document/${activeDocId}`, {
        method: 'PUT',
        headers: { 'Content-Type': 'application/json' },
        credentials: 'same-origin',
        body: JSON.stringify({ content: textarea.value }),
      });
      if (!res.ok) throw new Error(`Document save failed: HTTP ${res.status}`);
      const doc = await res.json();
      const badge = document.getElementById('doc-version-badge');
      if (badge) { const _v = doc.version_count || 1; badge.textContent = `v${_v}`; badge.style.display = _v > 1 ? '' : 'none'; }
@@ -8280,7 +8427,11 @@ import * as Modals from './modalManager.js';
      if (!silent && uiModule) uiModule.showToast('Document saved');
    } catch (e) {
      console.error('Failed to save document:', e);
-      if (!silent && uiModule) uiModule.showError('Failed to save document');
+      const now = Date.now();
      if (uiModule && (!silent || now - _lastAutoSaveErrorAt > 10000)) {
        uiModule.showError(silent ? 'Autosave failed' : 'Failed to save document');
        _lastAutoSaveErrorAt = now;
      }
    }
  }
@@ -2936,6 +2936,20 @@ function _createCard(em) {
    titleRow.appendChild(att);
  }
  const tags = Array.isArray(em.tags) ? em.tags : [];
  if (tags.length || em.is_spam_verdict) {
    const tagWrap = document.createElement('span');
    tagWrap.className = 'email-tags email-card-tags';
    tagWrap.innerHTML = tags.map(t => {
      const tag = String(t || '').trim().toLowerCase().replace(/_/g, '-');
      return tag ? `<span class="email-tag email-tag-${_esc(tag)}">${_esc(tag)}</span>` : '';
    }).join('');
    if (em.is_spam_verdict) {
      tagWrap.insertAdjacentHTML('beforeend', '<span class="email-tag email-tag-spam">spam</span>');
    }
    titleRow.appendChild(tagWrap);
  }
  // Done check + unread dot stay next to the subject on the left.
  const isSentFolder = /sent/i.test(state._libFolder);
  if (!isSentFolder) {
@@ -4560,11 +4574,12 @@ function _wireAttachmentHandlers(reader, folder) {
      const uid = openBtn.dataset.openUid;
      const index = openBtn.dataset.openIndex;
      const name = openBtn.dataset.openName || `attachment-${index}`;
      const sourceFolder = openBtn.dataset.openFolder || useFolder;
      if (!uid || index == null) return;
      const orig = openBtn.style.opacity;
      openBtn.style.opacity = '0.4';
      try {
-        const folderQs = encodeURIComponent(useFolder);
+        const folderQs = encodeURIComponent(sourceFolder);
        const res = await fetch(
          `${API_BASE}/api/email/attachment-as-doc/${encodeURIComponent(uid)}/${encodeURIComponent(index)}?folder=${folderQs}${_acct()}`,
          { method: 'POST', credentials: 'same-origin' }
@@ -4618,8 +4633,9 @@ function _wireAttachmentHandlers(reader, folder) {
      const uid = chip.dataset.attUid;
      const index = chip.dataset.attIndex;
      const name = chip.dataset.attName || `attachment-${index}`;
      const sourceFolder = chip.dataset.attFolder || useFolder;
      if (!uid || index == null) return;
-      const url = `${API_BASE}/api/email/attachment/${encodeURIComponent(uid)}/${encodeURIComponent(index)}?folder=${encodeURIComponent(useFolder)}${_acct()}`;
+      const url = `${API_BASE}/api/email/attachment/${encodeURIComponent(uid)}/${encodeURIComponent(index)}?folder=${encodeURIComponent(sourceFolder)}${_acct()}`;
      if (_isMobileUA) {
        window.open(url, '_blank');
        return;
@@ -4698,25 +4714,50 @@ function _isLikelySignatureImage(a) {
 // Build the attachments header+chips HTML for an email read response. Pulled
 // out so both the initial-open and the swap-reader paths can render it.
 function _buildAttsHtmlFor(uid, data) {
-  if (!data || !data.attachments || !data.attachments.length) return '';
+  if (!data) return '';
-  const _OPENABLE_RE = /\.(pdf|docx|txt|md|markdown)$/i;
+  const _OPENABLE_RE = /\.(pdf|docx|txt|md|markdown|eml)$/i;
-  const visible = data.attachments.filter(a => !_isLikelySignatureImage(a));
+  const currentAttachments = Array.isArray(data.attachments) ? data.attachments : [];
-  if (!visible.length) return '';
+  const relatedAttachments = Array.isArray(data.related_attachments) ? data.related_attachments : [];
-  const chips = visible.map(a => {
+  if (!currentAttachments.length && !relatedAttachments.length) return '';
  const visible = currentAttachments.filter(a => !_isLikelySignatureImage(a));
  const hidden = currentAttachments.filter(a => _isLikelySignatureImage(a));
  const related = relatedAttachments.filter(a => !_isLikelySignatureImage(a));
  const renderChip = (a, extraClass = '') => {
    const openable = _OPENABLE_RE.test(a.filename || '');
    const chipUid = a.source_uid || a.uid || uid;
    const chipFolder = a.source_folder || data.folder || state._libFolder || 'INBOX';
    const openBtn = openable
-      ? `<span class="email-attachment-open" title="Open in document editor" data-open-uid="${_esc(uid)}" data-open-index="${a.index}" data-open-name="${_esc(a.filename)}"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/><polyline points="14 2 14 8 20 8"/><line x1="8" y1="13" x2="16" y2="13"/><line x1="8" y1="17" x2="16" y2="17"/><line x1="8" y1="9" x2="10" y2="9"/></svg><span class="email-attachment-open-label">Open</span></span>`
+      ? `<span class="email-attachment-open" title="Open in document editor" data-open-uid="${_esc(chipUid)}" data-open-index="${a.index}" data-open-name="${_esc(a.filename)}" data-open-folder="${_esc(chipFolder)}"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/><polyline points="14 2 14 8 20 8"/><line x1="8" y1="13" x2="16" y2="13"/><line x1="8" y1="17" x2="16" y2="17"/><line x1="8" y1="9" x2="10" y2="9"/></svg><span class="email-attachment-open-label">Open</span></span>`
      : '';
-    return `<button type="button" class="email-attachment-chip" data-att-uid="${_esc(uid)}" data-att-index="${a.index}" data-att-name="${_esc(a.filename)}"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="m21.44 11.05-9.19 9.19a6 6 0 0 1-8.49-8.49l8.57-8.57A4 4 0 1 1 17.93 8.8l-8.59 8.57a2 2 0 0 1-2.83-2.83l8.49-8.48"/></svg><span>${_esc(a.filename)}</span><span class="att-size">${Math.round((a.size||0)/1024)} KB</span>${openBtn}</button>`;
+    return `<button type="button" class="email-attachment-chip${extraClass}" data-att-uid="${_esc(chipUid)}" data-att-index="${a.index}" data-att-name="${_esc(a.filename)}" data-att-folder="${_esc(chipFolder)}"><svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="m21.44 11.05-9.19 9.19a6 6 0 0 1-8.49-8.49l8.57-8.57A4 4 0 1 1 17.93 8.8l-8.59 8.57a2 2 0 0 1-2.83-2.83l8.49-8.48"/></svg><span>${_esc(a.filename)}</span><span class="att-size">${Math.round((a.size||0)/1024)} KB</span>${openBtn}</button>`;
-  }).join('');
+  };
  const chips = visible.map(a => renderChip(a)).join('');
  const hiddenChips = hidden.map(a => renderChip(a, ' email-attachment-chip-muted')).join('');
  const relatedChips = related.map(a => renderChip(a, ' email-attachment-chip-related')).join('');
  const visibleSection = visible.length
    ? '<div class="email-reader-atts">' + chips + '</div>'
    : '';
  const relatedSection = related.length
    ? '<div class="email-reader-atts-hidden-note">From earlier in this thread</div><div class="email-reader-atts email-reader-atts-related">' + relatedChips + '</div>'
    : '';
  const hiddenSection = hidden.length
    ? '<div class="email-reader-atts-hidden-note">Filtered inline images / signature files</div><div class="email-reader-atts email-reader-atts-hidden">' + hiddenChips + '</div>'
    : '';
  const label = visible.length
    ? `Attachments (${visible.length + related.length})`
    : related.length
      ? `Thread attachments (${related.length})`
      : `Hidden inline attachments (${hidden.length})`;
  return (
    '<div class="email-reader-atts-wrap collapsed">'
    +   '<div class="email-reader-atts-header email-summary-toggle" role="button" tabindex="0">'
    +     '<svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="m21.44 11.05-9.19 9.19a6 6 0 0 1-8.49-8.49l8.57-8.57A4 4 0 1 1 17.93 8.8l-8.59 8.57a2 2 0 0 1-2.83-2.83l8.49-8.48"/></svg>'
-    +     `<span>Attachments (${data.attachments.length})</span>`
+    +     `<span>${label}</span>`
    +     '<svg class="email-summary-chevron" width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" style="margin-left:auto;transition:transform .15s ease;"><polyline points="6 9 12 15 18 9"/></svg>'
    +   '</div>'
-    +   '<div class="email-reader-atts">' + chips + '</div>'
+    +   visibleSection
    +   relatedSection
    +   hiddenSection
    + '</div>'
  );
 }
@@ -36,6 +36,14 @@ function linkHtml(text, url) {
  return `<a href="${escapeHtml(safeUrl)}" target="_blank" rel="noopener noreferrer">${safeText}</a>`;
 }
 function imageHtml(alt, url, title) {
  const safeUrl = safeLinkUrl(url);
  if (!safeUrl || safeUrl.startsWith('#')) return escapeHtml(alt || '');
  const safeAlt = escapeHtml(alt || '');
  const safeTitle = title ? ` title="${escapeHtml(title)}"` : '';
  return `<img src="${escapeHtml(safeUrl)}" alt="${safeAlt}"${safeTitle} loading="lazy" decoding="async">`;
 }
 function _isModelEndpointUrl(rawUrl) {
  try {
    const parsed = new URL(String(rawUrl || ''), window.location.origin);
@@ -146,7 +154,7 @@ function sanitizeAllowedHtml(html) {
 * Check if text has unclosed think tag
 */
 export function hasUnclosedThinkTag(text) {
-  text = text || '';
+  text = normalizeThinkingMarkup(text || '');
  const openCount =
    (text.match(/<(?:think(?:ing)?|thought)(?:\s+[^>]*)?>/gi) || []).length
    + (text.match(/<\|channel>thought/gi) || []).length;
@@ -163,6 +171,10 @@ export function startsWithReasoningPrefix(text) {
 export function normalizeThinkingMarkup(text) {
  if (!text) return text;
  let normalized = text;
  // MiniMax M-series can emit namespaced reasoning tags like
  // <mm:think>...</mm:think>. Normalize them into the shared thinking parser.
  normalized = normalized.replace(/<mm:think(\s+[^>]*)?>/gi, (_m, attrs = '') => `<think${attrs || ''}>`);
  normalized = normalized.replace(/<\/mm:think>/gi, '</think>');
  normalized = normalized.replace(/<thought(\s+[^>]*)?>/gi, (_m, attrs = '') => `<think${attrs || ''}>`);
  normalized = normalized.replace(/<\/thought>/gi, '</think>');
  normalized = normalized.replace(/<\|channel>thought\s*\n?([\s\S]*?)<channel\|>\s*/gi, (_m, content = '') => {
@@ -535,6 +547,12 @@ export function mdToHtml(src, opts) {
    '$1[#$2](#$2)',
  );
  // Convert markdown images before links so ![alt](url) does not become
  // literal "!" plus a normal link.
  s = s.replace(/!\[([^\]\n]*)\]\(([^)\s]+)(?:\s+"([^"]*)")?\)/g, (match, alt, url, title) => {
    return imageHtml(alt, url, title);
  });
  // Convert markdown links [text](url) to clickable links
  // Internal #hash links navigate in-page; external links open in new tab
  s = s.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, url) => {
@@ -573,8 +591,9 @@ export function mdToHtml(src, opts) {
    return placeholder;
  });
-  // ALSO preserve <a> tags the same way (they're now in the HTML from markdown conversion)
+  // ALSO preserve <a>/<img> tags the same way (they're now in the HTML from
-  s = s.replace(/<a\s+[^>]*>.*?<\/a>/gi, (match) => {
+  // markdown conversion)
  s = s.replace(/<(?:a\s+[^>]*>.*?<\/a|img\s+[^>]*?)>/gi, (match) => {
    const placeholder = `___ALLOWED_HTML_${allowedHtmlBlocks.length}___`;
    allowedHtmlBlocks.push(sanitizeAllowedHtml(match));
    return placeholder;
@@ -77,6 +77,7 @@ function _handlePickerKeydown(e, listEl, itemSelector, closeFn) {
 // Dependencies injected via initModelPicker()
 let _deps = null;
 let _autoSelectingDefault = false;
 let _defaultChatPickInFlight = false;
 function _modelExists(modelId, url) {
  if (!modelId || !window.modelsModule || !window.modelsModule.getCachedItems) return false;
@@ -91,6 +92,43 @@ function _modelExists(modelId, url) {
  });
 }
 async function _ensureDefaultPendingChat() {
  if (!_deps || _defaultChatPickInFlight) return;
  if (_deps.getCurrentSessionId && _deps.getCurrentSessionId()) return;
  const pending = _deps.getPendingChat && _deps.getPendingChat();
  if (pending && pending.modelId) return;
  _defaultChatPickInFlight = true;
  try {
    let dc = null;
    try {
      const res = await fetch(`${API_BASE}/api/default-chat`, { credentials: 'same-origin' });
      if (res.ok) dc = await res.json();
    } catch (_) {}
    if (dc && dc.endpoint_url && dc.model) {
      _deps.setPendingChat({
        url: dc.endpoint_url,
        modelId: dc.model,
        endpointId: dc.endpoint_id || '',
      });
      try { window.__odysseusDefaultChat = dc; } catch (_) {}
      updateModelPicker();
      return;
    }
    // No configured default: preserve the old convenience fallback.
    if (window.modelsModule && window.modelsModule.getCachedItems) {
      const items = window.modelsModule.getCachedItems();
      const first = items.find(item => !item.offline && ((item.models || []).length || (item.models_extra || []).length));
      if (first) {
        const models = (first.models || []).concat(first.models_extra || []);
        _deps.setPendingChat({ url: first.url, modelId: models[0], endpointId: first.endpoint_id });
        updateModelPicker();
      }
    }
  } finally {
    _defaultChatPickInFlight = false;
  }
 }
 /**
 * Initialize the model picker dropdown.
 * @param {Object} deps
@@ -112,6 +150,7 @@ function _initModelPickerDropdown() {
  const search = document.getElementById('model-picker-search');
  const listEl = document.getElementById('model-picker-list');
  const searchRow = menu ? menu.querySelector('.model-picker-search-row') : null;
  const refreshBtn = document.getElementById('model-picker-refresh-btn');
  if (!wrap || !btn || !menu || !search || !listEl) return;
  function _close() {
@@ -608,6 +647,26 @@ function _initModelPickerDropdown() {
  search.addEventListener('input', () => _populate(search.value));
  search.addEventListener('click', (e) => e.stopPropagation());
  if (refreshBtn) {
    refreshBtn.addEventListener('click', async (e) => {
      e.stopPropagation();
      refreshBtn.disabled = true;
      refreshBtn.classList.add('spinning');
      try {
        if (window.modelsModule && window.modelsModule.refreshModels) {
          await window.modelsModule.refreshModels(true);
        }
        await _refreshLocalProbe();
        if (!menu.classList.contains('hidden')) _populate(search.value || '');
        updateModelPicker();
      } catch (_) {
        uiModule.showToast('Model refresh failed');
      } finally {
        refreshBtn.disabled = false;
        refreshBtn.classList.remove('spinning');
      }
    });
  }
  search.addEventListener('keydown', (e) => {
    _handlePickerKeydown(e, listEl, '.model-switch-item', _close);
  });
@@ -689,25 +748,7 @@ export function updateModelPicker() {
    }
  }
  if (!modelId && !_autoSelectingDefault && window.modelsModule && window.modelsModule.getCachedItems) {
-    const items = window.modelsModule.getCachedItems();
+    _ensureDefaultPendingChat();
    const first = items.find(item => !item.offline && ((item.models || []).length || (item.models_extra || []).length));
    if (first) {
      const models = (first.models || []).concat(first.models_extra || []);
      modelId = models[0];
      if (!currentSessionId) {
        _deps.setPendingChat({ url: first.url, modelId, endpointId: first.endpoint_id });
      } else {
        if (s) { s.model = modelId; s.endpoint_url = first.url; }
        _autoSelectingDefault = true;
        const fd = new FormData();
        fd.append('model', modelId);
        fd.append('endpoint_url', first.url || '');
        if (first.endpoint_id) fd.append('endpoint_id', first.endpoint_id);
        fetch(`${API_BASE}/api/session/${currentSessionId}`, { method: 'PATCH', body: fd })
          .catch(() => {})
          .finally(() => { _autoSelectingDefault = false; });
      }
    }
  }
  const displayName = modelId ? modelId.split('/').pop() : 'Select model';
@@ -608,7 +608,7 @@ function _isNoteFullyDone(note) {
 // A "checklist note" — todo or goal — has structured items[] that the cards
 // render as checkboxes and that "fully done" / progress logic reads from.
 function _hasItems(note) {
-  return note && (note.note_type === 'todo' || note.note_type === 'goal');
+  return note && (note.note_type === 'todo' || note.note_type === 'goal' || note.note_type === 'checklist');
 }
 // Compact " N/M" progress string for a goal's checklist. Empty when the goal
@@ -1120,8 +1120,6 @@ export function openPanel() {
  }
  _open = true;
  _editingId = null;
  // Reset the search filter — the rebuilt pane's search input renders empty, so a
  // stale _searchQuery would silently hide non-matching notes after a reopen.
  _searchQuery = '';
  _clearViewedReminderGlows();
  _firedDotDismissedAt = Date.now();
@@ -1822,10 +1820,20 @@ function _renderNotes() {
      for (let i = 0; i < note.items.length; i++) {
        const item = note.items[i];
        const doneClass = item.done ? ' done' : '';
        const agentStatus = (item.agent_status || '').toLowerCase();
        const agentDoneClass = agentStatus === 'stream_complete' ? ' is-agent-stream-complete' : '';
        const agentTitle = agentStatus === 'stream_complete'
          ? 'Agent stream finished for this todo'
          : (agentStatus === 'running' ? 'Agent is working on this todo' : 'Solve this todo with the agent');
        const agentSessionAttr = item.agent_session_id ? ` data-session-id="${_attrEsc(item.agent_session_id)}"` : '';
        const agentMenuTitle = item.agent_session_title || `Agent: ${(item.text || '').slice(0, 40)}`;
        const indent = Math.min(item.indent || 0, 3);
        contentHtml += `<div class="note-checkbox${doneClass}" data-note-id="${note.id}" data-idx="${i}" style="padding-left:${indent * 16}px">
          <span class="note-check-dot" title="Mark done"></span>
          <span class="note-check-text">${_linkify(item.text)}</span>
          <button class="note-checkbox-agent${agentDoneClass}" data-note-id="${_attrEsc(note.id)}" data-idx="${i}"${agentSessionAttr} data-agent-title="${_attrEsc(agentMenuTitle)}" title="${_attrEsc(agentTitle)}">
            <svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 8V4H8"/><rect x="4" y="8" width="16" height="12" rx="2"/><path d="M2 14h2M20 14h2M15 13v2M9 13v2"/></svg>
          </button>
          <button class="note-checkbox-rm" data-note-id="${note.id}" data-idx="${i}" title="Delete item">
            <svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg>
          </button>
@@ -1889,10 +1897,6 @@ function _renderNotes() {
      ${_hasItems(note) ? `<div class="note-cl-quickadd"><input type="text" class="note-cl-quickadd-input" placeholder="+ Add item" data-note-id="${note.id}" /></div>` : ''}
      ${reminderTagHtml}
      ${noteTags.length ? `<div class="note-card-label">${noteTags.map(t => `<button type="button" class="note-card-label-chip" data-note-label-filter="${_esc(t)}" title="Filter #${_esc(t)}">#${_esc(t)}</button>`).join(' ')}</div>` : ''}
      ${note.agent_session_id ? `<button class="note-agent-tag" data-note-id="${note.id}" data-session-id="${_esc(note.agent_session_id)}" title="Open the agent's chat for this note">
        <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 8V4H8"/><rect x="4" y="8" width="16" height="12" rx="2"/><path d="M2 14h2M20 14h2M15 13v2M9 13v2"/></svg>
        <span>Agent</span>
      </button>` : ''}
      <div class="note-card-actions">
        <div class="note-card-colors">${colorDots}</div>
        <span style="flex:1"></span>
@@ -2177,7 +2181,7 @@ function _bindCardEvents(body) {
  // Click empty area of checklist preview (not on checkbox/X) — edit
  body.querySelectorAll('.note-checklist-preview').forEach(el => {
    el.addEventListener('click', (e) => {
-      if (e.target.closest('.note-checkbox, .note-checkbox-rm, .note-cl-quickadd, input')) return;
+      if (e.target.closest('.note-checkbox, .note-checkbox-rm, .note-checkbox-agent, .note-cl-quickadd, input')) return;
      e.stopPropagation();
      tapToEditOrSelect(el.closest('.note-card'));
    });
@@ -2203,7 +2207,7 @@ function _bindCardEvents(body) {
  // title / content preview triggered edit, so padding + empty gutters were
  // dead zones that felt broken on mobile.
  if (_isNotesMobileMode() && !_selectMode) {
-    const _INTERACTIVE = 'button, a, input, label, .note-card-color-dot, .note-checkbox, .note-checkbox-rm, .note-cl-quickadd, .note-agent-tag, .note-card-pin, .note-card-corner-trash, .note-card-corner-menu, .note-card-corner-unarchive, .note-card-edit-corner, .note-card-reminder, .note-card-cb';
+    const _INTERACTIVE = 'button, a, input, label, .note-card-color-dot, .note-checkbox, .note-checkbox-rm, .note-checkbox-agent, .note-cl-quickadd, .note-agent-tag, .note-card-pin, .note-card-corner-trash, .note-card-corner-menu, .note-card-corner-unarchive, .note-card-edit-corner, .note-card-reminder, .note-card-cb';
    body.querySelectorAll('.note-card').forEach(card => {
      card.addEventListener('click', (e) => {
        if (e.target.closest(_INTERACTIVE)) return;
@@ -2297,16 +2301,6 @@ function _bindCardEvents(body) {
      _openNoteCornerMenu(btn);
    });
  });
  // Agent tag — opens the chat session the agent ran for this note.
  body.querySelectorAll('.note-agent-tag').forEach(tag => {
    tag.addEventListener('click', (e) => {
      e.preventDefault();
      e.stopPropagation();
      const sid = tag.dataset.sessionId;
      const _sm = window.sessionModule;
      if (sid && _sm && _sm.selectSession) { closePanel(); _sm.selectSession(sid); }
    });
  });
  body.querySelectorAll('.note-card-label-chip').forEach(chip => {
    chip.addEventListener('click', (e) => {
      e.preventDefault();
@@ -2523,6 +2517,18 @@ function _bindCardEvents(body) {
    });
  });
  // Per-item agent solve (hover button next to the X). Scoped to one todo
  // item — uses the note title as context if present, but only the single
  // item's text as the work. Mirrors the per-note _agentSolveNote pattern.
  body.querySelectorAll('.note-checkbox-agent').forEach(btn => {
    btn.addEventListener('click', (e) => {
      e.preventDefault();
      e.stopPropagation();
      if (_selectMode) return;
      _openTodoAgentMenu(btn);
    });
  });
  // Quick-add new checklist item (hover input at bottom of todo cards)
  body.querySelectorAll('.note-cl-quickadd-input').forEach(input => {
    input.addEventListener('click', (e) => e.stopPropagation());
@@ -4342,6 +4348,54 @@ function _openNoteCornerMenu(btn) {
  menu.querySelector('[data-act="agent"]').addEventListener('click', () => { menu.remove(); _agentSolveNote(id); });
 }
 function _positionNoteMenu(menu, btn, width = 196) {
  document.body.appendChild(menu);
  const r = btn.getBoundingClientRect();
  let left = Math.min(r.right - width, window.innerWidth - width - 8);
  left = Math.max(8, left);
  const mh = menu.offsetHeight || 112;
  const below = window.innerHeight - r.bottom;
  const top = (below < mh + 8 && r.top > mh + 8) ? (r.top - mh - 4) : (r.bottom + 4);
  menu.style.cssText += `position:fixed;z-index:11000;top:${Math.round(top)}px;left:${Math.round(left)}px;min-width:${width}px;`;
  const close = (ev) => {
    if (ev && menu.contains(ev.target)) return;
    menu.remove();
    document.removeEventListener('click', close, true);
  };
  setTimeout(() => document.addEventListener('click', close, true), 0);
 }
 function _openTodoAgentMenu(btn) {
  document.querySelectorAll('.note-corner-menu-dropdown').forEach(d => d.remove());
  const noteId = btn.dataset.noteId;
  const idx = parseInt(btn.dataset.idx);
  const sid = btn.dataset.sessionId || '';
  const menu = document.createElement('div');
  menu.className = 'note-corner-menu-dropdown note-agent-item-menu';
  menu.innerHTML = `
    ${sid ? `<button type="button" class="ncm-item" data-act="open">
      <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M15 3h6v6"/><path d="M10 14L21 3"/><path d="M18 13v6a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h6"/></svg>
      <span>Open</span>
    </button>` : ''}
    <button type="button" class="ncm-item" data-act="run">
      <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 8V4H8"/><rect x="4" y="8" width="16" height="12" rx="2"/><path d="M2 14h2M20 14h2M15 13v2M9 13v2"/></svg>
      <span>${sid ? 'Run again' : 'Run Agent'}</span>
    </button>`;
  _positionNoteMenu(menu, btn);
  const openBtn = menu.querySelector('[data-act="open"]');
  if (openBtn) {
    openBtn.addEventListener('click', () => {
      menu.remove();
      const _sm = window.sessionModule;
      if (sid && _sm && _sm.selectSession) { closePanel(); _sm.selectSession(sid); }
    });
  }
  menu.querySelector('[data-act="run"]').addEventListener('click', () => {
    menu.remove();
    _agentSolveTodoItem(noteId, idx);
  });
 }
 // Build the prompt the agent gets from a note: title + body, plus any
 // not-yet-done checklist items.
 function _noteToAgentPrompt(note) {
@@ -4353,7 +4407,7 @@ function _noteToAgentPrompt(note) {
      .forEach(it => parts.push('- ' + it.text.trim()));
  }
  const body = parts.join('\n');
-  return body ? `Help me get this done:\n\n${body}` : '';
+  return body ? `Help me get this done:\n\n${body}\n\nThe source note is read-only. Do not edit, replace, or update it.` : '';
 }
 // Agent-solve: create a chat session server-side, kick off an agent run
@@ -4395,6 +4449,7 @@ async function _agentSolveNote(id) {
    fd.append('message', prompt);
    fd.append('session', sid);
    fd.append('mode', 'agent');
    fd.append('disabled_tools', JSON.stringify(['manage_notes']));
    fetch(`${API_BASE}/api/chat_stream`, { method: 'POST', credentials: 'same-origin', body: fd })
      .then(async (res) => {
        if (!res.ok || !res.body) return;
@@ -4413,6 +4468,86 @@ async function _agentSolveNote(id) {
  }
 }
 // Per-item version of _agentSolveNote. Scoped to a single checklist item;
 // the note title (if any) is included as context, but only this one item's
 // text is the work the agent is asked to do. agent_session_id is set on the
 // PARENT note (latest-wins) so the Agent tag still surfaces the most recent
 // run from this note — same UX as a per-note solve.
 async function _agentSolveTodoItem(noteId, idx) {
  const note = _notes.find(n => n.id === noteId);
  if (!note || !Array.isArray(note.items)) return;
  const item = note.items[idx];
  const itemText = (item && (item.text || '').trim()) || '';
  if (!itemText) {
    uiModule.showToast('Nothing to solve — item is empty');
    return;
  }
  const titleCtx = (note.title || '').trim();
  const prompt = titleCtx
    ? `Context (from note "${titleCtx}").\n\nHelp me with this todo: ${itemText}\n\nThe source note is read-only. Do not edit, replace, or update it.`
    : `Help me with this todo: ${itemText}\n\nThe source note is read-only. Do not edit, replace, or update it.`;
  try {
    const dc = await (await fetch(`${API_BASE}/api/default-chat`, { credentials: 'same-origin' })).json();
    if (!dc.endpoint_url || !dc.model) { uiModule.showError('No default chat model configured'); return; }
    const label = itemText.slice(0, 40);
    const csFd = new FormData();
    csFd.append('name', 'Agent: ' + label);
    csFd.append('endpoint_url', dc.endpoint_url);
    csFd.append('model', dc.model);
    if (dc.endpoint_id) csFd.append('endpoint_id', dc.endpoint_id);
    csFd.append('skip_validation', 'true');
    const csRes = await fetch(`${API_BASE}/api/session`, { method: 'POST', credentials: 'same-origin', body: csFd });
    if (!csRes.ok) { uiModule.showError('Could not create agent session'); return; }
    const sess = await csRes.json();
    const sid = sess.id;
    const sessionTitle = 'Agent: ' + label;
    const n = _notes.find(x => x.id === noteId);
    if (n) {
      n.agent_session_id = sid;
      if (Array.isArray(n.items) && n.items[idx]) {
        n.items[idx].agent_session_id = sid;
        n.items[idx].agent_session_title = sessionTitle;
        n.items[idx].agent_status = 'running';
        n.items[idx].agent_stream_completed_at = '';
      }
    }
    _renderNotes();
    _patchNote(noteId, { items: n && Array.isArray(n.items) ? n.items : note.items, agent_session_id: sid }).catch(() => {});
    const fd = new FormData();
    fd.append('message', prompt);
    fd.append('session', sid);
    fd.append('mode', 'agent');
    fd.append('disabled_tools', JSON.stringify(['manage_notes']));
    fetch(`${API_BASE}/api/chat_stream`, { method: 'POST', credentials: 'same-origin', body: fd })
      .then(async (res) => {
        if (!res.ok || !res.body) return;
        const reader = res.body.getReader();
        while (true) { const { done } = await reader.read(); if (done) break; }
        if (window.sessionModule && window.sessionModule.markStreamComplete) {
          try { window.sessionModule.markStreamComplete(sid); } catch {}
        }
        const doneNote = _notes.find(x => x.id === noteId);
        if (doneNote && Array.isArray(doneNote.items) && doneNote.items[idx]) {
          doneNote.agent_session_id = sid;
          doneNote.items[idx].agent_session_id = sid;
          doneNote.items[idx].agent_session_title = sessionTitle;
          doneNote.items[idx].agent_status = 'stream_complete';
          doneNote.items[idx].agent_stream_completed_at = new Date().toISOString();
          _renderNotes();
          _patchNote(noteId, { items: doneNote.items, agent_session_id: sid }).catch(() => {});
        }
      })
      .catch(() => {});
    uiModule.showToast('Agent working on this item — tap the Agent tag when ready');
  } catch (e) {
    uiModule.showError('Agent failed: ' + (e.message || e));
  }
 }
 async function _copyNote(noteId, btnEl) {
  const note = _notes.find(n => n.id === noteId);
  if (!note) return false;
@@ -366,20 +366,13 @@ function _buildPanelHTML() {
    <div class="modal-body research-pane-body" data-no-swipe-dismiss>
      <div class="research-new-job">
        <div style="display:flex;align-items:center;gap:8px;margin-bottom:2px;">
-          <h2 style="margin:0;padding:0;line-height:1;display:inline-flex;align-items:center;gap:6px;"><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="var(--accent, var(--red))" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="flex-shrink:0;"><path d="M6 18h8"/><path d="M3 22h18"/><path d="M14 22a7 7 0 1 0 0-14h-1"/><path d="M9 14h2"/><path d="M9 12a2 2 0 0 1-2-2V6h4v4a2 2 0 0 1-2 2Z"/><path d="M12 6V3a1 1 0 0 0-1-1H9a1 1 0 0 0-1 1v3"/></svg>Research <span id="research-stats" class="memory-count" style="font-size:0.6em;opacity:0.6;font-weight:normal;position:relative;top:4px;"></span></h2>
+          <h2 style="margin:0;padding:0;line-height:1;display:inline-flex;align-items:center;gap:6px;"><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="var(--accent, var(--red))" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="flex-shrink:0;"><path d="M6 18h8"/><path d="M3 22h18"/><path d="M14 22a7 7 0 1 0 0-14h-1"/><path d="M9 14h2"/><path d="M9 12a2 2 0 0 1-2-2V6h4v4a2 2 0 0 1-2 2Z"/><path d="M12 6V3a1 1 0 0 0-1-1H9a1 1 0 0 0-1 1v3"/></svg>Research <span id="research-stats" class="memory-count" style="font-size:0.6em;opacity:0.6;font-weight:normal"></span></h2>
        </div>
        <p class="memory-desc doclib-desc" style="margin-top:2px;display:flex;align-items:center;gap:6px;flex-wrap:wrap;">
          <span>Multi-step web research with an LLM-in-the-loop agent</span>
-          <span id="research-no-past-hint" style="display:none;font-size:11px;opacity:0.7;position:relative;top:-4px;">— past runs in <button type="button" class="research-library-link" style="background:none;border:none;padding:0;font:inherit;color:var(--accent, var(--red));cursor:pointer;text-decoration:underline;">Library, Research</button></span>
+          <span id="research-no-past-hint" style="display:none;font:inherit;opacity:1;position:static;">— past runs in <button type="button" class="research-library-link" style="background:none;border:none;padding:0;font:inherit;color:var(--accent, var(--red));cursor:pointer;text-decoration:underline;">Library, Research</button></span>
        </p>
        <textarea id="research-query" class="research-query" placeholder="${_pickResearchHint()}" rows="4"></textarea>
        <div class="research-category-row" id="research-category-row">
          <button class="research-cat active" data-cat="" title="LLM auto-detects the best format">Auto</button>
          <button class="research-cat" data-cat="product">Product</button>
          <button class="research-cat" data-cat="comparison">Compare</button>
          <button class="research-cat" data-cat="howto">How-to</button>
          <button class="research-cat" data-cat="factcheck">Fact-check</button>
        </div>
        <button id="research-settings-toggle" class="research-settings-toggle${chevronCls}">
          <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-2px;margin-right:4px;opacity:0.85;flex-shrink:0;"><circle cx="12" cy="12" r="3"/><path d="M19.4 15a1.65 1.65 0 0 0 .33 1.82l.06.06a2 2 0 0 1 0 2.83 2 2 0 0 1-2.83 0l-.06-.06a1.65 1.65 0 0 0-1.82-.33 1.65 1.65 0 0 0-1 1.51V21a2 2 0 0 1-2 2 2 2 0 0 1-2-2v-.09A1.65 1.65 0 0 0 9 19.4a1.65 1.65 0 0 0-1.82.33l-.06.06a2 2 0 0 1-2.83 0 2 2 0 0 1 0-2.83l.06-.06a1.65 1.65 0 0 0 .33-1.82 1.65 1.65 0 0 0-1.51-1H3a2 2 0 0 1-2-2 2 2 0 0 1 2-2h.09A1.65 1.65 0 0 0 4.6 9a1.65 1.65 0 0 0-.33-1.82l-.06-.06a2 2 0 0 1 0-2.83 2 2 0 0 1 2.83 0l.06.06a1.65 1.65 0 0 0 1.82.33H9a1.65 1.65 0 0 0 1-1.51V3a2 2 0 0 1 2-2 2 2 0 0 1 2 2v.09a1.65 1.65 0 0 0 1 1.51 1.65 1.65 0 0 0 1.82-.33l.06-.06a2 2 0 0 1 2.83 0 2 2 0 0 1 0 2.83l-.06.06a1.65 1.65 0 0 0-.33 1.82V9a1.65 1.65 0 0 0 1.51 1H21a2 2 0 0 1 2 2 2 2 0 0 1-2 2h-.09a1.65 1.65 0 0 0-1.51 1z"/></svg>Settings<span class="research-settings-chevron">${_chevronIcon}</span>
        </button>
@@ -787,6 +780,21 @@ function _renderJobs() {
      +   '<span class="research-section-dot' + (dotPulse ? ' pulsing' : '') + '" style="background:' + dotColor + ';"></span>'
      +   '<svg class="research-section-chevron" width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round"><polyline points="6 9 12 15 18 9"/></svg>'
      + '</span>';
    if (key === 'past') {
      const hint = document.createElement('span');
      hint.className = 'research-library-hint';
      hint.innerHTML = '<span>Multi-step web research with an LLM-in-the-loop agent</span> <button type="button" class="research-library-link">Library, Research</button>';
      hint.querySelector('.research-library-link').addEventListener('click', (e) => {
        e.stopPropagation();
        // Close the research panel first so the Library opens ABOVE it on mobile
        // (otherwise it stacks under the full-screen panel).
        closePanel();
        if (window.documentModule && window.documentModule.openLibrary) {
          window.documentModule.openLibrary({ tab: 'research' });
        }
      });
      header.appendChild(hint);
    }
    header.addEventListener('click', () => {
      const nowCollapsed = sec.classList.toggle('collapsed');
      if (nowCollapsed) _collapsedSections.add(key); else _collapsedSections.delete(key);
@@ -803,27 +811,6 @@ function _renderJobs() {
    });
    const body = document.createElement('div');
    body.className = 'research-section-body';
    // Past Research header: link goes INLINE next to the title instead
    // of on a second row. Append it to the title span as a small chip.
    if (key === 'past') {
      const titleEl = header.querySelector('.research-section-title');
      if (titleEl) {
        const hint = document.createElement('span');
        hint.className = 'research-library-hint research-library-hint-inline';
        hint.style.cssText = 'margin-left:8px;font-size:10.5px;opacity:0.65;font-weight:normal;';
        hint.innerHTML = '— all in <button type="button" class="research-library-link" style="background:none;border:none;padding:0;font:inherit;color:var(--accent, var(--red));cursor:pointer;text-decoration:underline;">Library, Research</button>';
        hint.querySelector('.research-library-link').addEventListener('click', (e) => {
          e.stopPropagation();
          // Close the research panel first so the Library opens ABOVE it on mobile
          // (otherwise it stacks under the full-screen panel).
          closePanel();
          if (window.documentModule && window.documentModule.openLibrary) {
            window.documentModule.openLibrary({ tab: 'research' });
          }
        });
        titleEl.appendChild(hint);
      }
    }
    arr.forEach(j => body.appendChild(_buildJobCard(j)));
    sec.appendChild(header);
    sec.appendChild(body);
@@ -1014,9 +1001,9 @@ function _buildJobCard(job) {
      </div>
      ${failNote}
      <div class="research-job-actions">
        <button class="research-job-action" data-action="copy" title="Copy report to clipboard">${_copyIcon}</button>
        <button class="research-job-action" data-action="chat" title="Open follow-up chat with this research as context">${_chatIcon} Discuss</button>
        <button class="research-job-action research-job-action-report" data-action="report" title="Visual report">${_externalIcon} Visual Report</button>
        <button class="research-job-action" data-action="chat" title="Open follow-up chat with this research as context">${_chatIcon} Discuss</button>
        <button class="research-job-action research-job-action-dim" data-action="copy" title="Copy report to clipboard">${_copyIcon}</button>
        <button class="research-job-action research-job-action-dim" data-action="dismiss" title="Clear from list">${_cancelIcon}</button>
        <button class="research-job-action research-job-action-dim" data-action="delete" title="Delete from disk">${_trashIcon} Delete</button>
      </div>
@@ -103,7 +103,6 @@ export function initSidebarLayout(Storage, opts) {
  });
  // Hamburger cycles: full sidebar → mini → off → full
  // Shift-click swaps sidebar side
  let _userToggledSidebar = false;
  let _wasAutoCollapsed = false;
@@ -122,8 +121,7 @@ export function initSidebarLayout(Storage, opts) {
    if (window.innerWidth < 768 && cc && cc.classList.contains('compare-active')) return;
    _userToggledSidebar = true;
    // Optionally place the sidebar on a specific edge (the swipe gesture passes
-    // the direction). Persist it + re-anchor the doc panel, same as a
+    // the direction). Persist it + re-anchor the doc panel.
    // shift-click on the hamburger.
    if (side === 'left' || side === 'right') {
      const wantRight = side === 'right';
      if (sidebar.classList.contains('right-side') !== wantRight) {
@@ -143,13 +141,6 @@ export function initSidebarLayout(Storage, opts) {
    hamburgerBtn.addEventListener('click', (e) => {
      e.stopPropagation();
      const sidebar = document.getElementById('sidebar');
      if (e.shiftKey) {
        sidebar.classList.toggle('right-side');
        Storage.set(Storage.KEYS.SIDEBAR_SIDE, sidebar.classList.contains('right-side') ? 'right' : 'left');
        syncRailSide();
        if (documentModule && documentModule.swapSide) documentModule.swapSide();
        return;
      }
      _userToggledSidebar = true;
      const isSidebarVisible = !sidebar.classList.contains('hidden');
@@ -17,9 +17,16 @@ let _tasksFetched = false;   // first-fetch sentinel — `false` → show loadin
 let _escHandler = null;
 let _viewingRuns = null; // task id when viewing run history
 let _clockInterval = null;
 let _taskFailurePending = false;
 const DAYS_OF_WEEK = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'];
 function _setTaskFailurePending(active) {
  _taskFailurePending = !!active;
  document.getElementById('tool-tasks-btn')?.classList.toggle('task-failure-pending', _taskFailurePending);
  document.getElementById('rail-tasks')?.classList.toggle('task-failure-pending', _taskFailurePending);
 }
 // ---- API ----
 async function _fetchTasks() {
@@ -2238,6 +2245,9 @@ function _renderActivityEntry(entry) {
    status = _classifyResult(entry.result);
  }
  const statusDot = `<span class="task-log-status task-log-status-${status}" title="${status}"></span>`;
  const failedTag = status === 'error'
    ? '<span class="task-log-failed-tag">(failed)</span>'
    : '';
  // Render the result through markdown so code blocks, lists, links look right.
  let resultHtml;
  const _isRunning = entry.status === 'running' || entry.status === 'queued';
@@ -2361,7 +2371,7 @@ function _renderActivityEntry(entry) {
      <div class="task-log-row-head">
        ${statusDot}
        <span class="task-log-task-icon">${_taskIcon({ action: entry.action, task_type: entry.kind })}</span>
-        <span class="task-log-name">${_escHtml(entry.taskName)}</span>${_taskAiMark(entry)}
+        <span class="task-log-name">${_escHtml(entry.taskName)}</span>${failedTag}${_taskAiMark(entry)}
        ${repeatBadge}
        <span style="flex:1"></span>
        ${rightHtml}
@@ -2502,8 +2512,11 @@ function _renderMainView() {
 export function openTasks(focusId, opts) {
  const o = opts || {};
  const openActivityForFailure = _taskFailurePending && !focusId && o.filter === undefined;
  _setTaskFailurePending(false);
  if (_open) {
    // Already open — just focus the requested task / apply filter.
    if (openActivityForFailure) _switchTab('activity');
    if (o.filter !== undefined) { _taskFilter = o.filter; _renderList(); }
    if (focusId) _focusTask(focusId);
    return;
@@ -2610,7 +2623,7 @@ export function openTasks(focusId, opts) {
  // of an empty modal-body that fills in after the fetch resolves — that delay
  // was visible as a "flicker" right after opening.
  _activeTab = 'tasks';
-  _switchTab('tasks');
+  _switchTab(openActivityForFailure ? 'activity' : 'tasks');
  _fetchTasks().then(() => {
    // Re-render so the list swaps the Loading row for real cards.
    _renderList();
@@ -2704,7 +2717,13 @@ async function _pollTaskNotifications() {
      const msg = `Task ${ok ? 'finished' : 'failed'}: ${n.task_name}`;
      if (!uiModule) continue;
      if (ok) uiModule.showToast(msg, { duration: 5000 });
-      else uiModule.showError(msg);
+      else {
        _setTaskFailurePending(true);
        uiModule.showError(msg);
        if (_open && document.querySelector('.tasks-tab.active[data-tab="activity"]')) {
          _renderActivityView();
        }
      }
    }
  } catch (e) {
    // Silently ignore — server may be unreachable
@@ -26,7 +26,7 @@ export const THEMES = {
  gpt:        { bg:'#212121', fg:'#ececec', panel:'#171717', border:'#424242', red:'#949494',
                advanced: { sendBtnBg: '#949494', sendBtnHover: '#7f7f7f',
                            userBubbleBg: '#2f2f2f', aiBubbleBg: '#171717',
-                            inputBg: '#2f2f2f' } },
+                            inputBg: '#2f2f2f', brandColor: '#ffffff', brandMixTo: '#ffffff' } },
  claude:     { bg:'#262624', fg:'#f5f4f0', panel:'#30302e', border:'#4a4a47', red:'#c6613f' },
  cute:       { bg:'#fff0f5', fg:'#d4608a', panel:'#fff8fa', border:'#f0c0d0', red:'#ff6b9d' },
 };
@@ -184,6 +184,7 @@ const ADV_KEYS = [
  { key: 'bubbleBorder',       css: '--bubble-border',     label: 'Border Chat Bubble', group: 'Chat Bubbles' },
  { key: 'sidebarBg',          css: '--sidebar-bg',        label: 'Sidebar Bg',       group: 'Sidebar' },
  { key: 'brandColor',         css: '--brand-color',       label: 'Odysseus Logo',    group: 'Sidebar' },
  { key: 'brandMixTo',         css: '--brand-mix-to',      label: 'Logo Gradient End', group: 'Sidebar' },
  { key: 'hamburgerColor',     css: '--hamburger-color',   label: 'Hamburger Menu',   group: 'Sidebar' },
  { key: 'inputBg',            css: '--input-bg',          label: 'Input Bg',         group: 'Chat Input / Prompt Area' },
  { key: 'inputBorder',        css: '--input-border',      label: 'Input Border',     group: 'Chat Input / Prompt Area' },
@@ -203,6 +204,7 @@ function computeAdvancedDefaults(colors) {
    bubbleBorder: colors.border,
    sidebarBg: colors.panel,
    brandColor: red,
    brandMixTo: colors.fg,
    hamburgerColor: colors.fg,
    inputBg: colors.panel,
    inputBorder: colors.border,
@@ -43,7 +43,8 @@ def test_background_session_sort_uses_owner_task_endpoint():
 def test_scheduler_fallbacks_and_research_headers_are_owner_scoped():
    src = _src("src/task_scheduler.py")
-    assert "resolve_utility_fallback_candidates(owner=task.owner or None)" in src
+    assert "resolve_task_candidates(" in src
    assert "owner=task.owner or None" in src
    assert 'resolve_endpoint(\n                    "research",' in src
    assert "owner=task.owner or None" in src
    assert "headers_from_resolver = False" in src
@@ -51,23 +51,19 @@ class _Db:
        self.closed = True
-def _resolver_spy(monkeypatch, utility_result=("", "", {}), default_result=("http://llm", "model", {})):
+def _resolver_spy(monkeypatch, candidates=None):
-    from src import endpoint_resolver
+    from src import task_endpoint
    calls = []
    fallback_calls = []
-    def fake_resolve(kind, *args, **kwargs):
+    def fake_candidates(*args, **kwargs):
-        calls.append((kind, kwargs.get("owner")))
+        calls.append(kwargs.get("owner"))
-        return utility_result if kind == "utility" else default_result
+        if candidates is None:
            return [("http://llm", "model", {})]
        return list(candidates)
-    def fake_fallbacks(*args, **kwargs):
+    monkeypatch.setattr(task_endpoint, "resolve_task_candidates", fake_candidates)
-        fallback_calls.append(kwargs.get("owner"))
+    return calls
        return []
    monkeypatch.setattr(endpoint_resolver, "resolve_endpoint", fake_resolve)
    monkeypatch.setattr(endpoint_resolver, "resolve_utility_fallback_candidates", fake_fallbacks)
    return calls, fallback_calls
@pytest.mark.asyncio
@@ -88,7 +84,7 @@ async def test_classify_events_resolves_llm_for_task_owner(monkeypatch):
        location="",
    )
    db = _Db({FakeCalendarEvent: [event]})
-    calls, _fallback_calls = _resolver_spy(monkeypatch, utility_result=("http://llm", "model", {}))
+    calls = _resolver_spy(monkeypatch)
    monkeypatch.setattr(database, "CalendarEvent", FakeCalendarEvent)
    monkeypatch.setattr(database, "SessionLocal", lambda: db)
@@ -97,7 +93,7 @@ async def test_classify_events_resolves_llm_for_task_owner(monkeypatch):
    assert ok is True
    assert "Scanned 1 upcoming event" in message
-    assert calls == [("utility", "alice")]
+    assert calls == ["alice"]
    assert db.closed is True
@@ -122,7 +118,7 @@ async def test_learn_sender_signatures_resolves_llm_for_task_owner(monkeypatch):
        def logout(self):
            return None
-    calls, _fallback_calls = _resolver_spy(monkeypatch, utility_result=("", "", {}), default_result=("", "", {}))
+    calls = _resolver_spy(monkeypatch, candidates=[])
    imap_owners = []
    def fake_imap_connect(_account_id=None, owner=""):
@@ -135,14 +131,14 @@ async def test_learn_sender_signatures_resolves_llm_for_task_owner(monkeypatch):
    assert ok is False
    assert message == "No LLM endpoint available"
-    assert calls == [("utility", "alice"), ("default", "alice")]
+    assert calls == ["alice"]
    assert imap_owners == ["alice"]
@pytest.mark.asyncio
 async def test_learn_sender_signatures_writes_owner_scoped_cache(monkeypatch, tmp_path):
    from routes import email_helpers
-    from src import endpoint_resolver, llm_core
+    from src import llm_core, task_endpoint
    from src.builtin_actions import action_learn_sender_signatures
    db_path = tmp_path / "scheduled_emails.db"
@@ -205,15 +201,15 @@ async def test_learn_sender_signatures_writes_owner_scoped_cache(monkeypatch, tm
    monkeypatch.setattr(email_helpers, "_imap_connect", fake_imap_connect)
    monkeypatch.setattr(
-        endpoint_resolver,
+        task_endpoint,
-        "resolve_endpoint",
+        "resolve_task_candidates",
-        lambda kind, *args, **kwargs: ("http://llm", "alice-model", {}),
+        lambda *args, **kwargs: [("http://llm", "alice-model", {})],
    )
-    async def fake_llm_call_async(**_kwargs):
+    async def fake_llm_call_async(_candidates, **_kwargs):
        return "Writer Example\nExample Co.\nwriter@example.com"
-    monkeypatch.setattr(llm_core, "llm_call_async", fake_llm_call_async)
+    monkeypatch.setattr(llm_core, "llm_call_async_with_fallback", fake_llm_call_async)
    message, ok = await action_learn_sender_signatures("alice")
@@ -253,7 +249,7 @@ async def test_check_email_urgency_resolves_llm_candidates_for_task_owner(monkey
        from_address = _Column()
    db = _Db({FakeEmailAccount: []})
-    calls, fallback_calls = _resolver_spy(monkeypatch, utility_result=("http://llm", "model", {}))
+    calls = _resolver_spy(monkeypatch)
    monkeypatch.chdir(tmp_path)
    monkeypatch.setattr(database, "EmailAccount", FakeEmailAccount)
@@ -262,6 +258,5 @@ async def test_check_email_urgency_resolves_llm_candidates_for_task_owner(monkey
    with pytest.raises(TaskNoop, match="no email accounts configured"):
        await action_check_email_urgency("alice")
-    assert calls == [("utility", "alice")]
+    assert calls == ["alice"]
    assert fallback_calls == ["alice"]
    assert db.closed is True
@@ -29,8 +29,8 @@ def _read_memories(data_dir):
@pytest.mark.asyncio
 async def test_consolidate_memory_empty_owner_treats_each_owner_separately(monkeypatch, tmp_path):
    from src import constants
    from src import endpoint_resolver
    from src import llm_core
    from src import task_endpoint
    action_consolidate_memory = _import_consolidate_action()
    long_alice_text = "Alice private project context. " + ("A" * 2200)
@@ -44,11 +44,15 @@ async def test_consolidate_memory_empty_owner_treats_each_owner_separately(monke
        ],
    )
    monkeypatch.setattr(constants, "DATA_DIR", str(data_dir))
-    monkeypatch.setattr(endpoint_resolver, "resolve_endpoint", lambda *args, **kwargs: ("http://llm", "model", {}))
+    monkeypatch.setattr(
        task_endpoint,
        "resolve_task_candidates",
        lambda *args, **kwargs: [("http://llm", "model", {})],
    )
    prompts = []
-    async def fake_llm_call_async(**kwargs):
+    async def fake_llm_call_async(_candidates, **kwargs):
        prompt = kwargs["messages"][0]["content"]
        prompts.append(prompt)
        if "alice-long" in prompt:
@@ -71,7 +75,7 @@ async def test_consolidate_memory_empty_owner_treats_each_owner_separately(monke
            }
        )
-    monkeypatch.setattr(llm_core, "llm_call_async", fake_llm_call_async)
+    monkeypatch.setattr(llm_core, "llm_call_async_with_fallback", fake_llm_call_async)
    message, ok = await action_consolidate_memory("")
@@ -29,24 +29,24 @@ class _FakeMM:
 def test_omitted_memory_survives_only_explicit_drop(monkeypatch):
    import src.memory
    import src.endpoint_resolver
    import src.llm_core
    import src.task_endpoint
    _FakeMM.saved = None
    monkeypatch.setattr(src.memory, "MemoryManager", _FakeMM)
    monkeypatch.setattr(
-        src.endpoint_resolver, "resolve_endpoint",
+        src.task_endpoint, "resolve_task_candidates",
-        lambda kind, owner=None: ("http://x/v1", "model", {}),
+        lambda owner=None: [("http://x/v1", "model", {})],
    )
-    async def fake_llm(**kwargs):
+    async def fake_llm(_candidates, **kwargs):
        # Model keeps 'a', drops 'b', and OMITS 'c' entirely.
        return json.dumps({
            "keep": [{"id": "a", "text": "Likes dark roast coffee", "category": "preference"}],
            "drop": [{"id": "b", "reason": "duplicate of a"}],
        })
-    monkeypatch.setattr(src.llm_core, "llm_call_async", fake_llm)
+    monkeypatch.setattr(src.llm_core, "llm_call_async_with_fallback", fake_llm)
    msg, ok = asyncio.run(ba.action_consolidate_memory("alice"))
@@ -16,6 +16,7 @@ from pathlib import Path
 SRC = Path(__file__).resolve().parent.parent / "static/js/cookbook.js"
 SERVE_SRC = Path(__file__).resolve().parent.parent / "static/js/cookbookServe.js"
 ROUTES_SRC = Path(__file__).resolve().parent.parent / "routes/cookbook_routes.py"
 def test_cpu_only_drops_gpu_only_flags():
@@ -54,3 +55,32 @@ def test_windows_diffusers_uses_python_not_python3():
    assert "const diffusersPy = _isWindows() ? 'python' : _py3Bin;" in text
    assert "cmd += `${diffusersPy} scripts/diffusion_server.py" in text
    assert "cmd += `python3 scripts/diffusion_server.py" not in text
 def test_vllm_blank_swap_omits_swap_space_flag():
    text = SRC.read_text(encoding="utf-8")
    assert "const _swapRaw = (f.swap ?? '').toString().trim().toLowerCase();" in text
    assert "['0', 'off', 'none', 'false'].includes(_swapRaw)" in text
    assert "if (_swapRaw && !['0', 'off', 'none', 'false'].includes(_swapRaw)) cmd += ` --swap-space ${_swapRaw}`;" in text
 def test_serve_preflight_uses_selected_server_not_stale_env_host():
    text = SERVE_SRC.read_text(encoding="utf-8")
    assert "function _selectedServeTarget(panel) {" in text
    assert "const _hostStr = launchTarget.host || '';" in text
    assert "(t.remoteHost || '') === _hostStr" in text
    assert "const _probeHost = (launchTarget.host || '').trim();" in text
    assert "const _portHost = (launchTarget.host || '').trim();" in text
 def test_vllm_route_strips_swap_space_when_runtime_rejects_it():
    text = ROUTES_SRC.read_text(encoding="utf-8")
    assert "Setting vLLM --swap-space 0 so the runtime does not reserve CPU swap per GPU." in text
    assert "vLLM serve does not expose --swap-space; removing the flag and patching the runtime default to 0." in text
    assert "ODYSSEUS_VLLM_HELP_CMD" in text
    assert "print(shlex.join(parts[:serve_i + 1] + [\"--help\"]))" in text
    assert "eval \"$ODYSSEUS_VLLM_HELP_CMD\" 2>&1 | grep -q -- \"--swap-space\"" in text
    assert "eval \"$ODYSSEUS_SERVE_CMD\"" in text
@@ -348,7 +348,7 @@ def test_serve_pip_install_normalizes_llama_cpp_alias_and_adds_wheel_index():
    src = (pathlib.Path(__file__).resolve().parent.parent
        / "routes" / "cookbook_routes.py").read_text(encoding="utf-8")
-    assert "re.sub(r\"(?<![A-Za-z0-9_.-])llama_cpp(?![A-Za-z0-9_.-])\", \"llama-cpp-python[server]\", req.cmd)" in src
+    assert "re.sub(r\"(?<![A-Za-z0-9_.\\-/])llama_cpp(?![A-Za-z0-9_.\\-/])\", \"llama-cpp-python[server]\", req.cmd)" in src
    assert "if \"llama-cpp-python\" in req.cmd and \"--extra-index-url\" not in req.cmd:" in src
    assert "https://abetlen.github.io/llama-cpp-python/whl/cpu" in src
@@ -626,7 +626,7 @@ def test_llama_cpp_linux_bootstrap_prefers_rocm_before_cuda():
    script = "\n".join(runner_lines)
    assert "mkdir -p ~/bin" in script
-    assert script.index("mkdir -p ~/bin") < script.index("cd ~/llama.cpp && rm -rf build")
+    assert script.index("mkdir -p ~/bin") < script.index("cd ~/llama.cpp")
    assert 'command -v hipconfig &>/dev/null || [ -d /opt/rocm ] || [ -n "$ROCM_PATH" ] || [ -n "$HIP_PATH" ]' in script
    assert 'cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_HIP=ON' in script
    assert 'cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON' in script
@@ -676,7 +676,7 @@ def test_llama_cpp_linux_bootstrap_nvcc_without_cudart_warns_and_falls_back():
    # outer else that handles no-GPU-toolchain). Verify it appears at least once
    # before the outer "no HIP/CUDA toolchain" warning.
    cpu_cmake = 'cmake -B build -DCMAKE_BUILD_TYPE=Release &&'
-    no_toolchain_warn = 'WARNING: no HIP/CUDA toolchain found'
+    no_toolchain_warn = 'WARNING: no HIP/CUDA/Vulkan toolchain found'
    assert cpu_cmake in script
    assert script.index(cpu_cmake) < script.index(no_toolchain_warn)
@@ -693,8 +693,8 @@ def test_llama_cpp_linux_bootstrap_keeps_cpu_fallback_when_no_gpu_toolchain():
    _append_llama_cpp_linux_accel_build_lines(runner_lines)
    script = "\n".join(runner_lines)
-    assert 'WARNING: no HIP/CUDA toolchain found — building llama-server for CPU only.' in script
+    assert 'WARNING: no HIP/CUDA/Vulkan toolchain found — building llama-server for CPU only.' in script
-    assert 'Install ROCm for AMD GPUs or vLLM/CUDA tooling for NVIDIA' in script
+    assert 'Install Vulkan (libvulkan-dev) / ROCm for AMD GPUs or CUDA tooling for NVIDIA' in script
 def test_llama_cpp_rebuild_cmd_clears_cached_build_paths():
@@ -50,14 +50,14 @@ def test_serve_launch_preflights_use_selected_target_and_port():
    assert "if (launchTarget.port) _probeParams.set('ssh_port', launchTarget.port);" in SERVE
    assert "const _portHost = (launchTarget.host || '').trim();" in SERVE
    assert "StrictHostKeyChecking=no ${_sshPrefix(launchTarget.port)}${_portHost}" in SERVE
-    assert "let serveHost = launchTarget.host || '';" in SERVE
+    assert "const serveHost = launchTarget.host || '';" in SERVE
    assert SERVE.index(launch_target) < SERVE.index("const _runningMod = await import('./cookbookRunning.js');")
 def test_running_tab_resolves_profile_key_not_first_host():
-    assert "_serverByVal(_envState.remoteServerKey || _tHost)" in RUNNING
+    assert "_serverByVal(_targetKey)" in RUNNING
    assert "_serverByVal(_envState.remoteServerKey || _host)" in RUNNING
-    assert "_serverByVal(_envState.remoteServerKey || host)" in RUNNING
+    assert "_serverByVal(savedKey)" in RUNNING
    assert "_serverByVal = shared._serverByVal;" in RUNNING
    assert "_selectedServer = shared._selectedServer;" in RUNNING
@@ -126,6 +126,27 @@ def test_plain_reply_copy_text_is_unchanged(node_available):
    assert out["content"] == raw
 def test_minimax_namespaced_thinking_is_extracted(node_available):
    raw = (
        '<mm:think>The user said "idk" - just casual.</mm:think>'
        "Haha fair. Well, I'm here whenever you figure it out."
    )
    out = _extract_thinking_blocks(raw)
    assert out["thinkingBlocks"] == ['The user said "idk" - just casual.']
    assert out["content"] == "Haha fair. Well, I'm here whenever you figure it out."
    assert "mm:think" not in out["content"]
 def test_minimax_orphan_closing_tag_drops_leaked_reasoning(node_available):
    raw = "</mm:think>Hi! What can I do for you?"
    out = _extract_thinking_blocks(raw)
    assert out["thinkingBlocks"] == []
    assert out["content"] == "Hi! What can I do for you?"
    assert "mm:think" not in out["content"]
 def test_thinking_only_message_yields_empty_content(node_available):
    # The copy handler falls back to the raw text in this case so the button
    # still copies something for turns interrupted mid-thinking.
@@ -59,8 +59,8 @@ def test_docker_entrypoint_does_not_resolve_root_commands_from_app_local_path():
    path_export = script.index('export PATH="/app/.local/bin:$PATH"')
    gosu_capture = script.index('GOSU_BIN="$(command -v gosu)"')
    python_capture = script.index('PYTHON_BIN="$(command -v python)"')
-    setup_call = script.index('"$GOSU_BIN" "$PUID:$PGID" "$PYTHON_BIN" /app/setup.py')
+    setup_call = script.index('"$GOSU_BIN" "$ODY_USER" "$PYTHON_BIN" /app/setup.py')
-    final_exec = script.index('exec "$GOSU_BIN" "$PUID:$PGID" "$@"')
+    final_exec = script.index('exec "$GOSU_BIN" "$ODY_USER" "$@"')
    assert gosu_capture < path_export < setup_call
    assert python_capture < path_export < setup_call
@@ -221,6 +221,60 @@ def test_skip_fenced_still_recovers_xml_invoke_markup():
    assert "latest python release" in blocks[0].content
 def test_stepfun_native_tool_tokens_are_executed_even_when_fenced_fallback_is_skipped():
    leaked = (
        "<｜tool▁calls▁begin｜>"
        "<｜tool▁call▁begin｜>web_search<｜tool▁sep｜>"
        '{"query":"Sweden news today"}'
        "<｜tool▁call▁end｜>"
        "<｜tool▁calls▁end｜>"
    )
    blocks = parse_tool_blocks(leaked, skip_fenced=True)
    assert len(blocks) == 1
    assert blocks[0].tool_type == "web_search"
    assert "Sweden news today" in blocks[0].content
    assert strip_tool_blocks(leaked, skip_fenced=True) == ""
 def test_stepfun_native_tool_tokens_accept_plain_web_query():
    leaked = (
        "<｜tool▁call▁begin｜>web_search<｜tool▁sep｜>"
        "Sweden news today"
        "<｜tool▁call▁end｜>"
    )
    blocks = parse_tool_blocks(leaked, skip_fenced=True)
    assert len(blocks) == 1
    assert blocks[0].tool_type == "web_search"
    assert "Sweden news today" in blocks[0].content
 def test_skip_fenced_still_recovers_direct_xml_tool_markup():
    leaked = (
        "I'll search now.\n"
        "<tool_call><web_search>News in Sweden today 2026-06-22</web_search></tool_call>"
    )
    blocks = parse_tool_blocks(leaked, skip_fenced=True)
    assert len(blocks) == 1
    assert blocks[0].tool_type == "web_search"
    assert "News in Sweden today 2026-06-22" in blocks[0].content
    assert strip_tool_blocks(leaked, skip_fenced=True) == "I'll search now."
 def test_skip_fenced_recovers_direct_xml_tool_markup_with_unclosed_wrapper():
    leaked = (
        "I'll search now.\n"
        "<tool_call>\n"
        "<web_search>\n"
        "Sweden news today 2026-06-22\n"
        "</web_search>"
    )
    blocks = parse_tool_blocks(leaked, skip_fenced=True)
    assert len(blocks) == 1
    assert blocks[0].tool_type == "web_search"
    assert "Sweden news today 2026-06-22" in blocks[0].content
    assert strip_tool_blocks(leaked, skip_fenced=True) == "I'll search now."
 def test_skip_fenced_still_recovers_dsml_markup():
    dsml = (
        "Let me search for that.\n"
@@ -124,9 +124,9 @@ def test_nvidia_odysseus_adds_only_overlay(base):
        {"driver": "nvidia", "count": "all", "capabilities": ["gpu"]}
    ]
-    # No AMD-only keys leaked in.
+    # Base Docker socket group is preserved; no AMD-only keys leaked in.
    assert "devices" not in svc
-    assert "group_add" not in svc
+    assert svc["group_add"] == base_svc["group_add"]
 def test_amd_odysseus_adds_only_overlay(base):
@@ -137,11 +137,10 @@ def test_amd_odysseus_adds_only_overlay(base):
    # Environment is unchanged from base for AMD.
    assert svc["environment"] == base_svc["environment"]
-    # devices and group_add are new and match the overlay exactly.
+    # devices are new; group_add preserves the base Docker group and appends AMD groups.
    assert "devices" not in base_svc
    assert "group_add" not in base_svc
    assert svc["devices"] == ["/dev/kfd", "/dev/dri"]
-    assert svc["group_add"] == ["video", "${RENDER_GID:-render}"]
+    assert svc["group_add"] == base_svc["group_add"] + ["video", "${RENDER_GID:-render}"]
    # No NVIDIA-only keys leaked in.
    assert "deploy" not in svc
@@ -18,7 +18,7 @@ def _compute_is_api_model(model: str, endpoint_url: str, endpoint_supports=None)
    model_supports_tools = any(kw in model_lc for kw in (
        "gpt-4", "gpt-5", "gpt-o", "claude", "gemini", "gemma",
        "qwen3", "qwen2.5", "mixtral", "mistral", "llama-3.1", "llama-3.2",
-        "llama-3.3", "llama-4",
+        "llama-3.3", "llama-4", "llama3.1", "llama3.2", "llama3.3", "llama4",
        "minimax", "kimi", "yi-", "phi-3", "phi-4", "command-r",
        "glm-4", "internlm", "hermes",
        "deepseek-v", "deepseek-chat",
@@ -19,7 +19,12 @@ from pathlib import Path
 import pytest
 from fastapi import APIRouter
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.pool import NullPool
 import core.database as cdb
 from core.database import GalleryImage
 from src.upload_handler import count_recent_uploads, UploadHandler
 import routes.upload_routes as up
@@ -82,6 +87,10 @@ def _files(n):
    return [types.SimpleNamespace(filename=f"f{i}.txt") for i in range(n)]
 def _image_upload(name="photo.png", content=b"not really png but enough for route metadata"):
    return types.SimpleNamespace(filename=name, file=io.BytesIO(content))
@pytest.fixture(autouse=True)
 def _reset_router(monkeypatch):
    # Module-level router accumulates routes across setup calls; reset it.
@@ -163,3 +172,64 @@ def test_six_file_batch_is_not_rate_limited(tmp_path):
        assert meta and meta.get("id")
        saved += 1
    assert saved == 6
 async def test_chat_image_upload_is_added_to_gallery(tmp_path, monkeypatch):
    engine = create_engine(
        f"sqlite:///{tmp_path / 'gallery.db'}",
        connect_args={"check_same_thread": False},
        poolclass=NullPool,
    )
    cdb.Base.metadata.create_all(engine)
    TestingSession = sessionmaker(bind=engine, autoflush=False, autocommit=False)
    gallery_dir = tmp_path / "generated_images"
    monkeypatch.setattr(up, "SessionLocal", TestingSession)
    monkeypatch.setattr(up, "GENERATED_IMAGES_DIR", str(gallery_dir))
    h = UploadHandler(base_dir=str(tmp_path), upload_dir=str(tmp_path / "uploads"))
    up.setup_upload_routes(h)
    endpoint = _endpoint(up.router)
    result = await endpoint(_request(user="alice"), [_image_upload()])
    uploaded = result["files"][0]
    assert uploaded["gallery_id"]
    db = TestingSession()
    try:
        image = db.query(GalleryImage).filter(GalleryImage.id == uploaded["gallery_id"]).one()
        assert image.owner == "alice"
        assert image.model == "chat-upload"
        assert image.prompt == "photo.png"
        assert image.file_hash == uploaded["hash"]
        assert (gallery_dir / image.filename).exists()
    finally:
        db.close()
 async def test_non_image_chat_upload_is_not_added_to_gallery(tmp_path, monkeypatch):
    engine = create_engine(
        f"sqlite:///{tmp_path / 'gallery.db'}",
        connect_args={"check_same_thread": False},
        poolclass=NullPool,
    )
    cdb.Base.metadata.create_all(engine)
    TestingSession = sessionmaker(bind=engine, autoflush=False, autocommit=False)
    monkeypatch.setattr(up, "SessionLocal", TestingSession)
    monkeypatch.setattr(up, "GENERATED_IMAGES_DIR", str(tmp_path / "generated_images"))
    h = UploadHandler(base_dir=str(tmp_path), upload_dir=str(tmp_path / "uploads"))
    up.setup_upload_routes(h)
    endpoint = _endpoint(up.router)
    result = await endpoint(_request(user="alice"), [types.SimpleNamespace(
        filename="notes.txt",
        file=io.BytesIO(b"plain text upload"),
    )])
    assert "gallery_id" not in result["files"][0]
    db = TestingSession()
    try:
        assert db.query(GalleryImage).count() == 0
    finally:
        db.close()