From fa8c93ec0ae8a960ba29190eb34f713c76cbd7de Mon Sep 17 00:00:00 2001
From: pewdiepie-archdaemon <pewdiepie-archdaemon@users.noreply.github.com>
Date: Mon, 8 Jun 2026 22:38:49 +0900
Subject: [PATCH] Cookbook UI: Ollama browser, advanced serve fold, API tokens
 form, diagnosis toolbar, polish
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surface a lot of accumulated cookbook + UI work as a single non-agent
commit so the agent rework lands cleanly.

Highlights:
- Ollama as a first-class backend in the Cookbook:
  * Download input accepts ollama-style names (name:tag) → backend=ollama
  * /api/cookbook/ollama/library (cached scrape of ollama.com + curated
    fallback so classic models like qwen2.5 stay reachable)
  * "Browse Ollama library" toggle below Download with size chips
  * Engine=Ollama in hwfit toolbar merges the Ollama library into the
    main scan list as per-tag rows with the same Fit/Param/Quant/VRAM
    columns; click → fills Download input
- API Tokens form added to Integrations panel (matching wired
  loadTokens()/initTokenForm() that had no HTML)
- Serve panel polish: Advanced fold tightening (-8px nudges on vLLM
  checks, Extra args, Spec row), n_cpu_moe + Split Mode controls
  pulled up 8px to align with the row's checkboxes, GGUF File dropdown
  exposed for Ollama backend, GPU re-render on Edit serve restore,
  _forceBackend flag so saved serveState wins over backend detection,
  cookbook:servers-changed CustomEvent so panels don't need refresh
- Models page redesign: Add Models row (URL + hidden API key reveal +
  Type select + Scan/Ollama/Key/Test/Add icon buttons), Probe All +
  Clear-offline buttons in Added Models toolbar, offline-pill removed
  (opacity already conveys state), Engine dropdown gains Ollama option
- _ping_endpoint probes /v1/models then base, accepts 4xx as
  reachable (vLLM returns 404 on bare /v1, fully working endpoints
  were showing offline)
- Diagnosis card: × dismiss + Copy bundle buttons restored on the
  serve error feedback card
- Orphan tmux sweep re-enabled behind a 60s rate-limit + background
  Thread (off the main event loop) so dead serves get discovered
- cookbook_routes auto-register watchdog: drops the endpoint if the
  serve session exits non-zero within the first ~3min
- ollama-rocm sidecar awareness in download wrapper (`docker exec
  ollama-rocm ollama pull` when host ollama isn't installed)
- Skill extractor sets initial_status="published" when
  auto_approve_skills pref is on (audit demotes later)
- Skill list / model list / cookbook scan misc polish
---
 app.py                             |    4 +
 mcp_servers/email_server.py        |  533 ++++++++++++-
 routes/api_token_routes.py         |    2 +
 routes/cookbook_helpers.py         |    8 +-
 routes/cookbook_routes.py          | 1157 ++++++++++++++++++++--------
 routes/hwfit_routes.py             |   19 +-
 routes/model_routes.py             |  276 +++----
 services/hwfit/data/hf_models.json |   25 +-
 services/memory/skill_extractor.py |   15 +
 src/tool_implementations.py        |  150 +++-
 static/index.html                  |  134 +++-
 static/js/admin.js                 |  156 +++-
 static/js/chatRenderer.js          |   22 +
 static/js/cookbook-diagnosis.js    |   43 +-
 static/js/cookbook-hwfit.js        |  167 +++-
 static/js/cookbook.js              |  512 ++++++------
 static/js/cookbookDownload.js      |   12 +-
 static/js/cookbookRunning.js       |    9 +
 static/js/cookbookSchedule.js      |   36 +-
 static/js/cookbookServe.js         |  301 ++++----
 static/js/documentLibrary.js       |    7 +-
 static/js/emailLibrary.js          |    2 +-
 static/js/markdown.js              |  151 +++-
 static/js/modelPicker.js           |   11 +-
 static/js/models.js                |    9 +-
 static/js/settings.js              |   10 +-
 static/js/skills.js                |    8 +-
 static/style.css                   |  280 ++++++-
 28 files changed, 3033 insertions(+), 1026 deletions(-)

diff --git a/app.py b/app.py
index 97906bd46..0af6b18ea 100644
--- a/app.py
+++ b/app.py
@@ -650,6 +650,10 @@ app.include_router(calendar_router)
 from routes.shell_routes import setup_shell_routes
 app.include_router(setup_shell_routes())
 
+# Terminal agents (tmux-backed Codex/Claude/shell sessions)
+from routes.terminal_agent_routes import setup_terminal_agent_routes
+app.include_router(setup_terminal_agent_routes())
+
 # Cookbook (model download/serve/cache, cookbook state sync)
 from routes.cookbook_routes import setup_cookbook_routes
 app.include_router(setup_cookbook_routes())
diff --git a/mcp_servers/email_server.py b/mcp_servers/email_server.py
index d1c2ac07e..db731ec0f 100644
--- a/mcp_servers/email_server.py
+++ b/mcp_servers/email_server.py
@@ -22,6 +22,7 @@ import os
 import os.path
 from pathlib import Path
 from datetime import datetime, timedelta
+import uuid
 
 from mcp.server import Server
 from mcp.server.stdio import stdio_server
@@ -67,6 +68,59 @@ def _db_path() -> Path:
     return Path(APP_DB)
 
 
+def _load_email_writing_style() -> str:
+    """Return the existing Settings > Email > Writing Style value."""
+    try:
+        settings_path = DATA_DIR / "settings.json"
+        if not settings_path.exists():
+            return ""
+        settings = json.loads(settings_path.read_text(encoding="utf-8"))
+        return str(settings.get("email_writing_style") or "").strip()
+    except Exception:
+        return ""
+
+
+def _writing_style_guidance() -> str:
+    style = _load_email_writing_style()
+    if not style:
+        return (
+            "No saved writing style is configured in Settings > Email > Writing Style. "
+            "Use a concise, natural tone and do not invent facts."
+        )
+    return (
+        "Use this saved writing style from Settings > Email > Writing Style when "
+        "drafting the body. It overrides generic tone guidance:\n"
+        f"{style}"
+    )
+
+
+def _default_document_owner() -> str | None:
+    """Best-effort owner for MCP-created documents.
+
+    MCP stdio tools do not receive the browser request's authenticated user,
+    but the document library is owner-filtered. Stamp drafts to the configured
+    single/default admin so assistant-created email drafts are visible.
+    """
+    owner = os.environ.get("ODYSSEUS_DOCUMENT_OWNER", "").strip()
+    if owner:
+        return owner
+    try:
+        auth_path = DATA_DIR / "auth.json"
+        if not auth_path.exists():
+            return None
+        users = (json.loads(auth_path.read_text(encoding="utf-8")).get("users") or {})
+        if not isinstance(users, dict) or not users:
+            return None
+        admins = [name for name, data in users.items() if isinstance(data, dict) and data.get("is_admin")]
+        if len(admins) == 1:
+            return admins[0]
+        if len(users) == 1:
+            return next(iter(users))
+        return admins[0] if admins else next(iter(users))
+    except Exception:
+        return None
+
+
 def _list_accounts_raw() -> list:
     """Return list of dicts from the email_accounts table. Empty list if table
     missing or empty. Never raises."""
@@ -896,6 +950,340 @@ def _send_email(to, subject, body, in_reply_to=None, references=None, cc=None, b
     }
 
 
+def _build_email_document_content(
+    to,
+    subject,
+    body,
+    *,
+    cc=None,
+    bcc=None,
+    in_reply_to=None,
+    references=None,
+    source_uid=None,
+    source_folder=None,
+):
+    header_lines = [f"To: {to or ''}"]
+    if cc:
+        header_lines.append(f"Cc: {cc}")
+    if bcc:
+        header_lines.append(f"Bcc: {bcc}")
+    header_lines.append(f"Subject: {subject or ''}")
+    if in_reply_to:
+        header_lines.append(f"In-Reply-To: {in_reply_to}")
+    if references:
+        header_lines.append(f"References: {references}")
+    if source_uid:
+        header_lines.append(f"X-Source-UID: {source_uid}")
+    if source_folder:
+        header_lines.append(f"X-Source-Folder: {source_folder}")
+    return "\n".join(header_lines) + "\n---\n" + (body or "")
+
+
+def _merge_email_reply_body(existing_content: str, reply_body: str) -> str:
+    """Preserve email headers and quoted chain while replacing the editable reply body."""
+    if "\n---\n" not in (existing_content or ""):
+        return reply_body or ""
+    head, body = existing_content.split("\n---\n", 1)
+    quote_markers = (
+        "---------- Previous message ----------",
+        "-----Original Message-----",
+        "----- Original Message -----",
+    )
+    quote_index = -1
+    for marker in quote_markers:
+        idx = body.find(marker)
+        if idx != -1 and (quote_index == -1 or idx < quote_index):
+            quote_index = idx
+    quote = body[quote_index:].strip() if quote_index != -1 else ""
+    merged_body = (reply_body or "").strip()
+    if quote:
+        merged_body = f"{merged_body}\n\n{quote}" if merged_body else quote
+    return f"{head}\n---\n{merged_body}"
+
+
+def _create_email_draft_document(
+    *,
+    to,
+    subject,
+    body,
+    title=None,
+    cc=None,
+    bcc=None,
+    in_reply_to=None,
+    references=None,
+    source_uid=None,
+    source_folder=None,
+    account=None,
+    source_message_id=None,
+):
+    """Create an Odysseus email compose document for user review. Does not send."""
+    from core.database import SessionLocal, Document, DocumentVersion
+    try:
+        from src.event_bus import fire_event
+    except Exception:
+        fire_event = None
+
+    cfg = _load_config(account) if account else _load_config(None)
+    content = _build_email_document_content(
+        to,
+        subject,
+        body,
+        cc=cc,
+        bcc=bcc,
+        in_reply_to=in_reply_to,
+        references=references,
+        source_uid=source_uid,
+        source_folder=source_folder,
+    )
+    doc_id = str(uuid.uuid4())
+    ver_id = str(uuid.uuid4())
+    doc_title = (title or subject or "Email draft").strip() or "Email draft"
+    doc_owner = _default_document_owner()
+
+    db = SessionLocal()
+    try:
+        if source_uid and source_folder:
+            existing = (
+                db.query(Document)
+                .filter(Document.is_active == True)
+                .filter(Document.language == "email")
+                .filter(Document.owner == doc_owner)
+                .filter(Document.source_email_uid == str(source_uid))
+                .filter(Document.source_email_folder == source_folder)
+                .order_by(Document.updated_at.desc())
+                .first()
+            )
+            if existing and "\n---\n" in (existing.current_content or ""):
+                existing.current_content = _merge_email_reply_body(existing.current_content, body or "")
+                existing.version_count = (existing.version_count or 0) + 1
+                ver = DocumentVersion(
+                    id=ver_id,
+                    document_id=existing.id,
+                    version_number=existing.version_count,
+                    content=existing.current_content,
+                    summary="Updated by email MCP draft tool",
+                    source="ai",
+                )
+                db.add(ver)
+                db.commit()
+                if fire_event:
+                    try:
+                        fire_event("document_updated", doc_owner)
+                    except Exception:
+                        pass
+                return {
+                    "draft": True,
+                    "updated": True,
+                    "doc_id": existing.id,
+                    "title": existing.title,
+                    "language": existing.language,
+                    "account": cfg.get("account_name"),
+                    "account_id": cfg.get("account_id"),
+                    "to": to,
+                    "subject": subject,
+                }
+
+        doc = Document(
+            id=doc_id,
+            session_id=None,
+            title=doc_title,
+            language="email",
+            current_content=content,
+            version_count=1,
+            is_active=True,
+            owner=doc_owner,
+            source_email_uid=source_uid,
+            source_email_folder=source_folder,
+            source_email_account_id=cfg.get("account_id"),
+            source_email_message_id=source_message_id,
+        )
+        ver = DocumentVersion(
+            id=ver_id,
+            document_id=doc_id,
+            version_number=1,
+            content=content,
+            summary="Created by email MCP draft tool",
+            source="ai",
+        )
+        db.add(doc)
+        db.add(ver)
+        db.commit()
+        if fire_event:
+            try:
+                fire_event("document_created", doc_owner)
+            except Exception:
+                pass
+        return {
+            "draft": True,
+            "doc_id": doc_id,
+            "title": doc_title,
+            "language": "email",
+            "account": cfg.get("account_name"),
+            "account_id": cfg.get("account_id"),
+            "to": to,
+            "subject": subject,
+        }
+    finally:
+        db.close()
+
+
+def _draft_reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None, title=None):
+    """Create a threaded Odysseus reply draft document. Does not send."""
+    conn = _imap_connect(account)
+    conn.select(folder, readonly=True)
+    status, msg_data = conn.uid("FETCH", _b(uid), "(RFC822)")
+    conn.logout()
+    if status != "OK" or not msg_data or not msg_data[0]:
+        return {"error": f"Failed to fetch email UID {uid}"}
+    raw = msg_data[0][1]
+    orig = email.message_from_bytes(raw)
+
+    orig_subject = _decode_header(orig.get("Subject", ""))
+    reply_subject = orig_subject if orig_subject.lower().startswith("re:") else f"Re: {orig_subject}"
+    orig_message_id = orig.get("Message-ID", "")
+    orig_references = orig.get("References", "")
+    new_references = (orig_references + " " + orig_message_id).strip() if orig_references else orig_message_id
+
+    sender = _decode_header(orig.get("From", ""))
+    _, sender_addr = email.utils.parseaddr(sender)
+    to_addrs = sender_addr
+
+    cc = None
+    if reply_all:
+        cc_addrs = []
+        cfg = _load_config(account)
+        own_addrs = {
+            (cfg.get("imap_user") or "").strip().lower(),
+            (cfg.get("from_address") or "").strip().lower(),
+        }
+        for header_name in ("To", "Cc"):
+            for _, addr in email.utils.getaddresses([orig.get(header_name, "")]):
+                addr_l = (addr or "").strip().lower()
+                if addr and addr != sender_addr and addr_l not in own_addrs:
+                    cc_addrs.append(addr)
+        if cc_addrs:
+            cc = ", ".join(dict.fromkeys(cc_addrs))
+
+    return _create_email_draft_document(
+        to=to_addrs,
+        subject=reply_subject,
+        body=body,
+        title=title or reply_subject,
+        cc=cc,
+        in_reply_to=orig_message_id,
+        references=new_references,
+        source_uid=uid,
+        source_folder=folder,
+        account=account,
+        source_message_id=orig_message_id,
+    )
+
+
+async def _ai_draft_reply_to_email(uid, folder="INBOX", reply_all=False, account=None, title=None):
+    """Generate a reply with Odysseus' AI-reply prompt/style, then create a compose doc."""
+    read_result = _read_email(uid=uid, folder=folder, account=account)
+    if "error" in read_result:
+        return read_result
+
+    to_addr = read_result.get("from_address") or email.utils.parseaddr(read_result.get("from") or "")[1]
+    subject = read_result.get("subject") or ""
+    reply_subject = subject if subject.lower().startswith("re:") else f"Re: {subject}"
+    original_body = read_result.get("body") or ""
+    message_id = read_result.get("message_id") or ""
+
+    if not original_body.strip():
+        return {"error": "No email body available for AI reply"}
+
+    try:
+        from routes.email_helpers import (
+            _EMAIL_REPLY_SYS_PROMPT_BASE,
+            _apply_email_style_mechanics,
+            _extract_reply,
+            _load_settings,
+        )
+        from src.endpoint_resolver import (
+            resolve_endpoint,
+            resolve_utility_fallback_candidates,
+            resolve_chat_fallback_candidates,
+        )
+        from src.llm_core import llm_call_async_with_fallback
+    except Exception as exc:
+        return {"error": f"AI reply helpers unavailable: {exc}"}
+
+    settings = _load_settings()
+    style = settings.get("email_writing_style", "")
+    system_prompt = _EMAIL_REPLY_SYS_PROMPT_BASE
+    if style:
+        system_prompt += f"\n\nWRITING STYLE TO MATCH:\n{style}"
+
+    user_msg = (
+        f"Recipient: {to_addr}\nSubject: {reply_subject}\n\n"
+        f"Original email and any current draft:\n{original_body[:6000]}\n\n"
+        "Draft a reply. Return only the reply body text."
+    )
+
+    candidates = []
+    seen = set()
+
+    def _add(url, model, headers):
+        key = (url or "", model or "")
+        if not url or not model or key in seen:
+            return
+        seen.add(key)
+        candidates.append((url, model, headers))
+
+    try:
+        _add(*resolve_endpoint("utility", owner=None))
+    except Exception:
+        pass
+    try:
+        _add(*resolve_endpoint("default", owner=None))
+    except Exception:
+        pass
+    try:
+        utility_fallbacks = resolve_utility_fallback_candidates(owner=None) or []
+    except TypeError:
+        utility_fallbacks = resolve_utility_fallback_candidates() or []
+    for cand in utility_fallbacks:
+        _add(*cand)
+    try:
+        chat_fallbacks = resolve_chat_fallback_candidates(owner=None) or []
+    except TypeError:
+        chat_fallbacks = resolve_chat_fallback_candidates() or []
+    for cand in chat_fallbacks:
+        _add(*cand)
+
+    if not candidates:
+        return {"error": "No LLM endpoint configured for AI reply"}
+
+    try:
+        raw_reply = await llm_call_async_with_fallback(
+            candidates,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_msg},
+            ],
+            temperature=0.7,
+            max_tokens=1024,
+            timeout=60,
+        )
+    except Exception as exc:
+        return {"error": f"AI reply generation failed: {exc}"}
+
+    reply = _apply_email_style_mechanics(_extract_reply(raw_reply or ""))
+    if not reply:
+        return {"error": "AI reply generation returned an empty response"}
+
+    return _draft_reply_to_email(
+        uid=uid,
+        body=reply,
+        folder=folder,
+        reply_all=reply_all,
+        account=account,
+        title=title or reply_subject,
+    )
+
+
 def _reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None):
     """Reply to an existing email by UID. Threads via In-Reply-To/References."""
     conn = None
@@ -1189,6 +1577,8 @@ async def list_tools() -> list[Tool]:
             name="send_email",
             description=(
                 "Send a new email via SMTP. Provide recipient(s), subject, and body. "
+                "This sends immediately; for normal assistant-written email, prefer "
+                "draft_email so the user can review and send from Odysseus. "
                 "For replying to an existing thread, use reply_to_email instead. "
                 "Pass `account` to send from a non-default mailbox."
             ),
@@ -1205,10 +1595,35 @@ async def list_tools() -> list[Tool]:
                 "required": ["to", "subject", "body"],
             },
         ),
+        Tool(
+            name="draft_email",
+            description=(
+                "Create a new Odysseus email compose draft document. This DOES NOT send. "
+                "Use this as the default way to write an email for the user: it opens "
+                "a reviewable email document with To/Cc/Bcc/Subject/body, and the user "
+                "can edit or press Send in Odysseus. "
+                f"{_writing_style_guidance()}"
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "to": {"type": "string", "description": "Recipient email address(es), comma-separated"},
+                    "subject": {"type": "string", "description": "Email subject line"},
+                    "body": {"type": "string", "description": "Draft body"},
+                    "cc": {"type": "string", "description": "CC address(es), comma-separated (optional)"},
+                    "bcc": {"type": "string", "description": "BCC address(es), comma-separated (optional)"},
+                    "title": {"type": "string", "description": "Optional Odysseus document title"},
+                    **ACCOUNT_PROP,
+                },
+                "required": ["to", "subject", "body"],
+            },
+        ),
         Tool(
             name="reply_to_email",
             description=(
-                "Reply to an existing email by UID. Automatically threads the reply with "
+                "Reply to an existing email by UID. This sends immediately; for normal "
+                "assistant-written replies, prefer draft_email_reply so the user can "
+                "review and send from Odysseus. Automatically threads the reply with "
                 "In-Reply-To and References headers, prefixes 'Re:' on the subject, and "
                 "uses the original sender as the recipient. Set reply_all=true to also CC "
                 "the original To/Cc recipients. For follow-up 'reply ...' requests, use "
@@ -1226,6 +1641,49 @@ async def list_tools() -> list[Tool]:
                 "required": ["uid", "body"],
             },
         ),
+        Tool(
+            name="draft_email_reply",
+            description=(
+                "Create an Odysseus email reply draft document for an existing email UID. "
+                "This DOES NOT send. It threads the draft with In-Reply-To/References, "
+                "prefills the recipient and subject, and stores source email metadata so "
+                "the user can review and send from the normal email composer. "
+                f"{_writing_style_guidance()}"
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"},
+                    "body": {"type": "string", "description": "Draft reply body text"},
+                    "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"},
+                    "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False},
+                    "title": {"type": "string", "description": "Optional Odysseus document title"},
+                    **ACCOUNT_PROP,
+                },
+                "required": ["uid", "body"],
+            },
+        ),
+        Tool(
+            name="ai_draft_email_reply",
+            description=(
+                "Generate an AI reply using Odysseus' existing AI Reply behavior, "
+                "including Settings > Email > Writing Style, then create an email "
+                "compose document for review. This DOES NOT send and does NOT save "
+                "to the mailbox Drafts folder. Use this when the user asks you to "
+                "write or draft a reply to an email without dictating the exact body."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"},
+                    "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"},
+                    "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False},
+                    "title": {"type": "string", "description": "Optional Odysseus document title"},
+                    **ACCOUNT_PROP,
+                },
+                "required": ["uid"],
+            },
+        ),
         Tool(
             name="archive_email",
             description="Move an email out of the inbox into the Archive folder. Use after handling an email you want to keep but no longer need in the inbox.",
@@ -1552,6 +2010,31 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
             acct_note = f" (from {result['account']})" if result.get("account") else ""
             return [TextContent(type="text", text=f"Sent email to {result['to']} with subject '{result['subject']}'{acct_note}.")]
 
+        elif name == "draft_email":
+            to = arguments.get("to")
+            subject = arguments.get("subject")
+            body = arguments.get("body")
+            if not to or not subject or body is None:
+                return [TextContent(type="text", text="Error: to, subject, and body are required")]
+            result = _create_email_draft_document(
+                to=to,
+                subject=subject,
+                body=body,
+                title=arguments.get("title"),
+                cc=arguments.get("cc"),
+                bcc=arguments.get("bcc"),
+                account=acct,
+            )
+            acct_note = f" from {result['account']}" if result.get("account") else ""
+            return [TextContent(
+                type="text",
+                text=(
+                    f"Created Odysseus email draft `{result['title']}` "
+                    f"(document ID: {result['doc_id']}){acct_note}. "
+                    "It has not been sent; open the document in Odysseus to review and send."
+                ),
+            )]
+
         elif name == "reply_to_email":
             uid = arguments.get("uid")
             body = arguments.get("body")
@@ -1573,6 +2056,54 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
                 pass
             return [TextContent(type="text", text=f"Replied to UID {uid}: '{result['subject']}' → {result['to']}")]
 
+        elif name == "draft_email_reply":
+            uid = arguments.get("uid")
+            body = arguments.get("body")
+            if not uid or body is None:
+                return [TextContent(type="text", text="Error: uid and body are required")]
+            result = _draft_reply_to_email(
+                uid=uid,
+                body=body,
+                folder=arguments.get("folder", "INBOX"),
+                reply_all=bool(arguments.get("reply_all", False)),
+                account=acct,
+                title=arguments.get("title"),
+            )
+            if "error" in result:
+                return [TextContent(type="text", text=f"Error: {result['error']}")]
+            acct_note = f" from {result['account']}" if result.get("account") else ""
+            return [TextContent(
+                type="text",
+                text=(
+                    f"Created Odysseus reply draft `{result['title']}` for UID {uid} "
+                    f"(document ID: {result['doc_id']}){acct_note}. "
+                    "It has not been sent; open the document in Odysseus to review and send."
+                ),
+            )]
+
+        elif name == "ai_draft_email_reply":
+            uid = arguments.get("uid")
+            if not uid:
+                return [TextContent(type="text", text="Error: uid is required")]
+            result = await _ai_draft_reply_to_email(
+                uid=uid,
+                folder=arguments.get("folder", "INBOX"),
+                reply_all=bool(arguments.get("reply_all", False)),
+                account=acct,
+                title=arguments.get("title"),
+            )
+            if "error" in result:
+                return [TextContent(type="text", text=f"Error: {result['error']}")]
+            acct_note = f" from {result['account']}" if result.get("account") else ""
+            return [TextContent(
+                type="text",
+                text=(
+                    f"Generated AI reply and created Odysseus compose draft "
+                    f"`{result['title']}` for UID {uid} (document ID: {result['doc_id']}){acct_note}. "
+                    "It has not been sent; open the document in Odysseus to review and send."
+                ),
+            )]
+
         elif name == "archive_email":
             uid = arguments.get("uid")
             if not uid:
diff --git a/routes/api_token_routes.py b/routes/api_token_routes.py
index 97c576d15..05806e420 100644
--- a/routes/api_token_routes.py
+++ b/routes/api_token_routes.py
@@ -25,6 +25,8 @@ ALLOWED_SCOPES = {
     "calendar:write",
     "memory:read",
     "memory:write",
+    "cookbook:read",
+    "cookbook:launch",
 }
 TOKEN_PROFILES = {
     "chat": ["chat"],
diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py
index 39a18f715..a450278be 100644
--- a/routes/cookbook_helpers.py
+++ b/routes/cookbook_helpers.py
@@ -30,8 +30,9 @@ _LOCAL_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
 _OLLAMA_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._:/-]{0,200}$")
 # Include pattern is a glob: allow typical safe glyphs only.
 _INCLUDE_RE = re.compile(r"^[A-Za-z0-9._\-*?/\[\]]+$")
-# Remote host: user@host (optionally with :port-free hostname parts).
-_REMOTE_HOST_RE = re.compile(r"^[A-Za-z0-9._-]+@[A-Za-z0-9._-]+$")
+# Remote host: either `user@host` or plain `host` (alias is allowed), where host
+# is a safe DNS-like token or a short SSH config alias.
+_REMOTE_HOST_RE = re.compile(r"^(?:[A-Za-z0-9._-]+@)?[A-Za-z0-9._-]+$")
 # HF tokens and API tokens are url-safe base64-like.
 _TOKEN_RE = re.compile(r"^[A-Za-z0-9._~+/=-]+$")
 # Session IDs we mint look like "cookbook-deadbeef" or "serve-deadbeef".
@@ -81,7 +82,7 @@ def _validate_remote_host(v: str | None) -> str | None:
     if v is None or v == "":
         return None
     if not _REMOTE_HOST_RE.match(v):
-        raise HTTPException(400, "Invalid remote_host — must be user@host, no SSH option syntax")
+        raise HTTPException(400, "Invalid remote_host — must be host or user@host, no SSH option syntax")
     return v
 
 
@@ -787,6 +788,7 @@ def _llama_cpp_rebuild_cmd() -> str:
 
 class ModelDownloadRequest(BaseModel):
     repo_id: str
+    backend: str | None = None  # "hf" (default) or "ollama"
     include: str | None = None  # glob pattern e.g. "*Q4_K_M*"
     hf_token: str | None = None
     env_prefix: str | None = None  # e.g. "source ~/venv/bin/activate"
diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py
index 7a1ee85c6..ba950f4b7 100644
--- a/routes/cookbook_routes.py
+++ b/routes/cookbook_routes.py
@@ -15,26 +15,19 @@ from pathlib import Path
 from fastapi import APIRouter, HTTPException, Request, Depends
 
 from src.auth_helpers import require_user
-from src.constants import COOKBOOK_STATE_FILE
 from pydantic import BaseModel
 
 from core.middleware import require_admin
 from core.platform_compat import (
     IS_WINDOWS,
-    SSH_PATH_OVERRIDE,
-    NVIDIA_PATH_CANDIDATES,
     detached_popen_kwargs,
     find_bash,
-    git_bash_path,
     kill_process_tree,
     pid_alive,
     safe_chmod,
     which_tool,
-    translate_path,
-    get_wsl_windows_user_profile,
 )
 from routes.shell_routes import TMUX_LOG_DIR
-from src.constants import COOKBOOK_STATE_FILE
 
 logger = logging.getLogger(__name__)
 
@@ -45,10 +38,8 @@ from routes.cookbook_helpers import (
     _ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
     _safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
     _append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
-    _append_vllm_linux_preflight_lines, _ollama_bind_from_cmd, _pip_install_fallback_chain,
-    _pip_install_no_cache, _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
-    _append_pip_install_runner_lines,
-    _diagnose_serve_output, run_ssh_command_async,
+    _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache,
+    _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
     ModelDownloadRequest, ServeRequest,
 )
 
@@ -63,7 +54,7 @@ _HF_TOKEN_STATUS_SNIPPET = (
 
 def setup_cookbook_routes() -> APIRouter:
     router = APIRouter(tags=["cookbook"])
-    _cookbook_state_path = Path(COOKBOOK_STATE_FILE)
+    _cookbook_state_path = Path(os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
 
     def _mask_secret(value: str) -> str:
         if not value:
@@ -90,6 +81,127 @@ def setup_cookbook_routes() -> APIRouter:
                     task["payload"].pop("hf_token", None)
         return state
 
+    def _diagnose_serve_output(text: str) -> dict | None:
+        """Server-side mirror of the Cookbook UI's common serve diagnoses.
+
+        The browser uses cookbook-diagnosis.js for clickable fixes. This gives
+        the agent/tool path the same structured signal so it can retry with an
+        adjusted command instead of guessing from raw tmux output.
+        """
+        if not text:
+            return None
+        tail = text[-6000:]
+        patterns = [
+            (
+                r"No available memory for the cache blocks|Available KV cache memory:.*-",
+                "No GPU memory left for KV cache after loading model.",
+                [
+                    {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
+                    {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
+                ],
+            ),
+            (
+                r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
+                "GPU ran out of memory during startup or warmup.",
+                [
+                    {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+                    {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
+                    {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
+                ],
+            ),
+            (
+                r"not divisib|must be divisible|attention heads.*divisible",
+                "Tensor parallel size is incompatible with the model.",
+                [
+                    {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
+                    {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
+                ],
+            ),
+            (
+                r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
+                "Context length is too large for available GPU memory.",
+                [
+                    {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
+                    {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+                ],
+            ),
+            (
+                r"enable-auto-tool-choice requires --tool-call-parser",
+                "Auto tool choice requires an explicit tool call parser.",
+                [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
+            ),
+            (
+                r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
+                "Model requires custom code or newer model support.",
+                [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
+            ),
+            (
+                r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer",
+                "vLLM/Transformers kernel package mismatch.",
+                [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}],
+            ),
+            (
+                r"Address already in use|bind.*address.*in use",
+                "Port is already in use.",
+                [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
+            ),
+            (
+                r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
+                "No GPUs are visible to the serve process.",
+                [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
+            ),
+            (
+                r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
+                "vLLM could not find a supported GPU (CUDA or ROCm). "
+                "This machine may have integrated or unsupported graphics only.",
+                [
+                    {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+                    {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+                ],
+            ),
+            (
+                r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
+                "vLLM is not installed or not in PATH on this server.",
+                [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
+            ),
+            (
+                r"sglang.*command not found|No module named sglang|SGLang is not installed",
+                "SGLang is not installed or not in PATH on this server.",
+                [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
+            ),
+            (
+                r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+                "llama.cpp / llama-cpp-python dependencies are missing.",
+                [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
+            ),
+            (
+                r"No GGUF found on this host|no \.gguf file|No GGUF file found",
+                "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
+                [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
+            ),
+            (
+                r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
+                "Diffusion serving requires PyTorch and diffusers.",
+                [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}],
+            ),
+            (
+                r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
+                "Model access is gated or unauthorized.",
+                [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
+            ),
+        ]
+        for pattern, message, suggestions in patterns:
+            if re.search(pattern, tail, re.I):
+                return {"message": message, "suggestions": suggestions}
+        if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
+            r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
+        ):
+            return {
+                "message": "Python traceback detected during serve startup.",
+                "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
+            }
+        return None
+
     def _state_for_client(state):
         """Return cookbook state without raw secrets for browser clients."""
         _strip_task_secrets(state)
@@ -183,7 +295,6 @@ def setup_cookbook_routes() -> APIRouter:
         safe_chmod(key_path.with_suffix(".pub"), 0o644)
         return {"ok": True, "public_key": _read_cookbook_public_key()}
 
-
     def _needs_binary(cmd: str, binary: str) -> bool:
         return bool(re.search(rf"(^|[\s;&|()]){re.escape(binary)}($|[\s;&|()])", cmd or ""))
 
@@ -244,8 +355,8 @@ def setup_cookbook_routes() -> APIRouter:
             # POSIX form + shell-quoting so drive paths / spaces survive.
             inner = TMUX_LOG_DIR / f"{session_id}_run.sh"
             inner.write_text("\n".join(bash_lines) + "\n", encoding="utf-8")
-            lp = shlex.quote(git_bash_path(log_path))
-            ip = shlex.quote(git_bash_path(inner))
+            lp = shlex.quote(log_path.as_posix())
+            ip = shlex.quote(inner.as_posix())
             script_path = TMUX_LOG_DIR / f"{session_id}.sh"
             script_path.write_text(
                 f"bash {ip} > {lp} 2>&1\n",
@@ -286,24 +397,33 @@ def setup_cookbook_routes() -> APIRouter:
         require_admin(request)
         # Defence-in-depth: even though this endpoint is admin-gated, refuse
         # values that would land in shell contexts with metacharacters.
-        _validate_repo_id(req.repo_id)
-        _validate_include(req.include)
+        backend = (req.backend or "").strip().lower()
+        is_ollama_download = backend == "ollama" or ("/" not in req.repo_id and ":" in req.repo_id)
+        if is_ollama_download:
+            _validate_serve_model_id(req.repo_id)
+            req.include = None
+            req.local_dir = None
+        else:
+            _validate_repo_id(req.repo_id)
+            _validate_include(req.include)
         _validate_remote_host(req.remote_host)
         req.ssh_port = _validate_ssh_port(req.ssh_port)
         req.local_dir = _validate_local_dir(req.local_dir)
-        req.hf_token = req.hf_token or _load_stored_hf_token()
+        req.hf_token = "" if is_ollama_download else (req.hf_token or _load_stored_hf_token())
         _validate_token(req.hf_token)
         TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
         session_id = f"cookbook-{uuid.uuid4().hex[:8]}"
         wrapper_script = TMUX_LOG_DIR / f"{session_id}.sh"
 
-        # When a download directory is set, target a per-model subfolder under it
-        # (<dir>/<name>) so the flat-directory cache scan lists it as its own
-        # model. Without it, hf/snapshot_download falls back to the HF cache.
-        _dl_short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
-        _dl_base = (req.local_dir.rstrip("/") + "/" + _dl_short) if req.local_dir else None
-        _dl_shell = _shell_path(_dl_base) if _dl_base else None      # for hf CLI / bash
-        _dl_pyarg = (", local_dir=os.path.expanduser(" + repr(_dl_base) + ")") if _dl_base else ""
+        # Custom download dir: point the HF cache at <dir>/hub via env vars
+        # (HF_HOME + HUGGINGFACE_HUB_CACHE) instead of --local-dir. local_dir
+        # produces a flat layout (<dir>/<name>/<file>) and the local-dir
+        # bookkeeping files (.cache/huggingface/.gitignore.lock), and it
+        # also breaks robust resume on flaky transfers — the blob-based hub
+        # cache survives SSL ReadError mid-stream by reusing <sha>.incomplete,
+        # local_dir does not. See issue #2722.
+        _dl_hf_home_shell = _shell_path(req.local_dir.rstrip("/")) if req.local_dir else None
+        _dl_pyarg = ""  # snapshot_download honors the env vars too — no kwarg needed
 
         # Build the hf download command. Redirection to suppress the interactive
         # "update available? [Y/n]" prompt is added per-platform further down
@@ -311,8 +431,7 @@ def setup_cookbook_routes() -> APIRouter:
         hf_cmd = f"hf download {req.repo_id}"
         if req.include:
             hf_cmd += f" --include '{req.include}'"
-        if _dl_shell:
-            hf_cmd += f" --local-dir {_dl_shell}"
+        ollama_cmd = f"ollama pull {shlex.quote(req.repo_id)}"
 
         # Build the shell wrapper — runs hf download directly in tmux (which is a TTY)
         # No script/tee needed — we'll use tmux capture-pane to read output
@@ -320,8 +439,15 @@ def setup_cookbook_routes() -> APIRouter:
         lines.extend(_user_shell_path_bootstrap())
         if req.hf_token:
             lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
+        if _dl_hf_home_shell and not is_ollama_download:
+            # Make hf download / snapshot_download honor the chosen dir via the
+            # standard HF cache (gives us the models--org--name/blobs/... layout
+            # with resumable .incomplete blobs).
+            lines.append(f"export HF_HOME={_dl_hf_home_shell}")
+            lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub")
+            lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub")
         # Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
-        lines.append('export PATH="$HOME/.local/bin:$PATH"')
+        lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
         # When Odysseus runs from a venv (e.g. native macOS install), put its bin
         # on PATH so the tmux shell finds the bundled `hf`/`python3` without an
         # activated venv. Local bash runs only — meaningless over SSH.
@@ -332,14 +458,25 @@ def setup_cookbook_routes() -> APIRouter:
         # throughput. Retries set disable_hf_transfer to fall back to the plain,
         # slower-but-reliable downloader (resumes cleanly from the .incomplete files).
         # Use `python3 -m pip` not `pip` — macOS has no bare `pip` command.
-        lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
-        if req.disable_hf_transfer:
-            lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
-            lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+        if is_ollama_download:
+            lines.append('if command -v ollama >/dev/null 2>&1; then')
+            lines.append(f'  ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}')
+            lines.append('elif command -v docker >/dev/null 2>&1; then')
+            lines.append('  ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"')
+            lines.append('  if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then')
+            lines.append(f'    ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}')
+            lines.append('  fi')
+            lines.append('fi')
+            lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi')
         else:
-            lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
-            lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
-            lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
+            lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
+            if req.disable_hf_transfer:
+                lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
+                lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+            else:
+                lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
+                lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+                lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
 
         remote = req.remote_host  # None for local
         is_windows = req.platform == "windows"
@@ -361,37 +498,48 @@ def setup_cookbook_routes() -> APIRouter:
             ps_lines = []
             ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
             ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
-            ps_lines.append('$env:PYTHONIOENCODING = "utf-8"')
-            ps_lines.append('$env:PYTHONUTF8 = "1"')
             if req.hf_token:
                 ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
+            if req.local_dir and not is_ollama_download:
+                # Mirror the bash branch — point the HF cache at the user's dir
+                # via env vars instead of --local-dir, so resume works on flaky
+                # transfers (issue #2722).
+                _dl_ps = _ps_squote(req.local_dir.rstrip("/"))
+                ps_lines.append(f"$env:HF_HOME = '{_dl_ps}'")
+                ps_lines.append(f"$env:HUGGINGFACE_HUB_CACHE = '{_dl_ps}/hub'")
+                ps_lines.append(f"$env:HF_HUB_CACHE = '{_dl_ps}/hub'")
             if req.env_prefix:
                 ps_lines.append(_safe_env_prefix(req.env_prefix))
-            # Try hf CLI, fall back to Python huggingface_hub, then auto-install
-            ps_lines.append('try {{')
-            ps_lines.append('  $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
-            ps_lines.append('  if ($hfPath) {{')
-            # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
-            ps_lines.append(f'    $null | {hf_cmd}')
-            ps_lines.append('  }} else {{')
-            ps_lines.append('    python -c "import huggingface_hub" 2>$null')
-            ps_lines.append('    if ($LASTEXITCODE -eq 0) {{')
-            ps_lines.append('      Write-Host "hf CLI not found, using Python huggingface_hub..."')
-            ps_lines.append('      python -m pip install -q hf_transfer 2>$null')
-            ps_lines.append('      $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
-            ps_lines.append(f"      python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
-            ps_lines.append('    }} else {{')
-            ps_lines.append('      Write-Host "Installing huggingface-hub..."')
-            ps_lines.append('      python -m pip install -q huggingface-hub hf_transfer')
-            ps_lines.append('      $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
-            ps_lines.append(f"      python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
-            ps_lines.append('    }}')
-            ps_lines.append('  }}')
-            ps_lines.append('  if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
-            ps_lines.append('  else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
-            ps_lines.append('}} catch {{')
-            ps_lines.append('  Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
-            ps_lines.append('}}')
+            if is_ollama_download:
+                ps_lines.append('if (-not (Get-Command ollama -ErrorAction SilentlyContinue)) { Write-Host "ERROR: Ollama not found. Install from https://ollama.com/download/windows"; exit 127 }')
+                ps_lines.append(f"$null | ollama pull '{_ps_squote(req.repo_id)}'")
+                ps_lines.append('if ($LASTEXITCODE -eq 0) { Write-Host ""; Write-Host "DOWNLOAD_OK" } else { Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }')
+            else:
+                # Try hf CLI, fall back to Python huggingface_hub, then auto-install
+                ps_lines.append('try {{')
+                ps_lines.append('  $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
+                ps_lines.append('  if ($hfPath) {{')
+                # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
+                ps_lines.append(f'    $null | {hf_cmd}')
+                ps_lines.append('  }} else {{')
+                ps_lines.append('    python -c "import huggingface_hub" 2>$null')
+                ps_lines.append('    if ($LASTEXITCODE -eq 0) {{')
+                ps_lines.append('      Write-Host "hf CLI not found, using Python huggingface_hub..."')
+                ps_lines.append('      python -m pip install -q hf_transfer 2>$null')
+                ps_lines.append('      $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
+                ps_lines.append(f"      python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
+                ps_lines.append('    }} else {{')
+                ps_lines.append('      Write-Host "Installing huggingface-hub..."')
+                ps_lines.append('      python -m pip install -q huggingface-hub hf_transfer')
+                ps_lines.append('      $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
+                ps_lines.append(f"      python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
+                ps_lines.append('    }}')
+                ps_lines.append('  }}')
+                ps_lines.append('  if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
+                ps_lines.append('  else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
+                ps_lines.append('}} catch {{')
+                ps_lines.append('  Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
+                ps_lines.append('}}')
             ps_lines.append(f'Remove-Item -Force "$HOME\\{remote_runner}" -ErrorAction SilentlyContinue')
             runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1"
             runner_path.write_text("\r\n".join(ps_lines) + "\r\n", encoding="utf-8")
@@ -422,6 +570,10 @@ def setup_cookbook_routes() -> APIRouter:
             runner_lines.append("deactivate 2>/dev/null; hash -r")
             if req.hf_token:
                 runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
+            if _dl_hf_home_shell and not is_ollama_download:
+                runner_lines.append(f"export HF_HOME={_dl_hf_home_shell}")
+                runner_lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub")
+                runner_lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub")
             if req.env_prefix:
                 runner_lines.append(_safe_env_prefix(req.env_prefix))
             else:
@@ -432,42 +584,67 @@ def setup_cookbook_routes() -> APIRouter:
                     'done'
                 )
             # Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
-            runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
+            runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
             # Install hf CLI + optional hf_transfer best-effort. Retries disable
             # hf_transfer because the Rust parallel path is fast but has been
             # flaky near the end of very large multi-file downloads.
-            # The helper tries active pip first, then guarded user-site fallbacks.
-            runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
-            if req.disable_hf_transfer:
-                runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
-                runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+            # Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
+            if is_ollama_download:
+                runner_lines.append('if command -v ollama >/dev/null 2>&1; then')
+                runner_lines.append(f'  ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}')
+                runner_lines.append('elif command -v docker >/dev/null 2>&1; then')
+                runner_lines.append('  ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"')
+                runner_lines.append('  if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then')
+                runner_lines.append(f'    ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}')
+                runner_lines.append('  fi')
+                runner_lines.append('fi')
+                runner_lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi')
             else:
-                runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
-                runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
-                runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
-            # Surface whether the HF token actually reached THIS server, so a gated
-            # download's "not authorized" failure can be told apart from a missing
-            # token (the token is masked — we only print applied / not-set).
-            runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
-            # Try hf CLI first, fall back to Python huggingface_hub, then auto-install
-            runner_lines.append('if command -v hf &>/dev/null; then')
-            # < /dev/null suppresses interactive "update available? [Y/n]" prompt
-            runner_lines.append(f'  {hf_cmd} < /dev/null')
-            runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
-            runner_lines.append('  echo "hf CLI not found, using Python huggingface_hub..."')
-            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
-            runner_lines.append('else')
-            runner_lines.append('  echo "Installing huggingface-hub and dependencies..."')
-            runner_lines.append('  pip install --no-deps -q huggingface-hub 2>/dev/null')
-            if req.disable_hf_transfer:
-                runner_lines.append('  pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
-                runner_lines.append('  export HF_HUB_ENABLE_HF_TRANSFER=0')
+                runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
+                if req.disable_hf_transfer:
+                    runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
+                    runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+                else:
+                    runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
+                    runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+                    runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
+                # Surface whether the HF token actually reached THIS server, so a gated
+                # download's "not authorized" failure can be told apart from a missing
+                # token (the token is masked — we only print applied / not-set).
+                runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
+            # Wrap the download in a retry loop. Large HF/Ollama transfers can
+            # hit transient network failures; both backends resume cached partials.
+            mw = 4 if req.disable_hf_transfer else 8
+            runner_lines.append('_max_retries=10; _attempt=0; _ec=0')
+            runner_lines.append('while [ $_attempt -lt $_max_retries ]; do')
+            runner_lines.append('  _attempt=$((_attempt+1))')
+            if is_ollama_download:
+                runner_lines.append('  eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null')
             else:
-                runner_lines.append('  pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
-                runner_lines.append("  python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
-            runner_lines.append(f'  python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
-            runner_lines.append('fi')
-            runner_lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
+                runner_lines.append('  if command -v hf &>/dev/null; then')
+                runner_lines.append(f'    {hf_cmd} < /dev/null')
+                runner_lines.append('  elif python3 -c "import huggingface_hub" 2>/dev/null; then')
+                runner_lines.append('    [ $_attempt -eq 1 ] && echo "hf CLI not found, using Python huggingface_hub..."')
+                runner_lines.append(f'    python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"')
+                runner_lines.append('  else')
+                runner_lines.append('    echo "Installing huggingface-hub and dependencies..."')
+                runner_lines.append('    pip install --no-deps -q huggingface-hub 2>/dev/null')
+                if req.disable_hf_transfer:
+                    runner_lines.append('    pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
+                    runner_lines.append('    export HF_HUB_ENABLE_HF_TRANSFER=0')
+                else:
+                    runner_lines.append('    pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
+                    runner_lines.append("    python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+                runner_lines.append(f'    python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"')
+                runner_lines.append('  fi')
+            runner_lines.append('  _ec=$?')
+            runner_lines.append('  if [ $_ec -eq 0 ]; then break; fi')
+            runner_lines.append('  if [ $_attempt -lt $_max_retries ]; then')
+            runner_lines.append('    echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."')
+            runner_lines.append('    sleep 30')
+            runner_lines.append('  fi')
+            runner_lines.append('done')
+            runner_lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi')
             runner_lines.append(f"rm -f {remote_runner}")
             runner_lines.append('exec "${SHELL:-/bin/bash}"')
             runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
@@ -493,23 +670,30 @@ def setup_cookbook_routes() -> APIRouter:
                 lines.append("deactivate 2>/dev/null; hash -r")
             # Show whether the HF token reached this run (masked) — tells a gated
             # "not authorized" failure apart from a missing token.
-            lines.append(_HF_TOKEN_STATUS_SNIPPET)
-            if IS_WINDOWS:
-                # Detached path: no controlling TTY, so skip `< /dev/null`
-                # (handled by Popen stdin=DEVNULL) and don't keep a shell open.
-                lines.append(hf_cmd)
-                lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
-            else:
-                # < /dev/null suppresses interactive "update available? [Y/n]" prompt
-                lines.append(f"{hf_cmd} < /dev/null")
-                lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
+            if not is_ollama_download:
+                lines.append(_HF_TOKEN_STATUS_SNIPPET)
+            # Retry loop — same rationale as the remote-bash path. Issue #2722.
+            _hf_invoke = 'eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null' if is_ollama_download else (hf_cmd if IS_WINDOWS else f"{hf_cmd} < /dev/null")
+            lines.append('_max_retries=10; _attempt=0; _ec=0')
+            lines.append('while [ $_attempt -lt $_max_retries ]; do')
+            lines.append('  _attempt=$((_attempt+1))')
+            lines.append(f'  {_hf_invoke}')
+            lines.append('  _ec=$?')
+            lines.append('  if [ $_ec -eq 0 ]; then break; fi')
+            lines.append('  if [ $_attempt -lt $_max_retries ]; then')
+            lines.append('    echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."')
+            lines.append('    sleep 30')
+            lines.append('  fi')
+            lines.append('done')
+            lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi')
+            if not IS_WINDOWS:
                 lines.append(f"rm -f '{wrapper_script}'")
                 lines.append('exec "${SHELL:-/bin/bash}"')
                 wrapper_script.write_text("\n".join(lines) + "\n", encoding="utf-8")
                 wrapper_script.chmod(0o755)
             setup_cmd = None if IS_WINDOWS else f"tmux new-session -d -s {session_id} {shlex.quote(str(wrapper_script))}"
 
-        logger.info(f"Model download: {req.repo_id} (include={req.include}, session={session_id}, remote={remote})")
+        logger.info(f"Model download: {req.repo_id} (backend={'ollama' if is_ollama_download else 'hf'}, include={req.include}, session={session_id}, remote={remote})")
         logger.info(f"Download setup_cmd: {setup_cmd}")
 
         if setup_cmd is None:
@@ -564,35 +748,24 @@ def setup_cookbook_routes() -> APIRouter:
             for d in model_dir.split(','):
                 d = d.strip()
                 if d:
-                    translated_d = translate_path(d) if not host else d
-                    model_dirs.append(translated_d)
-        win_hf_hub = None
-        if not host:
-            win_profile = get_wsl_windows_user_profile()
-            win_hf_hub = os.path.join(win_profile, ".cache", "huggingface", "hub") if win_profile else None
-            
-        paths_code = _cached_model_scan_script(model_dirs, win_hf_hub)
+                    model_dirs.append(d)
+        paths_code = _cached_model_scan_script(model_dirs)
 
         scan_py = TMUX_LOG_DIR / "scan_cache.py"
         scan_py.write_text(paths_code, encoding="utf-8")
-        scan_payload = scan_py.read_bytes()
 
         if host:
+            _pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
             if platform == "windows":
-                remote_cmd = "python -"
+                # Windows: use 'python' and pipe via stdin with double-quote wrapping
+                cmd = f'ssh {_pf}{host} "python -" < \'{scan_py}\''
             else:
-                # POSIX: use 'python3' if available, fall back to 'python'; throw if neither is found.
-                remote_cmd = (
-                    "if command -v python3 >/dev/null 2>&1; then python3 -; "
-                    "elif command -v python >/dev/null 2>&1; then python -; "
-                    "else echo \"python3/python not found\" >&2; exit 127; fi"
-                )
-            rc, stdout_b, stderr_b = await run_ssh_command_async(
-                host,
-                ssh_port,
-                remote_cmd,
-                timeout=60,
-                stdin_data=scan_payload,
+                cmd = f"ssh {_pf}{host} 'python3 -' < '{scan_py}'"
+            proc = await asyncio.create_subprocess_shell(
+                cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                cwd=str(Path.home()),
             )
         else:
             # LOCAL scan: use sys.executable (the venv Python Odysseus is already
@@ -612,7 +785,7 @@ def setup_cookbook_routes() -> APIRouter:
                 stderr=asyncio.subprocess.PIPE,
                 cwd=str(Path.home()),
             )
-            stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)
+        stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)
 
         models = []
         try:
@@ -752,6 +925,100 @@ def setup_cookbook_routes() -> APIRouter:
                     return p
         return None
 
+    async def _serve_crash_watchdog(
+        endpoint_id: str,
+        session_id: str,
+        remote: str | None,
+        ssh_port: str | None,
+        is_windows: bool,
+    ) -> None:
+        """Drop a freshly-registered endpoint when the cookbook serve dies early.
+
+        The runner script always emits ``=== Process exited with code N ===``
+        when the launched cmd terminates (success or failure). We poll the
+        tmux pane periodically; on a non-zero exit detected within the watch
+        window, the endpoint row is deleted so the picker doesn't keep a
+        dead model around. A zero exit (rare for a long-running serve, but
+        possible for fast-failing builds that the runner reports as code 0)
+        and "missing exit marker" both leave the endpoint alone — that's
+        the loading-but-not-yet-bound state, which the probe-marks-offline
+        logic already handles.
+
+        Times are picked to outlast realistic vLLM load times (Qwen3.5-122B
+        takes ~3 min to load) without burning resources on a stuck-forever
+        wait. After the last check, the watchdog gives up — the picker's
+        per-endpoint probe takes over from there.
+        """
+        # Cumulative wait points: 25 s, 60 s, 2 min, 5 min.
+        _waits = [25, 35, 60, 180]
+        # Tmux capture-pane equivalent of the polling path used elsewhere in
+        # this file. Build it once and reuse on each tick. Skip the watchdog
+        # entirely on native-Windows local runs (no tmux). The Windows
+        # detached-process path writes its log to a known file and has its
+        # own lifecycle tracking; punting here keeps the code simple.
+        local_win = is_windows and not remote
+        if local_win:
+            return
+        if remote:
+            ssh_args = ["ssh"]
+            if ssh_port and ssh_port != "22":
+                ssh_args.extend(["-p", str(ssh_port)])
+            capture_cmd = ssh_args + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"]
+        else:
+            capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"]
+
+        _exit_re = re.compile(r"=== Process exited with code (-?\d+) ===")
+        for wait_s in _waits:
+            await asyncio.sleep(wait_s)
+            try:
+                proc = await asyncio.create_subprocess_exec(
+                    *capture_cmd,
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.DEVNULL,
+                )
+                stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=8)
+                output = stdout.decode("utf-8", errors="replace")
+            except Exception as e:
+                logger.debug(f"crash-watchdog: capture-pane failed (will retry): {e!r}")
+                continue
+            # Last occurrence wins — a serve that exits/restarts under the
+            # runner's "exec bash -i" trail will emit multiple markers; the
+            # most-recent code is the one that matters.
+            matches = list(_exit_re.finditer(output))
+            if not matches:
+                continue
+            try:
+                exit_code = int(matches[-1].group(1))
+            except (ValueError, IndexError):
+                continue
+            if exit_code == 0:
+                # Exit 0 on a long-running serve is unusual (a normal "loaded
+                # then ready" path keeps the process alive) but it happens for
+                # commands like "ollama pull" the user might launch through
+                # the same form. Don't drop the endpoint on a clean exit;
+                # let the probe layer mark it offline if nothing's listening.
+                logger.info(f"crash-watchdog: serve {session_id} exited cleanly (0); leaving endpoint {endpoint_id}")
+                return
+            # Non-zero exit — drop the endpoint.
+            try:
+                from core.database import SessionLocal as _SL, ModelEndpoint as _ME
+                db = _SL()
+                try:
+                    ep = db.query(_ME).filter(_ME.id == endpoint_id).first()
+                    if ep:
+                        logger.info(
+                            f"crash-watchdog: dropping endpoint {endpoint_id} "
+                            f"({ep.name} @ {ep.base_url}) — serve exited {exit_code}"
+                        )
+                        db.delete(ep)
+                        db.commit()
+                finally:
+                    db.close()
+            except Exception as e:
+                logger.warning(f"crash-watchdog: endpoint cleanup failed: {e!r}")
+            return
+        logger.debug(f"crash-watchdog: no exit marker for {session_id} within window; leaving endpoint {endpoint_id}")
+
     def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
         """Register a freshly-served LLM as a model endpoint so it appears in the
         model picker without a manual /setup step — the text-model sibling of
@@ -763,6 +1030,10 @@ def setup_cookbook_routes() -> APIRouter:
         probing /v1/models and dims the endpoint until the server is reachable,
         so registering immediately (before the server finishes loading) is safe.
         """
+        logger.info(
+            f"_auto_register_llm_endpoint: ENTRY repo_id={req.repo_id!r} "
+            f"remote={remote!r} cmd_prefix={req.cmd[:80]!r}"
+        )
         import re
         from core.database import SessionLocal, ModelEndpoint
 
@@ -787,16 +1058,20 @@ def setup_cookbook_routes() -> APIRouter:
         else:
             port = 8080  # llama.cpp's llama-server default — the Apple Silicon path
 
-        # Determine host (mirrors the image path: SSH alias for remote serves).
-        # For local serves while Odysseus runs inside Docker, "localhost"
-        # resolves to the container itself — useless. Use host.docker.internal
-        # which compose maps to the actual host, matching what /setup adds
-        # for Ollama by hand.
+        # Determine host. The cookbook tmux for `local=true` serves runs INSIDE
+        # the odysseus container — so the right URL for the in-container
+        # backend to reach it is `localhost`, NOT `host.docker.internal`
+        # (the latter points at the docker HOST, which doesn't have a server
+        # on that port). The previous host.docker.internal fallback only made
+        # sense for /setup-added external services like systemd Ollama on the
+        # host — and those go through manual setup, not this auto-register
+        # code path. For remote serves we still use the SSH host alias.
         if remote:
             host = remote.split("@")[-1] if "@" in remote else remote
+        elif re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\b", req.cmd or ""):
+            host = "host.docker.internal"
         else:
-            from routes.model_routes import _docker_host_gateway_reachable
-            host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost"
+            host = "localhost"
 
         base_url = f"http://{host}:{port}/v1"
 
@@ -805,7 +1080,9 @@ def setup_cookbook_routes() -> APIRouter:
 
         # If the serve command opts models into OpenAI tool-calling, record it so
         # agent_loop trusts emitted tool_calls instead of the name heuristic.
+        is_ollama_endpoint = "ollama" in (req.cmd or "").lower()
         supports_tools = True if "--enable-auto-tool-choice" in req.cmd else None
+        pinned_models = [req.repo_id] if is_ollama_endpoint and req.repo_id else []
 
         db = SessionLocal()
         try:
@@ -815,14 +1092,43 @@ def setup_cookbook_routes() -> APIRouter:
                 existing.is_enabled = True
                 existing.model_type = "llm"
                 existing.name = display_name
+                if is_ollama_endpoint:
+                    existing.endpoint_kind = "ollama"
+                    if pinned_models:
+                        existing.cached_models = json.dumps(pinned_models)
+                        existing.pinned_models = json.dumps(pinned_models)
                 if supports_tools is not None:
                     existing.supports_tools = supports_tools
-                # Wipe stale model lists so the picker re-probes and discovers
-                # the newly-served model instead of showing the old one.
-                existing.cached_models = None
-                existing.hidden_models = None
                 db.commit()
                 logger.info(f"Updated existing local model endpoint: {base_url}")
+                # Re-probe so cached_models matches what the server actually
+                # serves right now (the URL may have stayed the same but the
+                # model behind it changed across launches).
+                try:
+                    from routes.model_routes import _probe_endpoint
+                    import json as _json2
+                    probed = _probe_endpoint(base_url, existing.api_key, timeout=5)
+                    if probed:
+                        existing.cached_models = _json2.dumps(probed)
+                        db.commit()
+                except Exception as _pe:
+                    logger.warning(f"Re-probe failed for {base_url}: {_pe!r}")
+                # Sweep stale dupes: other endpoints with the same display name
+                # at DIFFERENT URLs (likely failed earlier-attempt ports) get
+                # deleted so the picker doesn't show an offline ghost next to
+                # the working one. Only sweeps endpoints whose id starts with
+                # `local-` so we never touch a user's hand-added DeepSeek/OpenAI/
+                # etc. entry with a coincidentally matching name.
+                stale = (db.query(ModelEndpoint)
+                         .filter(ModelEndpoint.name == display_name)
+                         .filter(ModelEndpoint.base_url != base_url)
+                         .filter(ModelEndpoint.id.like("local-%"))
+                         .all())
+                for s in stale:
+                    logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})")
+                    db.delete(s)
+                if stale:
+                    db.commit()
                 return existing.id
 
             ep_id = f"local-{uuid.uuid4().hex[:8]}"
@@ -833,11 +1139,42 @@ def setup_cookbook_routes() -> APIRouter:
                 api_key=None,
                 is_enabled=True,
                 model_type="llm",
+                endpoint_kind="ollama" if is_ollama_endpoint else "auto",
+                cached_models=json.dumps(pinned_models) if pinned_models else None,
+                pinned_models=json.dumps(pinned_models) if pinned_models else None,
                 supports_tools=supports_tools,
             )
             db.add(ep)
             db.commit()
             logger.info(f"Auto-registered local model endpoint: {display_name} @ {base_url}")
+            # Same sweep on first-register path: drop any pre-existing local-*
+            # endpoints with this display name pointed elsewhere.
+            stale = (db.query(ModelEndpoint)
+                     .filter(ModelEndpoint.name == display_name)
+                     .filter(ModelEndpoint.id != ep_id)
+                     .filter(ModelEndpoint.id.like("local-%"))
+                     .all())
+            for s in stale:
+                logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})")
+                db.delete(s)
+            if stale:
+                db.commit()
+            # Probe /v1/models NOW and write cached_models so the chat
+            # picker actually shows the model on the next /api/models
+            # call. Without this immediate probe, the endpoint has empty
+            # cached_models until the next background refresh fires (up
+            # to a minute later) and the picker shows nothing — even
+            # though the endpoint is in the DB and the server is up.
+            try:
+                from routes.model_routes import _probe_endpoint
+                import json as _json2
+                probed = _probe_endpoint(base_url, None, timeout=5)
+                if probed:
+                    ep.cached_models = _json2.dumps(probed)
+                    db.commit()
+                    logger.info(f"Auto-register: probed {len(probed)} models @ {base_url}")
+            except Exception as _pe:
+                logger.warning(f"Auto-register: probe-after-create failed for {base_url}: {_pe!r}")
             return ep_id
         except Exception as e:
             logger.error(f"Failed to auto-register local model endpoint: {e}")
@@ -877,27 +1214,11 @@ def setup_cookbook_routes() -> APIRouter:
             in_venv=sys.prefix != sys.base_prefix,
         )
         is_pip_install = bool(req.cmd and "pip install" in req.cmd)
-        remote = req.remote_host
-        is_windows = req.platform == "windows"
-        local_windows = IS_WINDOWS and not remote
-        if is_windows or local_windows:
-            if req.cmd.startswith("python3 "):
-                req.cmd = "python " + req.cmd[len("python3 "):]
-        if is_pip_install and ("llama-cpp-python" in req.cmd or "llama_cpp" in req.cmd) and (is_windows or local_windows):
-            if "--extra-index-url" not in req.cmd:
-                req.cmd += " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
-
         if is_pip_install:
             # Keep big dependency wheel builds (vLLM, …) off the home filesystem's
             # pip cache so they don't fail mid-build with "No space left" (#1219)
             # and leave the dep installed-but-unusable (#1459).
             req.cmd = _pip_install_no_cache(req.cmd)
-            # Accept common aliases and enforce server extras for llama-cpp so
-            # `python -m llama_cpp.server` has all runtime dependencies.
-            req.cmd = re.sub(r"(?<![A-Za-z0-9_.-])llama_cpp(?![A-Za-z0-9_.-])", "llama-cpp-python[server]", req.cmd)
-            req.cmd = re.sub(r"(?<![A-Za-z0-9_.-])llama-cpp-python(?!\[)", "llama-cpp-python[server]", req.cmd)
-            if "llama-cpp-python" in req.cmd and "--extra-index-url" not in req.cmd:
-                req.cmd += " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
             # PEP-508-style package spec — letters, digits, `.-_` for the
             # name; `[` `]` for extras; `<>=!~,` for version specifiers.
             # v2 review HIGH-14: tightened from the previous regex which
@@ -920,7 +1241,12 @@ def setup_cookbook_routes() -> APIRouter:
         # Otherwise the runner script picks one at runtime and `_auto_register`
         # below still registers the stale 11434 default — which on a host with
         # a systemd ollama lands on the wrong (unreachable-from-docker) service.
-        if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd:
+        # Match "ollama serve" as a phrase (with optional flags after), not
+        # any substring containing "ollama" — otherwise commands like
+        # `docker exec ollama-test ollama-import …` get wrapped as if they
+        # were native `ollama serve`, prepending OLLAMA_HOST=… and then
+        # running the ollama-not-found preflight which exits 127.
+        if re.search(r"\bollama\s+serve\b", req.cmd) and "OLLAMA_HOST=" not in req.cmd:
             _ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1"
             _ollama_chosen_port = _pick_free_port_for_ollama(
                 remote, req.ssh_port, start_port=11434, max_offset=10,
@@ -950,8 +1276,6 @@ def setup_cookbook_routes() -> APIRouter:
             ps_lines = []
             ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
             ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
-            ps_lines.append('$env:PYTHONIOENCODING = "utf-8"')
-            ps_lines.append('$env:PYTHONUTF8 = "1"')
             if req.hf_token:
                 ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
             if req.gpus:
@@ -970,7 +1294,7 @@ def setup_cookbook_routes() -> APIRouter:
                 ps_lines.append('try { python -c "import llama_cpp" 2>$null } catch {}')
                 ps_lines.append('if ($LASTEXITCODE -ne 0) {')
                 ps_lines.append('  Write-Host "Installing llama-cpp-python..."')
-                ps_lines.append('  python -m pip install llama-cpp-python[server] --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu')
+                ps_lines.append('  python -m pip install llama-cpp-python[server]')
                 ps_lines.append('}')
             elif "vllm" in req.cmd:
                 ps_lines.append('Write-Host "ERROR: vLLM is not supported on Windows. Use Ollama or llama.cpp instead."')
@@ -1045,58 +1369,46 @@ def setup_cookbook_routes() -> APIRouter:
                 # ollama is found (otherwise macOS falls back to a slow source build).
                 # /opt/homebrew = Apple Silicon, /usr/local = Intel; harmless on Linux.
                 runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:$HOME/llama.cpp/build/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
-                if local_windows:
-                    # LOCAL Windows: no native source compilation (no cmake/compiler on Git Bash).
-                    # Just check python bindings (using native `python` binary) and fall back to pip install.
-                    runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then')
-                    runner_lines.append('  echo "llama-server not found — installing Python bindings..."')
-                    runner_lines.append(f"  {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='python')} || true")
-                    runner_lines.append('fi')
-                    runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then')
-                    runner_lines.append('  echo "ERROR: llama.cpp serving is not available after install attempts."')
-                    runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
-                    runner_lines.append('fi')
-                else:
-                    runner_lines.append('if [ -d /data/data/com.termux ]; then')
-                    runner_lines.append('  # Termux: no native build — use the Python bindings (CPU).')
-                    runner_lines.append('  if ! python3 -c "import llama_cpp" 2>/dev/null; then')
-                    runner_lines.append('    pkg install -y cmake 2>/dev/null')
-                    runner_lines.append('    pip install numpy diskcache jinja2 2>/dev/null')
-                    runner_lines.append('    CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
-                    runner_lines.append('  fi')
-                    runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
-                    runner_lines.append('  echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
-                    runner_lines.append('  mkdir -p ~/bin')
-                    runner_lines.append('  cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
-                    # Build with the right accelerator: Metal on macOS (llama.cpp
-                    # enables it automatically, no flag), CUDA on Linux when present,
-                    # else a plain CPU build. nproc is Linux-only — fall back to
-                    # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships
-                    # a prebuilt llama-server and skips this whole source build.)
-                    runner_lines.append('  NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"')
-                    runner_lines.append('  if [ "$(uname -s)" = "Darwin" ]; then')
-                    runner_lines.append('    command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."')
-                    # Start from a clean cache: a prior failed configure (e.g. a CUDA
-                    # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build`
-                    # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is
-                    # explicit so the binary is optimized (Metal auto-enables on macOS).
-                    runner_lines.append('    cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
-                    runner_lines.append('      && cmake --build build -j"$NPROC" --target llama-server \\')
-                    runner_lines.append('      && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
-                    runner_lines.append('  else')
-                    _append_llama_cpp_linux_accel_build_lines(runner_lines)
-                    runner_lines.append('  fi')
-                    # If the native build failed, fall back to the Python bindings.
-                    runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
-                    runner_lines.append('    echo "llama-server build failed — installing Python bindings as fallback..."')
-                    runner_lines.append(f"    {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
-                    runner_lines.append('  fi')
-                    runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
-                    runner_lines.append('    echo "ERROR: llama.cpp serving is not available after install/build attempts."')
-                    runner_lines.append('    ODYSSEUS_PREFLIGHT_EXIT=127')
-                    runner_lines.append('  fi')
-                    runner_lines.append('fi')
-            elif "ollama" in req.cmd:
+                runner_lines.append('if [ -d /data/data/com.termux ]; then')
+                runner_lines.append('  # Termux: no native build — use the Python bindings (CPU).')
+                runner_lines.append('  if ! python3 -c "import llama_cpp" 2>/dev/null; then')
+                runner_lines.append('    pkg install -y cmake 2>/dev/null')
+                runner_lines.append('    pip install numpy diskcache jinja2 2>/dev/null')
+                runner_lines.append('    CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
+                runner_lines.append('  fi')
+                runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
+                runner_lines.append('  echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
+                runner_lines.append('  mkdir -p ~/bin')
+                runner_lines.append('  cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
+                # Build with the right accelerator: Metal on macOS (llama.cpp
+                # enables it automatically, no flag), CUDA on Linux when present,
+                # else a plain CPU build. nproc is Linux-only — fall back to
+                # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships
+                # a prebuilt llama-server and skips this whole source build.)
+                runner_lines.append('  NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"')
+                runner_lines.append('  if [ "$(uname -s)" = "Darwin" ]; then')
+                runner_lines.append('    command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."')
+                # Start from a clean cache: a prior failed configure (e.g. a CUDA
+                # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build`
+                # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is
+                # explicit so the binary is optimized (Metal auto-enables on macOS).
+                runner_lines.append('    cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
+                runner_lines.append('      && cmake --build build -j"$NPROC" --target llama-server \\')
+                runner_lines.append('      && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+                runner_lines.append('  else')
+                _append_llama_cpp_linux_accel_build_lines(runner_lines)
+                runner_lines.append('  fi')
+                runner_lines.append('  # If the native build failed, fall back to the Python bindings.')
+                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
+                runner_lines.append('    echo "llama-server build failed — installing Python bindings as fallback..."')
+                runner_lines.append(f"    {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
+                runner_lines.append('  fi')
+                runner_lines.append('  if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
+                runner_lines.append('    echo "ERROR: llama.cpp serving is not available after install/build attempts."')
+                runner_lines.append('    ODYSSEUS_PREFLIGHT_EXIT=127')
+                runner_lines.append('  fi')
+                runner_lines.append('fi')
+            elif re.search(r"\bollama\s+serve\b", req.cmd):
                 handled_ollama_serve = True
                 _ollama_default_host = "0.0.0.0" if remote else "127.0.0.1"
                 _ollama_host, _ollama_port = _ollama_bind_from_cmd(
@@ -1117,23 +1429,13 @@ def setup_cookbook_routes() -> APIRouter:
                 runner_lines.append('    ODYSSEUS_OLLAMA_PORT="$_ody_try_port"')
                 runner_lines.append('    break')
                 runner_lines.append('  fi')
-                runner_lines.append('  echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"')
-                runner_lines.append('  echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."')
-                if local_windows:
-                    # Windows detached process has no TTY; exec bash -i crashes.
-                    # Keep the monitoring task alive with a sleep loop.
-                    runner_lines.append('  while true; do sleep 60; done')
-                else:
-                    runner_lines.append('  exec bash -i')
-                runner_lines.append('fi')
+                runner_lines.append('  exec 3<&-; exec 3>&-')
+                runner_lines.append('done')
                 runner_lines.append('if ! command -v ollama &>/dev/null; then')
                 runner_lines.append('  echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."')
                 runner_lines.append('  echo')
                 runner_lines.append('  echo "=== Process exited with code 127 ==="')
-                if local_windows:
-                    runner_lines.append('  exit 127')
-                else:
-                    runner_lines.append('  exec bash -i')
+                runner_lines.append('  exec bash -i')
                 runner_lines.append('fi')
                 runner_lines.append('ODYSSEUS_OLLAMA_URL="http://${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}"')
                 if remote and _ollama_host in ("0.0.0.0", "::"):
@@ -1141,20 +1443,24 @@ def setup_cookbook_routes() -> APIRouter:
                     runner_lines.append('echo "[odysseus] Ollama has no built-in authentication; expose this only on a trusted LAN/VPN or provide an explicit OLLAMA_HOST with your own access controls."')
                 runner_lines.append('echo "Starting ollama server on ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}..."')
                 runner_lines.append('OLLAMA_HOST="${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
-                if local_windows:
-                    _append_serve_exit_code_lines(runner_lines, keep_shell_open=False)
-                else:
-                    runner_lines.append('_ody_exit=$?')
-                    runner_lines.append('echo')
-                    runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
-                    runner_lines.append('exec bash -i')
+                runner_lines.append('_ody_exit=$?')
+                runner_lines.append('echo')
+                runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
+                runner_lines.append('exec bash -i')
             elif "vllm serve" in req.cmd:
                 # vLLM is CUDA/ROCm-only and does not run on macOS at all.
                 runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then')
                 runner_lines.append('  echo "ERROR: vLLM does not run on macOS. Use Ollama or llama.cpp (Metal) instead."')
                 runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=1')
                 runner_lines.append('fi')
-                _append_vllm_linux_preflight_lines(runner_lines)
+                # Put ~/.local/bin on PATH first — without a venv, vllm installs
+                # there via --user and the non-login serve shell otherwise can't
+                # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
+                runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
+                runner_lines.append('if ! command -v vllm &>/dev/null; then')
+                runner_lines.append('  echo "ERROR: vLLM is not installed."')
+                runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
+                runner_lines.append('fi')
             elif "sglang.launch_server" in req.cmd:
                 runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
                 runner_lines.append('if ! command -v sglang &>/dev/null; then')
@@ -1173,15 +1479,30 @@ def setup_cookbook_routes() -> APIRouter:
                 runner_lines.append('  ODYSSEUS_PREFLIGHT_EXIT=127')
                 runner_lines.append('fi')
 
-            if not handled_ollama_serve:
+            handled_ollama_sidecar_probe = False
+            if (not handled_ollama_serve
+                and re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\s+ollama\s+show\b", req.cmd or "")):
+                handled_ollama_sidecar_probe = True
                 _append_serve_preflight_exit_lines(
                     runner_lines,
                     keep_shell_open=not local_windows,
                 )
-                if is_pip_install:
-                    _append_pip_install_runner_lines(runner_lines, req.cmd)
-                else:
-                    runner_lines.append(req.cmd)
+                runner_lines.append(req.cmd)
+                runner_lines.append('_ody_exit=$?')
+                runner_lines.append('echo')
+                runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
+                runner_lines.append('if [ "$_ody_exit" -eq 0 ]; then')
+                runner_lines.append('  echo "[odysseus] Ollama sidecar model is available; keeping Cookbook task attached to the persistent Ollama daemon."')
+                runner_lines.append('  while true; do sleep 3600; done')
+                runner_lines.append('fi')
+                runner_lines.append('exec bash -i')
+
+            if not handled_ollama_serve and not handled_ollama_sidecar_probe:
+                _append_serve_preflight_exit_lines(
+                    runner_lines,
+                    keep_shell_open=not local_windows,
+                )
+                runner_lines.append(req.cmd)
                 if local_windows:
                     # Detached background process — no interactive shell to keep open.
                     # Print the exit marker the status poller looks for, then stop.
@@ -1263,6 +1584,26 @@ def setup_cookbook_routes() -> APIRouter:
         elif not is_pip_install:
             endpoint_id = _auto_register_llm_endpoint(req, remote)
 
+        # Crash watchdog: the auto-register above writes the endpoint row
+        # IMMEDIATELY (before the server has even bound its port) so the
+        # picker shows the model as it warms up. When the serve process
+        # crashes right at startup (missing module, bad cmd, port collision,
+        # ModuleNotFoundError on llama_cpp, etc.), the endpoint is left
+        # dangling — every subsequent chat returns 503 or an empty response.
+        # Schedule a background task to read the tmux output for the
+        # "=== Process exited with code N ===" marker the runner emits;
+        # if N != 0 within the watch window, delete the endpoint we just
+        # created. Skipped for diffusion (different image-endpoint cleanup
+        # path) and pip-install tasks (no endpoint to drop).
+        if endpoint_id and not is_diffusion and not is_pip_install:
+            asyncio.create_task(_serve_crash_watchdog(
+                endpoint_id=endpoint_id,
+                session_id=session_id,
+                remote=remote,
+                ssh_port=req.ssh_port,
+                is_windows=is_windows,
+            ))
+
         # Log to assistant
         try:
             from src.assistant_log import log_to_assistant
@@ -1342,8 +1683,8 @@ def setup_cookbook_routes() -> APIRouter:
             cmd = f"ssh {pf}{host} '{setup_script}'"
         else:
             # Linux: auto-install tmux (via whichever package manager is available)
-            # and huggingface_hub + hf_transfer (falling back to --user, then
-            # guarded --break-system-packages on PEP-668 locked distros).
+            # and huggingface_hub + hf_transfer (falling back to --user/--break-system-packages
+            # on PEP-668 locked distros like Arch / newer Debian).
             setup_script = (
                 # Install tmux if missing — try common package managers; skip if no sudo
                 "if ! command -v tmux >/dev/null 2>&1; then "
@@ -1355,15 +1696,10 @@ def setup_cookbook_routes() -> APIRouter:
                 "  fi; "
                 "fi; "
                 "command -v tmux >/dev/null 2>&1 || echo 'WARNING: tmux missing and auto-install failed (need passwordless sudo). Install manually.'; "
-                # Install Python bits. Try system install first; fall back to --user,
-                # then use --break-system-packages only when pip supports it.
+                # Install Python bits. Try system install first; fall back to --user --break-system-packages on PEP 668 systems.
                 "pip install -q huggingface_hub hf_transfer 2>/dev/null || "
-                "pip install --user -q huggingface_hub hf_transfer 2>/dev/null || "
-                "( pip install --help 2>/dev/null | grep -q -- --break-system-packages && "
-                "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ) || "
-                "pip3 install --user -q huggingface_hub hf_transfer 2>/dev/null || "
-                "( pip3 install --help 2>/dev/null | grep -q -- --break-system-packages && "
-                "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ); "
+                "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null || "
+                "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null; "
                 "python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'"
             )
             cmd = f"ssh {pf}{host} '{setup_script}'"
@@ -1386,38 +1722,11 @@ def setup_cookbook_routes() -> APIRouter:
     async def _run_nvidia_smi(query: str, host: str | None, ssh_port: str | None, timeout: int = 8):
         """Run nvidia-smi locally or over SSH. Returns (stdout, error_or_None)."""
         if host:
-            candidates = [query]
-            stripped = query.strip()
-            if stripped.startswith("nvidia-smi "):
-                args = stripped[len("nvidia-smi "):]
-                candidates.append(
-                    "bash -lc "
-                    + shlex.quote(
-                        f"{SSH_PATH_OVERRIDE}"
-                        f"nvidia-smi {args}"
-                    )
-                )
-                for nvidia_path in NVIDIA_PATH_CANDIDATES:
-                    candidates.append(f"{nvidia_path} {args}")
-
-            last_err = "nvidia-smi failed"
-            for candidate in candidates:
-                try:
-                    rc, stdout, stderr = await run_ssh_command_async(
-                        host,
-                        ssh_port,
-                        candidate,
-                        connect_timeout=5,
-                        timeout=timeout,
-                    )
-                except asyncio.TimeoutError:
-                    return None, "nvidia-smi timed out"
-                if rc == 0:
-                    return stdout.decode("utf-8", errors="replace"), None
-                err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
-                if err:
-                    last_err = err
-            return None, last_err
+            pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
+            cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{query}'"
+            proc = await asyncio.create_subprocess_shell(
+                cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
         else:
             proc = await asyncio.create_subprocess_exec(
                 *shlex.split(query),
@@ -1996,30 +2305,58 @@ def setup_cookbook_routes() -> APIRouter:
 
         return {"models": out}
 
-    # Rate-limit for the orphan-tmux adoption sweep. The UI polls
-    # tasks/status every ~3s; we don't want to SSH every host on every
-    # poll. 20s is fast enough that a model the agent launched in the
-    # background shows up "almost immediately" in the UI without being
-    # wasteful.
+    # Rate-limit for the orphan-tmux adoption sweep. 60s interval so SSH
+    # work is genuinely sparse even on an actively-polled cookbook page.
     _last_orphan_sweep_ts = [0.0]
-    _ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0
+    _ORPHAN_SWEEP_MIN_INTERVAL_S = 60.0
+    # Concurrency guard so two requests racing don't both spawn a sweep.
+    _orphan_sweep_inflight = [False]
 
     def _maybe_sweep_orphans(tasks: list, state: dict) -> None:
         """Scan each configured cookbook server for `serve-*` tmux sessions
         the cookbook doesn't know about and adopt them into state.tasks.
 
-        Writes are conditional: if no orphans are found, nothing is touched.
-        Rate-limited so polling UIs don't trigger SSH on every refresh.
+        Heavy SSH work runs in a background thread via asyncio.to_thread so
+        it never blocks the request that triggered it. Was previously
+        disabled because the sync implementation pegged uvicorn CPU during
+        active cookbook polling — re-enabled now with the work pushed off
+        the event loop and a slower (60s) cadence.
         """
         import time as _time
-        import subprocess
-        logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}")
         now = _time.monotonic()
+        if _orphan_sweep_inflight[0]:
+            return
         if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S:
-            logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last")
             return
         _last_orphan_sweep_ts[0] = now
+        _orphan_sweep_inflight[0] = True
+        # Snapshot inputs so the worker doesn't race with state mutations.
+        try:
+            tasks_snap = list(tasks or [])
+        except Exception:
+            tasks_snap = []
+        state_snap = state if isinstance(state, dict) else {}
 
+        # Caller is _cookbook_tasks_status_sync (sync context, no event
+        # loop). Use a plain background thread — no asyncio needed.
+        import threading
+        def _run_sweep() -> None:
+            try:
+                _sync_sweep_orphans(tasks_snap, state_snap)
+            except Exception as _e:
+                logger.warning(f"orphan sweep thread failed: {_e!r}")
+            finally:
+                _orphan_sweep_inflight[0] = False
+        try:
+            threading.Thread(target=_run_sweep, daemon=True, name="orphan-sweep").start()
+        except Exception as _e:
+            logger.warning(f"orphan sweep thread spawn failed: {_e!r}")
+            _orphan_sweep_inflight[0] = False
+        return
+
+    def _sync_sweep_orphans(tasks: list, state: dict) -> None:
+        """The actual sync sweep — never call this on the event loop."""
+        import subprocess
         env = state.get("env") if isinstance(state, dict) else {}
         servers = env.get("servers") if isinstance(env, dict) else []
         logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}")
@@ -2143,6 +2480,121 @@ def setup_cookbook_routes() -> APIRouter:
             except Exception as e:
                 logger.warning(f"orphan sweep: state write failed: {e}")
 
+    # In-memory cache for the Ollama library scrape. ollama.com is a public
+    # site, but it doesn't expose a stable JSON listing — we fetch the HTML
+    # search page and regex out the model cards. Cached for 1 h so a busy
+    # cookbook view doesn't hammer the site on every render.
+    _ollama_library_cache: dict = {"models": [], "fetched_at": 0.0, "error": None}
+
+    _OLLAMA_FALLBACK_LIBRARY = [
+        {"name": "qwen2.5", "description": "Qwen2.5 series — strong general/coding model from Alibaba.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b", "72b"]},
+        {"name": "qwen2.5-coder", "description": "Code-specialized Qwen2.5 family.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b"]},
+        {"name": "qwen3", "description": "Qwen3 — newer Alibaba family with hybrid reasoning.", "sizes": ["0.6b", "1.7b", "4b", "8b", "14b", "32b"]},
+        {"name": "llama3.2", "description": "Meta Llama 3.2 instruct (and tiny / vision variants).", "sizes": ["1b", "3b", "11b", "90b"]},
+        {"name": "llama3.1", "description": "Meta Llama 3.1 instruct.", "sizes": ["8b", "70b", "405b"]},
+        {"name": "llama3.3", "description": "Meta Llama 3.3 70B instruct.", "sizes": ["70b"]},
+        {"name": "gemma3", "description": "Google Gemma 3 — multimodal capable open-weights.", "sizes": ["1b", "4b", "12b", "27b"]},
+        {"name": "gemma2", "description": "Google Gemma 2 instruct.", "sizes": ["2b", "9b", "27b"]},
+        {"name": "mistral", "description": "Mistral 7B instruct — small, fast generalist.", "sizes": ["7b"]},
+        {"name": "mistral-nemo", "description": "Mistral NeMo 12B instruct.", "sizes": ["12b"]},
+        {"name": "mistral-small", "description": "Mistral Small 22B / 24B instruct.", "sizes": ["22b", "24b"]},
+        {"name": "mixtral", "description": "Mistral MoE 8x7B / 8x22B.", "sizes": ["8x7b", "8x22b"]},
+        {"name": "phi3", "description": "Microsoft Phi-3 small / medium.", "sizes": ["mini", "medium"]},
+        {"name": "phi4", "description": "Microsoft Phi-4 14B.", "sizes": ["14b"]},
+        {"name": "deepseek-r1", "description": "DeepSeek R1 reasoning model (distilled variants).", "sizes": ["1.5b", "7b", "8b", "14b", "32b", "70b"]},
+        {"name": "deepseek-v3", "description": "DeepSeek V3 MoE 671B (huge — needs serious VRAM).", "sizes": ["671b"]},
+        {"name": "codellama", "description": "Meta Code Llama instruct family.", "sizes": ["7b", "13b", "34b", "70b"]},
+        {"name": "starcoder2", "description": "BigCode StarCoder2 — code completion.", "sizes": ["3b", "7b", "15b"]},
+        {"name": "deepseek-coder-v2", "description": "DeepSeek Coder V2 — code MoE.", "sizes": ["16b", "236b"]},
+        {"name": "nomic-embed-text", "description": "Embedding model — text vector encoder.", "sizes": ["latest"]},
+        {"name": "mxbai-embed-large", "description": "Embedding model — Mixedbread large.", "sizes": ["latest"]},
+        {"name": "llava", "description": "LLaVA multimodal vision-language model.", "sizes": ["7b", "13b", "34b"]},
+        {"name": "minicpm-v", "description": "MiniCPM-V multimodal.", "sizes": ["8b"]},
+        {"name": "command-r", "description": "Cohere Command R — RAG-oriented.", "sizes": ["35b"]},
+        {"name": "command-r-plus", "description": "Cohere Command R+ — larger RAG model.", "sizes": ["104b"]},
+        {"name": "qwq", "description": "Qwen QwQ reasoning preview.", "sizes": ["32b"]},
+        {"name": "smollm2", "description": "HuggingFaceTB SmolLM2 — tiny capable models.", "sizes": ["135m", "360m", "1.7b"]},
+        {"name": "granite3.1-dense", "description": "IBM Granite 3.1 dense instruct.", "sizes": ["2b", "8b"]},
+        {"name": "nemotron", "description": "NVIDIA Nemotron 70B.", "sizes": ["70b"]},
+        {"name": "olmo2", "description": "AI2 OLMo 2 open-weights.", "sizes": ["7b", "13b"]},
+    ]
+
+    @router.get("/api/cookbook/ollama/library")
+    async def ollama_library(refresh: int = 0, request: Request = None, owner: str = Depends(require_user)):
+        """List popular Ollama library models for the Browse picker.
+
+        Tries a 1-hour-cached fetch of ollama.com/library, falls back to a
+        curated hard-coded list so the picker always renders something."""
+        import time as _time
+        import httpx as _httpx
+        TTL = 3600.0
+        now = _time.time()
+        if refresh or (now - _ollama_library_cache["fetched_at"]) > TTL or not _ollama_library_cache["models"]:
+            models: list[dict] = []
+            err = None
+            try:
+                async with _httpx.AsyncClient(timeout=8, follow_redirects=True) as client:
+                    resp = await client.get(
+                        "https://ollama.com/search?sort=popular",
+                        headers={"User-Agent": "odysseus-cookbook/1.0"},
+                    )
+                if resp.status_code == 200:
+                    html = resp.text
+                    # ollama.com renders each model card as a single anchor:
+                    #   <a href="/library/<name>" class="group w-full"> … </a>
+                    # The description + sizes live inside that anchor. Pull
+                    # the whole block then extract pieces individually.
+                    block_re = re.compile(
+                        r'<a[^>]*href="/library/([A-Za-z0-9._-]+)"[^>]*>(.*?)</a>',
+                        re.DOTALL,
+                    )
+                    desc_re = re.compile(r'<p[^>]*>([^<]{4,400})</p>', re.DOTALL)
+                    # Size tags on ollama.com cards look like "0.5b", "14b",
+                    # "8x7b", "27b". Pulled from short <span>-wrapped chips.
+                    size_re = re.compile(r'>\s*(\d+(?:\.\d+)?(?:x\d+)?[bBmM])\s*<')
+                    seen: set[str] = set()
+                    for bm in block_re.finditer(html):
+                        name = bm.group(1).strip()
+                        if name in seen:
+                            continue
+                        seen.add(name)
+                        body = bm.group(2)
+                        dm = desc_re.search(body)
+                        desc = (dm.group(1).strip() if dm else "").replace("\n", " ")
+                        sizes_raw = size_re.findall(body)
+                        # Dedup sizes preserving order
+                        sizes: list[str] = []
+                        for s in sizes_raw:
+                            s_low = s.lower()
+                            if s_low not in sizes:
+                                sizes.append(s_low)
+                        models.append({"name": name, "description": desc, "sizes": sizes})
+                        if len(models) >= 80:
+                            break
+                else:
+                    err = f"HTTP {resp.status_code}"
+            except Exception as e:
+                err = str(e)[:160]
+            # Merge curated fallback so classics (qwen2.5, llama3, deepseek-r1,
+            # …) stay reachable even when ollama.com's front page is dominated
+            # by brand-new releases the user might not be looking for.
+            live_names = {m["name"] for m in models}
+            for fb in _OLLAMA_FALLBACK_LIBRARY:
+                if fb["name"] not in live_names:
+                    models.append(fb)
+            if not models:
+                models = list(_OLLAMA_FALLBACK_LIBRARY)
+                if err is None:
+                    err = "parsed 0 results — using fallback list"
+            _ollama_library_cache["models"] = models
+            _ollama_library_cache["fetched_at"] = now
+            _ollama_library_cache["error"] = err
+        return {
+            "models": _ollama_library_cache["models"],
+            "fetched_at": _ollama_library_cache["fetched_at"],
+            "error": _ollama_library_cache["error"],
+        }
+
     @router.get("/api/cookbook/tasks/status")
     async def cookbook_tasks_status(request: Request):
         """Check status of all active cookbook tmux sessions.
@@ -2180,13 +2632,39 @@ def setup_cookbook_routes() -> APIRouter:
                 "inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
                 "sys.exit(0 if ok and not inc else 1)"
             )
-            if remote_host:
-                cmd = ["python3", "-c", py, repo_id]
-            else:
-                # Local Windows: python3 can hit the Microsoft Store stub. Use the
-                # real Python Odysseus is running under (guaranteed to exist).
-                import sys as _sys_local
-                cmd = [_sys_local.executable, "-c", py, repo_id]
+            cmd = ["python3", "-c", py, repo_id]
+            try:
+                if remote_host:
+                    ssh_base = ["ssh"]
+                    if ssh_port and ssh_port != "22":
+                        ssh_base.extend(["-p", str(ssh_port)])
+                    shell_cmd = " ".join(shlex.quote(x) for x in cmd)
+                    proc = subprocess.run(ssh_base + [remote_host, shell_cmd], timeout=12, capture_output=True)
+                else:
+                    proc = subprocess.run(cmd, timeout=12, capture_output=True)
+                return proc.returncode == 0
+            except Exception:
+                return False
+
+        def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
+            """Best-effort check for resumable HF partial blobs.
+
+            A lost SSH/tmux session can leave a real download still incomplete.
+            Treat any *.incomplete blob as stronger evidence than stale
+            "100%" lines in the captured pane output.
+            """
+            if not repo_id or "/" not in repo_id:
+                return False
+            py = (
+                "import os,sys;"
+                "repo=sys.argv[1];"
+                "base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
+                "d=os.path.join(base,'models--'+repo.replace('/','--'));"
+                "blobs=os.path.join(d,'blobs');"
+                "inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
+                "sys.exit(0 if inc else 1)"
+            )
+            cmd = ["python3", "-c", py, repo_id]
             try:
                 if remote_host:
                     ssh_base = ["ssh"]
@@ -2333,28 +2811,43 @@ def setup_cookbook_routes() -> APIRouter:
                 except Exception:
                     pass
             else:
-                try:
-                    alive = subprocess.run(check_cmd, timeout=10, capture_output=True)
-                    is_alive = alive.returncode == 0
-                except Exception:
+                # Skip the live SSH check entirely for tasks already in a
+                # terminal state — they won't change, and 10s timeouts
+                # stacked per task were the dominant cost of this whole
+                # status endpoint (3+ minute stalls with ~8 accumulated
+                # stopped tasks). The agent's `list_served_models` call
+                # was blocking the chat stream every time.
+                _task_status = (task.get("status") or "").lower()
+                if _task_status in {"stopped", "done", "completed",
+                                    "crashed", "error", "failed",
+                                    "ended", "killed"}:
                     is_alive = False
-
-                # Capture last lines for progress. Prefer the "Downloading" line
-                # (real aggregate bytes) over "Fetching N files" (whole-file count that
-                # lags with hf_transfer). Falls back to the true last line otherwise.
-                if is_alive:
+                    # Keep the persisted output_tail for the UI — it's
+                    # what the agent uses to diagnose past failures.
+                    full_snapshot = (task.get("output") or "")[-12000:]
+                else:
                     try:
-                        cap = subprocess.run(capture_cmd, timeout=10, capture_output=True, text=True)
-                        if cap.returncode == 0:
-                            full_snapshot = cap.stdout.strip()
-                            lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
-                            downloading_lines = [l for l in lines if l.startswith("Downloading")]
-                            if downloading_lines:
-                                progress_text = downloading_lines[-1]
-                            elif lines:
-                                progress_text = lines[-1]
+                        alive = subprocess.run(check_cmd, timeout=4, capture_output=True)
+                        is_alive = alive.returncode == 0
                     except Exception:
-                        pass
+                        is_alive = False
+
+                    # Capture last lines for progress. Prefer the "Downloading" line
+                    # (real aggregate bytes) over "Fetching N files" (whole-file count that
+                    # lags with hf_transfer). Falls back to the true last line otherwise.
+                    if is_alive:
+                        try:
+                            cap = subprocess.run(capture_cmd, timeout=4, capture_output=True, text=True)
+                            if cap.returncode == 0:
+                                full_snapshot = cap.stdout.strip()
+                                lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
+                                downloading_lines = [l for l in lines if l.startswith("Downloading")]
+                                if downloading_lines:
+                                    progress_text = downloading_lines[-1]
+                                elif lines:
+                                    progress_text = lines[-1]
+                        except Exception:
+                            pass
 
             # Determine status. For the local-Windows detached model the log file
             # persists after the process exits, so a finished download still has a
@@ -2362,6 +2855,16 @@ def setup_cookbook_routes() -> APIRouter:
             # when the PID is gone instead of blindly reporting "stopped".
             download_zero_files = False
             status = "unknown"
+            download_has_ok = task_type == "download" and "DOWNLOAD_OK" in full_snapshot
+            download_has_failed = task_type == "download" and "DOWNLOAD_FAILED" in full_snapshot
+            download_has_incomplete_evidence = (
+                task_type == "download"
+                and (
+                    ".incomplete" in full_snapshot
+                    or bool(re.search(r'model-\d+-of-\d+\.[A-Za-z0-9_.-]+:\s+(?:[0-9]|[1-8][0-9])%', full_snapshot))
+                    or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""))
+                )
+            )
             if is_alive or (local_win_task and full_snapshot):
                 lower = full_snapshot.lower()
                 exit_match = re.search(r"=== process exited with code\s+(-?\d+)", full_snapshot, re.I)
@@ -2374,20 +2877,24 @@ def setup_cookbook_routes() -> APIRouter:
                 elif has_exit and task_type == "download":
                     # Dependency installs are tracked as download tasks but only
                     # emit the generic runner exit marker, not HF download markers.
-                    status = "completed" if exit_code == 0 else "error"
+                    if download_has_incomplete_evidence and not download_has_ok:
+                        status = "running" if is_alive else "stopped"
+                    else:
+                        status = "completed" if exit_code == 0 else "error"
                 elif has_exit and "unrecognized arguments" in lower:
                     status = "error"
                 elif has_error and not ("application startup complete" in lower):
                     status = "error"
-                elif task_type == "download" and ("100%" in full_snapshot or "DOWNLOAD_OK" in full_snapshot):
-                    # Only download tasks treat 100% as "completed".
-                    # Serve tasks log 100%|██████| during inference progress
-                    # (diffusion sampling, etc.) — that's "running", not done.
+                elif task_type == "download" and download_has_ok:
                     if re.search(r"Fetching\s+0\s+files", full_snapshot, re.IGNORECASE):
                         status = "error"
                         download_zero_files = True
                     else:
                         status = "completed"
+                elif task_type == "download" and download_has_failed:
+                    status = "error"
+                elif task_type == "download" and download_has_incomplete_evidence:
+                    status = "running" if is_alive else "stopped"
                 elif "application startup complete" in lower:
                     status = "ready"
                 elif not is_alive:
@@ -2397,7 +2904,11 @@ def setup_cookbook_routes() -> APIRouter:
                     status = "running"
             else:
                 # Session is dead — check if it completed or crashed
-                if task_type == "download" and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")):
+                if (
+                    task_type == "download"
+                    and not download_has_incomplete_evidence
+                    and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""))
+                ):
                     status = "completed"
                     if not progress_text:
                         progress_text = "Download complete"
@@ -2407,12 +2918,12 @@ def setup_cookbook_routes() -> APIRouter:
                     status = "stopped"
 
             # Parse structured phase info — single source of truth for the UI
-            phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
+            phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and full_snapshot) else {}
             if phase_info.get("status") == "ready":
                 status = "ready"
             serve_phase = phase_info.get("phase", "")
             diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
-            if diagnosis and status in {"running", "unknown", "stopped"}:
+            if diagnosis and status in {"running", "unknown", "stopped"} and phase_info.get("status") != "ready":
                 status = "error"
             if download_zero_files:
                 diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."}
diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py
index a7af18b04..eb408ac9d 100644
--- a/routes/hwfit_routes.py
+++ b/routes/hwfit_routes.py
@@ -196,7 +196,24 @@ def setup_hwfit_routes():
         if target_context is not None:
             target_context = max(1024, min(target_context, 1000000))
 
-        results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None, target_context=target_context, fit_only=fit_only)
+        rank_kwargs = {
+            "use_case": use_case or None,
+            "limit": limit,
+            "search": search or None,
+            "sort": sort,
+            "quant": quant or None,
+            "fit_only": fit_only,
+        }
+        if target_context is not None:
+            rank_kwargs["target_context"] = target_context
+        try:
+            import inspect
+            supported = set(inspect.signature(rank_models).parameters)
+            rank_kwargs = {k: v for k, v in rank_kwargs.items() if k in supported}
+        except Exception:
+            rank_kwargs.pop("target_context", None)
+            rank_kwargs.pop("fit_only", None)
+        results = rank_models(system, **rank_kwargs)
         return {"system": system, "models": results}
 
     @router.get("/profiles")
diff --git a/routes/model_routes.py b/routes/model_routes.py
index 995705d75..6b76dc71f 100644
--- a/routes/model_routes.py
+++ b/routes/model_routes.py
@@ -5,7 +5,6 @@ import re
 import uuid
 import json
 import socket
-import hashlib
 import time as _time
 import logging
 import httpx
@@ -283,11 +282,8 @@ _HOST_TO_CURATED = (
     ("fireworks.ai", "fireworks"),
     ("googleapis.com", "google"),
     ("x.ai", "xai"),
-
     ("openrouter.ai", "openrouter"),
     ("ollama.com", "ollama"),
-    ("opencode.ai/zen/go", "opencode-go"),
-    ("opencode.ai/zen", "opencode-zen"),
 )
 
 
@@ -494,8 +490,6 @@ _NON_CHAT_EXACT_PREFIXES = (
 def _is_chat_model(model_id: str) -> bool:
     """Return True if the model ID looks like a chat/completions-capable model."""
     mid = model_id.lower()
-    if mid in {"gpt-5.1-codex"}:
-        return True
     for prefix in _NON_CHAT_PREFIXES:
         if mid.startswith(prefix):
             return False
@@ -508,67 +502,9 @@ def _is_chat_model(model_id: str) -> bool:
     return True
 
 
-def _delete_orphaned_provider_auth(db, auth_id: Optional[str], exclude_ep_id: Optional[str] = None) -> bool:
-    """Delete a ProviderAuthSession once no endpoint still references it.
-
-    Subscription providers (e.g. ChatGPT Subscription) keep their refresh token
-    in ProviderAuthSession rather than ModelEndpoint.api_key. When the last
-    endpoint backed by that auth row is removed, the stored credentials should
-    be cleared instead of lingering. Returns True if a row was deleted.
-    ``exclude_ep_id`` drops the endpoint currently being deleted from the
-    reference count so it does not keep its own auth alive.
-    """
-    if not auth_id:
-        return False
-    from core.database import ProviderAuthSession
-    still_referenced = db.query(ModelEndpoint.id).filter(
-        ModelEndpoint.provider_auth_id == auth_id,
-        ModelEndpoint.id != exclude_ep_id,
-    ).first()
-    if still_referenced is not None:
-        return False
-    auth_row = db.query(ProviderAuthSession).filter(ProviderAuthSession.id == auth_id).first()
-    if auth_row is None:
-        return False
-    db.delete(auth_row)
-    return True
-
-
-def _is_discovery_only_provider(provider: str) -> bool:
-    """Provider that only supports model discovery, not live probing.
-
-    ChatGPT Subscription speaks the Responses/Codex API and has no
-    chat-completions or general health endpoint, so completion probes and
-    reachability pings are skipped — status is derived from cached models.
-    """
-    return provider == "chatgpt-subscription"
-
-
-def _resolve_probe_key(ep) -> Optional[str]:
-    """API key/bearer to probe an endpoint with.
-
-    Delegates to ``resolve_endpoint_runtime``, which already returns the static
-    ``ModelEndpoint.api_key`` for keyed endpoints and resolves (and refreshes)
-    the runtime bearer for session-backed providers (e.g. ChatGPT Subscription).
-    Returns None if resolution fails (e.g. re-auth required) so probing skips
-    rather than raising. Reads only already-loaded scalar attributes of ``ep``.
-    """
-    try:
-        from src.endpoint_resolver import resolve_endpoint_runtime
-        _base, key = resolve_endpoint_runtime(ep, owner=getattr(ep, "owner", None))
-        return key
-    except Exception as e:
-        logger.warning("Probe key resolution failed for %s: %s", getattr(ep, "id", "?"), e)
-        return None
-
-
-def _probe_single_model(base: str, api_key: Optional[str], model_id: str, timeout: int = 10, with_tools: bool = False) -> dict:
+def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 10, with_tools: bool = False) -> dict:
     """Send a realistic completion request to a single model. Returns {status, latency_ms, error?}."""
     provider = _detect_provider(base)
-    if _is_discovery_only_provider(provider):
-        # Responses/Codex API, not chat-completions: a completion probe would
-        # 400 and the re-probe flow would then hide every model. Discovery-only.
-        return {"status": "ok", "latency_ms": 0, "skipped": True}
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "Say OK"},
@@ -682,11 +618,6 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
     For Anthropic, queries their /v1/models API, falling back to hardcoded list."""
     from src.endpoint_resolver import resolve_url
     base = resolve_url(_normalize_base(base_url))
-    if _detect_provider(base) == "chatgpt-subscription":
-        from src.chatgpt_subscription import fetch_available_models
-        if api_key:
-            return fetch_available_models(api_key, timeout=timeout)
-        return []
     if _detect_provider(base) == "anthropic":
         # Try Anthropic's /v1/models endpoint first
         url = build_models_url(base)
@@ -713,10 +644,6 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
             logger.warning(f"Anthropic /v1/models failed, using hardcoded list: {e}")
         return list(ANTHROPIC_MODELS)
     url = build_models_url(base)
-    if not url:
-        curated_key = _match_provider_curated(base, None)
-        fallback = _PROVIDER_CURATED.get(curated_key) if curated_key else None
-        return list(fallback or [])
     headers = build_headers(api_key, base)
     try:
         r = httpx.get(url, headers=headers, timeout=timeout, verify=llm_verify())
@@ -770,6 +697,7 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
         return list(fallback)
     return []
 
+
 def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> Dict[str, Any]:
     """Reachability probe that does not require installed/listed models."""
     from src.endpoint_resolver import resolve_url
@@ -785,10 +713,6 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
         or "ollama" in (parsed_base.hostname or "").lower()
     )
 
-    # APFEL-specific detection
-    host = (parsed_base.hostname or "").lower()
-    looks_like_apfel = "apfel" in host or parsed_base.port == 11435
-
     def _result_from_response(r) -> Dict[str, Any]:
         if 300 <= r.status_code < 400:
             loc = r.headers.get("location", "")
@@ -810,23 +734,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
     last_error: Optional[str] = None
 
     try:
-        # APFEL does not behave like Ollama; use its health endpoint.
-        if looks_like_apfel:
-            root = base
-            for suffix in ("/v1", "/api"):
-                if root.endswith(suffix):
-                    root = root[: -len(suffix)].rstrip("/")
-                    break
-            try:
-                r = httpx.get(root + "/health", timeout=timeout, verify=llm_verify())
-                result = _result_from_response(r)
-                if result["reachable"]:
-                    return result
-                last_error = result.get("error")
-            except Exception as e:
-                last_error = str(e)[:120]
-
-        elif looks_like_ollama:
+        if looks_like_ollama:
             root = base
             for suffix in ("/v1", "/api"):
                 if root.endswith(suffix):
@@ -844,33 +752,44 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
     except Exception:
         pass
 
+    # OpenAI-compatible servers (vLLM, llama.cpp, SGLang, lmdeploy, …) expose
+    # /v1/models but return 404 on the bare /v1 root. The probe used to GET
+    # the base URL only, so a fully-working vLLM endpoint (chats fine!) read
+    # as offline because /v1 → 404. Try /models first; fall back to the base
+    # URL only if /models couldn't be reached (TCP-level failure).
+    models_url = build_models_url(base)
+    try:
+        r = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify())
+        result = _result_from_response(r)
+        if result["reachable"]:
+            return result
+        last_error = result.get("error")
+    except Exception as e:
+        last_error = str(e)[:120]
+
     try:
         r = httpx.get(base, headers=headers, timeout=timeout, verify=llm_verify())
         result = _result_from_response(r)
-        # If the bare base URL returns a non-auth 4xx (e.g. 404), try /models
-        # as a fallback. OpenAI-compatible servers like llama-swap return 404
-        # on the base /v1 prefix but 200 on /v1/models.  Auth failures (401/403)
-        # are definitive — probing /models would just repeat the same rejection.
-        if (
-            not result["reachable"]
-            and result.get("status_code") is not None
-            and 400 <= result["status_code"] < 500
-            and result["status_code"] not in (401, 403)
-        ):
-            models_url = build_models_url(base)
-            try:
-                r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify())
-                result2 = _result_from_response(r2)
-                if result2["reachable"]:
-                    return result2
-            except Exception:
-                pass
-        return result
+        if result["reachable"]:
+            return result
+        # 4xx from a reachable HTTP server (404 /v1, 401/403 missing key) is
+        # still proof the upstream is alive. Only treat connection-level
+        # failures, 5xx, and redirect-to-/login as truly offline.
+        sc = result.get("status_code") or 0
+        if 400 <= sc < 500 and sc not in (407, 408, 421, 425, 429):
+            return {
+                "reachable": True,
+                "status_code": sc,
+                "error": None,
+            }
+        last_error = result.get("error") or last_error
     except Exception as e:
         last_error = str(e)[:120]
 
     return {"reachable": False, "status_code": None, "error": last_error}
 
+
+
 def _model_endpoint_error_message(base_url: str, ping: Dict[str, Any] = None) -> str:
     """Return a provider-aware error message for failed endpoint probes."""
     ping = ping or {}
@@ -959,14 +878,6 @@ def _visible_models(cached_models, hidden_models, pinned_models=None):
     return [m for m in merged if m not in hidden]
 
 
-def _api_key_fingerprint(api_key: Optional[str]) -> str:
-    """Stable, non-secret label for distinguishing same-URL credentials."""
-    key = (api_key or "").strip()
-    if not key:
-        return ""
-    return hashlib.sha256(key.encode("utf-8")).hexdigest()[:8]
-
-
 def setup_model_routes(model_discovery):
     router = APIRouter(prefix="/api")
 
@@ -1068,17 +979,6 @@ def setup_model_routes(model_discovery):
                         ok, info = _should_refresh_endpoint(ep, now, force=force)
                         if not ok:
                             continue
-                        if getattr(ep, "provider_auth_id", None):
-                            try:
-                                from src.endpoint_resolver import resolve_endpoint_runtime
-                                info["base"], info["api_key"] = resolve_endpoint_runtime(
-                                    ep,
-                                    owner=getattr(ep, "owner", None),
-                                )
-                                info["key"] = _refresh_key(info["base"], info["api_key"])
-                            except Exception as e:
-                                logger.warning("Skipping model refresh for %s: could not resolve provider auth: %s", getattr(ep, "name", ep.id), e)
-                                continue
                         groups.setdefault(info["key"], {
                             "base": info["base"],
                             "api_key": info["api_key"],
@@ -1232,9 +1132,8 @@ def setup_model_routes(model_discovery):
                 raise HTTPException(401, "Not authenticated")
         except HTTPException:
             raise
-        except Exception as e:
-            logger.error('Auth gate error in GET /api/models, failing closed: %s', e)
-            raise HTTPException(status_code=500, detail='Internal error')
+        except Exception:
+            pass
         # Admins see every endpoint (they manage the global pool); regular
         # users get the owner-scoped view.
         _is_admin = False
@@ -1298,7 +1197,14 @@ def setup_model_routes(model_discovery):
             t0 = _time.time()
             try:
                 import asyncio as _asyncio
-                ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 1.5)
+                # Bumped 1.5s → 3.5s. The previous 1.5s budget was clipping
+                # local vLLM endpoints on Tailscale links where the model
+                # server is still loading (Qwen3.5-122B takes 2–3 min to
+                # warm); /v1/models can take 500–2500 ms on a busy box,
+                # which pushed _ping_endpoint's full path-discovery sweep
+                # past the cap and marked the row offline despite the
+                # user actively chatting with it.
+                ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 3.5)
                 lat = round((_time.time() - t0) * 1000)
                 return {
                     "alive": bool(ping.get("reachable")),
@@ -1348,20 +1254,12 @@ def setup_model_routes(model_discovery):
                 "endpoint_kind": kind,
             }
             try:
-                if _is_discovery_only_provider(provider):
-                    # No general health endpoint — an unauthenticated GET just
-                    # 401s. Report status from cached models instead of pinging.
-                    entry["latency_ms"] = None
-                    entry["status"] = "online" if cached_count else "offline"
-                    entry["error"] = None
-                    entry["model_count"] = cached_count
-                else:
-                    t0 = _time.time()
-                    ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
-                    entry["latency_ms"] = round((_time.time() - t0) * 1000)
-                    entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
-                    entry["error"] = ping.get("error")
-                    entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
+                t0 = _time.time()
+                ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
+                entry["latency_ms"] = round((_time.time() - t0) * 1000)
+                entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
+                entry["error"] = ping.get("error")
+                entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
             except Exception as e:
                 entry["latency_ms"] = None
                 entry["status"] = "online" if cached_count else "offline"
@@ -1394,7 +1292,7 @@ def setup_model_routes(model_discovery):
                 if ep_id and ep_id not in endpoints_cache:
                     ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first()
                     if ep:
-                        endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": _resolve_probe_key(ep)}
+                        endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": ep.api_key}
                 ep_data = endpoints_cache.get(ep_id)
                 if not ep_data:
                     # Try to find by base_url from the model's endpoint field
@@ -1433,7 +1331,7 @@ def setup_model_routes(model_discovery):
                     "id": ep.id,
                     "name": ep.name,
                     "base_url": ep.base_url,
-                    "api_key": _resolve_probe_key(ep),
+                    "api_key": ep.api_key,
                 })
         finally:
             db.close()
@@ -1522,21 +1420,43 @@ def setup_model_routes(model_discovery):
                 # Endpoint counts as reachable if it has any model — including
                 # admin-pinned IDs that a probe would never surface.
                 status = "online" if (all_models or pinned) else "offline"
-                base = _normalize_base(r.base_url)
                 ping = None
-                # Discovery-only providers have no health endpoint — an
-                # unauthenticated ping just 401s, so don't bother.
-                if not all_models and not pinned and r.is_enabled and not _is_discovery_only_provider(_detect_provider(base)):
-                    ping = _ping_endpoint(r.base_url, r.api_key, timeout=1.0)
+                # When cached_models is empty, do a quick reachability probe.
+                # Bumped 1.0s → 3.5s because the user reported endpoints they
+                # were ACTIVELY chatting with showed "offline" — the previous
+                # 1s timeout was clipping live cloud endpoints (DeepSeek can
+                # take 1.5–2.5s on /v1/models when their region is under load,
+                # vLLM on a remote GPU box behind SSH can also push past 1s).
+                # 3.5s still keeps the picker render snappy in the common
+                # "everything's already cached" path because this branch only
+                # runs for endpoints with an empty cached_models.
+                if not all_models and not pinned and r.is_enabled:
+                    ping = _ping_endpoint(r.base_url, r.api_key, timeout=3.5)
                     if ping.get("reachable"):
                         status = "empty"
+                        # Best-effort: if the probe came back reachable, try
+                        # to populate cached_models in the background so the
+                        # NEXT picker load shows "online" instead of "empty".
+                        # Failure here is silent — we already returned the
+                        # "empty" status, and the existing background refresh
+                        # path will eventually fill it in too.
+                        try:
+                            probed = _probe_endpoint(r.base_url, r.api_key, timeout=5)
+                            if probed:
+                                r.cached_models = json.dumps(probed)
+                                db.commit()
+                                all_models = probed
+                                visible = _visible_models(all_models, r.hidden_models, pinned)
+                                status = "online"
+                        except Exception as _refill_err:
+                            logger.debug(f"opportunistic cached_models refill failed for {r.id}: {_refill_err!r}")
+                base = _normalize_base(r.base_url)
                 kind = _effective_endpoint_kind(r, base)
                 results.append({
                     "id": r.id,
                     "name": r.name,
                     "base_url": r.base_url,
                     "has_key": bool(r.api_key),
-                    "api_key_fingerprint": _api_key_fingerprint(r.api_key),
                     "is_enabled": r.is_enabled,
                     "models": visible,
                     "pinned_models": pinned,
@@ -1603,34 +1523,21 @@ def setup_model_routes(model_discovery):
         )
         explicit_timeout = _explicit_model_list_timeout(base_url, requested_kind, refresh_timeout)
 
-        # Dedupe: if an endpoint with the same base_url and compatible
-        # credentials already exists and is reachable by the caller (shared or
-        # owned by them), return it instead of creating a duplicate row. Keep
-        # same-url/different-key rows distinct so users can group the same
-        # provider URL under multiple credentials.
+        # Dedupe: if an endpoint with the same base_url already exists and
+        # is reachable by the caller (shared or owned by them), return it
+        # instead of creating a duplicate row. Fixes "Scan for Servers"
+        # re-adding manually-added endpoints under their host:port name.
         from src.auth_helpers import get_current_user as _gcu_dedup
         _caller = _gcu_dedup(request) or None
-        _incoming_api_key = api_key.strip()
         _db_dedup = SessionLocal()
         try:
-            _same_url_rows = (
+            existing = (
                 _db_dedup.query(ModelEndpoint)
                 .filter(ModelEndpoint.base_url == base_url)
                 .filter((ModelEndpoint.owner.is_(None)) | (ModelEndpoint.owner == _caller))
                 .order_by(ModelEndpoint.owner.desc())  # prefer owned over shared
-                .all()
+                .first()
             )
-            existing = None
-            _empty_key_existing = None
-            for _candidate in _same_url_rows:
-                _candidate_key = (getattr(_candidate, "api_key", None) or "").strip()
-                if _candidate_key == _incoming_api_key:
-                    existing = _candidate
-                    break
-                if _incoming_api_key and not _candidate_key and _empty_key_existing is None:
-                    _empty_key_existing = _candidate
-            if existing is None and _incoming_api_key and _empty_key_existing is not None:
-                existing = _empty_key_existing
             if existing:
                 changed = False
                 # Persist any incoming pinned IDs onto the existing row. An
@@ -1679,8 +1586,6 @@ def setup_model_routes(model_discovery):
                     "id": existing.id,
                     "name": existing.name,
                     "base_url": existing.base_url,
-                    "has_key": bool(existing.api_key),
-                    "api_key_fingerprint": _api_key_fingerprint(existing.api_key),
                     "models": _visible_models(
                         existing_models,
                         getattr(existing, "hidden_models", None),
@@ -1754,8 +1659,6 @@ def setup_model_routes(model_discovery):
             "id": ep_id,
             "name": name.strip(),
             "base_url": base_url,
-            "has_key": bool(api_key.strip()),
-            "api_key_fingerprint": _api_key_fingerprint(api_key),
             "models": _merge_model_ids(model_ids, _pinned),
             "pinned_models": _pinned,
             "online": bool(model_ids) or bool(_pinned) or bool(ping.get("reachable")),
@@ -1805,7 +1708,7 @@ def setup_model_routes(model_discovery):
             ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first()
             if not ep:
                 raise HTTPException(404, "Endpoint not found")
-            ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": _resolve_probe_key(ep)}
+            ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": ep.api_key}
         finally:
             db.close()
 
@@ -1869,7 +1772,7 @@ def setup_model_routes(model_discovery):
                 category = _classify_endpoint(base, kind)
                 timeout = _manual_refresh_timeout(ep, category, refresh_timeout)
                 try:
-                    probed = _probe_endpoint(base, _resolve_probe_key(ep), timeout=timeout)
+                    probed = _probe_endpoint(base, ep.api_key, timeout=timeout)
                 except Exception as exc:
                     logger.warning("Manual model refresh failed for endpoint %s at %s: %s", ep_id, base, exc)
                     probed = []
@@ -2105,8 +2008,6 @@ def setup_model_routes(model_discovery):
                 "name": ep.name,
                 "model_type": ep.model_type,
                 "base_url": ep.base_url,
-                "has_key": bool(ep.api_key),
-                "api_key_fingerprint": _api_key_fingerprint(ep.api_key),
                 "pinned_models": _normalize_model_ids(getattr(ep, "pinned_models", None)),
                 "endpoint_kind": getattr(ep, "endpoint_kind", None) or "auto",
                 "model_refresh_mode": getattr(ep, "model_refresh_mode", None) or "auto",
@@ -2208,9 +2109,7 @@ def setup_model_routes(model_discovery):
             cleared_user_preferences = _clear_user_prefs_for_endpoint(ep_id)
             cleared_sessions = _clear_sessions_for_endpoint(db, ep.base_url)
             cleared_loaded_sessions = _clear_loaded_sessions_for_endpoint(ep.base_url)
-            auth_id = getattr(ep, "provider_auth_id", None)
             db.delete(ep)
-            cleared_provider_auth = _delete_orphaned_provider_auth(db, auth_id, exclude_ep_id=ep_id)
             db.commit()
             _invalidate_models_cache()
             _local_probe_cache["data"] = None
@@ -2220,7 +2119,6 @@ def setup_model_routes(model_discovery):
                 "cleared_user_preferences": cleared_user_preferences,
                 "cleared_sessions": cleared_sessions,
                 "cleared_loaded_sessions": cleared_loaded_sessions,
-                "cleared_provider_auth": cleared_provider_auth,
             }
         finally:
             db.close()
diff --git a/services/hwfit/data/hf_models.json b/services/hwfit/data/hf_models.json
index e73cc26dc..35b55d9a9 100644
--- a/services/hwfit/data/hf_models.json
+++ b/services/hwfit/data/hf_models.json
@@ -14036,6 +14036,29 @@
    "vision"
   ]
  },
+ {
+  "name": "google/gemma-4-12B",
+  "provider": "Google",
+  "parameter_count": "12.0B",
+  "parameters_raw": 12000000000,
+  "min_ram_gb": 24.0,
+  "recommended_ram_gb": 32.0,
+  "min_vram_gb": 24.0,
+  "quantization": "BF16",
+  "context_length": 131072,
+  "use_case": "General purpose, multimodal",
+  "is_moe": false,
+  "num_experts": null,
+  "active_experts": null,
+  "active_parameters": null,
+  "architecture": "gemma4",
+  "pipeline_tag": "image-text-to-text",
+  "release_date": "2026-04-01",
+  "gguf_sources": [],
+  "capabilities": [
+   "vision"
+  ]
+ },
  {
   "name": "google/gemma-4-31B-it",
   "provider": "Google",
@@ -19121,4 +19144,4 @@
   ],
   "_discovered": true
  }
-]
\ No newline at end of file
+]
diff --git a/services/memory/skill_extractor.py b/services/memory/skill_extractor.py
index e763bca4c..79e4c67c2 100644
--- a/services/memory/skill_extractor.py
+++ b/services/memory/skill_extractor.py
@@ -243,6 +243,20 @@ async def maybe_extract_skill(
             logger.debug("[skill-extract] '%s' already exists — dropped as duplicate", title)
             return None
 
+        # Auto-publish gate: if the user has `auto_approve_skills` on, the
+        # newly-extracted skill is created `published` immediately rather
+        # than waiting for the next audit batch. The audit still runs later
+        # and can demote it back to `draft` (or delete) on failure. Default
+        # ON matches the UI label "Auto-approve skills".
+        _initial_status = "draft"
+        try:
+            from routes.prefs_routes import _load_for_user as _load_prefs
+            _prefs = _load_prefs(owner) or {}
+            if _prefs.get("auto_approve_skills", True):
+                _initial_status = "published"
+        except Exception:
+            pass
+
         entry = skills_manager.add_skill(
             title=title,
             problem=data.get("problem", ""),
@@ -253,6 +267,7 @@ async def maybe_extract_skill(
             confidence=data.get("confidence", 0.7),
             session_id=getattr(session, "session_id", None),
             owner=owner,
+            status=_initial_status,
         )
         try:
             from src.event_bus import fire_event
diff --git a/src/tool_implementations.py b/src/tool_implementations.py
index 548f6f0f5..5e62e686c 100644
--- a/src/tool_implementations.py
+++ b/src/tool_implementations.py
@@ -664,6 +664,17 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict:
             proc = args.get("steps") or []
         if not proc and not args.get("body_extra") and not args.get("solution"):
             return {"error": "procedure (or solution body) is required", "exit_code": 1}
+        # Same auto-publish gate as the extractor path — when the user
+        # has auto_approve_skills on and the caller didn't pin an explicit
+        # status, publish immediately. Audit later demotes/removes on fail.
+        _status_arg = args.get("status")
+        if not _status_arg:
+            try:
+                from routes.prefs_routes import _load_for_user as _load_prefs
+                _prefs = _load_prefs(owner) or {}
+                _status_arg = "published" if _prefs.get("auto_approve_skills", True) else "draft"
+            except Exception:
+                _status_arg = "draft"
         entry = sm.add_skill(
             name=args.get("name"),
             description=(args.get("description") or args.get("title") or "").strip(),
@@ -677,7 +688,7 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict:
             procedure=proc,
             pitfalls=args.get("pitfalls") or [],
             verification=args.get("verification") or [],
-            status=args.get("status") or "draft",
+            status=_status_arg,
             version=args.get("version") or "1.0.0",
             confidence=args.get("confidence", 0.8),
             source=args.get("source", "learned"),
@@ -2621,8 +2632,90 @@ async def _cookbook_env_for_host(host: str) -> Dict[str, Any]:
     }
 
 
-async def _cookbook_register_task(session_id: str, model: str, host: str,
-                                  cmd: str, task_type: str = "serve") -> bool:
+def _infer_serve_port(cmd: str) -> int:
+    """Infer likely listen port from a serve command."""
+    if not cmd:
+        return 8080
+    m = re.search(r"--port\\s+(\\d+)", cmd)
+    if m:
+        try:
+            return int(m.group(1))
+        except Exception:
+            pass
+    m = re.search(r"OLLAMA_HOST=[^\\s]*?:(\\d+)", cmd)
+    if m:
+        try:
+            return int(m.group(1))
+        except Exception:
+            pass
+    if "ollama" in cmd:
+        return 11434
+    return 8080
+
+
+def _infer_serve_host(host: str | None) -> tuple[str, bool]:
+    """Return (host, container_local) for registering a served endpoint."""
+    if not (host or "").strip():
+        return "localhost", True
+    base_host = host.split("@", 1)[-1] if "@" in host else host
+    return base_host, False
+
+
+async def _ensure_served_endpoint(
+    *,
+    model: str,
+    cmd: str,
+    host: str | None,
+) -> Dict[str, Any]:
+    """Register/fetch a model endpoint for a running serve session."""
+    import httpx
+    endpoint_host, container_local = _infer_serve_host(host)
+    port = _infer_serve_port(cmd)
+    base_url = f"http://{endpoint_host}:{port}/v1"
+    short_name = model.split("/")[-1] if "/" in model else model
+    is_image = "diffusion_server.py" in (cmd or "")
+    payload = {
+        "name": short_name if not is_image else f"{short_name} (image)",
+        "base_url": base_url,
+        "skip_probe": "true",
+        "model_type": "image" if is_image else "llm",
+        "container_local": "true" if container_local else "false",
+    }
+    try:
+        async with httpx.AsyncClient(timeout=30) as client:
+            resp = await client.post(
+                f"{_COOKBOOK_BASE}/api/model-endpoints",
+                data=payload,
+                headers=_internal_headers(),
+            )
+            data = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
+        if resp.status_code >= 400:
+            logger.debug(
+                f"ensure endpoint failed for {model!r}: status={resp.status_code} data={data}"
+            )
+            return {"added": False, "endpoint_id": "", "base_url": base_url, "error": data}
+        ep_id = data.get("id") if isinstance(data, dict) else None
+        return {
+            "added": bool(ep_id),
+            "endpoint_id": ep_id or "",
+            "base_url": base_url,
+            "data": data,
+        }
+    except Exception as e:
+        logger.debug(f"ensure endpoint exception for {model!r}: {e}")
+        return {"added": False, "endpoint_id": "", "base_url": base_url, "error": str(e)}
+
+
+async def _cookbook_register_task(
+    session_id: str,
+    model: str,
+    host: str,
+    cmd: str,
+    task_type: str = "serve",
+    *,
+    endpoint_added: bool = False,
+    endpoint_id: str = "",
+) -> bool:
     """Append a task entry to cookbook_state.json after the agent
     launches via /api/model/serve or /api/model/download. The route
     spawns tmux but leaves state-writing to the UI; the agent needs to
@@ -2672,7 +2765,8 @@ async def _cookbook_register_task(session_id: str, model: str, host: str,
         "sshPort": "",
         "platform": "linux",
         "_serveReady": False,
-        "_endpointAdded": False,
+        "_endpointAdded": bool(endpoint_added),
+        "_endpointId": endpoint_id or "",
     })
     state["tasks"] = tasks
     try:
@@ -3008,7 +3102,12 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict:
         if _servers.get("default_host"):
             host = _servers["default_host"]
             _host_defaulted = True
+    backend = (args.get("backend") or "").strip().lower()
+    if not backend and "/" not in repo_id and ":" in repo_id:
+        backend = "ollama"
     payload = {"repo_id": repo_id}
+    if backend:
+        payload["backend"] = backend
     if host:
         payload["remote_host"] = host
     if args.get("include"):
@@ -3028,12 +3127,20 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict:
             sid = data.get("session_id", "?")
             registered = await _cookbook_register_task(
                 session_id=sid, model=repo_id, host=host,
-                cmd=f"hf download {repo_id}", task_type="download",
+                cmd=(f"ollama pull {repo_id}" if backend == "ollama" else f"hf download {repo_id}"),
+                task_type="download",
             )
             note = "" if registered else " (state-write failed — download may not show in UI)"
             where = host or "local"
             default_note = " (defaulted to the cookbook's selected server — pass host= or local=true to override)" if _host_defaulted else ""
-            return {"output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}", "session_id": sid, "host": host, "exit_code": 0}
+            return {
+                "output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}",
+                "session_id": sid,
+                "host": host,
+                "task_type": "download",
+                "phase": "running",
+                "exit_code": 0,
+            }
         return {"error": data.get("error", "Download failed"), "exit_code": 1}
     except Exception as e:
         return {"error": str(e), "exit_code": 1}
@@ -3102,12 +3209,28 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict:
             data = resp.json()
         if data.get("ok"):
             sid = data.get("session_id", "?")
+            endpoint_id = data.get("endpoint_id") or ""
+            if endpoint_id:
+                endpoint_added = True
+            else:
+                endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host)
+                endpoint_added = bool(endpoint_meta.get("added"))
+                endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id
             registered = await _cookbook_register_task(
                 session_id=sid, model=repo_id,
                 host=host, cmd=cmd, task_type="serve",
+                endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
             )
             note = "" if registered else " (state-write failed — task may not show in UI)"
-            return {"output": f"Serving {repo_id} (session: {sid}){note}", "session_id": sid, "exit_code": 0}
+            return {
+                "output": f"Serving {repo_id} (session: {sid}){note}",
+                "session_id": sid,
+                "task_type": "serve",
+                "phase": "running",
+                "host": host,
+                "endpoint_id": endpoint_id,
+                "exit_code": 0,
+            }
         # FastAPI HTTPException puts the message under `detail`, not `error`.
         # Surface BOTH so the agent sees "Invalid characters in cmd" (from
         # _validate_serve_cmd rejecting `&&`/`source`/`cd`) instead of
@@ -3804,7 +3927,8 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict:
     if env_cfg.get("gpus"):       payload["gpus"]       = env_cfg["gpus"]
     if env_cfg.get("hf_token"):   payload["hf_token"]   = env_cfg["hf_token"]
     if env_cfg.get("platform"):   payload["platform"]   = env_cfg["platform"]
-    if env_cfg.get("ssh_port"):   payload["ssh_port"]   = env_cfg["ssh_port"]
+    if env_cfg.get("ssh_port"):
+        payload["ssh_port"] = env_cfg["ssh_port"]
 
     try:
         async with httpx.AsyncClient(timeout=30) as client:
@@ -3813,12 +3937,20 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict:
             data = resp.json()
         if data.get("ok"):
             sid = data.get("session_id", "?")
+            endpoint_id = data.get("endpoint_id") or ""
+            if endpoint_id:
+                endpoint_added = True
+            else:
+                endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host)
+                endpoint_added = bool(endpoint_meta.get("added"))
+                endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id
             registered = await _cookbook_register_task(
                 session_id=sid, model=repo_id, host=host,
                 cmd=cmd, task_type="serve",
+                endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
             )
             note = "" if registered else " (state-write failed — task may not show in UI)"
-            return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "exit_code": 0}
+            return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "host": host, "endpoint_id": endpoint_id, "exit_code": 0}
         return {"error": data.get("error", "Serve failed"), "exit_code": 1}
     except Exception as e:
         return {"error": str(e), "exit_code": 1}
diff --git a/static/index.html b/static/index.html
index ec4af199f..ae3092659 100644
--- a/static/index.html
+++ b/static/index.html
@@ -1492,21 +1492,7 @@
               <div id="set-researchMsg" style="font-size:11px;color:color-mix(in srgb, var(--fg) 45%, transparent);"></div>
             </div>
           </div>
-          <div class="admin-card">
-            <h2><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-2px;margin-right:5px;opacity:0.6"><path d="M14.7 6.3a1 1 0 0 0 0 1.4l1.6 1.6a1 1 0 0 0 1.4 0l3.77-3.77a6 6 0 0 1-7.94 7.94l-6.91 6.91a2.12 2.12 0 0 1-3-3l6.91-6.91a6 6 0 0 1 7.94-7.94l-3.76 3.76z"/></svg>Agent</h2>
-            <div class="admin-toggle-sub" style="margin-bottom:8px">Controls for the agent tool loop.</div>
-            <div class="settings-col">
-              <div class="settings-row">
-                <label class="settings-label">Tool call limit</label>
-                <input id="set-agentMaxTools" type="text" inputmode="numeric" placeholder="0 = unlimited" class="settings-select" style="width:120px;">
-              </div>
-              <div class="settings-row">
-                <label class="settings-label">Max steps per message</label>
-                <input id="set-agentMaxRounds" type="text" inputmode="numeric" placeholder="20" class="settings-select" style="width:120px;">
-              </div>
-              <div id="set-agentMsg" style="font-size:11px;color:color-mix(in srgb, var(--fg) 45%, transparent);"></div>
-            </div>
-          </div>
+          <!-- Agent card moved to the Agent Tools tab. -->
           <!-- Image Generation removed — only inpaint remains in this build,
                and inpaint is configured via the gallery editor not this card.
                Keeping the DOM (hidden) so JS wiring against the inputs
@@ -2048,30 +2034,37 @@
               <div class="admin-model-form">
                 <div class="admin-model-form-row">
                   <input id="adm-epLocalUrl" type="text" placeholder="Paste endpoint URL, e.g. http://localhost:11434/v1" style="flex:1">
-                  <select id="adm-epLocalType" style="padding:5px;width:72px;flex-shrink:0;">
-                    <option value="llm">LLM</option>
-                    <option value="image">Image</option>
-                  </select>
                 </div>
-                <div class="admin-model-form-row">
+                <!-- API key row stays in the DOM but is collapsed until the
+                     user clicks the Key button on the action row. Local
+                     endpoints rarely need a key; hiding it by default keeps
+                     the form a single visual line. -->
+                <div class="admin-model-form-row" id="adm-epLocalApiKey-row" style="display:none;">
                   <input id="adm-epLocalApiKey" type="password" placeholder="API key (optional — for protected local endpoints)" autocomplete="off" style="flex:1">
                 </div>
+                <!-- Action row: LLM/Image type, Quickstart buttons (Scan,
+                     Ollama), Key reveal toggle, Test, Add — all inline so
+                     the Quickstart fold is gone and Type sits with the
+                     primary actions. -->
                 <div class="admin-model-form-row">
+                  <label style="display:inline-flex;align-items:center;gap:4px;font-size:11px;opacity:0.6;flex-shrink:0;">Type:<select id="adm-epLocalType" style="padding:5px;width:72px;flex-shrink:0;">
+                    <option value="llm" selected>LLM</option>
+                    <option value="image">Image</option>
+                  </select></label>
+                  <button class="admin-btn-sm" id="adm-epDiscoverBtn" title="Scan your network for running model servers" style="display:inline-flex;align-items:center;gap:4px;">
+                    <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round"><circle cx="11" cy="11" r="8"/><line x1="21" y1="21" x2="16.65" y2="16.65"/></svg>Scan
+                  </button>
+                  <button class="admin-btn-sm" id="adm-epOllamaBtn" title="Fill the default Ollama endpoint" style="display:inline-flex;align-items:center;gap:5px;"><span class="adm-ollama-logo" style="display:inline-flex;width:13px;height:13px;"></span>Ollama</button>
                   <span style="flex:1"></span>
-                  <button class="admin-btn-sm" id="adm-epLocalTestBtn" style="width:55px;text-align:center;">Test</button>
-                  <button class="admin-btn-add" id="adm-epLocalAddBtn" style="width:55px;text-align:center;">Add</button>
-                </div>
-                <div class="adm-quickstart-section collapsed" id="adm-add-local-quickstart">
-                  <div class="adm-quickstart-toggle" role="button" tabindex="0" aria-expanded="false">
-                    <span>Quickstart</span>
-                    <svg class="adm-section-caret" width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"><polyline points="6 9 12 15 18 9"/></svg>
-                  </div>
-                  <div class="adm-quickstart-body">
-                    <button class="admin-btn-sm" id="adm-epDiscoverBtn" title="Scan your network for running model servers">
-                      <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" style="vertical-align:-1px;margin-right:4px;"><circle cx="11" cy="11" r="8"/><line x1="21" y1="21" x2="16.65" y2="16.65"/></svg>Scan for Servers
-                    </button>
-                    <button class="admin-btn-sm" id="adm-epOllamaBtn" title="Fill the default Ollama endpoint">Ollama</button>
-                  </div>
+                  <button class="admin-btn-sm" id="adm-epLocalKeyBtn" title="Show / hide the API key field" aria-expanded="false" aria-controls="adm-epLocalApiKey-row" style="opacity:0.75;display:inline-flex;align-items:center;gap:4px;">
+                    <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 2l-9.6 9.6"/><circle cx="7.5" cy="15.5" r="5.5"/><path d="M15.5 7.5l3 3"/></svg>API
+                  </button>
+                  <button class="admin-btn-sm" id="adm-epLocalTestBtn" style="min-width:55px;text-align:center;display:inline-flex;align-items:center;justify-content:center;gap:4px;">
+                    <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"><polygon points="5 3 19 12 5 21 5 3"/></svg>Test
+                  </button>
+                  <button class="admin-btn-add" id="adm-epLocalAddBtn" style="min-width:55px;text-align:center;display:inline-flex;align-items:center;justify-content:center;gap:4px;">
+                    <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg>Add
+                  </button>
                 </div>
                 <div id="adm-epLocalMsg" class="adm-ep-inline-msg"></div>
               </div>
@@ -2116,19 +2109,33 @@
                   <option value="https://opencode.ai/zen/go/v1" data-logo="opencode">OpenCode Go</option>
                   <option value="https://api.z.ai/api/coding/paas/v4" data-logo="zhipu">Z.AI Coding Plan</option>
                 </select>
-                <div class="admin-model-form-row">
-                  <input id="adm-epApiKey" type="password" placeholder="API key">
+                <!-- API key row stays in DOM, hidden until Key button is
+                     clicked. Mirrors the Local section pattern: most users
+                     paste a key via the provider preset flow rather than
+                     typing it free-form, so the row only appears on demand. -->
+                <div class="admin-model-form-row" id="adm-epApiKey-row" style="display:none;">
+                  <input id="adm-epApiKey" type="password" placeholder="API key" autocomplete="off" style="flex:1">
+                </div>
+                <div class="admin-model-form-row" style="margin-top:-4px;">
                   <select id="adm-epKind" style="padding:5px;width:82px;">
                     <option value="proxy">Proxy</option>
                     <option value="api">API</option>
                   </select>
-                  <select id="adm-epType" style="padding:5px;width:80px;">
-                    <option value="llm">LLM</option>
+                  <label style="display:inline-flex;align-items:center;gap:4px;font-size:11px;opacity:0.6;flex-shrink:0;">Type:<select id="adm-epType" style="padding:5px;width:80px;flex-shrink:0;">
+                    <option value="llm" selected>LLM</option>
                     <option value="image">Image</option>
-                  </select>
-                  <button class="admin-btn-sm" id="adm-epApiTestBtn" style="width:55px;text-align:center;">Test</button>
+                  </select></label>
+                  <span style="flex:1"></span>
+                  <button class="admin-btn-sm" id="adm-epApiKeyBtn" title="Show / hide the API key field" aria-expanded="false" aria-controls="adm-epApiKey-row" style="opacity:0.75;display:inline-flex;align-items:center;gap:4px;">
+                    <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 2l-9.6 9.6"/><circle cx="7.5" cy="15.5" r="5.5"/><path d="M15.5 7.5l3 3"/></svg>API
+                  </button>
+                  <button class="admin-btn-sm" id="adm-epApiTestBtn" style="min-width:55px;text-align:center;display:inline-flex;align-items:center;justify-content:center;gap:4px;">
+                    <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"><polygon points="5 3 19 12 5 21 5 3"/></svg>Test
+                  </button>
                   <button class="admin-btn-sm hidden" id="adm-epApiCancelTestBtn" style="width:62px;text-align:center;">Cancel</button>
-                  <button class="admin-btn-add" id="adm-epAddBtn" style="width:55px;text-align:center;">Add</button>
+                  <button class="admin-btn-add" id="adm-epAddBtn" style="min-width:55px;text-align:center;display:inline-flex;align-items:center;justify-content:center;gap:4px;">
+                    <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg>Add
+                  </button>
                 </div>
                 <div id="adm-epApiMsg" class="adm-ep-inline-msg"></div>
                 <div id="adm-deviceAuthStatus" class="adm-ep-inline-msg"></div>
@@ -2136,7 +2143,15 @@
             </div>
           </div>
           <div class="admin-card">
-            <h2><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-2px;margin-right:5px;opacity:0.6"><rect x="2" y="3" width="20" height="14" rx="2"/><line x1="8" y1="21" x2="16" y2="21"/><line x1="12" y1="17" x2="12" y2="21"/></svg>Added Models <span style="opacity:0.45;font-weight:normal;font-size:0.82em">(Endpoints)</span></h2>
+            <h2 style="display:flex;align-items:center;gap:8px;"><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-2px;margin-right:5px;opacity:0.6"><rect x="2" y="3" width="20" height="14" rx="2"/><line x1="8" y1="21" x2="16" y2="21"/><line x1="12" y1="17" x2="12" y2="21"/></svg>Added Models <span style="opacity:0.45;font-weight:normal;font-size:0.82em">(Endpoints)</span>
+              <span style="flex:1"></span>
+              <button class="admin-btn-sm" id="adm-epProbeAllBtn" title="Re-test every endpoint and refresh online status" style="font-size:11px;font-weight:normal;display:inline-flex;align-items:center;gap:4px;">
+                <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.4" stroke-linecap="round" stroke-linejoin="round"><polyline points="23 4 23 10 17 10"/><polyline points="1 20 1 14 7 14"/><path d="M3.51 9a9 9 0 0 1 14.85-3.36L23 10M1 14l4.64 4.36A9 9 0 0 0 20.49 15"/></svg>Probe
+              </button>
+              <button class="admin-btn-sm" id="adm-epClearOfflineBtn" title="Remove all endpoints currently marked offline" style="font-size:11px;font-weight:normal;display:inline-flex;align-items:center;gap:4px;opacity:0.85;">
+                <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.4" stroke-linecap="round" stroke-linejoin="round"><polyline points="3 6 5 6 21 6"/><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"/></svg>Clear offline <span id="adm-epOfflineCount" style="opacity:0.6;margin-left:2px;"></span>
+              </button>
+            </h2>
             <div class="admin-toggle-sub" style="margin-bottom:10px">Manage the endpoints you've added.</div>
             <div class="adm-ep-section">
               <div class="adm-ep-section-head">
@@ -2167,10 +2182,45 @@
               <button type="button" class="admin-btn-sm" id="unified-intg-add-btn" style="display:inline-flex;align-items:center;gap:6px;">+ Add Integration<svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="opacity:0.7;"><path d="M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"/><path d="M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"/></svg></button>
             </div>
           </div>
+          <div class="admin-card admin-only" style="margin-top:12px;">
+            <h2><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-2px;margin-right:5px;opacity:0.6"><path d="M21 2l-2 2m-7.61 7.61a5.5 5.5 0 1 1-7.778 7.778 5.5 5.5 0 0 1 7.777-7.777zm0 0L15.5 7.5m0 0l3 3L22 7l-3-3m-3.5 3.5L19 4"/></svg>API Tokens</h2>
+            <div class="admin-toggle-sub" style="margin-bottom:8px">Bearer tokens for external integrations (scripts, Codex, headless agent runs). Token value shown ONCE on create — copy it then.</div>
+            <div id="adm-tokenList" style="margin-bottom:8px;"></div>
+            <div style="display:flex;gap:6px;flex-wrap:wrap;align-items:flex-start;">
+              <input type="text" id="adm-tokenName" placeholder="Token name (e.g. agent-test)" class="settings-select" style="flex:1;min-width:160px;">
+              <input type="text" id="adm-tokenScopes" placeholder="scopes (comma-separated, blank = chat)" class="settings-select" style="flex:2;min-width:220px;" title="Allowed: chat, cookbook:read, cookbook:launch, documents:read|write, todos:read|write, email:read|draft|send, calendar:read|write, memory:read|write">
+              <button class="admin-btn-add" id="adm-tokenAddBtn">Create token</button>
+            </div>
+            <div id="adm-tokenMsg" style="font-size:11px;margin-top:6px;"></div>
+            <div id="adm-tokenReveal" style="display:none;margin-top:8px;padding:8px 10px;background:color-mix(in srgb, var(--accent, var(--red)) 12%, transparent);border:1px solid color-mix(in srgb, var(--accent, var(--red)) 35%, transparent);border-radius:6px;">
+              <div style="font-size:11px;font-weight:600;margin-bottom:4px;">Copy now — this is the only time you'll see it:</div>
+              <code id="adm-tokenValue" style="font-family:'Berkeley Mono','SF Mono','Fira Code',monospace;font-size:11px;word-break:break-all;display:block;background:var(--bg);padding:6px 8px;border-radius:4px;margin-bottom:6px;user-select:all;"></code>
+              <button class="admin-btn-sm" id="adm-tokenCopyBtn">Copy</button>
+            </div>
+          </div>
         </div>
 
         <!-- ═══ TOOLS TAB ═══ -->
         <div data-settings-panel="tools" class="hidden">
+          <div class="admin-card" style="margin-bottom:12px;">
+            <h2><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-2px;margin-right:5px;opacity:0.6"><path d="M14.7 6.3a1 1 0 0 0 0 1.4l1.6 1.6a1 1 0 0 0 1.4 0l3.77-3.77a6 6 0 0 1-7.94 7.94l-6.91 6.91a2.12 2.12 0 0 1-3-3l6.91-6.91a6 6 0 0 1 7.94-7.94l-3.76 3.76z"/></svg>Agent</h2>
+            <div class="admin-toggle-sub" style="margin-bottom:8px">Controls for the agent tool loop.</div>
+            <div class="settings-col">
+              <div class="settings-row">
+                <label class="settings-label">Tool call limit</label>
+                <input id="set-agentMaxTools" type="text" inputmode="numeric" placeholder="0 = unlimited" class="settings-select" style="width:120px;">
+              </div>
+              <div class="settings-row">
+                <label class="settings-label">Max steps per message</label>
+                <input id="set-agentMaxRounds" type="text" inputmode="numeric" placeholder="20" class="settings-select" style="width:120px;">
+              </div>
+              <div id="set-agentMsg" style="font-size:11px;color:color-mix(in srgb, var(--fg) 45%, transparent);"></div>
+            </div>
+          </div>
+          <div class="admin-card" style="margin-bottom:12px;">
+            <h2 style="display:flex;align-items:center;gap:6px;"><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="margin-right:1px;opacity:0.6;flex-shrink:0"><path d="M9 11l3 3L22 4"/><path d="M21 12v7a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2h11"/></svg>Agent loop<span style="flex:1"></span><label class="admin-switch" title="On a failing effectful turn, climb verify → different-method → teacher → stop-and-summarize instead of silently quitting." style="flex-shrink:0"><input type="checkbox" id="set-agentSupervisorLadder"><span class="admin-slider"></span></label></h2>
+            <div class="admin-toggle-sub" style="margin-bottom:8px">Supervisor ladder. When on, every effectful agent turn that claims done is verified; on FAIL the ladder escalates verify → different method → teacher → stop-with-blocker, each rung visible in chat. Teacher rung requires <code>teacher_model</code> to be set.</div>
+          </div>
           <div class="admin-card" style="margin-bottom:12px;">
             <h2><svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-2px;margin-right:5px;opacity:0.6"><path d="M14.7 6.3a1 1 0 0 0 0 1.4l1.6 1.6a1 1 0 0 0 1.4 0l3.77-3.77a6 6 0 0 1-7.94 7.94l-6.91 6.91a2.12 2.12 0 0 1-3-3l6.91-6.91a6 6 0 0 1 7.94-7.94l-3.76 3.76z"/></svg>Built-in Tools</h2>
             <div class="admin-toggle-sub" style="margin-bottom:8px">Enable or disable tools available to the AI agent.</div>
diff --git a/static/js/admin.js b/static/js/admin.js
index e4a39adf3..82b90b737 100644
--- a/static/js/admin.js
+++ b/static/js/admin.js
@@ -1149,6 +1149,144 @@ function initEndpointForm() {
     }
   }
 
+  // API Key reveal toggle. The key inputs are hidden by default so the Add
+  // form reads as a single action row; the Key button toggles the input row
+  // and flips aria-expanded for screen readers / CSS pseudo-classes.
+  const _wireKeyToggle = (btnId, rowId) => {
+    const btn = el(btnId);
+    const row = el(rowId);
+    if (!btn || !row) return;
+    btn.addEventListener('click', () => {
+      const showing = row.style.display !== 'none';
+      row.style.display = showing ? 'none' : '';
+      btn.setAttribute('aria-expanded', showing ? 'false' : 'true');
+      btn.style.opacity = showing ? '0.75' : '1';
+      if (!showing) {
+        const inp = row.querySelector('input');
+        if (inp) inp.focus();
+      }
+    });
+  };
+  _wireKeyToggle('adm-epLocalKeyBtn', 'adm-epLocalApiKey-row');
+  _wireKeyToggle('adm-epApiKeyBtn', 'adm-epApiKey-row');
+
+  // ── Added Models toolbar: Probe + Clear offline ────────────────────
+  // Both buttons act over the currently-rendered endpoint list. The
+  // online/offline marker is stamped on each row's [data-adm-ep-online]
+  // attribute by loadEndpoints(), so both buttons just iterate the DOM
+  // without re-fetching anything they don't already have.
+  const _refreshOfflineCount = () => {
+    const lbl = el('adm-epOfflineCount');
+    if (!lbl) return;
+    const n = document.querySelectorAll('[data-adm-ep-id] [data-adm-ep-online="0"]').length;
+    lbl.textContent = n > 0 ? `(${n})` : '';
+    // Keep the button enabled even when there are no offline rows — a
+    // click on the empty case fires a toast instead of feeling dead.
+    const btn = el('adm-epClearOfflineBtn');
+    if (btn) btn.style.opacity = n === 0 ? '0.55' : '0.85';
+  };
+  // Wire after every loadEndpoints() run by patching the render hook —
+  // simplest path: MutationObserver on the two list containers.
+  const _obsRoots = ['adm-epList-local', 'adm-epList-api']
+    .map(id => el(id)).filter(Boolean);
+  if (_obsRoots.length) {
+    const mo = new MutationObserver(_refreshOfflineCount);
+    _obsRoots.forEach(r => mo.observe(r, { childList: true, subtree: true }));
+    _refreshOfflineCount();
+  }
+
+  const probeAllBtn = el('adm-epProbeAllBtn');
+  if (probeAllBtn) {
+    probeAllBtn.addEventListener('click', async () => {
+      probeAllBtn.disabled = true;
+      const origHTML = probeAllBtn.innerHTML;
+      probeAllBtn.innerHTML = '<span style="opacity:0.7;">Probing…</span>';
+      try {
+        // Hit the bulk local probe (same one the model picker uses).
+        await fetch('/api/model-endpoints/probe-local', { credentials: 'same-origin' }).catch(() => {});
+        // Then per-endpoint /probe for the rest so API/cloud endpoints
+        // refresh too. Parallel — capped to 6 at a time so we don't
+        // hammer the backend on a big list.
+        const ids = Array.from(document.querySelectorAll('[data-adm-ep-id]')).map(r => r.getAttribute('data-adm-ep-id')).filter(Boolean);
+        const lane = async (id) => {
+          try { await fetch(`/api/model-endpoints/${id}/probe`, { credentials: 'same-origin' }); } catch (_) {}
+        };
+        const queue = [...ids];
+        const workers = Array.from({length: Math.min(6, queue.length)}, () => (async () => {
+          while (queue.length) {
+            const id = queue.shift();
+            if (id) await lane(id);
+          }
+        })());
+        await Promise.all(workers);
+        await loadEndpoints();
+        if (uiModule && uiModule.showToast) uiModule.showToast('Endpoint status refreshed', 1800);
+      } finally {
+        probeAllBtn.innerHTML = origHTML;
+        probeAllBtn.disabled = false;
+      }
+    });
+  }
+
+  const clearOfflineBtn = el('adm-epClearOfflineBtn');
+  if (clearOfflineBtn) {
+    clearOfflineBtn.addEventListener('click', async () => {
+      const offlineBtns = Array.from(document.querySelectorAll('[data-adm-del-ep][data-adm-ep-online="0"]'));
+      const ids = offlineBtns.map(b => b.getAttribute('data-adm-del-ep')).filter(Boolean);
+      if (!ids.length) {
+        if (uiModule && uiModule.showToast) {
+          uiModule.showToast('No offline endpoints — nothing to clear', 1800);
+        }
+        return;
+      }
+      const confirmMsg = ids.length === 1
+        ? 'Remove 1 offline endpoint?'
+        : `Remove ${ids.length} offline endpoints?`;
+      if (uiModule && uiModule.styledConfirm) {
+        const ok = await uiModule.styledConfirm(confirmMsg, { confirmText: 'Remove', danger: true });
+        if (!ok) return;
+      } else if (!confirm(confirmMsg)) {
+        return;
+      }
+      clearOfflineBtn.disabled = true;
+      // Optimistic UI: pull rows immediately, then fire the DELETEs.
+      offlineBtns.forEach(b => {
+        const row = b.closest('[data-adm-ep-id]');
+        if (row) row.remove();
+      });
+      await Promise.all(ids.map(id =>
+        fetch('/api/model-endpoints/' + id, { method: 'DELETE', credentials: 'same-origin' }).catch(() => {})
+      ));
+      try { await loadEndpoints(); } catch (_) {}
+      _refreshOfflineCount();
+      if (uiModule && uiModule.showToast) uiModule.showToast(`Removed ${ids.length} offline endpoint${ids.length === 1 ? '' : 's'}`, 1800);
+    });
+  }
+
+  // Clear-on-focus for the API key inputs. The fields are type=password so the
+  // value is masked; users can't see what's there to edit it in place, so the
+  // expected gesture is "click in, type new key". Wiping on focus removes the
+  // select-all-and-delete dance.
+  const _wireClearOnFocus = (id) => {
+    const inp = el(id);
+    if (!inp) return;
+    inp.addEventListener('focus', () => {
+      if (inp.value) inp.value = '';
+    });
+  };
+  _wireClearOnFocus('adm-epLocalApiKey');
+  _wireClearOnFocus('adm-epApiKey');
+
+  // Drop the Ollama provider logo into the Ollama Quickstart button. Reuses
+  // the same SVG the provider picker uses, so brand parity stays free.
+  try {
+    const _ollamaLogoSlot = document.querySelector('#adm-epOllamaBtn .adm-ollama-logo');
+    if (_ollamaLogoSlot) {
+      const svg = providerLogo('ollama') || '';
+      if (svg) _ollamaLogoSlot.innerHTML = svg;
+    }
+  } catch (_) {}
+
   // Local "Add" button — sibling form for self-hosted base URLs.
   const localAddBtn = el('adm-epLocalAddBtn');
   const localTestBtn = el('adm-epLocalTestBtn');
@@ -2073,17 +2211,28 @@ async function loadTokens() {
 }
 
 function initTokenForm() {
-  el('adm-tokenAddBtn').addEventListener('click', async () => {
+  const addBtn = el('adm-tokenAddBtn');
+  if (!addBtn || addBtn.dataset.bound) return;
+  addBtn.dataset.bound = '1';
+  addBtn.addEventListener('click', async () => {
     const msg = el('adm-tokenMsg');
     const reveal = el('adm-tokenReveal');
     msg.textContent = ''; msg.className = ''; reveal.style.display = 'none';
     const name = el('adm-tokenName').value.trim();
     if (!name) { msg.textContent = 'Token name is required'; msg.className = 'admin-error'; return; }
     const fd = new FormData(); fd.append('name', name);
+    const scopes = (el('adm-tokenScopes')?.value || '').trim();
+    if (scopes) fd.append('scopes', scopes);
     try {
       const res = await fetch('/api/tokens', { method: 'POST', body: fd, credentials: 'same-origin' });
       const data = await res.json();
-      if (res.ok) { el('adm-tokenValue').textContent = data.token; reveal.style.display = ''; el('adm-tokenName').value = ''; loadTokens(); }
+      if (res.ok) {
+        el('adm-tokenValue').textContent = data.token;
+        reveal.style.display = '';
+        el('adm-tokenName').value = '';
+        if (el('adm-tokenScopes')) el('adm-tokenScopes').value = '';
+        loadTokens();
+      }
       else { msg.textContent = data.detail || 'Failed'; msg.className = 'admin-error'; }
     } catch (e) { msg.textContent = 'Request failed'; msg.className = 'admin-error'; }
   });
@@ -2344,7 +2493,7 @@ function initDangerZone() {
    ═══════════════════════════════════════════ */
 function initAll() {
   modalEl = el('settings-modal');
-  const inits = [initSignupToggle, initAddUser, initEndpointForm, initMcpForm, initCalDAV, initBackup, initDangerZone, () => settingsModule.initIntegrations()];
+  const inits = [initSignupToggle, initAddUser, initEndpointForm, initMcpForm, initCalDAV, initBackup, initDangerZone, initTokenForm, () => settingsModule.initIntegrations()];
   for (const fn of inits) {
     try { fn(); } catch (e) { console.error('Admin init error in', fn.name || 'anonymous', e); }
   }
@@ -2357,6 +2506,7 @@ function refreshAll() {
   loadEndpoints();
   loadBuiltinTools();
   loadMcpServers();
+  loadTokens();
 }
 
 /* ═══════════════════════════════════════════
diff --git a/static/js/chatRenderer.js b/static/js/chatRenderer.js
index fc7ed1aeb..9a5c6f78b 100644
--- a/static/js/chatRenderer.js
+++ b/static/js/chatRenderer.js
@@ -2118,6 +2118,28 @@ export function addMessage(role, content, modelName, metadata) {
       return lastWrap;
     }
 
+    // --- Wake-task / supervisor system check-in ---
+    // The self-wake mechanism injects "Did you finish?" as a user message
+    // (or persisted history shows a "[Task] Self-check: <id>" envelope)
+    // so the agent loop re-enters and re-checks status. Render as a
+    // normal user-style bubble — same chrome as a real user message,
+    // just with role "Supervisor" and a short summary body — instead of
+    // a slim system chip. Matches chat style and integrates cleanly
+    // into the conversation flow.
+    let _isWakeCheck = !!(metadata?.wake_check_in || metadata?.hidden_from_user_view);
+    if (!_isWakeCheck && typeof textRaw === 'string') {
+      // Also catch historical messages persisted as "[Task] Self-check: <sid>"
+      // (older wake tasks that didn't set wake_check_in metadata).
+      if (/^\s*\[Task\]\s+Self-check:/i.test(textRaw)) {
+        _isWakeCheck = true;
+      }
+    }
+    if (_isWakeCheck) {
+      // Supervisor self-check messages are an internal control signal —
+      // skip rendering entirely so they don't show up in the conversation.
+      return null;
+    }
+
     // --- Standard single-bubble message ---
     const wrap = document.createElement('div');
     wrap.className = 'msg ' + (role === 'user' ? 'msg-user' : 'msg-ai');
diff --git a/static/js/cookbook-diagnosis.js b/static/js/cookbook-diagnosis.js
index 19512ab50..24d5770e7 100644
--- a/static/js/cookbook-diagnosis.js
+++ b/static/js/cookbook-diagnosis.js
@@ -610,12 +610,47 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
     ? `Suggested action: ${fixes[0].label}.`
     : 'Suggested action: copy the error and adjust the serve settings.');
 
-  // Simplified diagnosis card: just the error message + suggestion + fix
-  // button(s). Removed the fold toggle, copy button, and × dismiss — they
-  // made the card noisy without earning their keep. _diagCollapsed is kept
-  // as a stub so callers don't have to change.
   panel._diagCollapsed = false;
 
+  // Top-right toolbar: Copy bundle + × dismiss. Restored after user feedback
+  // — without them there's no way to quietly close a stale diagnosis or grab
+  // the full error+context for a forum/discord paste.
+  const toolbar = document.createElement('div');
+  toolbar.className = 'cookbook-diag-toolbar';
+  toolbar.style.cssText = 'display:flex;justify-content:flex-end;align-items:center;gap:4px;margin-bottom:-2px;';
+
+  const copyBtn = document.createElement('button');
+  copyBtn.type = 'button';
+  copyBtn.className = 'cookbook-diag-copy';
+  copyBtn.title = 'Copy diagnosis details';
+  copyBtn.setAttribute('aria-label', 'Copy diagnosis');
+  copyBtn.innerHTML = '<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2" ry="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>';
+  copyBtn.addEventListener('click', async (e) => {
+    e.stopPropagation();
+    const bundle = _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText);
+    try {
+      await navigator.clipboard.writeText(bundle);
+      copyBtn.classList.add('copied');
+      setTimeout(() => { if (copyBtn.isConnected) copyBtn.classList.remove('copied'); }, 1200);
+    } catch (_) {}
+  });
+
+  const dismissBtn = document.createElement('button');
+  dismissBtn.type = 'button';
+  dismissBtn.className = 'cookbook-diag-dismiss';
+  dismissBtn.title = 'Dismiss diagnosis';
+  dismissBtn.setAttribute('aria-label', 'Dismiss');
+  dismissBtn.textContent = '×';
+  dismissBtn.addEventListener('click', (e) => {
+    e.stopPropagation();
+    panel._diagDismissed = diagnosis.message;
+    _clearDiagnosis(panel);
+  });
+
+  toolbar.appendChild(copyBtn);
+  toolbar.appendChild(dismissBtn);
+  diag.appendChild(toolbar);
+
   const body = document.createElement('div');
   body.className = 'cookbook-diag-body';
   const msg = document.createElement('div');
diff --git a/static/js/cookbook-hwfit.js b/static/js/cookbook-hwfit.js
index 74571bae9..d8652d02e 100644
--- a/static/js/cookbook-hwfit.js
+++ b/static/js/cookbook-hwfit.js
@@ -416,9 +416,11 @@ function _hwfitShowError(list, host, detail) {
   if (rb) rb.addEventListener('click', () => { _resetGpuToggleState(); _hwfitFetch(true); });
 }
 
-// Client-side "Engine" filter (llama.cpp / vLLM / SGLang). Empty = show all.
-// Uses the same _detectBackend() the serve commands use, so what you filter to
-// is exactly what would be launched. Pure view filter — no refetch needed.
+// Client-side "Engine" filter (llama.cpp / vLLM / SGLang / Ollama). Empty =
+// show all. Uses the same _detectBackend() the serve commands use, so what you
+// filter to is exactly what would be launched. Pure view filter — no refetch
+// needed. Ollama rows are merged into the main list (see _ensureOllamaLib +
+// _ollamaToHwfitRows below) so the filter handles all engines uniformly.
 function _applyEngineFilter(models) {
   const want = document.getElementById('hwfit-engine')?.value || '';
   if (!want || !Array.isArray(models)) return models || [];
@@ -427,6 +429,86 @@ function _applyEngineFilter(models) {
   });
 }
 
+// Ollama library cache (per-page). Filled lazily on first _hwfitFetch; the raw
+// list is the same shape returned by /api/cookbook/ollama/library, then turned
+// into per-tag hwfit rows so they slot into the main list grid alongside HF
+// scan results.
+let _ollamaLibCache = null;
+async function _ensureOllamaLib() {
+  if (_ollamaLibCache) return _ollamaLibCache;
+  try {
+    const res = await fetch('/api/cookbook/ollama/library');
+    const data = await res.json();
+    _ollamaLibCache = Array.isArray(data?.models) ? data.models : [];
+  } catch { _ollamaLibCache = []; }
+  return _ollamaLibCache;
+}
+
+// Convert an Ollama library entry's sizes into per-tag hwfit rows. Shape
+// matches what _hwfitRenderList expects (fit_level, parameter_count,
+// required_gb, score, …) so the rows render identically to HF results.
+function _olParseSize(s) {
+  // "14b" → 14, "1.5b" → 1.5, "8x7b" → 56 (rough), "135m" → 0.135, "latest" → null
+  if (!s) return null;
+  const low = s.toLowerCase();
+  let m = low.match(/^(\d+(?:\.\d+)?)x(\d+(?:\.\d+)?)b$/);
+  if (m) return parseFloat(m[1]) * parseFloat(m[2]);
+  m = low.match(/^(\d+(?:\.\d+)?)b$/);
+  if (m) return parseFloat(m[1]);
+  m = low.match(/^(\d+(?:\.\d+)?)m$/);
+  if (m) return parseFloat(m[1]) / 1000;
+  return null;
+}
+function _ollamaToHwfitRows(libModels, vramAvail, ramAvail) {
+  const out = [];
+  if (!Array.isArray(libModels)) return out;
+  for (const m of libModels) {
+    const sizes = (Array.isArray(m.sizes) && m.sizes.length) ? m.sizes : ['latest'];
+    for (const sz of sizes) {
+      const params = _olParseSize(sz);
+      // Ollama default GGUF is ~Q4_K_M. Rough VRAM estimate: 0.6 GB / B.
+      const vramGb = params ? params * 0.6 : 0;
+      let fitLevel = 'no_fit';
+      if (vramGb && vramAvail) {
+        if (vramGb <= vramAvail * 0.6) fitLevel = 'perfect';
+        else if (vramGb <= vramAvail) fitLevel = 'good';
+        else if (ramAvail && vramGb <= ramAvail) fitLevel = 'marginal';
+        else fitLevel = 'too_tight';
+      } else if (vramGb && ramAvail && vramGb <= ramAvail) {
+        fitLevel = 'marginal';
+      }
+      const tag = `${m.name}:${sz}`;
+      const paramsLabel = params
+        ? (params >= 1 ? params.toFixed(params >= 10 ? 0 : 1) + 'B' : (params * 1000).toFixed(0) + 'M')
+        : '?';
+      // A modest score so Ollama rows still sort sensibly in the default
+      // score view — bigger models get a slightly higher base, but they
+      // always come in below well-scored HF results. Sort by Fit or VRAM
+      // to surface them more aggressively.
+      const score = params ? Math.min(30 + params * 0.3, 60) : 25;
+      out.push({
+        name: tag,
+        repo_id: tag,
+        quant: 'Q4_K_M',
+        parameter_count: paramsLabel,
+        params_b: params || 0,
+        required_gb: vramGb,
+        fit_level: fitLevel,
+        score,
+        speed_tps: 0,
+        context: 0,
+        is_gguf: true,
+        backend: 'ollama',
+        _isOllama: true,
+        _olName: m.name,
+        _olSize: sz,
+        _description: m.description || '',
+      });
+    }
+  }
+  return out;
+}
+
 export async function _hwfitFetch(fresh = false) {
   const _tk = ++_hwfitFetchToken;
   const useCase = document.getElementById('hwfit-usecase')?.value || '';
@@ -475,7 +557,12 @@ export async function _hwfitFetch(fresh = false) {
     _setLastCacheHost(remoteKey);
     const _cacheSrv = _serverByVal(_envState.remoteServerKey || remoteHost);
     const _cachePort = _cacheSrv?.port || '';
-    const _cacheParams = new URLSearchParams({ host: remoteHost }); if (_cachePort) _cacheParams.set('ssh_port', _cachePort); if (_cacheSrv?.platform) _cacheParams.set('platform', _cacheSrv.platform);
+    const _cacheParams = new URLSearchParams();
+    if (remoteHost) {
+      _cacheParams.set('host', remoteHost);
+      if (_cachePort) _cacheParams.set('ssh_port', _cachePort);
+      if (_cacheSrv?.platform) _cacheParams.set('platform', _cacheSrv.platform);
+    }
     fetch(`/api/model/cached?${_cacheParams}`, { credentials: 'same-origin' })
       .then(r => r.json())
       .then(d => {
@@ -543,7 +630,18 @@ export async function _hwfitFetch(fresh = false) {
     // A newer scan started while this one was in flight (user switched servers
     // mid-probe) — drop this stale response so it can't clobber the new one.
     if (_tk !== _hwfitFetchToken) { try { wp.destroy(); } catch {} return; }
-    if (!res.ok) throw new Error(res.statusText);
+    if (!res.ok) {
+      const body = await res.text().catch(() => '');
+      let msg = '';
+      try {
+        const payload = JSON.parse(body);
+        msg = payload && (payload.detail || payload.error || payload.message);
+      } catch {
+        msg = body;
+      }
+      msg = typeof msg === 'string' ? msg.trim() : '';
+      throw new Error(`HTTP ${res.status} ${res.statusText}${msg ? `: ${msg}` : ''}`);
+    }
     let data = await res.json();
     if (_tk !== _hwfitFetchToken) { try { wp.destroy(); } catch {} return; }
     if (!isImageMode && quantPref && !data.error && Array.isArray(data.models) && data.models.length === 0) {
@@ -583,6 +681,23 @@ export async function _hwfitFetch(fresh = false) {
       if (!_cached) { _hwfitShowError(list, remoteHost, data.error); if (hw) hw.innerHTML = ''; }
       return;
     }
+    // Merge Ollama library rows into the main list so they appear with the
+    // same Fit/Param/Quant/VRAM/Mode columns as HF results and respond to the
+    // Engine filter. Skipped in image-gen mode (Ollama doesn't serve diffusers).
+    if (!isImageMode) {
+      const _vramAvail = data.system?.gpu_vram_gb || 0;
+      const _ramAvail = data.system?.total_ram_gb || 0;
+      const _lib = await _ensureOllamaLib();
+      const _olRows = _ollamaToHwfitRows(_lib, _vramAvail, _ramAvail);
+      // Search filter on Ollama rows: HF API already filters by search; do the
+      // same client-side over Ollama name + description so the search box
+      // works consistently across both sources.
+      const _s = (search || '').trim().toLowerCase();
+      const _olFiltered = _s
+        ? _olRows.filter(r => r.name.toLowerCase().includes(_s) || (r._description || '').toLowerCase().includes(_s))
+        : _olRows;
+      data.models = (data.models || []).concat(_olFiltered);
+    }
     _hwfitCache = data;
     _hwfitRenderHw(hw, data.system);
     // Propagate local platform from hardware probe so _isWindows(task) works
@@ -964,14 +1079,36 @@ export function _hwfitRenderList(el, models) {
     html += `</div>`;
   }
   el.innerHTML = html;
-  // Click row → expand inline action panel
+  // Click row → expand inline action panel. Exception: Ollama rows skip the
+  // expand panel (no HF metadata to power it) and just fill the Download
+  // input with the `<name>:<size>` tag — one click → ready to pull.
   el.querySelectorAll('.hwfit-row:not(.hwfit-header)').forEach(row => {
     row.addEventListener('click', () => {
       const name = row.dataset.model;
       if (!name) return;
-      // Find model data from cache
       const modelData = (_hwfitCache?.models || []).find(m => m.name === name);
       if (!modelData) return;
+      if (modelData._isOllama) {
+        // Force-open the Download card if it's been collapsed — otherwise
+        // filling the (hidden) input silently swallows the click.
+        const dlBody = document.getElementById('cookbook-download-card-body');
+        const dlArrow = document.getElementById('cookbook-download-card-arrow');
+        if (dlBody && dlBody.style.display === 'none') {
+          dlBody.style.display = 'block';
+          if (dlArrow) dlArrow.style.transform = 'rotate(90deg)';
+        }
+        const dlInput = document.getElementById('cookbook-dl-repo');
+        if (dlInput) {
+          dlInput.value = modelData.name;
+          dlInput.focus();
+          // Briefly highlight so the user sees what got filled even when the
+          // download card sits far above the (long) hwfit list.
+          dlInput.classList.add('cookbook-dl-flash');
+          setTimeout(() => dlInput.classList.remove('cookbook-dl-flash'), 800);
+          dlInput.scrollIntoView({ behavior: 'smooth', block: 'center' });
+        }
+        return;
+      }
       _expandModelRow(row, modelData);
     });
   });
@@ -1297,7 +1434,7 @@ export function _hwfitInit() {
   if (sort) sort.addEventListener('change', () => _hwfitFetch());
   if (qpref) qpref.addEventListener('change', () => _hwfitFetch());
   // Engine filter is a pure client-side view filter over the already-fetched
-  // list, so just re-render from cache instead of re-probing hardware.
+  // list (HF + Ollama merged), so just re-render from cache.
   const engine = document.getElementById('hwfit-engine');
   if (engine) engine.addEventListener('change', () => {
     const list = document.getElementById('hwfit-list');
@@ -1694,6 +1831,15 @@ export function _hwfitInit() {
       saveBtn.addEventListener('click', () => {
         _syncServers();
         _rebuildServerSelect();
+        // Broadcast for anything outside the settings tab that depends on
+        // the server list (Serve dialog host picker, Running tasks, etc.).
+        // Without this the user had to hard-refresh to see the new entry
+        // in those other places.
+        try {
+          document.dispatchEvent(new CustomEvent('cookbook:servers-changed', {
+            detail: { servers: _envState.servers.slice() },
+          }));
+        } catch (_) {}
         saveBtn.classList.add('saved');
         saveBtn.innerHTML = '<svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="#50fa7b" stroke-width="2.6" stroke-linecap="round" stroke-linejoin="round" style="margin-right:4px;flex-shrink:0;"><polyline points="20 6 9 17 4 12"/></svg>Saved';
       });
@@ -1713,6 +1859,11 @@ export function _hwfitInit() {
       entry.remove();
       _syncServers();
       _rebuildServerSelect();
+      try {
+        document.dispatchEvent(new CustomEvent('cookbook:servers-changed', {
+          detail: { servers: _envState.servers.slice() },
+        }));
+      } catch (_) {}
       _hwfitCache = null;
       _hwfitFetch();
     });
diff --git a/static/js/cookbook.js b/static/js/cookbook.js
index c1395179c..03319d0de 100644
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -72,7 +72,7 @@ function _platformIcon(platform) {
   return '';
 }
 
-export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', remoteServerKey: '', servers: [], modelPaths: [], platform: '', defaultServer: '' };
+export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', defaultServer: '' };
 let _lastCacheHostVal = null;
 let _cookbookOpeningSpinners = [];
 export function _lastCacheHost() { return _lastCacheHostVal; }
@@ -89,8 +89,8 @@ function _setCookbookOpening(on) {
   ].filter(Boolean);
   if (!on) {
     _cookbookOpeningSpinners.forEach(({ spinner, wrap, target }) => {
-      try { spinner?.stop?.(); } catch { }
-      try { wrap?.remove?.(); } catch { }
+      try { spinner?.stop?.(); } catch {}
+      try { wrap?.remove?.(); } catch {}
       target?.classList?.remove('cookbook-opening');
     });
     _cookbookOpeningSpinners = [];
@@ -114,44 +114,18 @@ function _setCookbookOpening(on) {
 // True for the local server entry (empty / "local" / "localhost" host).
 function _isLocalEntry(s) { return !s || !s.host || s.host === 'local' || s.host.toLowerCase() === 'localhost'; }
 
-// Resolve a dropdown option value to a server entry. New option values are
-// stable per-profile keys, so same-host SSH profiles stay distinguishable.
-// Host strings and numeric indices remain accepted for stale saved state.
-export function _serverKey(s) {
-  if (_isLocalEntry(s)) return 'local';
-  return 'srv:' + [
-    s?.name || '',
-    s?.host || '',
-    s?.port || '',
-    s?.envPath || '',
-    s?.platform || '',
-  ].map(v => encodeURIComponent(String(v).trim())).join('|');
-}
-
+// Resolve a dropdown option value to a server entry. Option values are the
+// stable HOST string ('local' for the local box) — NOT array indices — because
+// `_envState.servers` gets deduped/reordered, which made index-based selection
+// silently resolve to the wrong (or local) server. Accepts a numeric index too
+// for backwards-compat with any stale value.
 function _serverByVal(val) {
   if (val == null || val === 'local' || val === '') return null;
-  const raw = String(val);
-  let s = _envState.servers.find(x => _serverKey(x) === raw);
-  if (!s) s = _envState.servers.find(x => x.host === raw);
+  let s = _envState.servers.find(x => x.host === val);
   if (!s && /^\d+$/.test(String(val))) s = _envState.servers[parseInt(val)];
   return s || null;
 }
 
-export function _selectedServer() {
-  if (_envState.remoteServerKey) {
-    const keyed = _serverByVal(_envState.remoteServerKey);
-    if (keyed) return keyed;
-  }
-  if (_envState.remoteHost) return _envState.servers.find(s => s.host === _envState.remoteHost) || null;
-  return null;
-}
-
-export function _currentServerValue() {
-  const selected = _selectedServer();
-  if (selected) return _serverKey(selected);
-  return _envState.remoteHost || 'local';
-}
-
 function _buildServerOpts(excludeLocal = false) {
   // The local server is ALWAYS represented by the synthetic value="local" option
   // (showing its custom name from the "server name" feature). We must therefore
@@ -160,20 +134,13 @@ function _buildServerOpts(excludeLocal = false) {
   const _localSrv = _localIdx >= 0 ? _envState.servers[_localIdx] : null;
   const _localLabel = (_localSrv && _localSrv.name) ? _localSrv.name : 'Local';
   let html = `<option value="local"${!_envState.remoteHost ? ' selected' : ''}>${esc(_localLabel)}</option>`;
-  const selectedKey = _envState.remoteServerKey || '';
-  let legacyHostSelected = false;
   for (let i = 0; i < _envState.servers.length; i++) {
     const s = _envState.servers[i];
     if (i === _localIdx) continue;                 // already the synthetic "local" option
     if (excludeLocal && _isLocalEntry(s)) continue;
     const label = s.name || s.host || `Server ${i + 1}`;
-    const value = _serverKey(s);
-    let selected = selectedKey ? value === selectedKey : false;
-    if (!selectedKey && _envState.remoteHost === s.host && !legacyHostSelected) {
-      selected = true;
-      legacyHostSelected = true;
-    }
-    html += `<option value="${esc(value)}"${selected ? ' selected' : ''}>${esc(label)}</option>`;
+    const selected = _envState.remoteHost === s.host ? ' selected' : '';
+    html += `<option value="${esc(s.host)}"${selected}>${esc(label)}</option>`;
   }
   return html;
 }
@@ -187,41 +154,16 @@ export function _sshCmd(host, cmd, port) {
 /** Get SSH port for a given host (or task object) */
 function _getPort(hostOrTask) {
   if (!hostOrTask) return '';
-  if (typeof hostOrTask === 'object') return hostOrTask.sshPort || _getPort(hostOrTask.remoteServerKey || hostOrTask.remoteHost);
-  const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null;
-  const srv = selected || _serverByVal(hostOrTask);
+  if (typeof hostOrTask === 'object') return hostOrTask.sshPort || _getPort(hostOrTask.remoteHost);
+  const srv = _envState.servers.find(s => s.host === hostOrTask);
   return srv?.port || '';
 }
 
 /** Get platform for a given host (or task object). Returns 'windows', 'termux', 'linux', or '' */
 export function _getPlatform(hostOrTask) {
-  const isWinBrowser = (window.navigator.userAgent || window.navigator.platform || '').toLowerCase().includes('win');
-  // The browser's OS is NOT the server's OS when the UI is opened remotely —
-  // e.g. a Windows browser driving a Mac/Linux homeserver. Trusting the
-  // user-agent there makes the serve builder emit the Windows python-only
-  // shape (`python -m llama_cpp.server`, no `llama-server ||` fallback), which
-  // then fails on the actual Unix server. The local hardware probe is
-  // authoritative: it reports a backend (metal/cuda/rocm/cpu_*) for any Unix
-  // server and carries platform:"windows" for local Windows (which sets
-  // _envState.platform, short-circuiting below). So only fall back to the
-  // browser hint when we have no server-side signal at all.
-  const localPlatform = () => {
-    if (_envState.platform) return _envState.platform;
-    if (String(_hwfitCache?.system?.backend || '')) return '';
-    return isWinBrowser ? 'windows' : '';
-  };
-  if (!hostOrTask || hostOrTask === 'local') {
-    return localPlatform();
-  }
-  if (typeof hostOrTask === 'object') {
-    const h = hostOrTask.remoteHost;
-    if (!h || h === 'local') {
-      return hostOrTask.platform || localPlatform();
-    }
-    return hostOrTask.platform || _getPlatform(hostOrTask.remoteServerKey || h);
-  }
-  const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null;
-  const srv = selected || _serverByVal(hostOrTask);
+  if (!hostOrTask) return _envState.platform || '';
+  if (typeof hostOrTask === 'object') return hostOrTask.platform || _getPlatform(hostOrTask.remoteHost);
+  const srv = _envState.servers.find(s => s.host === hostOrTask);
   return srv?.platform || '';
 }
 
@@ -237,19 +179,6 @@ export function _isMetal() {
   return ['metal', 'mps', 'apple'].includes(String(_hwfitCache?.system?.backend || '').toLowerCase());
 }
 
-const GEMMA4_THINKING_CHAT_TEMPLATE = `{% for message in messages %}{% if message['role'] == 'system' %}<|turn>system\n<|think|>{{ message['content'] }}<turn|>\n{% elif message['role'] == 'user' %}<|turn>user\n{{ message['content'] }}<turn|>\n{% elif message['role'] == 'assistant' %}<|turn>model\n{{ message['content'] }}<turn|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|turn>model\n<|channel>thought{% endif %}`;
-
-function _isGemma4ThinkingModel(modelName) {
-  const n = (modelName || '').toLowerCase();
-  return n.includes('gemma-4') || n.includes('gemma4');
-}
-
-function _gemma4ThinkingChatTemplateArg(modelName) {
-  return _isGemma4ThinkingModel(modelName)
-    ? _shellQuote(GEMMA4_THINKING_CHAT_TEMPLATE)
-    : '';
-}
-
 /** Detect model-specific vLLM optimizations */
 function _detectModelOptimizations(modelName) {
   const n = (modelName || '').toLowerCase();
@@ -326,7 +255,10 @@ export function _detectToolParser(modelName) {
 // ── Backend detection ──
 
 export function _detectBackend(model) {
-  if (model?.backend === 'ollama' || model?.is_ollama) {
+  const _ollamaName = String(model?.repo_id || model?.name || model?.id || '').trim();
+  const _ollamaMeta = `${model?.backend || ''} ${model?.endpoint_kind || ''} ${model?.provider || ''} ${model?.source || ''}`.toLowerCase();
+  const _looksLikeOllamaTag = /^[A-Za-z0-9][A-Za-z0-9._-]*(?::[A-Za-z0-9][A-Za-z0-9._-]*)$/.test(_ollamaName);
+  if (model?.backend === 'ollama' || model?.is_ollama || _ollamaMeta.includes('ollama') || _looksLikeOllamaTag) {
     return { backend: 'ollama', label: 'Ollama' };
   }
   const q = (model.quant || '').toUpperCase();
@@ -450,8 +382,6 @@ export function _buildServeCmd(f, modelName, backend) {
     const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim();
     if (_extraEnv) cmd += _extraEnv + ' ';
     cmd += `${_vllmBin} serve ${modelName} --host 0.0.0.0 --port ${f.port || '8000'}`;
-    const _gemma4ChatTemplate = _gemma4ThinkingChatTemplateArg(modelName);
-    if (_gemma4ChatTemplate) cmd += ` --chat-template ${_gemma4ChatTemplate}`;
     cmd += ` --tensor-parallel-size ${f.tp || '1'}`;
     cmd += ` --max-model-len ${f.ctx || '8192'}`;
     cmd += ` --gpu-memory-utilization ${f.gpu_mem || '0.90'}`;
@@ -482,8 +412,6 @@ export function _buildServeCmd(f, modelName, backend) {
     const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim();
     if (_extraEnv) cmd += _extraEnv + ' ';
     cmd += `${_py3Bin} -m sglang.launch_server --model-path ${modelName} --host 0.0.0.0 --port ${f.port || '30000'}`;
-    const _gemma4ChatTemplate = _gemma4ThinkingChatTemplateArg(modelName);
-    if (_gemma4ChatTemplate) cmd += ` --chat-template ${_gemma4ChatTemplate}`;
     if (f.tp && f.tp !== '1') cmd += ` --tp ${f.tp}`;
     if (f.ctx) cmd += ` --context-length ${f.ctx}`;
     if (f.gpu_mem && f.gpu_mem !== '0.90') cmd += ` --mem-fraction-static ${f.gpu_mem}`;
@@ -585,9 +513,34 @@ export function _buildServeCmd(f, modelName, backend) {
     }
   } else if (backend === 'ollama') {
     const ollamaPort = f.port || '11434';
-    const bindHost = _envState.remoteHost ? '0.0.0.0' : '127.0.0.1';
-    const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=${bindHost}:${ollamaPort} ` : '';
-    cmd = `${hostEnv}ollama serve`;
+    // GGUF + Ollama: delegate to the iGPU-bound ollama-test container via
+    // its /usr/local/bin/ollama-import helper. Plain `ollama serve` errors
+    // 127 on hosts where ollama isn't on PATH (and even when it is, it
+    // doesn't import the GGUF — it just starts the daemon). Args are all
+    // literal so the cookbook validator (which bans &&/||/;/$() ) is
+    // happy: `docker exec ollama-test ollama-import <repo> <name> <ctx>
+    // <file>`. The helper handles the find/Modelfile/preload dance.
+    if (modelName.includes('/') && (f.gguf_file || /-GGUF$/i.test(modelName))) {
+      // HF-GGUF repo → import + preload + tail
+      const _name = (modelName.split('/').pop() || modelName)
+        .replace(/-GGUF$/i, '')
+        .toLowerCase()
+        .replace(/[^a-z0-9._:-]+/g, '-')
+        .replace(/^-+|-+$/g, '');
+      const _ctx = f.ctx || '8192';
+      const _file = (f.gguf_file || '').split('/').pop() || '';
+      // Trailing GGUF_FILE is optional; helper picks the first match if empty.
+      cmd = `docker exec ollama-test ollama-import ${modelName} ${_name} ${_ctx}${_file ? ' ' + _file : ''}`;
+    } else if (!modelName.includes('/') && modelName) {
+      // Already-pulled Ollama tag (e.g. `qwen2.5:7b`). On kierkegaard the
+      // runtime is the ROCm Ollama sidecar; this quick command verifies the
+      // tag exists, then the backend auto-registers http://host.docker.internal:11434/v1.
+      cmd = `docker exec ollama-rocm ollama show ${modelName}`;
+    } else {
+      const bindHost = _envState.remoteHost ? '0.0.0.0' : '127.0.0.1';
+      const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=${bindHost}:${ollamaPort} ` : '';
+      cmd = `${hostEnv}ollama serve`;
+    }
   } else if (backend === 'diffusers') {
     const gpuStr = f.gpus?.trim();
     if (gpuStr) cmd += `CUDA_VISIBLE_DEVICES=${gpuStr} `;
@@ -630,7 +583,7 @@ function _fallbackCopy(text) {
   ta.style.cssText = 'position:fixed;left:-9999px;top:-9999px';
   document.body.appendChild(ta);
   ta.select();
-  try { document.execCommand('copy'); } catch (_) { }
+  try { document.execCommand('copy'); } catch (_) {}
   document.body.removeChild(ta);
   return Promise.resolve();
 }
@@ -663,7 +616,7 @@ function _readStoredEnvState() {
 
 export function _persistEnvState() {
   try { localStorage.setItem(LAST_STATE_KEY, JSON.stringify(_envStateForStorage())); }
-  catch (_) { }
+  catch (_) {}
   _saveTasks(_loadTasks());
 }
 
@@ -712,24 +665,22 @@ async function _fetchDependencies() {
     const data = await resp.json();
     const pkgs = data.packages || [];
     if (!pkgs.length) { list.innerHTML = '<div class="hwfit-loading">No packages found</div>'; return; }
-    const _winUnsupported = new Set(['vllm', 'rembg', 'gfpgan']);
+    const _winUnsupported = new Set(['diffusers', 'hf_transfer', 'vllm', 'rembg', 'gfpgan']);
 
     const _statusTag = (pkg, isLocal, isSystemDep, winBlocked) => {
       if (winBlocked) return `<span class="cookbook-dep-tag cookbook-dep-na">N/A</span>`;
-      const hasCustomInstall = !!pkg.install_cmd;
-      const hasCustomUpdate = !!pkg.update_cmd;
-      if (pkg.installed && isSystemDep && !hasCustomUpdate) return `<span class="cookbook-dep-tag cookbook-dep-installed" title="Found on selected server">Installed</span>`;
-      if (pkg.installed && pkg.pip_update_available === false && !hasCustomUpdate) {
+      if (pkg.installed && isSystemDep) return `<span class="cookbook-dep-tag cookbook-dep-installed" title="Found on selected server">Installed</span>`;
+      if (pkg.installed && pkg.pip_update_available === false) {
         const tip = esc(pkg.update_note || pkg.status_note || 'Found externally; update outside Odysseus.');
         return `<span class="cookbook-dep-tag cookbook-dep-installed" title="${tip}">Installed</span>`;
       }
       if (pkg.installed) return `<button class="cookbook-dep-tag cookbook-dep-installed cookbook-dep-installed-btn" title="Installed — click for actions"><span class="cookbook-dep-installed-label">Installed</span><span class="cookbook-dep-caret">&#9662;</span></button>`;
-      if (isSystemDep && !hasCustomInstall) {
+      if (isSystemDep) {
         const depTip = esc(pkg.install_hint || 'Install this OS package on the selected server.');
         const depLabel = pkg.applicable === false ? 'N/A ?' : 'Missing';
         return `<span class="cookbook-dep-tag cookbook-dep-na" title="${depTip}">${depLabel}</span>`;
       }
-      return `<button class="cookbook-dep-tag cookbook-dep-install" data-dep-pip="${esc(pkg.pip || '')}" data-dep-install-cmd="${esc(pkg.install_cmd || '')}" data-dep-update-cmd="${esc(pkg.update_cmd || '')}" data-dep-target="${isLocal ? 'local' : 'remote'}">Install</button>`;
+      return `<button class="cookbook-dep-tag cookbook-dep-install" data-dep-pip="${esc(pkg.pip)}" data-dep-target="${isLocal ? 'local' : 'remote'}">Install</button>`;
     };
 
     const _depRow = (pkg) => {
@@ -752,7 +703,7 @@ async function _fetchDependencies() {
       } else if (pkg.name === 'sglang' && pkg.installed) {
         _rebuildBtn = `<button type="button" class="cookbook-dep-tag cookbook-dep-rebuild cookbook-dep-reinstall" data-reinstall-pkg="sglang" title="Force-reinstall SGLang (pulls a matching torch). Runs as a tmux task in the Running tab.">Reinstall</button>`;
       }
-      return `<div class="cookbook-dep-row${winBlocked ? ' cookbook-dep-blocked' : ''}" data-pkg-name="${esc(pkg.name)}" data-dep-pip="${esc(pkg.pip || '')}" data-dep-install-cmd="${esc(pkg.install_cmd || '')}" data-dep-update-cmd="${esc(pkg.update_cmd || '')}" data-dep-target="${isLocal ? 'local' : 'remote'}" data-dep-kind="${esc(pkg.kind || 'python')}">`
+      return `<div class="cookbook-dep-row${winBlocked ? ' cookbook-dep-blocked' : ''}" data-pkg-name="${esc(pkg.name)}" data-dep-pip="${esc(pkg.pip || '')}" data-dep-target="${isLocal ? 'local' : 'remote'}" data-dep-kind="${esc(pkg.kind || 'python')}">`
         + `<div class="cookbook-dep-info">`
         + `<div class="memory-item-title">${esc(pkg.name)}</div>`
         + `<div class="memory-item-meta" style="font-size:10px;opacity:0.5;margin-top:2px;">${esc(pkg.desc)}</div>`
@@ -782,7 +733,7 @@ async function _fetchDependencies() {
     // Shared install/update routine — used by the Install button and the
     // "Update" item in an installed package's ⋮ menu. `upgrade` adds pip -U;
     // `statusEl`, when given, shows "Installing…/Updating…" and is disabled.
-    async function _installDep(pipName, pkgName, isLocalOnly, upgrade, statusEl, actionCmd = '') {
+    async function _installDep(pipName, pkgName, isLocalOnly, upgrade, statusEl) {
       if (isLocalOnly) {
         _envState.remoteHost = '';
         _envState.env = 'none';
@@ -827,43 +778,6 @@ async function _fetchDependencies() {
           envPrefix = 'eval "$(conda shell.bash hook)" && conda activate ' + _shellQuote(_envState.envPath);
         }
       }
-
-      if (actionCmd) {
-        const shellCmd = envPrefix ? `${envPrefix} ${actionCmd}` : actionCmd;
-        const fullCmd = (!isLocalOnly && _envState.remoteHost)
-          ? _sshCmd(_envState.remoteHost, shellCmd, _getPort(_envState.remoteHost))
-          : shellCmd;
-        try {
-          if (statusEl) { statusEl.textContent = upgrade ? 'Updating...' : 'Installing...'; statusEl.disabled = true; }
-          const res = await fetch('/api/shell/stream', {
-            method: 'POST', credentials: 'same-origin',
-            headers: { 'Content-Type': 'application/json' },
-            body: JSON.stringify({ command: fullCmd }),
-          });
-          uiModule.showToast(`${upgrade ? 'Updating' : 'Installing'} ${pkgName} on ${targetHost}...`);
-          const body = await res.text();
-          if (!res.ok) throw new Error(`HTTP ${res.status}`);
-          const exitMatches = [...body.matchAll(/"exit_code":\s*(-?\d+)/g)].map(m => Number(m[1]));
-          const exitCode = exitMatches.length ? exitMatches[exitMatches.length - 1] : 0;
-          if (exitCode !== 0) {
-            throw new Error((body.slice(-500).trim() || `${pkgName} command failed`) + ` (exit ${exitCode})`);
-          }
-
-          if (upgrade) { uiModule.showToast(`Successfully updated ${pkgName} on ${targetHost}.`); } else { uiModule.showToast(`Successfully installed ${pkgName} on ${targetHost}.`); }
-          await _fetchDependencies();
-          return;
-        } catch (err) {
-          if (statusEl) { statusEl.textContent = 'Install'; statusEl.disabled = false; }
-          uiModule.showToast(`${upgrade ? 'Update' : 'Install'} failed: ` + err.message);
-          return;
-        }
-      }
-
-      // Always go through `python -m pip` so the leading token is `python`
-      // — matches the /api/model/serve allow-list (bare `pip` is blocked).
-      // Inside a venv/conda env, `--user` is invalid (pip refuses), so we
-      // only add `--user --break-system-packages` when there's no env —
-      // for PEP-668-locked system pythons (Arch, newer Debian).
       try {
         const reqBody = {
           repo_id: pipName,
@@ -902,9 +816,8 @@ async function _fetchDependencies() {
       btn.addEventListener('click', async (e) => {
         e.stopPropagation();
         const pipName = btn.dataset.depPip;
-        const installCmd = btn.dataset.depInstallCmd || '';
         const pkgName = btn.closest('.cookbook-dep-row')?.querySelector('.memory-item-title')?.textContent || pipName;
-        await _installDep(pipName, pkgName, btn.dataset.depTarget === 'local', !!btn.dataset.upgrade, btn, installCmd);
+        await _installDep(pipName, pkgName, btn.dataset.depTarget === 'local', !!btn.dataset.upgrade, btn);
       });
     });
 
@@ -927,12 +840,11 @@ async function _fetchDependencies() {
       const it = document.createElement('div');
       it.className = 'dropdown-item-compact';
       it.innerHTML = `<span class="dropdown-icon">${upIco}</span><span>Update</span>`;
-      it.title = row.dataset.depUpdateCmd ? `Update ${pkgName} using its custom command` : `Update ${pkgName} to the latest version (pip install -U)`;
+      it.title = `Update ${pkgName} to the latest version (pip install -U)`;
       it.addEventListener('click', async (e) => {
         e.stopPropagation();
         dropdown.remove();
-        const updateCmd = row.dataset.depUpdateCmd || '';
-        await _installDep(pipName, pkgName, isLocalOnly, true, null, updateCmd);
+        await _installDep(pipName, pkgName, isLocalOnly, true, null);
       });
       dropdown.appendChild(it);
       document.body.appendChild(dropdown);
@@ -964,7 +876,6 @@ async function _fetchDependencies() {
 function _applyServerSelection(val) {
   if (val === 'local') {
     _envState.remoteHost = '';
-    _envState.remoteServerKey = '';
     _envState.env = 'none';
     _envState.envPath = '';
     _envState.platform = '';
@@ -972,7 +883,6 @@ function _applyServerSelection(val) {
     const s = _serverByVal(val);
     if (s) {
       _envState.remoteHost = s.host;
-      _envState.remoteServerKey = _serverKey(s);
       _envState.env = s.env || 'none';
       _envState.envPath = s.envPath || '';
       _envState.platform = s.platform || '';
@@ -983,9 +893,10 @@ function _applyServerSelection(val) {
   // bug: the Download/Cache/Deps dropdowns set the host but never saved it, so
   // it silently reverted and downloads/scans hit the wrong server).
   _persistEnvState();
-  const _want = _currentServerValue();
+  const _want = _envState.remoteHost || 'local';
   document.querySelectorAll('#hwfit-server-select, #hwfit-dl-server, #hwfit-cache-server, #hwfit-deps-server').forEach(sel => {
     if (!sel || sel.tagName !== 'SELECT') return;
+    // Option values are host strings now ('local' for the local box).
     sel.value = _want;
     // If the host isn't among this select's current options (stale options after
     // the server list changed), the browser leaves the box BLANK/grey even though
@@ -993,7 +904,7 @@ function _applyServerSelection(val) {
     // re-apply; fall back to 'local' only if it's genuinely gone.
     if (sel.selectedIndex < 0) {
       sel.innerHTML = _buildServerOpts(sel.id === 'hwfit-dl-server');
-      sel.value = _currentServerValue();
+      sel.value = _want;
       if (sel.selectedIndex < 0) sel.value = 'local';
     }
   });
@@ -1031,7 +942,7 @@ function _wireTabEvents(body) {
       // Ignore swipes that start in a horizontally-scrollable tag row — those
       // should scroll the chips, not flip the tab.
       if (window.innerWidth > 768 || e.touches.length !== 1
-        || e.target.closest('input, textarea, select, .doclib-lang-chips')) { _sx = null; return; }
+          || e.target.closest('input, textarea, select, .doclib-lang-chips')) { _sx = null; return; }
       _sx = e.touches[0].clientX; _sy = e.touches[0].clientY;
     }, { passive: true });
     body.addEventListener('touchend', (e) => {
@@ -1081,13 +992,11 @@ function _wireTabEvents(body) {
       const remotes = servers.filter(s => !_isLocalEntry(s));
       if (remotes.length === 1) {
         _envState.remoteHost = remotes[0].host;
-        _envState.remoteServerKey = _serverKey(remotes[0]);
         _envState.env = remotes[0].env || 'none';
         _envState.envPath = remotes[0].envPath || '';
       }
     }
-    const activeSrv = _selectedServer();
-    if (activeSrv) _envState.remoteServerKey = _serverKey(activeSrv);
+    const activeSrv = servers.find(s => s.host === _envState.remoteHost);
     _envState.platform = activeSrv?.platform || '';
     localStorage.setItem('cookbook-last-state', JSON.stringify(_envStateForStorage()));
     _saveTasks(_loadTasks());
@@ -1095,7 +1004,7 @@ function _wireTabEvents(body) {
     // UI matches the resolved host. Done in a microtask so the dropdowns
     // exist by the time we set their .value.
     Promise.resolve().then(() => {
-      const _want = _currentServerValue();
+      const _want = _envState.remoteHost || 'local';
       document.querySelectorAll('#hwfit-server-select, #hwfit-dl-server, #hwfit-cache-server, #hwfit-deps-server').forEach(sel => {
         if (sel && sel.tagName === 'SELECT') sel.value = _want;
       });
@@ -1361,14 +1270,28 @@ function _wireTabEvents(body) {
       if (!m) return { repo: raw, include: null };
       return { repo: m[1], include: `*${m[2]}*` };
     }
+    // Ollama-library name. Matches `qwen2.5:14b`, `llama3:latest`, and the
+    // (rare) `library/<name>:<tag>` form which we normalize by stripping the
+    // namespace. The backend's _is_ollama_download check expects the same
+    // shape (no slash + has a colon).
+    function _ollamaName(raw) {
+      const stripped = raw.replace(/^library\//, '');
+      if (/^[A-Za-z0-9][A-Za-z0-9._-]{0,200}:[A-Za-z0-9][A-Za-z0-9._-]{0,200}$/.test(stripped)) {
+        return stripped;
+      }
+      return null;
+    }
     const triggerDownload = () => {
       const rawRepo = _stripHfUrl(dlInput.value);
       if (!rawRepo) return;
-      const { repo, include: autoInclude } = _splitRepoTag(rawRepo);
+      const ollamaName = _ollamaName(rawRepo);
+      const { repo, include: autoInclude } = ollamaName ? { repo: ollamaName, include: null } : _splitRepoTag(rawRepo);
       // HuggingFace repo IDs must be `org/model`. A bare model name would 404
       // at snapshot_download time with a raw traceback, so reject it up front.
-      if (!/^[^\s/]+\/[^\s/]+$/.test(repo)) {
-        uiModule.showToast('Enter a full HuggingFace repo ID like "org/model-name" (or paste the full HF URL).');
+      // Ollama names (single-segment with a tag) skip this check — they go
+      // through `ollama pull` server-side, not snapshot_download.
+      if (!ollamaName && !/^[^\s/]+\/[^\s/]+$/.test(repo)) {
+        uiModule.showToast('Enter a full HuggingFace repo ID like "org/model-name", or an Ollama name like "qwen2.5:14b".');
         dlInput.focus();
         return;
       }
@@ -1383,12 +1306,13 @@ function _wireTabEvents(body) {
       if (srvVal !== 'local') {
         host = _serverByVal(srvVal)?.host || '';
       }
-      const _hsrv = srvVal !== 'local' ? (_serverByVal(srvVal) || {}) : {};
+      const _hsrv = _envState.servers.find(sv => sv.host === host) || {};
       let env = host ? (_hsrv.env || 'none') : _envState.env;
       let envPath = host ? (_hsrv.envPath || '') : _envState.envPath;
       const payload = { repo_id: repo };
+      if (ollamaName) payload.backend = 'ollama';
       if (autoInclude) payload.include = autoInclude;
-      if (_envState.hfToken) payload.hf_token = _envState.hfToken;
+      if (_envState.hfToken && !ollamaName) payload.hf_token = _envState.hfToken;
       if (host) { payload.remote_host = host; const _sp3 = _getPort(host); if (_sp3) payload.ssh_port = _sp3; }
       const srvPlatform = _getPlatform(host);
       if (srvPlatform) payload.platform = srvPlatform;
@@ -1432,7 +1356,7 @@ function _wireTabEvents(body) {
       // the section is collapsed (the body's content normally provides
       // separation; with no body visible, the line gives the h2 definition).
       dlFold.classList.toggle('is-folded', !folded);
-      try { localStorage.setItem('cookbook_dl_tab_folded_v1', folded ? '0' : '1'); } catch { }
+      try { localStorage.setItem('cookbook_dl_tab_folded_v1', folded ? '0' : '1'); } catch {}
     });
   }
   const hfToggle = document.getElementById('cookbook-hf-latest-toggle');
@@ -1478,7 +1402,7 @@ function _wireTabEvents(body) {
           _hwCache[cacheKey] = hw;
           return hw;
         }
-      } catch { }
+      } catch {}
       _hwCache[cacheKey] = { vram: 0, backend: '' };
       return _hwCache[cacheKey];
     }
@@ -1591,6 +1515,84 @@ function _wireTabEvents(body) {
     document.getElementById('hwfit-server-select')?.addEventListener('change', _onServerChange);
   }
 
+  // Browse Ollama library — popular models from ollama.com via cached backend
+  // proxy. Click a row → fills the download input with `<name>:<size>` so the
+  // existing Download button kicks off `ollama pull`.
+  const olToggle = document.getElementById('cookbook-ollama-toggle');
+  const olArrow = document.getElementById('cookbook-ollama-arrow');
+  const olList = document.getElementById('cookbook-ollama-list');
+  const olRefresh = document.getElementById('cookbook-ollama-refresh');
+  if (olToggle && olList) {
+    let _olLoaded = false;
+    async function _loadOllama(refresh = false) {
+      olList.innerHTML = '<div class="hwfit-loading" style="opacity:0.5;font-size:11px;text-align:center;padding:12px;">Loading…</div>';
+      try {
+        const res = await fetch(`/api/cookbook/ollama/library${refresh ? '?refresh=1' : ''}`);
+        const data = await res.json();
+        const models = data.models || [];
+        if (!models.length) {
+          olList.innerHTML = '<div class="hwfit-loading">No models</div>';
+          return;
+        }
+        let html = '';
+        for (const m of models) {
+          const sizes = Array.isArray(m.sizes) && m.sizes.length ? m.sizes : ['latest'];
+          const sizeChips = sizes.map(s => `<button type="button" class="memory-toolbar-btn cookbook-ol-size" data-name="${esc(m.name)}" data-size="${esc(s)}" style="height:20px;padding:0 6px;font-size:10px;border-radius:3px;">${esc(s)}</button>`).join('');
+          html += `<div class="doclib-card memory-item cookbook-ollama-card" data-name="${esc(m.name)}">`;
+          html += `<div style="flex:1;min-width:0;">`;
+          html += `<div class="memory-item-title">${esc(m.name)} <a href="https://ollama.com/library/${esc(m.name)}" target="_blank" rel="noopener" class="cookbook-hf-link">ollama ↗</a></div>`;
+          if (m.description) html += `<div class="memory-item-meta" style="font-size:10px;opacity:0.55;margin-top:2px;">${esc(m.description)}</div>`;
+          html += `<div style="display:flex;flex-wrap:wrap;gap:3px;margin-top:4px;">${sizeChips}</div>`;
+          html += `</div></div>`;
+        }
+        olList.innerHTML = html;
+        olList.querySelectorAll('.cookbook-ol-size').forEach(btn => {
+          btn.addEventListener('click', (e) => {
+            e.stopPropagation();
+            const name = btn.dataset.name;
+            const size = btn.dataset.size;
+            if (dlInput) {
+              dlInput.value = `${name}:${size}`;
+              dlInput.focus();
+            }
+          });
+        });
+        // Clicking the card body (not a size chip / link) → default to first size
+        olList.querySelectorAll('.cookbook-ollama-card').forEach(card => {
+          card.addEventListener('click', (e) => {
+            if (e.target.closest('a') || e.target.closest('.cookbook-ol-size')) return;
+            const name = card.dataset.name;
+            const firstSize = card.querySelector('.cookbook-ol-size')?.dataset.size || 'latest';
+            if (dlInput) {
+              dlInput.value = `${name}:${firstSize}`;
+              dlInput.focus();
+            }
+          });
+        });
+      } catch (e) {
+        olList.innerHTML = '<div class="hwfit-loading">Failed to load</div>';
+      }
+    }
+    olToggle.addEventListener('click', () => {
+      const isOpen = olList.style.display !== 'none';
+      olList.style.display = isOpen ? 'none' : 'flex';
+      if (olArrow) olArrow.style.transform = isOpen ? 'rotate(0deg)' : 'rotate(90deg)';
+      if (!isOpen && !_olLoaded) {
+        _olLoaded = true;
+        _loadOllama(false);
+      }
+    });
+    if (olRefresh) olRefresh.addEventListener('click', (e) => {
+      e.stopPropagation();
+      _olLoaded = true;
+      _loadOllama(true);
+      if (olList.style.display === 'none') {
+        olList.style.display = 'flex';
+        if (olArrow) olArrow.style.transform = 'rotate(90deg)';
+      }
+    });
+  }
+
   // Server add button, row removal, model-dir add/remove, and per-row wiring
   // are ALL owned by cookbook-hwfit.js's _hwfitInit / _wireServerEntry.
   // A duplicate add handler used to live here and fired alongside the hwfit
@@ -1603,7 +1605,7 @@ function _wireTabEvents(body) {
     hfInput.addEventListener('change', async () => {
       const val = hfInput.value.trim();
       _envState.hfToken = val;
-      try { await _persistEnvState(); } catch { }
+      try { await _persistEnvState(); } catch {}
       if (val) {
         _envState.hfTokenConfigured = true;
         const masked = val.length > 6 ? val.slice(0, 3) + '…' + val.slice(-3) : '••••';
@@ -1643,9 +1645,8 @@ export function _serverEntryHtml(s, i, defaultServer, forceRemote, isNew) {
   let html = '';
   html += `<div class="cookbook-server-entry" data-idx="${i}" data-platform="${esc(s.platform || '')}">`;
   const _srvTitle = s.name || (isLocal ? 'Local' : (s.host || `Server ${i + 1}`));
-  const _srvKey = isLocal ? 'local' : _serverKey(s);
-  const _legacyDefault = !String(defaultServer || '').startsWith('srv:') && !isLocal && (defaultServer || '') === (s.host || '');
-  const _isDefaultSrv = (defaultServer || '') === _srvKey || _legacyDefault;
+  const _srvKey = isLocal ? 'local' : (s.host || '');
+  const _isDefaultSrv = (defaultServer || '') === _srvKey;
   const _pIco = _platformIcon(s.platform);
   const _keyBtn = `<button class="cookbook-server-key-btn" title="Set up SSH key for this server" style="height:22px;box-sizing:border-box;display:inline-flex;align-items:center;position:relative;top:-2px;"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="margin-right:4px;flex-shrink:0;"><circle cx="7.5" cy="15.5" r="5.5"/><path d="M12 11l8-8"/><path d="M17 6l3 3"/></svg>Key</button>`;
   const _checkBtn = `<button class="cookbook-server-check-btn" title="Check SSH connection" style="height:22px;box-sizing:border-box;display:inline-flex;align-items:center;position:relative;top:-2px;"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round" style="margin-right:4px;flex-shrink:0;"><polyline points="20 6 9 17 4 12"/></svg>Check</button>`;
@@ -1775,11 +1776,24 @@ function _renderRecipes() {
   html += `<button class="memory-toolbar-btn cookbook-dl-add-server" title="Add server in Settings" style="height:28px;">add server</button>`;
   html += `</div>`;
   html += `<div class="cookbook-dl-input" style="margin-top:0;">`;
-  html += `<input type="text" class="cookbook-dl-repo" id="cookbook-dl-repo" placeholder="org/model-name, HF URL, or org/model:QUANT_TAG" />`;
+  html += `<input type="text" class="cookbook-dl-repo" id="cookbook-dl-repo" placeholder="org/model-name, qwen2.5:14b, or HF URL" />`;
   html += `<button class="cookbook-btn cookbook-dl-btn" id="cookbook-dl-btn">Download</button>`;
   html += `</div>`;
+  // Browse Ollama library — fetches popular models from ollama.com via the
+  // /api/cookbook/ollama/library cached proxy, click → fills the input with
+  // `<name>:<size>` so the existing Download button kicks off `ollama pull`.
+  html += `<div style="margin-top:5px;position:relative;top:-3px;">`;
+  html += `<div style="display:flex;gap:4px;align-items:center;">`;
+  html += `<button type="button" class="memory-toolbar-btn" id="cookbook-ollama-toggle" style="flex:1;text-align:left;height:26px;display:flex;align-items:center;gap:6px;border-radius:4px;">`;
+  html += `<span id="cookbook-ollama-arrow" style="display:inline-block;transition:transform 0.15s;pointer-events:none;">▸</span>`;
+  html += `<span style="pointer-events:none;">Browse Ollama library</span>`;
+  html += `</button>`;
+  html += `<button type="button" class="memory-toolbar-btn" id="cookbook-ollama-refresh" title="Refresh" style="height:26px;width:26px;padding:0;border-radius:4px;">↻</button>`;
+  html += `</div>`;
+  html += `<div id="cookbook-ollama-list" style="display:none;margin-top:4px;max-height:320px;overflow-y:auto;flex-direction:column;gap:4px;"></div>`;
+  html += `</div>`;
   // Latest HF models that fit — collapsible card list
-  html += `<div style="margin-top:5px;position:relative;top:-7px;">`;
+  html += `<div style="margin-top:5px;position:relative;top:-3px;">`;
   html += `<div style="display:flex;gap:4px;align-items:center;">`;
   html += `<button type="button" class="memory-toolbar-btn" id="cookbook-hf-latest-toggle" style="flex:1;text-align:left;height:26px;display:flex;align-items:center;gap:6px;border-radius:4px;">`;
   html += `<span id="cookbook-hf-latest-arrow" style="display:inline-block;transition:transform 0.15s;pointer-events:none;">\u25B8</span>`;
@@ -1804,7 +1818,7 @@ function _renderRecipes() {
   html += '<option value="general" selected>Standard</option><option value="coding">Coding</option>';
   html += '<option value="reasoning">Reasoning</option><option value="chat">Chat</option>';
   // Image tab removed — text→image gen is gone from this build (only inpaint
-  // remains, which uses its own settings panel). Vision (multimodal) stays.
+   // remains, which uses its own settings panel). Vision (multimodal) stays.
   html += '<option value="multimodal">Vision</option></select>';
   // Engine sits next to the type filter so the "what category / which serving
   // path" filters live together; Quant + Context are storage-format and budget
@@ -1813,6 +1827,7 @@ function _renderRecipes() {
   html += '<select class="cookbook-field-input hwfit-engine" id="hwfit-engine" style="height:28px;" title="Filter by serving engine">';
   html += '<option value="">Engine</option>';
   html += '<option value="llamacpp">llama.cpp</option>';
+  html += '<option value="ollama">Ollama</option>';
   html += '<option value="vllm">vLLM</option>';
   html += '<option value="sglang">SGLang</option>';
   html += '</select>';
@@ -1869,13 +1884,13 @@ function _renderRecipes() {
   // Footer: link to the public discussion where users can request additions
   // to the curated model list. Sits below the list so it reads as a callout
   // after browsing, not a header.
-  html += '<div class="hwfit-list-footer" style="margin-top:8px;padding-top:6px;border-top:1px solid color-mix(in srgb, var(--border) 50%, transparent);font-size:9.5px;opacity:0.65;text-align:right;">'
-    + 'Don\'t see a model? '
-    + '<a href="https://github.com/pewdiepie-archdaemon/odysseus/discussions/1962" target="_blank" rel="noopener" style="color:var(--accent,var(--red));text-decoration:none;display:inline-flex;align-items:center;gap:4px;vertical-align:middle;">'
-    + 'Request it →'
-    + '<svg width="11" height="11" viewBox="0 0 16 16" fill="currentColor" aria-hidden="true" style="flex-shrink:0;"><path d="M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"/></svg>'
-    + '</a>'
-    + '</div>';
+  html += '<div class="hwfit-list-footer" style="display:none;">'
+       + 'Don\'t see a model? '
+       + '<a href="https://github.com/pewdiepie-archdaemon/odysseus/discussions/1962" target="_blank" rel="noopener" style="color:var(--accent,var(--red));text-decoration:none;display:inline-flex;align-items:center;gap:4px;vertical-align:middle;position:relative;top:-1px;">'
+       + 'Request it →'
+       + '<svg width="11" height="11" viewBox="0 0 16 16" fill="currentColor" aria-hidden="true" style="flex-shrink:0;"><path d="M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"/></svg>'
+       + '</a>'
+       + '</div>';
 
   html += '</div></div>';
 
@@ -1885,7 +1900,7 @@ function _renderRecipes() {
   html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;">';
   html += '<h2 style="margin:0;padding:0;line-height:1;">Serve <span id="serve-stats" class="memory-count" style="font-size:0.6em;opacity:0.6;font-weight:normal"></span></h2>';
   html += '</div>';
-  const _selSrv = _selectedServer() || _es.servers[0] || {};
+  const _selSrv = _es.servers.find(s => s.host === _es.remoteHost) || _es.servers[0] || {};
   const _srvDirs = (Array.isArray(_selSrv.modelDirs) ? _selSrv.modelDirs : [_selSrv.modelDir || '~/.cache/huggingface/hub']).map(d => d.replaceAll('✕', '').replaceAll('✖', '').trim()).filter(Boolean);
   html += '<div class="cookbook-serve-dirs" style="margin-top:6px;">';
   html += _srvDirs.map(d => `<span class="cookbook-serve-dir-pill">${esc(d)}</span>`).join('');
@@ -1909,7 +1924,7 @@ function _renderRecipes() {
   html += '<label class="memory-bulk-check-all"><input type="checkbox" id="serve-select-all"> All</label>';
   html += '<span id="serve-bulk-count" style="font-size:10px;opacity:0.5;">0 selected</span>';
   html += '<button class="memory-toolbar-btn danger" id="serve-bulk-delete" style="position:relative;top:-3px;"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-1px;margin-right:3px;"><polyline points="3 6 5 6 21 6"/><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"/><path d="M10 11v6"/><path d="M14 11v6"/></svg>Delete</button>';
-  html += '<button class="memory-toolbar-btn" id="serve-bulk-cancel" title="Cancel (Esc)" style="margin-left:4px;padding:3px 6px;position:relative;top:-3px;"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg></button>';
+  html += '<button class="memory-toolbar-btn" id="serve-bulk-cancel" title="Cancel (Esc)" style="margin-left:4px;padding:3px 6px;position:relative;top:-7px;"><svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg></button>';
   html += '</div>';
 
   html += '<div class="doclib-grid hwfit-cached-list" id="hwfit-cached-list"></div>';
@@ -1963,7 +1978,7 @@ function _renderRecipes() {
   html += '<div style="display:flex;align-items:baseline;gap:8px;margin-bottom:2px;margin-top:-4px;">';
   html += '<h2 style="margin:0;padding:0;line-height:1;">Servers</h2>';
   // Reuse the calendar +New pill: spinning plus, label fades in idea uses
-  // the same `.cal-add-btn-text` rules, so styling stays consistent.
+   // the same `.cal-add-btn-text` rules, so styling stays consistent.
   html += '<button class="cal-add-btn cal-add-btn-text" id="cookbook-server-add" title="Add server" style="margin-left:auto;"><span class="cal-add-plus">+</span><span class="cal-add-label">Add</span></button>';
   html += '</div>';
   html += '<p class="memory-desc doclib-desc">Configure SSH servers, install Odysseus keys, choose model directories, and set the default server. Local is this machine.</p>';
@@ -2059,73 +2074,73 @@ export async function open(opts) {
   }
   _setCookbookOpening(true);
   try {
-    // Invalidate any pending close() animation handlers so they won't re-hide us
-    _closeGen++;
-    // Clear any leftover inline styles from a previous swipe-dismiss or close animation
-    const _content = modal.querySelector('.modal-content');
-    if (_content) {
-      _content.classList.remove('modal-closing', 'sheet-ready', 'cookbook-modal-entering');
-      _content.style.transform = '';
-      _content.style.transition = '';
-      _content.style.animation = '';
-      _content.style.opacity = '';
+  // Invalidate any pending close() animation handlers so they won't re-hide us
+  _closeGen++;
+  // Clear any leftover inline styles from a previous swipe-dismiss or close animation
+  const _content = modal.querySelector('.modal-content');
+  if (_content) {
+    _content.classList.remove('modal-closing', 'sheet-ready', 'cookbook-modal-entering');
+    _content.style.transform = '';
+    _content.style.transition = '';
+    _content.style.animation = '';
+    _content.style.opacity = '';
+  }
+  modal.style.display = '';
+  Modals.register('cookbook-modal', {
+    railBtnId: 'rail-cookbook',
+    sidebarBtnId: 'tool-cookbook-btn',
+    closeFn: () => _doClose(),
+    restoreFn: () => { _renderRunningTab(); },
+  });
+  _wireCookbookDrag(modal);
+  await _syncFromServer();
+  // `_syncFromServer` lives in cookbookRunning.js and populates *its* _envState
+  // (a different object reference than this module's), then mirrors the merged
+  // state to localStorage. So ALWAYS hydrate our _envState from that mirror —
+  // on a successful sync it holds the freshly-fetched servers; on failure it
+  // holds the last-known state. Gating this on `!synced` left the render's
+  // _envState empty whenever sync succeeded → "servers don't show".
+  try { Object.assign(_envState, _readStoredEnvState()); } catch {}
+  // Honour a user-set default server: always land on it when Cookbook opens, so
+  // every dropdown (scan/download/serve/cache/deps) starts on the same machine.
+  if (_envState.defaultServer) {
+    const _dk = _envState.defaultServer;
+    if (_dk === 'local') {
+      _envState.remoteHost = ''; _envState.env = 'none'; _envState.envPath = ''; _envState.platform = '';
+    } else {
+      const _ds = (_envState.servers || []).find(s => s.host === _dk);
+      if (_ds) { _envState.remoteHost = _ds.host; _envState.env = _ds.env || 'none'; _envState.envPath = _ds.envPath || ''; _envState.platform = _ds.platform || ''; }
     }
-    modal.style.display = '';
-    Modals.register('cookbook-modal', {
-      railBtnId: 'rail-cookbook',
-      sidebarBtnId: 'tool-cookbook-btn',
-      closeFn: () => _doClose(),
-      restoreFn: () => { _renderRunningTab(); },
-    });
-    _wireCookbookDrag(modal);
-    await _syncFromServer();
-    // `_syncFromServer` lives in cookbookRunning.js and populates *its* _envState
-    // (a different object reference than this module's), then mirrors the merged
-    // state to localStorage. So ALWAYS hydrate our _envState from that mirror —
-    // on a successful sync it holds the freshly-fetched servers; on failure it
-    // holds the last-known state. Gating this on `!synced` left the render's
-    // _envState empty whenever sync succeeded → "servers don't show".
-    try { Object.assign(_envState, _readStoredEnvState()); } catch { }
-    // Honour a user-set default server: always land on it when Cookbook opens, so
-    // every dropdown (scan/download/serve/cache/deps) starts on the same machine.
-    if (_envState.defaultServer) {
-      const _dk = _envState.defaultServer;
-      if (_dk === 'local') {
-        _envState.remoteHost = ''; _envState.remoteServerKey = ''; _envState.env = 'none'; _envState.envPath = ''; _envState.platform = '';
-      } else {
-        const _ds = _serverByVal(_dk);
-        if (_ds) { _envState.remoteHost = _ds.host; _envState.remoteServerKey = _serverKey(_ds); _envState.env = _ds.env || 'none'; _envState.envPath = _ds.envPath || ''; _envState.platform = _ds.platform || ''; }
-      }
-    }
-    // Re-render on every open AFTER sync so the freshly-fetched state (servers,
-    // HF token, presets) is always reflected. Gating this to once-per-page used
-    // to freeze a stale/empty servers list whenever the first sync raced or
-    // returned before hydration — and since close/reopen doesn't reset the page,
-    // only a full reload recovered it. Re-rendering is cheap and the in-progress
-    // Running tab is rendered separately just below.
-    _renderRecipes();
-    _rendered = true;
-    _clearCookbookNotif();
-    _renderRunningTab();
-    // Self-heal: revive any download tasks whose tmux session is still alive
-    // but were persisted as done/error (covers the "restarted server while a
-    // big multi-shard download was in flight" case — the task survived in
-    // tmux, the cookbook just lost track of it).
-    try { _selfHealStaleTasks({ oneShot: true }); } catch { }
-    if (_content) {
-      // Put the panel in its entering state before it becomes visible. On
-      // mobile, showing first and adding the class a frame later can paint the
-      // sheet at its final position, which makes the slide-up look like a snap.
-      _content.classList.add('cookbook-modal-entering');
-    }
-    modal.classList.remove('hidden');
-    if (_content) {
-      void _content.offsetWidth;
-      _content.addEventListener('animationend', () => {
-        _content.classList.remove('cookbook-modal-entering');
-      }, { once: true });
-    }
-    setTimeout(_applyIntent, 0);
+  }
+  // Re-render on every open AFTER sync so the freshly-fetched state (servers,
+  // HF token, presets) is always reflected. Gating this to once-per-page used
+  // to freeze a stale/empty servers list whenever the first sync raced or
+  // returned before hydration — and since close/reopen doesn't reset the page,
+  // only a full reload recovered it. Re-rendering is cheap and the in-progress
+  // Running tab is rendered separately just below.
+  _renderRecipes();
+  _rendered = true;
+  _clearCookbookNotif();
+  _renderRunningTab();
+  // Self-heal: revive any download tasks whose tmux session is still alive
+  // but were persisted as done/error (covers the "restarted server while a
+  // big multi-shard download was in flight" case — the task survived in
+  // tmux, the cookbook just lost track of it).
+  try { _selfHealStaleTasks({ oneShot: true }); } catch {}
+  if (_content) {
+    // Put the panel in its entering state before it becomes visible. On
+    // mobile, showing first and adding the class a frame later can paint the
+    // sheet at its final position, which makes the slide-up look like a snap.
+    _content.classList.add('cookbook-modal-entering');
+  }
+  modal.classList.remove('hidden');
+  if (_content) {
+    void _content.offsetWidth;
+    _content.addEventListener('animationend', () => {
+      _content.classList.remove('cookbook-modal-entering');
+    }, { once: true });
+  }
+  setTimeout(_applyIntent, 0);
   } finally {
     _setCookbookOpening(false);
   }
@@ -2217,9 +2232,6 @@ const shared = {
   _getPort,
   _sshPrefix,
   _getPlatform,
-  _serverByVal,
-  _selectedServer,
-  _currentServerValue,
   _isWindows,
   _isMetal,
   _buildEnvPrefix,
diff --git a/static/js/cookbookDownload.js b/static/js/cookbookDownload.js
index 6c155c8d7..6ea07cc85 100644
--- a/static/js/cookbookDownload.js
+++ b/static/js/cookbookDownload.js
@@ -242,11 +242,7 @@ export function _wirePanelEvents(panel, model, backend) {
   const dlBtn = panel.querySelector('.hwfit-dl-btn');
   if (dlBtn) {
     dlBtn.addEventListener('click', () => {
-      if (backend === 'ollama') {
-        _runPanelCmd(panel, _buildDownloadCmd(model, backend), { timeout: 0 });
-      } else {
-        _runModelDownload(panel, model, backend);
-      }
+      _runModelDownload(panel, model, backend)
     });
   }
 
@@ -459,7 +455,9 @@ export async function _runModelDownload(panel, model, backend, hostOverride) {
     uiModule.showToast(_missingGgufMessage(model));
     return;
   }
-  const repo = ggufSource?.repo || model.quant_repo || model.name;
+  const repo = backend === 'ollama'
+    ? (model.ollama || model.ollama_name || model.name)
+    : (ggufSource?.repo || model.quant_repo || model.name);
   const include = backend === 'llamacpp' ? _ggufIncludePattern(model, ggufSource) : null;
 
   _syncEnvFromPanel(panel);
@@ -494,7 +492,7 @@ export async function _runModelDownload(panel, model, backend, hostOverride) {
   const platform = host ? (srv.platform || '') : (_envState.platform || '');
   const isWin = host ? (platform === 'windows') : _isWindows();
 
-  const payload = { repo_id: repo };
+  const payload = { repo_id: repo, backend };
   if (include) payload.include = include;
   // Large downloads are where hf_transfer most often dies near the end. Use the
   // plain HuggingFace downloader up front for big model files; it is slower, but
diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js
index a4e7b83eb..b13856c08 100644
--- a/static/js/cookbookRunning.js
+++ b/static/js/cookbookRunning.js
@@ -1564,6 +1564,10 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid
     const payload = { repo_id: repo, remote_host: _host || undefined, ssh_port: _sp || undefined, _cmd: cmd, _fields: fields || undefined, _env: _usedEnv, _envPath: _usedEnvPath, _gpus: _usedGpus };
     _addTask(data.session_id, shortName, 'serve', payload);
     uiModule.showToast(`Serving ${shortName}...`);
+    // Auto-register may have enabled an existing (offline) endpoint for this
+    // host:port. Refresh the picker so the row is no longer dimmed, and the
+    // user doesn't see "offline" on a serve they just started.
+    try { _refreshModelsAfterEndpointChange(); } catch (_) {}
   } catch (e) {
     uiModule.showToast('Failed: ' + e.message);
   }
@@ -3032,6 +3036,11 @@ async function _reconnectTask(el, task) {
             if (info.status === 'ready' && !task._serveReady) {
               task._serveReady = true;
               _updateTask(task.sessionId, { _serveReady: true });
+              // The auto-registered endpoint was marked offline while the
+              // server was coming up. Now that it's reachable, nudge the
+              // picker to re-probe so the offline pill clears without the
+              // user having to reopen Settings or refresh the page.
+              try { _refreshModelsAfterEndpointChange(); } catch (_) {}
             }
             if (info.phase) {
               badge.textContent = info.phase;
diff --git a/static/js/cookbookSchedule.js b/static/js/cookbookSchedule.js
index a26de5dbc..69f28a6b5 100644
--- a/static/js/cookbookSchedule.js
+++ b/static/js/cookbookSchedule.js
@@ -129,7 +129,7 @@ try { (function () {
           </label>
         </div>
 
-        <div class="hwfit-schedule-row">
+        <div class="hwfit-schedule-row hwfit-schedule-when-row">
           <label class="hwfit-schedule-field">
             <span>From</span>
             <input type="time" class="hwfit-sched-start cookbook-field-input" value="09:00" />
@@ -138,24 +138,24 @@ try { (function () {
             <span>Until</span>
             <input type="time" class="hwfit-sched-end cookbook-field-input" value="17:00" />
           </label>
-        </div>
-
-        <div class="hwfit-schedule-row hwfit-schedule-days-row">
-          <span class="hwfit-schedule-label">Days</span>
-          <div class="hwfit-sched-days">
-            ${DAYS.map(d => `
-              <button type="button" class="hwfit-sched-day-chip${WEEKDAYS.has(d.k) ? " is-on" : ""}" data-day="${d.k}">${d.l}</button>
-            `).join("")}
+          <label class="hwfit-schedule-field hwfit-schedule-days-field">
+            <span>Days</span>
+            <div class="hwfit-sched-days">
+              ${DAYS.map(d => `
+                <button type="button" class="hwfit-sched-day-chip${WEEKDAYS.has(d.k) ? " is-on" : ""}" data-day="${d.k}">${d.l}</button>
+              `).join("")}
+            </div>
+          </label>
+          <div class="hwfit-schedule-actions-inline">
+            <button type="button" class="cookbook-btn hwfit-sched-cancel" title="Cancel">
+              <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.4" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-1px;margin-right:5px;flex-shrink:0;"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg>
+              <span>Cancel</span>
+            </button>
+            <button type="button" class="cookbook-btn hwfit-sched-save" title="Save schedule" aria-label="Save schedule">
+              <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-1px;margin-right:5px;flex-shrink:0;"><rect x="3" y="4" width="18" height="18" rx="2"/><line x1="16" y1="2" x2="16" y2="6"/><line x1="8" y1="2" x2="8" y2="6"/><line x1="3" y1="10" x2="21" y2="10"/></svg>
+              <span>Save</span>
+            </button>
           </div>
-          <span class="hwfit-schedule-actions-spacer"></span>
-          <button type="button" class="cookbook-btn hwfit-sched-cancel" title="Cancel">
-            <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.4" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-1px;margin-right:5px;flex-shrink:0;"><line x1="18" y1="6" x2="6" y2="18"/><line x1="6" y1="6" x2="18" y2="18"/></svg>
-            <span>Cancel</span>
-          </button>
-          <button type="button" class="cookbook-btn hwfit-sched-save" title="Save schedule" aria-label="Save schedule">
-            <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align:-1px;margin-right:5px;flex-shrink:0;"><rect x="3" y="4" width="18" height="18" rx="2"/><line x1="16" y1="2" x2="16" y2="6"/><line x1="8" y1="2" x2="8" y2="6"/><line x1="3" y1="10" x2="21" y2="10"/></svg>
-            <span>Save</span>
-          </button>
         </div>
 
         <div class="hwfit-sched-err"></div>
diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js
index 3f7e53916..d06477baf 100644
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -14,7 +14,6 @@ import { bindMenuDismiss, dismissOrRemove } from './escMenuStack.js';
 let _envState;
 let _sshCmd;
 let _getPort;
-let _serverByVal;
 let _sshPrefix;
 let _getPlatform;
 let _isWindows;
@@ -98,14 +97,14 @@ function _selectedServeTarget(panel) {
   const select = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
   const servers = Array.isArray(_envState.servers) ? _envState.servers : [];
   let host = _envState.remoteHost || '';
-  let server = host ? (_serverByVal?.(_envState.remoteServerKey || host) || servers.find(s => s.host === host)) : null;
+  let server = host ? servers.find(s => s.host === host) : null;
   if (select && select.value != null) {
     if (select.value === 'local') {
       host = '';
       server = servers.find(s => !s.host || s.host === 'local') || null;
     } else {
       const idx = /^\d+$/.test(String(select.value)) ? parseInt(select.value, 10) : -1;
-      server = _serverByVal?.(select.value) || (idx >= 0 ? servers[idx] : null) || null;
+      server = servers.find(s => s.host === select.value) || (idx >= 0 ? servers[idx] : null) || null;
       host = server?.host || '';
     }
   }
@@ -115,7 +114,7 @@ function _selectedServeTarget(panel) {
     : (server?.name || 'local server');
   return {
     host,
-    port: host ? (server?.port || _getPort(host) || '') : '',
+    port: host ? (_getPort(host) || server?.port || '') : '',
     venv,
     label,
   };
@@ -243,21 +242,6 @@ function _shellPathExpr(path) {
 function _selectedGgufExpr(model, repo, relPath) {
   const rel = String(relPath || '').replace(/^\/+/, '');
   if (!rel) return '';
-  if (_isWindows()) {
-    // PowerShell: plain path — no bash $() syntax (backend validator rejects
-    // $( ) in non-prelude commands, and PowerShell doesn't have printf).
-    const relW = rel.replace(/\//g, '\\');
-    if (model.is_local_dir && model.path) {
-      const base = String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\');
-      return `${base}\\${repo.replace(/\//g, '\\')}\\${relW}`;
-    }
-    if (model.path) {
-      const base = String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\');
-      return `${base}\\models--${repo.replace(/\//g, '--')}\\snapshots\\${relW}`;
-    }
-    const cacheRepo = repo.replace(/\//g, '--');
-    return `$env:USERPROFILE\\.cache\\huggingface\\hub\\models--${cacheRepo}\\snapshots\\${relW}`;
-  }
   if (model.is_local_dir && model.path) {
     const base = String(model.path || '').replace(/\/+$/, '');
     return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
@@ -271,15 +255,6 @@ function _selectedGgufExpr(model, repo, relPath) {
 }
 
 function _ggufSearchDirExpr(model, repo) {
-  if (_isWindows()) {
-    if (model.is_local_dir && model.path) {
-      return `${String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\')}\\${repo.replace(/\//g, '\\')}`;
-    }
-    if (model.path) {
-      return `${String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\')}\\models--${repo.replace(/\//g, '--')}\\snapshots`;
-    }
-    return `$env:USERPROFILE\\.cache\\huggingface\\hub\\models--${repo.replace(/\//g, '--')}\\snapshots`;
-  }
   if (model.is_local_dir && model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/${repo}`);
   if (model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/models--${repo.replace(/\//g, '--')}/snapshots`);
   return `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
@@ -537,7 +512,7 @@ function _rerenderCachedModels() {
       // The venv set per-server in Settings (server.envPath). Used as the venv
       // field default when the global active env path isn't carrying it, so a
       // configured server venv shows up without re-typing it.
-      const _selSrv = _serverByVal?.(_es.remoteServerKey || _es.remoteHost || '') || {};
+      const _selSrv = (_es.servers || []).find(s => s.host === (_es.remoteHost || '')) || {};
       const _srvVenv = _selSrv.envPath || '';
       // Serve state schema: { _byRepo: { <repo>: {...} }, _lastUsed: {...} }.
       // Loading priority: this-repo's saved settings → last-used (from any
@@ -600,7 +575,7 @@ function _rerenderCachedModels() {
         + `<button type="button" class="cookbook-slot-btn cookbook-saved-arrow" title="${esc(_arrowTitle)}">${_arrowLabel}</button>`
         + `</div>`;
 
-      let panelHtml = `<div class="hwfit-serve-panel">${_slotsHtml}`;
+      let panelHtml = `<div class="hwfit-serve-panel">`;
       // Warn when serving a model whose download hasn't fully completed —
       // the user CAN still hit Launch (vLLM/llama-server will start, then
       // crash trying to read missing shards), but they should know.
@@ -633,26 +608,48 @@ function _rerenderCachedModels() {
         _gpuBtnsHtml += `<button type="button" class="cookbook-gpu-btn${on ? ' active' : ''}" data-gpu="${i}">${i}</button>`;
       }
       panelHtml += `<label>${_l('GPUs','Toggle which GPUs to use')}<div class="cookbook-gpu-group">${_gpuBtnsHtml}</div><input type="hidden" class="hwfit-sf" data-field="gpus" value="${esc(defaultGpus)}" /></label>`;
+      // Save / saved-configs split button — moved into Row 1 (next to GPUs)
+      // so it shares the same baseline as the rest of the top controls.
+      panelHtml += _slotsHtml;
       panelHtml += `</div>`;
       panelHtml += `<div class="hwfit-serve-runtime-note" style="display:none;font-size:11px;line-height:1.35;color:var(--fg-muted);margin-top:-4px;"></div>`;
       if (_ggufChoices.length > 1) {
-        panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
-        panelHtml += `<label class="hwfit-backend-llamacpp">${_l('GGUF File','Choose the exact GGUF artifact to serve from this cached model folder.')}<select class="hwfit-sf hwfit-sf-wide" data-field="gguf_file">${_ggufOptions}</select></label>`;
+        // Show the GGUF File dropdown for BOTH llama.cpp and Ollama — Ollama
+        // also needs to know which exact .gguf to import via the new
+        // `docker exec ollama-test ollama-import` auto-fill (otherwise the
+        // helper falls back to "first sorted gguf", which may not match what
+        // the user picked).
+        panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp hwfit-backend-ollama">`;
+        panelHtml += `<label class="hwfit-backend-llamacpp hwfit-backend-ollama">${_l('GGUF File','Choose the exact GGUF artifact to serve from this cached model folder.')}<select class="hwfit-sf hwfit-sf-wide" data-field="gguf_file">${_ggufOptions}</select></label>`;
         panelHtml += `</div>`;
       } else if (_defaultGguf) {
         panelHtml += `<input type="hidden" class="hwfit-sf" data-field="gguf_file" value="${esc(_defaultGguf)}" />`;
       }
-      // Row 2: Core settings
-      panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp">`;
+      // Row 2: Core settings — the handful you actually touch every launch.
+      // TP / Context / GPU / GPU Mem / Max Seqs / Dtype. Everything else
+      // (Swap, KV Cache, Attention backend, Env vars, llama.cpp batch/ubatch)
+      // moved to the Advanced fold below to keep this row scannable.
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang hwfit-backend-llamacpp hwfit-backend-ollama">`;
       panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('TP','Tensor Parallelism — split model across N GPUs')}<select class="hwfit-sf" data-field="tp">${tpOpts}</select></label>`;
       // ctx resets to the model's max on every panel open (the real ctx slider
       // lives in the Scan/Download toolbar — see cookbook.js .hwfit-ctx-control).
       panelHtml += `<label>${_l('Context','Max tokens per request — resets to the model max on every open. Lower = less VRAM')}<input type="text" class="hwfit-sf" data-field="ctx" value="${esc(m.context_length || m.context || '20000')}" /></label>`;
       panelHtml += `<label>${_l('GPU','Which GPU to use. Leave empty for default')}<input type="text" class="hwfit-sf" data-field="gpu_id" value="${esc(sv('gpu_id', ''))}" placeholder="auto" style="width:50px;" /></label>`;
       panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('GPU Mem','Fraction of GPU memory (0.0–1.0). Lower if OOM')}<input type="text" class="hwfit-sf" data-field="gpu_mem" value="${esc(sv('gpu_mem', '0.90'))}" /></label>`;
-      panelHtml += `<label class="hwfit-backend-vllm">${_l('Swap','CPU swap space in GB. Leave empty to omit (removed in newer vLLM)')}<input type="text" class="hwfit-sf" data-field="swap" value="${esc(sv('swap', ''))}" placeholder="off" /></label>`;
       panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang">${_l('Max Seqs','Maximum concurrent requests. Lower = less memory. Default 4 — prosumer GPUs often OOM on vLLM default 256 during CUDA graph capture.')}<input type="text" class="hwfit-sf" data-field="max_seqs" value="${esc(sv('max_seqs', '4'))}" placeholder="4" /></label>`;
       panelHtml += `<label>${_l('Dtype','Data type for weights. auto picks best for GPU')}<select class="hwfit-sf" data-field="dtype">${dtypeOpts}</select></label>`;
+      panelHtml += `</div>`;
+      // ── Advanced (collapsed by default) ──
+      // Everything below the fold is tuning users only touch occasionally:
+      // vLLM kernel/env knobs, llama.cpp fit/cache/split controls, the
+      // GGUF batch sizes, the speculative-decoding row, and the live VRAM
+      // monitor. Wrapped in a native <details> so toggle state survives
+      // re-renders cheaply and a closed fold doesn't trigger any layout
+      // work for the dozens of nested inputs.
+      panelHtml += `<details class="hwfit-serve-advanced">`;
+      panelHtml += `<summary class="hwfit-serve-advanced-summary">Advanced</summary>`;
+      // Advanced vLLM/SGLang row (KV Cache, Attention, Swap, Env)
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-vllm hwfit-backend-sglang">`;
       panelHtml += `<label class="hwfit-backend-vllm">${_l('KV Cache','vLLM --kv-cache-dtype. auto uses the model/runtime default; fp8 reduces KV memory for long context.')}<select class="hwfit-sf" data-field="vllm_kv_cache_dtype" style="height:32px;">${vllmKvCacheOpts}</select></label>`;
       // Attention backend selector — pin the kernel impl. Default `auto` lets
       // vLLM pick FlashInfer (which JITs on first use and breaks on older
@@ -662,6 +659,7 @@ function _rerenderCachedModels() {
       const vllmAttnBackendOpts = ['auto', 'FLASH_ATTN', 'XFORMERS', 'FLASHINFER', 'TORCH_SDPA']
         .map(b => `<option value="${b === 'auto' ? '' : b}"${(sv('vllm_attn_backend','') === (b === 'auto' ? '' : b)) ? ' selected' : ''}>${b}</option>`).join('');
       panelHtml += `<label class="hwfit-backend-vllm">${_l('Attention','vLLM VLLM_ATTENTION_BACKEND. auto = vLLM picks (often FLASHINFER, which JITs and can fail on old nvcc). FLASH_ATTN skips the JIT entirely.')}<select class="hwfit-sf" data-field="vllm_attn_backend" style="height:32px;">${vllmAttnBackendOpts}</select></label>`;
+      panelHtml += `<label class="hwfit-backend-vllm">${_l('Swap','CPU swap space in GB. Leave empty to omit (removed in newer vLLM)')}<input type="text" class="hwfit-sf" data-field="swap" value="${esc(sv('swap', ''))}" placeholder="off" /></label>`;
       // Free-text env-vars field. Anything pasted here is prepended to the
       // launch command verbatim. Use for CUDACXX, PATH overrides, NCCL_*
       // tuning, or any other KEY=VALUE pair that doesn't have a dedicated
@@ -669,6 +667,12 @@ function _rerenderCachedModels() {
       // already exported so they expand correctly here.
       panelHtml += `<label class="hwfit-backend-vllm hwfit-backend-sglang" style="flex:1 1 100%;">${_l('Env','Extra KEY=VALUE env-var pairs prepended to the launch (space-separated). Example: CUDACXX=$VIRTUAL_ENV/lib/python3.10/site-packages/nvidia/cuda_nvcc/bin/nvcc — points flashinfer at the venv-bundled nvcc when the system one is too old for your GPU.')}<input type="text" class="hwfit-sf" data-field="extra_env" value="${esc(sv('extra_env',''))}" placeholder="CUDACXX=/path/to/nvcc NCCL_P2P_DISABLE=1" style="width:100%;" /></label>`;
       panelHtml += `</div>`;
+      // Advanced llama.cpp row (Batch / UBatch — moved out of Core for the
+      // same "rarely touched" reason as the vLLM extras above).
+      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
+      panelHtml += `<label class="hwfit-backend-llamacpp">${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
+      panelHtml += `<label class="hwfit-backend-llamacpp">${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
+      panelHtml += `</div>`;
       // Row 2b: Diffusers settings
       const diffDtypeOpts = ['bfloat16','float16','float32'].map(d => `<option value="${d}"${sv('diff_dtype','bfloat16')===d?' selected':''}>${d}</option>`).join('');
       const deviceMapOpts = ['balanced','auto','sequential'].map(d => `<option value="${d}"${sv('diff_device_map','balanced')===d?' selected':''}>${d}</option>`).join('');
@@ -691,7 +695,7 @@ function _rerenderCachedModels() {
       const llamaFitOpts = ['', 'off', 'on'].map(d => `<option value="${d}"${sv('llama_fit','')===d?' selected':''}>${d||'default'}</option>`).join('');
       const llamaSplitModeOpts = ['', 'layer', 'tensor', 'row', 'none'].map(d => `<option value="${d}"${sv('llama_split_mode','')===d?' selected':''}>${d||'default'}</option>`).join('');
       panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
-      panelHtml += `<label>${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" style="width:54px;" /></label>`;
+      panelHtml += `<label>${_l('CPU MoE','n-cpu-moe: number of MoE expert layers to run on CPU when the model is bigger than VRAM. 0 = all on GPU. Set automatically by the Auto profiles below.')}<input type="text" class="hwfit-sf" data-field="n_cpu_moe" value="${esc(sv('n_cpu_moe',''))}" placeholder="0" style="width:54px;position:relative;top:-8px;" /></label>`;
       panelHtml += `<label>${_l('KV Cache','cache-type-k/v: quantize the KV cache. q4_0 = smallest (more context), q8_0 = sharp long-context, f16 = full. Blank = llama.cpp default.')}<select class="hwfit-sf" data-field="cache_type">${_kvOpts}</select></label>`;
       panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="flash_attn"${sv('flash_attn',false)?' checked':''} /> Flash Attn${_h('--flash-attn on: faster attention + needed for quantized KV cache.')}</label>`;
       panelHtml += `<label class="hwfit-sf-cb" style="align-self:end;"><input type="checkbox" class="hwfit-sf" data-field="vision"${sv('vision',false)?' checked':''} /> Vision${_h('Serve with the vision encoder so the model can read images. Auto-finds an mmproj-*.gguf next to the model (download one into the model folder). Adds ~1 GB VRAM + a small per-image cost.')}</label>`;
@@ -701,19 +705,16 @@ function _rerenderCachedModels() {
       // explicit overrides for known-good advanced presets; blank keeps
       // llama.cpp/profile defaults.
       panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp">`;
-      panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode">${llamaSplitModeOpts}</select></label>`;
+      panelHtml += `<label>${_l('Split Mode','llama.cpp GPU placement. layer is the usual default; tensor splits weights and KV across GPUs.')}<select class="hwfit-sf" data-field="llama_split_mode" style="position:relative;top:-8px;">${llamaSplitModeOpts}</select></label>`;
       panelHtml += `<label>${_l('Tensor Split','GPU proportions for llama.cpp, e.g. 50,50 across two visible GPUs. Leave blank for auto.')}<input type="text" class="hwfit-sf" data-field="llama_tensor_split" value="${esc(sv('llama_tensor_split', ''))}" placeholder="50,50" /></label>`;
       panelHtml += `<label>${_l('Main GPU','llama.cpp --main-gpu index inside the visible GPU set. Mostly useful for split mode none/row.')}<input type="text" class="hwfit-sf" data-field="llama_main_gpu" value="${esc(sv('llama_main_gpu', ''))}" placeholder="auto" /></label>`;
       panelHtml += `<label>${_l('Parallel','llama.cpp parallel slots. Leave blank for llama.cpp default; 1 matches single-lane presets.')}<input type="text" class="hwfit-sf" data-field="llama_parallel" value="${esc(sv('llama_parallel', ''))}" placeholder="1" /></label>`;
-      panelHtml += `<label>${_l('Batch','llama.cpp prompt batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_batch_size" value="${esc(sv('llama_batch_size', ''))}" placeholder="2048" /></label>`;
-      panelHtml += `<label>${_l('UBatch','llama.cpp physical micro-batch size. Leave blank for llama.cpp default.')}<input type="text" class="hwfit-sf" data-field="llama_ubatch_size" value="${esc(sv('llama_ubatch_size', ''))}" placeholder="512" /></label>`;
-      panelHtml += `</div>`;
-      // Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
-      // Buttons are injected after the panel mounts (needs an async fetch).
-      panelHtml += `<div class="hwfit-serve-row hwfit-backend-llamacpp hwfit-serve-profiles" style="align-items:center;gap:8px;">`;
-      panelHtml += `<span style="opacity:0.7;font-size:11px;">Auto profiles:</span>`;
-      panelHtml += `<span class="hwfit-profile-btns" style="display:flex;gap:6px;flex-wrap:wrap;"><span style="opacity:0.5;font-size:11px;">computing…</span></span>`;
       panelHtml += `</div>`;
+      // Auto-profile chips row removed — visual fit with the rest of the
+      // serve panel was off, and the manual ctx/n_cpu_moe/cache controls
+      // above are already sufficient. The hwfit profile API
+      // (/api/hwfit/profiles) is still available for any caller that
+      // wants it.
       // Live VRAM / RAM-spillover monitor for the serve target's GPU. Polls
       // /api/cookbook/gpus while the panel is open so you can SEE whether the
       // config fits VRAM (fast) or spills to system RAM (slow). Populated after mount.
@@ -745,7 +746,7 @@ function _rerenderCachedModels() {
       // even for models the auto-detector doesn't recognize. Expert-parallel,
       // reasoning-parser and MoE-env still only appear when auto-detected.
       const _opts2 = _detectModelOptimizations(repo);
-      panelHtml += `<div class="hwfit-serve-checks hwfit-backend-vllm" style="margin-top:2px;">`;
+      panelHtml += `<div class="hwfit-serve-checks hwfit-backend-vllm">`;
       if (_opts2.flags.includes('--enable-expert-parallel')) panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="expert_parallel" /> Expert Parallel</label>`;
       if (_opts2.flags.some(f => f.includes('--reasoning-parser'))) { const rp = _opts2.flags.find(f => f.includes('--reasoning-parser')).split(' ')[1]; panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="reasoning_parser" data-parser="${rp}" /> Reasoning Parser <span class="hwfit-parser-tag">${rp}</span></label>`; }
       {
@@ -764,6 +765,8 @@ function _rerenderCachedModels() {
       }
       if (_opts2.envVars.length) panelHtml += `<label class="hwfit-sf-cb"><input type="checkbox" class="hwfit-sf" data-field="moe_env" /> MoE Env Vars</label>`;
       panelHtml += `</div>`;
+      // ── End Advanced fold ──
+      panelHtml += `</details>`;
       // Command preview + actions. Wrap the textarea so a floating Copy
       // button can sit at its top-right corner — same pattern as the chat
       // run-output panel.
@@ -825,27 +828,17 @@ function _rerenderCachedModels() {
           // model the file lives under "<path>/<repo>" — search there just like we
           // search the HF snapshots dir, so serving a GGUF from a custom dir works
           // instead of handing llama.cpp a directory (which fails).
-          const _ldir = m.path
-            ? (_isWindows() ? `${m.path.replace(/\//g, '\\')}\\${repo.replace(/\//g, '\\')}` : _shellQuote(`${m.path}/${repo}`))
-            : (_isWindows() ? '' : '""');
-          if (selectedGguf) {
-            f._gguf_path = _selectedGgufExpr(m, repo, selectedGguf.rel_path);
-          } else if (_isWindows()) {
-            // Windows fallback: no bash $() available; validator rejects it.
-            // Return empty so the serve fails with a clear message.
-            f._gguf_path = '';
-          } else if (m.is_local_dir && m.path) {
-            f._gguf_path = `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
-          } else {
-            f._gguf_path = `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
-          }
+          const _ldir = m.path ? _shellQuote(`${m.path}/${repo}`) : '""';
+          f._gguf_path = selectedGguf
+            ? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
+            : m.is_local_dir && m.path
+            ? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
+            : `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
           // Vision: auto-find the mmproj (CLIP/projector) file in the same dir.
           // Resolved at runtime so the toggle just works if an mmproj-*.gguf is
           // present (downloaded alongside the model). Empty if none → cmd omits it.
           const _vsearchdir = (m.is_local_dir && m.path) ? _ldir : dir;
-          f._mmproj_path = _isWindows()
-            ? (_vsearchdir ? `${_vsearchdir}\\mmproj*.gguf` : '')
-            : `$(find ${_vsearchdir} -iname 'mmproj*.gguf' 2>/dev/null | sort | head -1)`;
+          f._mmproj_path = `$(find ${_vsearchdir} -iname 'mmproj*.gguf' 2>/dev/null | sort | head -1)`;
         }
         if (f.reasoning_parser) {
           const _rpEl2 = panel.querySelector('[data-field="reasoning_parser"]');
@@ -886,72 +879,29 @@ function _rerenderCachedModels() {
         _clampCtx(false);   // fix any stale/preset value already present
       }
 
-      // Auto profiles — fetch hardware-computed llama.cpp profiles and render
-      // them as clickable chips. Clicking one fills the ctx/CPU-MoE/KV/flash
-      // fields and rebuilds the command. Computed from detected VRAM (see
-      // services/hwfit/profiles.py); rough on t/s, accurate on fit.
-      async function _loadServeProfiles() {
-        const wrap = panel.querySelector('.hwfit-profile-btns');
-        if (!wrap) return;
+      // Tighten the ctx slider's upper bound to the model's trained limit.
+      // Asking llama.cpp for ctx > n_ctx_train overflows and, with a quantized
+      // KV cache, can crash the GPU (radv ErrorDeviceLost). The auto-profile
+      // chip row that used to also live here was removed — visual fit with
+      // the rest of the serve panel was off — but this clamp is essential.
+      (async () => {
         try {
           const host = (_es.remoteHost || '').trim();
-          const selected = _serverByVal?.(_es.remoteServerKey || host);
           const params = new URLSearchParams({ model: repo });
           if (host) {
             params.set('host', host);
-            const _sp = selected?.port;
+            const _sp = (_es.servers || []).find(s => s.host === host)?.port;
             if (_sp) params.set('ssh_port', _sp);
           }
-          // SERVE mode: this is a specific GGUF file already on disk, so its quant
-          // is fixed — tell the profiler the file's real size + quant so it varies
-          // only the serving knobs (KV/ctx/offload), not the quant. Parse the size
-          // from m.size (e.g. "20.6 GB") and the quant from the file/repo name.
-          const _sizeMatch = String(m.size || '').match(/([\d.]+)\s*GB/i);
-          if (_sizeMatch) params.set('serve_weights_gb', _sizeMatch[1]);
-          const _qMatch = String(repo).match(/(Q\d[\w]*|IQ\d[\w]*|F16|BF16|FP8)/i);
-          if (_qMatch) params.set('serve_quant', _qMatch[1]);
           const res = await fetch(`/api/hwfit/profiles?${params}`);
           const data = await res.json();
-          // Remember the model's trained context limit and clamp the ctx field
-          // to it — asking llama.cpp for ctx > n_ctx_train overflows and, with a
-          // quantized KV cache, can crash the GPU (radv ErrorDeviceLost).
           const ctxMax = Number(data && data.model_ctx_max) || 0;
           if (ctxMax > 0) {
-            panel._modelCtxMax = ctxMax;   // tighten the clamp to the real limit
-            _clampCtx(false);              // re-apply now that we know the model's max
+            panel._modelCtxMax = ctxMax;
+            _clampCtx(false);
           }
-          const profs = (data && Array.isArray(data.profiles)) ? data.profiles : [];
-          if (!profs.length) { wrap.innerHTML = `<span style="opacity:0.5;font-size:11px;">no auto profile for this model</span>`; return; }
-          wrap.innerHTML = '';
-          for (const p of profs) {
-            const b = document.createElement('button');
-            b.type = 'button';
-            b.className = 'cookbook-btn hwfit-profile-chip';
-            b.style.cssText = 'height:24px;padding:0 9px;font-size:11px;';
-            const off = p.offloads ? `, ncm${p.n_cpu_moe}` : ', all-GPU';
-            b.textContent = `${p.label} · ${p.quant} · ${Math.round(p.ctx/1024)}k${off}`;
-            b.title = `${p.note}\nKV ${p.cache_type}, ~${p.est_vram_gb} GB VRAM`;
-            b.addEventListener('click', () => {
-              const set = (field, val) => {
-                const el = panel.querySelector(`[data-field="${field}"]`);
-                if (!el) return;
-                if (el.type === 'checkbox') el.checked = !!val; else el.value = val;
-              };
-              set('ctx', p.ctx);
-              set('n_cpu_moe', p.n_cpu_moe || '');
-              set('cache_type', p.cache_type || '');
-              set('flash_attn', true);   // required for a quantized KV cache
-              wrap.querySelectorAll('.hwfit-profile-chip').forEach(x => x.classList.remove('cookbook-btn-active'));
-              b.classList.add('cookbook-btn-active');
-              updateCmd();
-            });
-            wrap.appendChild(b);
-          }
-        } catch {
-          wrap.innerHTML = `<span style="opacity:0.5;font-size:11px;">profile compute failed</span>`;
-        }
-      }
-      _loadServeProfiles();
+        } catch { /* clamp falls back to the static default */ }
+      })();
 
       // Live GPU-memory monitor: poll /api/cookbook/gpus and show VRAM usage +
       // RAM-spillover, with a plain-language health/speed hint. Lets you tell at
@@ -962,11 +912,10 @@ function _rerenderCachedModels() {
         if (!el || !document.body.contains(el)) return false;  // panel closed → stop
         try {
           const host = (_es.remoteHost || '').trim();
-          const selected = _serverByVal?.(_es.remoteServerKey || host);
           const params = new URLSearchParams();
           if (host) {
             params.set('host', host);
-            const _sp = selected?.port;
+            const _sp = (_es.servers || []).find(s => s.host === host)?.port;
             if (_sp) params.set('ssh_port', _sp);
           }
           const res = await fetch('/api/cookbook/gpus' + (params.toString() ? '?' + params : ''));
@@ -1535,6 +1484,38 @@ function _rerenderCachedModels() {
           }
           panel._gpuProbe.byIdx = new Map(data.gpus.map(g => [g.index, g]));
           panel._gpuProbe.host = remoteHost;
+          // If the probe found more GPUs than the panel originally
+          // rendered (e.g. host switched from a 1-iGPU local box to an
+          // 8-GPU remote), append buttons for the missing indexes so the
+          // user can actually toggle them. Reuse the parent <div> from
+          // the first existing button as the insertion target.
+          try {
+            const _existing = Array.from(panel.querySelectorAll('.cookbook-gpu-btn'));
+            const _grp = _existing[0] && _existing[0].parentElement;
+            if (_grp) {
+              const _have = new Set(_existing.map(b => parseInt(b.dataset.gpu, 10)));
+              const _activeStr = (panel.querySelector('[data-field="gpus"]')?.value || '').split(',').map(s => s.trim());
+              data.gpus.forEach(g => {
+                if (_have.has(g.index)) return;
+                const _b = document.createElement('button');
+                _b.type = 'button';
+                _b.className = 'cookbook-gpu-btn' + (_activeStr.includes(String(g.index)) ? ' active' : '');
+                _b.dataset.gpu = String(g.index);
+                _b.textContent = String(g.index);
+                _grp.appendChild(_b);
+                // Re-wire the click handler the same way the panel did
+                // on first render. Toggles active + rewrites the hidden
+                // gpus input from the live set of active buttons.
+                _b.addEventListener('click', () => {
+                  _b.classList.toggle('active');
+                  const activeBtns = [...panel.querySelectorAll('.cookbook-gpu-btn.active')];
+                  const ids = activeBtns.map(x => x.dataset.gpu).sort((a, b) => +a - +b).join(',');
+                  const hidden = panel.querySelector('[data-field="gpus"]');
+                  if (hidden) { hidden.value = ids; hidden.dispatchEvent(new Event('change', { bubbles: true })); }
+                });
+              });
+            }
+          } catch (_) {}
           panel.querySelectorAll('.cookbook-gpu-btn').forEach(b => {
             const idx = parseInt(b.dataset.gpu);
             const g = panel._gpuProbe.byIdx.get(idx);
@@ -1790,7 +1771,7 @@ function _rerenderCachedModels() {
             const _probeParams = new URLSearchParams();
             if (_probeHost) {
               _probeParams.set('host', _probeHost);
-              const _sp = (_serverByVal?.(_envState.remoteServerKey || _probeHost) || {}).port;
+              const _sp = (_envState.servers || []).find(s => s.host === _probeHost)?.port;
               if (_sp) _probeParams.set('ssh_port', _sp);
             }
             const _probeRes = await fetch('/api/cookbook/gpus' + (_probeParams.toString() ? '?' + _probeParams : ''), { credentials: 'same-origin' });
@@ -1861,12 +1842,20 @@ function _rerenderCachedModels() {
         }
         // Save in the { _byRepo, _lastUsed } schema — no legacy flat keys at
         // the root so per-model state doesn't leak between models.
+        // Stamp `_forceBackend: true` so the next open of this model defaults
+        // to the launched configuration end-to-end, even when the detector
+        // would have picked a different backend. Without this flag, the
+        // `savedMatchesBackend` gate inside sv() throws away every saved
+        // value when the detected backend doesn't match — the user opens
+        // Serve again and the panel looks like a fresh form despite a
+        // known-good prior launch.
         try {
           let cur = {};
           try { cur = JSON.parse(localStorage.getItem(SERVE_STATE_KEY)) || {}; } catch {}
           const byRepo = (cur && cur._byRepo && typeof cur._byRepo === 'object') ? cur._byRepo : {};
-          byRepo[repo] = serveState;
-          localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: serveState }));
+          const _saved = { ...serveState, _forceBackend: true };
+          byRepo[repo] = _saved;
+          localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: _saved }));
         } catch {}
         const origEnv = _envState.env;
         const origEnvPath = _envState.envPath;
@@ -1882,7 +1871,8 @@ function _rerenderCachedModels() {
         if (_ssEl && _ssEl.value != null) {
           if (_ssEl.value === 'local') serveHost = '';
           else {
-            const _srv = _serverByVal?.(_ssEl.value) || _envState.servers[parseInt(_ssEl.value)];
+            // Values are host strings now; resolve by host (numeric fallback).
+            const _srv = _envState.servers.find(s => s.host === _ssEl.value) || _envState.servers[parseInt(_ssEl.value)];
             if (_srv) {
               serveHost = _srv.host;
               _srvEnv = _srv.env || '';
@@ -1938,10 +1928,24 @@ function _rerenderCachedModels() {
 function _resolveCacheHost() {
   let host = _envState.remoteHost || '';
   const cacheSrv = document.getElementById('hwfit-cache-server');
+
+  function _serverByCacheValue(val) {
+    if (val === 'local') return null;
+    const found = _envState.servers.find(x => x.host === val)
+      || (/^\d+$/.test(String(val)) ? _envState.servers[parseInt(val)] : null)
+      || _envState.servers.find(x => x.name === val)
+      || null;
+    return found || null;
+  }
+
   if (cacheSrv) {
     const val = cacheSrv.value;
-    if (val === 'local') host = '';
-    else { const s = _serverByVal?.(val) || _envState.servers[parseInt(val)]; if (s) host = s.host; }
+    if (val === 'local') {
+      host = '';
+    } else {
+      const s = _serverByCacheValue(val);
+      if (s) host = s.host;
+    }
   }
   return host;
 }
@@ -2071,8 +2075,12 @@ export async function openServePanelForRepo(repo, fields) {
       let cur = {};
       try { cur = JSON.parse(localStorage.getItem(SERVE_STATE_KEY)) || {}; } catch {}
       const byRepo = (cur && cur._byRepo && typeof cur._byRepo === 'object') ? cur._byRepo : {};
-      byRepo[repo] = fields;
-      localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: fields }));
+      // Mirror the launch-time save: stamp _forceBackend so the panel's
+      // sv() helper treats these seeded fields as authoritative, not as
+      // overridable defaults.
+      const _seeded = { ...fields, _forceBackend: true };
+      byRepo[repo] = _seeded;
+      localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: _seeded }));
     } catch {}
   }
   // Switch to the Serve tab (its click handler triggers _fetchCachedModels).
@@ -2099,7 +2107,18 @@ export async function openServePanelForRepo(repo, fields) {
              .find(el => (el.dataset.repo || '').split('/').pop() === _short);
     }
     if (card) {
-      if (!card.classList.contains('doclib-card-expanded')) card.click();
+      // If we were given fields to restore, force a fresh render of the
+      // serve panel so it reads the just-written _byRepo[repo] values
+      // from localStorage. Without this, an already-expanded card kept
+      // its stale form and the "Edit serve" → previous settings round-
+      // trip looked broken from the user's side.
+      if (fields && card.classList.contains('doclib-card-expanded')) {
+        card.click();
+        await new Promise(r => setTimeout(r, 40));
+        card.click();
+      } else if (!card.classList.contains('doclib-card-expanded')) {
+        card.click();
+      }
       try { card.scrollIntoView({ behavior: 'smooth', block: 'center' }); } catch {}
       return true;
     }
@@ -2130,6 +2149,14 @@ export async function _fetchCachedModels() {
   try {
     let host = _envState.remoteHost || '';
     let selectedServer = null;
+    const _serverByCacheValue = (val) => {
+      if (val === 'local') return null;
+      return _envState.servers.find(x => x.host === val)
+        || (/^\d+$/.test(String(val)) ? _envState.servers[parseInt(val)] : null)
+        || _envState.servers.find(x => x.name === val)
+        || null;
+    };
+
     const cacheSrv = document.getElementById('hwfit-cache-server');
     if (cacheSrv) {
       const val = cacheSrv.value;
@@ -2137,11 +2164,11 @@ export async function _fetchCachedModels() {
         host = '';
         selectedServer = _envState.servers.find(s => !s.host || s.host === 'local') || _envState.servers[0];
       } else {
-        const s = _serverByVal?.(val) || _envState.servers[parseInt(val)];
+        const s = _serverByCacheValue(val);
         if (s) { host = s.host; selectedServer = s; }
       }
     } else {
-      selectedServer = _serverByVal?.(_envState.remoteServerKey || host) || _envState.servers[0];
+      selectedServer = _envState.servers.find(s => s.host === host) || _envState.servers[0];
     }
     // Read extra model dirs from the SELECTED server's modelDirs (canonical source)
     const modelDirs = [];
@@ -2171,7 +2198,18 @@ export async function _fetchCachedModels() {
     if (modelDirs.length) qp.set('model_dir', modelDirs.join(','));
     const params = qp.toString() ? `?${qp}` : '';
     const res = await fetch(`/api/model/cached${params}`);
-    if (!res.ok) throw new Error(res.statusText);
+    if (!res.ok) {
+      const body = await res.text().catch(() => '');
+      let msg = '';
+      try {
+        const payload = JSON.parse(body);
+        msg = payload && (payload.detail || payload.error || payload.message);
+      } catch {
+        msg = body;
+      }
+      msg = typeof msg === 'string' ? msg.trim() : '';
+      throw new Error(`HTTP ${res.status} ${res.statusText}${msg ? `: ${msg}` : ''}`);
+    }
     const data = await res.json();
     _dlWp.destroy();
 
@@ -2268,7 +2306,6 @@ export function initServe(shared) {
   _envState = shared._envState;
   _sshCmd = shared._sshCmd;
   _getPort = shared._getPort;
-  _serverByVal = shared._serverByVal;
   _sshPrefix = shared._sshPrefix;
   _getPlatform = shared._getPlatform;
   _isWindows = shared._isWindows;
diff --git a/static/js/documentLibrary.js b/static/js/documentLibrary.js
index 642a91faa..8c632a3a9 100644
--- a/static/js/documentLibrary.js
+++ b/static/js/documentLibrary.js
@@ -578,13 +578,12 @@ let _libraryArchivedView = false;   // Documents tab showing archived docs?
     const pieces = [];
     if (doc.session_name) pieces.push(`<span>${_esc(doc.session_name)}</span>`);
     if (doc.language && doc.language !== 'text') {
-      const ic = langIcon(doc.language, 11, { style: 'vertical-align:-2px;flex-shrink:0;opacity:0.65;color:currentColor;' });
-      pieces.push(`<span style="display:inline-flex;align-items:center;gap:3px;">${ic}${_esc(doc.language)}</span>`);
+      // Per-language icon lives in the title row above; just the language
+      // name here keeps the meta line scannable without duplicating the icon.
+      pieces.push(`<span>${_esc(doc.language)}</span>`);
     }
     pieces.push(`<span>${_esc(libraryRelativeTime(doc.updated_at))}</span>`);
     meta.innerHTML = pieces.join('<span style="opacity:0.5;">\u00b7</span>');
-    // Strip the per-language icon from the meta line \u2014 it now sits next to the
-    // title above, so duplicating it here was redundant.
     content.appendChild(meta);
     card.appendChild(content);
 
diff --git a/static/js/emailLibrary.js b/static/js/emailLibrary.js
index a294ca010..4dd2f720d 100644
--- a/static/js/emailLibrary.js
+++ b/static/js/emailLibrary.js
@@ -788,7 +788,7 @@ export function openEmailLibrary(opts = {}) {
         <div class="admin-card" style="flex:1;flex-direction:column;display:flex;overflow:hidden;">
           <p class="memory-desc doclib-desc">All emails. Click to open as a document.</p>
           <div class="email-accounts-row">
-            <div id="email-lib-accounts" style="display:flex;gap:4px;flex-wrap:wrap;flex:1;"></div>
+            <div id="email-lib-accounts" style="display:flex;gap:4px;flex:1;min-width:0;"></div>
             <button class="memory-toolbar-btn email-compose-jiggle" id="email-lib-compose-btn">
               <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" style="vertical-align:-2px;margin-right:3px;"><rect x="2" y="4" width="20" height="16" rx="2"/><path d="m22 7-8.97 5.7a1.94 1.94 0 0 1-2.06 0L2 7"/></svg>
               New
diff --git a/static/js/markdown.js b/static/js/markdown.js
index 61ac069b5..41a62b3d2 100644
--- a/static/js/markdown.js
+++ b/static/js/markdown.js
@@ -36,6 +36,17 @@ function linkHtml(text, url) {
   return `<a href="${escapeHtml(safeUrl)}" target="_blank" rel="noopener noreferrer">${safeText}</a>`;
 }
 
+function _isModelEndpointUrl(rawUrl) {
+  try {
+    const parsed = new URL(String(rawUrl || ''), window.location.origin);
+    if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return false;
+    const path = parsed.pathname.replace(/\/+$/, '');
+    return path === '/v1';
+  } catch (_) {
+    return false;
+  }
+}
+
 /**
  * Sanitize the raw-HTML fragments that mdToHtml deliberately preserves from
  * the source text — <details> blocks (collapsible agent output) and <a> tags
@@ -327,6 +338,17 @@ function createThinkingSection(thinkingContent, index = 0, thinkingTime = null)
   `;
 }
 
+function createTaskCompletedMarker() {
+  return `
+    <div class="task-completed-marker" role="status" aria-label="Task completed">
+      <span class="task-completed-icon" aria-hidden="true">
+        <svg viewBox="0 0 24 24" width="14" height="14" fill="none" stroke="currentColor" stroke-width="2.6" stroke-linecap="round" stroke-linejoin="round"><polyline points="20 6 9 17 4 12"/></svg>
+      </span>
+      <span>Task completed</span>
+    </div>
+  `;
+}
+
 /**
  * Process text and render with thinking sections
  */
@@ -422,6 +444,9 @@ export function processWithThinking(text) {
   const { thinkingBlocks, content, thinkingTime } = extractThinkingBlocks(text);
 
   let html = '';
+  let visibleContent = content || '';
+  const doneOnly = /^\s*\[DONE\]\s*$/i.test(visibleContent);
+  const hadTrailingDone = !doneOnly && /(?:^|\n)\s*\[DONE\]\s*$/i.test(visibleContent);
 
   // Add thinking sections (collapsed by default)
   thinkingBlocks.forEach((block, index) => {
@@ -429,8 +454,12 @@ export function processWithThinking(text) {
   });
 
   // Add the actual content
-  if (content) {
-    html += mdToHtml(content);
+  if (doneOnly) {
+    html += createTaskCompletedMarker();
+  } else {
+    if (hadTrailingDone) visibleContent = visibleContent.replace(/\n?\s*\[DONE\]\s*$/i, '').trimEnd();
+    if (visibleContent) html += mdToHtml(visibleContent);
+    if (hadTrailingDone) html += createTaskCompletedMarker();
   }
 
   return _useSvgEmoji() ? svgifyEmoji(html) : html;
@@ -885,3 +914,121 @@ document.addEventListener('click', function(e) {
     start();
   }
 })();
+
+function _endpointNameFromUrl(url) {
+  try {
+    const parsed = new URL(url, window.location.origin);
+    return parsed.host || parsed.hostname || 'Model endpoint';
+  } catch (_) {
+    return 'Model endpoint';
+  }
+}
+
+function _appendEndpointAddButtons(root) {
+  if (!root || !root.querySelectorAll) return;
+  const anchors = root.matches?.('a[href]')
+    ? [root]
+    : [...root.querySelectorAll('a[href]')];
+  for (const anchor of anchors) {
+    if (anchor.dataset.endpointAddChecked === '1') continue;
+    anchor.dataset.endpointAddChecked = '1';
+    const href = anchor.getAttribute('href') || '';
+    if (!_isModelEndpointUrl(href)) continue;
+    if (anchor.nextElementSibling?.classList?.contains('model-endpoint-add-btn')) continue;
+
+    const btn = document.createElement('button');
+    btn.type = 'button';
+    btn.className = 'model-endpoint-add-btn';
+    btn.dataset.endpointUrl = new URL(href, window.location.origin).href.replace(/\/+$/, '');
+    btn.title = 'Add this OpenAI-compatible endpoint to the model picker';
+    btn.innerHTML = '<span aria-hidden="true">+</span><span>Add to model picker</span>';
+    anchor.insertAdjacentElement('afterend', btn);
+  }
+}
+
+async function _registerEndpointFromButton(btn) {
+  const baseUrl = String(btn?.dataset?.endpointUrl || '').trim();
+  if (!baseUrl || !_isModelEndpointUrl(baseUrl)) return;
+  const original = btn.innerHTML;
+  btn.disabled = true;
+  btn.innerHTML = '<span aria-hidden="true">...</span><span>Adding</span>';
+  try {
+    const existingRes = await fetch('/api/model-endpoints', { credentials: 'same-origin' });
+    if (existingRes.ok) {
+      const endpoints = await existingRes.json();
+      const existing = Array.isArray(endpoints)
+        ? endpoints.find((ep) => String(ep.base_url || '').replace(/\/+$/, '') === baseUrl)
+        : null;
+      if (existing) {
+        btn.classList.add('added');
+        btn.innerHTML = '<span aria-hidden="true">✓</span><span>Already added</span>';
+        window.dispatchEvent(new CustomEvent('ge:model-endpoints-updated', { detail: { baseUrl } }));
+        if (window.modelsModule?.refreshModels) window.modelsModule.refreshModels(true);
+        if (window.sessionModule?.updateModelPicker) window.sessionModule.updateModelPicker();
+        uiModule.showToast?.(`Already in model picker: ${existing.name || _endpointNameFromUrl(baseUrl)}`);
+        return;
+      }
+    }
+
+    const parsed = new URL(baseUrl, window.location.origin);
+    const fd = new FormData();
+    fd.append('base_url', baseUrl);
+    fd.append('name', _endpointNameFromUrl(baseUrl));
+    fd.append('model_type', 'llm');
+    fd.append('endpoint_kind', 'auto');
+    fd.append('skip_probe', 'true');
+    if (/^(localhost|127\.0\.0\.1|0\.0\.0\.0)$/i.test(parsed.hostname)) {
+      fd.append('container_local', 'true');
+    }
+    const res = await fetch('/api/model-endpoints', {
+      method: 'POST',
+      credentials: 'same-origin',
+      body: fd,
+    });
+    if (!res.ok) {
+      const body = await res.text().catch(() => '');
+      throw new Error(`HTTP ${res.status}${body ? ': ' + body.slice(0, 160) : ''}`);
+    }
+    btn.classList.add('added');
+    btn.innerHTML = '<span aria-hidden="true">✓</span><span>Added</span>';
+    window.dispatchEvent(new CustomEvent('ge:model-endpoints-updated', { detail: { baseUrl } }));
+    if (window.modelsModule?.refreshModels) await window.modelsModule.refreshModels(true);
+    if (window.sessionModule?.updateModelPicker) window.sessionModule.updateModelPicker();
+    uiModule.showToast?.(`Model endpoint added: ${_endpointNameFromUrl(baseUrl)}`);
+  } catch (err) {
+    btn.disabled = false;
+    btn.innerHTML = original;
+    uiModule.showError?.(`Add endpoint failed: ${err.message || err}`);
+  }
+}
+
+(function _watchModelEndpointLinks() {
+  if (window._modelEndpointLinkWatcherWired) return;
+  window._modelEndpointLinkWatcherWired = true;
+
+  document.addEventListener('click', (e) => {
+    const btn = e.target.closest?.('.model-endpoint-add-btn');
+    if (!btn) return;
+    e.preventDefault();
+    e.stopPropagation();
+    _registerEndpointFromButton(btn);
+  });
+
+  const start = () => {
+    const root = document.body;
+    if (!root) return;
+    _appendEndpointAddButtons(root);
+    new MutationObserver((mutations) => {
+      for (const m of mutations) {
+        for (const node of m.addedNodes) {
+          if (node.nodeType === 1) _appendEndpointAddButtons(node);
+        }
+      }
+    }).observe(root, { childList: true, subtree: true });
+  };
+  if (document.readyState === 'loading') {
+    document.addEventListener('DOMContentLoaded', start, { once: true });
+  } else {
+    start();
+  }
+})();
diff --git a/static/js/modelPicker.js b/static/js/modelPicker.js
index 84656c7d0..f486c2335 100644
--- a/static/js/modelPicker.js
+++ b/static/js/modelPicker.js
@@ -327,13 +327,10 @@ function _initModelPickerDropdown() {
       // hover so the suffix/variant tag is still discoverable (#1982).
       nameSpan.title = m.display;
       row.appendChild(nameSpan);
-      if (m.stale) {
-        const badge = document.createElement('span');
-        badge.className = 'model-switch-stale-badge';
-        badge.textContent = 'offline';
-        badge.style.cssText = 'font-size:10px;opacity:0.7;padding:1px 6px;border:1px solid var(--border);border-radius:8px;margin-left:6px;';
-        row.appendChild(badge);
-      }
+      // Offline state is already conveyed by the row's reduced opacity —
+      // a redundant "offline" pill on top of that just added clutter.
+      // (Class kept on `row` so the opacity rule still applies; the text
+      // badge is gone.)
       const epSpan = document.createElement('span');
       epSpan.className = 'model-switch-ep';
       // Don't show endpoint name if it matches the model name (local self-hosted)
diff --git a/static/js/models.js b/static/js/models.js
index cf569c28f..c66876ce0 100644
--- a/static/js/models.js
+++ b/static/js/models.js
@@ -178,7 +178,14 @@ export async function refreshModels(force = false) {
     _loadingSpinner.start();
     try {
       if (!_fetchInflight) {
-        _fetchInflight = fetch(`${API_BASE}/api/models`, { credentials: 'same-origin' })
+        // Pass ?refresh=true on forced refreshes so the BACKEND's 30s
+        // per-user cache also gets bypassed. Without this, `force=true`
+        // only clears the frontend cache and the same stale list comes
+        // back — newly-served endpoints don't appear until the cache
+        // ages out. (Bug repro: serve a model, picker is empty for ~30s
+        // even though the endpoint is in the DB and online.)
+        const _url = `${API_BASE}/api/models` + (force ? '?refresh=true' : '');
+        _fetchInflight = fetch(_url, { credentials: 'same-origin' })
           .then(async (res) => {
             if (!res.ok) throw new Error(`HTTP ${res.status}`);
             return res.json();
diff --git a/static/js/settings.js b/static/js/settings.js
index c6a1d1836..8f39e44a5 100644
--- a/static/js/settings.js
+++ b/static/js/settings.js
@@ -1559,6 +1559,7 @@ async function initResearchSearchSettings() {
 async function initAgentSettings() {
   var toolsInput = el('set-agentMaxTools');
   var roundsInput = el('set-agentMaxRounds');
+  var supInput = el('set-agentSupervisorLadder');
   var msg = el('set-agentMsg');
   if (!toolsInput) return;
 
@@ -1567,6 +1568,7 @@ async function initAgentSettings() {
     var settings = await res.json();
     if (settings.agent_max_tool_calls) toolsInput.value = settings.agent_max_tool_calls;
     if (roundsInput && settings.agent_max_rounds) roundsInput.value = settings.agent_max_rounds;
+    if (supInput) supInput.checked = !!settings.agent_supervisor_ladder;
   } catch (e) {}
 
   // Clamp + coerce a raw input to an int in [lo, hi]; falls back to `dflt`
@@ -1584,23 +1586,27 @@ async function initAgentSettings() {
     if (roundsInput) roundsInput.value = rounds;
     var payload = { agent_max_tool_calls: tools };
     if (rounds != null) payload.agent_max_rounds = rounds;
+    if (supInput) payload.agent_supervisor_ladder = !!supInput.checked;
     try {
       await fetch('/api/auth/settings', { method: 'POST', credentials: 'same-origin',
         headers: { 'Content-Type': 'application/json' },
         body: JSON.stringify(payload)
       });
       msg.textContent = (tools > 0 ? 'Limit: ' + tools + ' tool calls' : 'Unlimited tool calls') +
-        (rounds != null ? ' · ' + rounds + ' steps/message' : '');
+        (rounds != null ? ' · ' + rounds + ' steps/message' : '') +
+        (supInput && supInput.checked ? ' · supervisor on' : '');
       msg.style.color = 'var(--fg)';
     } catch (e) { msg.textContent = 'Failed to save'; msg.style.color = 'var(--red)'; }
   }
 
   toolsInput.addEventListener('change', save);
   if (roundsInput) roundsInput.addEventListener('change', save);
+  if (supInput) supInput.addEventListener('change', save);
   var cur = parseInt(toolsInput.value, 10) || 0;
   var curR = roundsInput ? (parseInt(roundsInput.value, 10) || 20) : null;
   msg.textContent = (cur > 0 ? 'Limit: ' + cur + ' tool calls' : 'Unlimited tool calls') +
-    (curR != null ? ' · ' + curR + ' steps/message' : '');
+    (curR != null ? ' · ' + curR + ' steps/message' : '') +
+    (supInput && supInput.checked ? ' · supervisor on' : '');
 }
 
 /* ═══════════════════════════════════════════
diff --git a/static/js/skills.js b/static/js/skills.js
index 1a0c9701b..8eac3954c 100644
--- a/static/js/skills.js
+++ b/static/js/skills.js
@@ -890,10 +890,10 @@ function renderSkillsList() {
     });
   }
 
-  // Background-load the visible skills' SKILL.md so expanding any of them is
-  // instant (no first-time async fetch → no jump). Deferred so it never
-  // competes with the render/cascade paint.
-  setTimeout(_preloadVisibleMarkdown, 0);
+  // Do not eager-load every visible SKILL.md. On large skill libraries this
+  // creates dozens of simultaneous /api/skills/<name>/markdown requests during
+  // app startup and can peg uvicorn. Markdown is fetched lazily when a card is
+  // expanded.
 }
 
 // ---- Card expand / edit / actions ----
diff --git a/static/style.css b/static/style.css
index 491652c7a..55a7a0dbc 100644
--- a/static/style.css
+++ b/static/style.css
@@ -2048,12 +2048,64 @@ body.bg-pattern-sparkles {
     .msg-user .body {
       color: var(--fg);
     }
-    .msg-ai .body {
-      color: var(--fg);
-    }
-    .rag-sources {
-      margin-top: 12px;
-      border: 1px solid var(--border);
+.msg-ai .body {
+  color: var(--fg);
+}
+.model-endpoint-add-btn {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+  margin-left: 7px;
+  padding: 2px 7px;
+  border: 1px solid color-mix(in srgb, var(--red) 34%, var(--border));
+  border-radius: 999px;
+  background: color-mix(in srgb, var(--red) 8%, transparent);
+  color: var(--red);
+  font: inherit;
+  font-size: 0.78em;
+  line-height: 1.45;
+  cursor: pointer;
+  vertical-align: 1px;
+}
+.model-endpoint-add-btn:hover {
+  background: color-mix(in srgb, var(--red) 14%, transparent);
+  border-color: color-mix(in srgb, var(--red) 55%, var(--border));
+}
+.model-endpoint-add-btn:disabled {
+  cursor: default;
+  opacity: 0.72;
+}
+.model-endpoint-add-btn.added {
+  color: var(--color-save-green, #4caf50);
+  border-color: color-mix(in srgb, var(--color-save-green, #4caf50) 45%, var(--border));
+  background: color-mix(in srgb, var(--color-save-green, #4caf50) 9%, transparent);
+}
+.task-completed-marker {
+  display: inline-flex;
+  align-items: center;
+  gap: 7px;
+  margin: 7px 0 2px;
+  padding: 5px 9px;
+  border: 1px solid color-mix(in srgb, var(--color-save-green, #4caf50) 42%, var(--border));
+  border-radius: 999px;
+  background: color-mix(in srgb, var(--color-save-green, #4caf50) 9%, transparent);
+  color: var(--color-save-green, #4caf50);
+  font-size: 0.86em;
+  font-weight: 600;
+}
+.task-completed-icon {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  width: 17px;
+  height: 17px;
+  border-radius: 50%;
+  background: color-mix(in srgb, var(--color-save-green, #4caf50) 18%, transparent);
+  flex: 0 0 auto;
+}
+.rag-sources {
+  margin-top: 12px;
+  border: 1px solid var(--border);
       border-radius: 6px;
       padding: 8px;
       font-size: 12px;
@@ -2182,7 +2234,7 @@ body.bg-pattern-sparkles {
       position: absolute;
       top: 0;
       right: 0;
-      z-index: 2;
+      z-index: 250;
       transform-origin: top right;
       transition: opacity 0.22s ease, transform 0.22s ease;
       will-change: opacity, transform;
@@ -2704,7 +2756,7 @@ body.bg-pattern-sparkles {
       position: absolute;
       bottom: calc(100% + 16px);
       right: 0;
-      z-index: 300;
+      z-index: 250;
       min-width: 260px;
       max-width: 360px;
       background: var(--panel);
@@ -8367,6 +8419,14 @@ body.hide-thinking .thinking-section { display: none !important; }
   transition: background 0.2s ease;
 }
 
+.thinking-header > .token-new {
+  display: none;
+}
+
+.thinking-header > div:last-child {
+  flex-shrink: 0;
+}
+
 .thinking-header:hover {
   background: color-mix(in srgb, var(--red) 12%, transparent);
 }
@@ -8382,6 +8442,7 @@ body.hide-thinking .thinking-section { display: none !important; }
   min-width: 0;
 }
 .thinking-header-left span {
+  display: block;
   overflow: hidden;
   text-overflow: ellipsis;
   white-space: nowrap;
@@ -8760,6 +8821,22 @@ body.hide-thinking .thinking-section { display: none !important; }
 .agent-thread-node + .agent-thread-node {
   margin-top: 2px;
 }
+/* Supervisor ladder cards — same chrome as tool cards but tinted so the
+   user can tell at a glance "this is the agent recovering" vs "this is
+   the agent doing work". Stop rung gets the red accent. */
+.agent-thread-node.supervisor-step .agent-thread-tool {
+  color: color-mix(in srgb, var(--accent, #c08a3e) 80%, var(--fg));
+  font-style: italic;
+}
+.agent-thread-node.supervisor-step .agent-thread-dot {
+  background: color-mix(in srgb, var(--accent, #c08a3e) 60%, transparent);
+}
+.agent-thread-node.supervisor-step[data-rung="stop"] .agent-thread-tool {
+  color: var(--red, #d65a5a);
+}
+.agent-thread-node.supervisor-step[data-rung="stop"] .agent-thread-dot {
+  background: color-mix(in srgb, var(--red, #d65a5a) 60%, transparent);
+}
 .agent-thread-dot {
   position: absolute;
   left: -20px;
@@ -15144,10 +15221,28 @@ body.right-dock-active:not(.email-doc-split-active) .doc-editor-pane {
   }
 }
 
-/* Cookbook's cached-model list should scale with viewport height, not be capped at 400px */
+/* Cookbook's cached-model list: NO inner-scroll cap. Two nested scroll
+   surfaces (this + the outer .admin-card) trapped the wheel so an expanded
+   serve panel couldn't be reached on tall content. Let the outer
+   .admin-card (overflow-y:auto) be the single scroll surface. */
 .hwfit-cached-list {
-  max-height: min(75vh, 900px) !important;
-  overflow-y: auto;
+  max-height: none !important;
+  overflow-y: visible !important;
+}
+/* Serve panel specifically: the admin-card inline style is
+   `overflow:hidden` (so the toolbar/header don't drift), and the list
+   inside has overflow:visible. On short windows that combination
+   clipped the cards off the bottom with no scrollbar. Make the list
+   itself the scroll surface so the rest of the card stays put. */
+.cookbook-group[data-backend-group="Serve"] > .admin-card {
+  min-height: 0;
+}
+.cookbook-group[data-backend-group="Serve"] > .admin-card > #hwfit-cached-list,
+.cookbook-group[data-backend-group="Serve"] > .admin-card > .hwfit-cached-list {
+  flex: 1 1 0;
+  min-height: 0;
+  overflow-y: auto !important;
+  overscroll-behavior: contain;
 }
 /* Drag-and-drop visual hint for the email compose pane. Subtle accent
    outline + tinted overlay so it's obvious files will attach if dropped. */
@@ -17924,8 +18019,11 @@ body.gallery-selecting .gallery-dl-btn,
 }
 #cookbook-modal .cookbook-group > .admin-card {
   min-height: 0;
-  overflow-y: auto !important;
-  overflow-x: hidden !important;
+  /* Let .cookbook-body be the SINGLE scroll surface. Nesting another
+     overflow:auto here trapped the wheel inside the cached-list when a
+     serve panel expanded — the page couldn't scroll past the panel's
+     bottom (Launch button got hidden). */
+  overflow: visible !important;
 }
 #cookbook-modal .cookbook-section-body {
   min-height: 0;
@@ -18733,6 +18831,13 @@ body.gallery-selecting .gallery-dl-btn,
   justify-content: flex-end;
   margin-bottom: 4px;
 }
+/* When the Save split sits inside Row 1 (next to GPUs), align it with the
+   input baseline (the row's grid cells stretch top-down; without this the
+   Save buttons sit above the GPU button group). */
+.hwfit-serve-row .cookbook-serve-slots {
+  align-self: end;
+  margin-bottom: 4px;
+}
 .cookbook-slot-btn {
   min-width: 22px; height: 22px;
   padding: 0 6px;
@@ -20207,6 +20312,21 @@ body.gallery-selecting .gallery-dl-btn,
   background: color-mix(in srgb, var(--color-error) 8%, transparent);
   border: 1px solid color-mix(in srgb, var(--color-error) 30%, transparent);
   border-radius: 6px;
+  /* The diagnosis body can carry traceback fragments and long unbroken
+     paths (e.g. /home/.../snapshots/<sha>/<file>.gguf). Without these,
+     a single long token pushes the card wider than the cookbook modal,
+     scrolling the row right and clipping the action buttons. */
+  min-width: 0;
+  max-width: 100%;
+  overflow-wrap: anywhere;
+  word-break: break-word;
+}
+.cookbook-diagnosis pre,
+.cookbook-diagnosis code {
+  white-space: pre-wrap;
+  word-break: break-word;
+  overflow-wrap: anywhere;
+  max-width: 100%;
 }
 .cookbook-diag-header {
   display: flex;
@@ -20400,6 +20520,14 @@ body.gallery-selecting .gallery-dl-btn,
   opacity: 0.5;
   font-family: inherit;
 }
+/* Brief border+glow flash when an Ollama row in the hwfit list autofills the
+   Download input — helps the user see what landed when the input is offscreen
+   or above a tall list. */
+.cookbook-dl-repo.cookbook-dl-flash {
+  border-color: var(--red) !important;
+  box-shadow: 0 0 0 3px color-mix(in srgb, var(--red) 25%, transparent) !important;
+  transition: border-color 0.2s, box-shadow 0.2s;
+}
 .cookbook-dl-btn {
   background: var(--accent, var(--red));
   color: #fff;
@@ -22446,6 +22574,88 @@ input.settings-select::placeholder { color: color-mix(in srgb, var(--fg) 35%, tr
   text-align: right;
 }
 .settings-fallback-row .settings-select { flex: 1; min-width: 0; }
+/* Cookbook Serve Advanced fold — wraps the rarely-touched tuning rows
+   (KV/Attention/Swap/Env for vLLM, llama.cpp batch/cache/split, VRAM
+   monitor, speculative, extra args). Matches the existing .hwfit-panel-
+   advanced look: muted-gray label, no caps, no letter-spacing, no
+   warning-y opacity. Content flows into the parent's existing scroll
+   surface (no inner max-height) and inner rows reset their margin so
+   stacking gaps don't double when the fold opens. */
+/* Styled to match the Add Models page collapsible sections
+   (.adm-section-toggle) — same border/background/caret pattern, so the
+   two folds across the app read consistently. */
+details.hwfit-serve-advanced {
+  margin-top: 8px;
+  overflow: visible;
+}
+details.hwfit-serve-advanced > summary.hwfit-serve-advanced-summary {
+  cursor: pointer;
+  user-select: none;
+  list-style: none;
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  font-size: 11px;
+  color: var(--fg);
+  opacity: 0.8;
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  padding: 6px 9px;
+  background: color-mix(in srgb, var(--fg) 4%, transparent);
+  transition: border-color 0.12s, background 0.12s, opacity 0.12s, border-radius 0s;
+}
+details.hwfit-serve-advanced > summary.hwfit-serve-advanced-summary::-webkit-details-marker {
+  display: none;
+}
+details.hwfit-serve-advanced > summary.hwfit-serve-advanced-summary:hover {
+  opacity: 1;
+  border-color: var(--red);
+  background: color-mix(in srgb, var(--red) 8%, transparent);
+}
+/* Caret on the right, rotates open/closed. SVG-style rectangles via
+   borders keep this glyph-free + crisp at small sizes. */
+details.hwfit-serve-advanced > summary.hwfit-serve-advanced-summary::after {
+  content: '';
+  margin-left: auto;
+  width: 0;
+  height: 0;
+  border-left: 4px solid currentColor;
+  border-top: 3px solid transparent;
+  border-bottom: 3px solid transparent;
+  opacity: 0.6;
+  transform: rotate(90deg);
+  transition: transform 0.18s ease;
+}
+details.hwfit-serve-advanced:not([open]) > summary.hwfit-serve-advanced-summary::after {
+  transform: rotate(0deg);
+}
+/* Body rows below the header — tight rhythm so the fold doesn't
+   feel airy. The cookbook modal's existing .cookbook-body is the
+   scroll surface; nothing inside the fold should add its own scroll. */
+details.hwfit-serve-advanced[open] > summary.hwfit-serve-advanced-summary {
+  margin-bottom: 6px;
+}
+details.hwfit-serve-advanced > .hwfit-serve-row,
+details.hwfit-serve-advanced > .hwfit-serve-checks,
+details.hwfit-serve-advanced > .hwfit-serve-cmd-wrap,
+details.hwfit-serve-advanced > .hwfit-serve-extra {
+  margin-top: 0;
+  margin-bottom: 0;
+}
+/* Pull the vLLM/SGLang checks row, Extra args, and the trailing
+   model-specific (Speculative) checks row up tight against the row
+   above — the previous 4px gap plus per-row baseline padding left a
+   ~8px gap that read as too airy in the Advanced fold. */
+details.hwfit-serve-advanced > .hwfit-serve-checks.hwfit-backend-vllm,
+details.hwfit-serve-advanced > .hwfit-serve-checks.hwfit-backend-sglang,
+details.hwfit-serve-advanced > .hwfit-serve-extra {
+  margin-top: -8px;
+}
+details.hwfit-serve-advanced > .hwfit-serve-row:last-of-type,
+details.hwfit-serve-advanced > .hwfit-serve-checks:last-of-type {
+  margin-bottom: 0;
+}
+
 .settings-fallback-remove {
   flex-shrink: 0;
   margin-right: 4px;
@@ -22463,6 +22673,9 @@ input.settings-select::placeholder { color: color-mix(in srgb, var(--fg) 35%, tr
   transition: border-color 0.12s, color 0.12s, background 0.12s;
   position: relative;
   top: -6px;
+  /* Glyph baseline trim: nudge × up 1px inside the button without moving the
+     button. line-height < 1 lets the glyph float toward the top of its line box. */
+  line-height: 0.85;
 }
 .settings-fallback-remove:hover {
   border-color: var(--red);
@@ -33593,7 +33806,24 @@ button.cal-add-btn.cal-add-btn-text.cal-add-btn-sm:hover .cal-add-label {
 /* Only the direct-child compose button gets pushed right; nested chips
    inside #email-lib-accounts pack to the left as normal flex items. */
 .email-accounts-row > .memory-toolbar-btn { flex-shrink: 0; margin-left: auto; }
-#email-lib-accounts { justify-content: flex-start; }
+#email-lib-accounts { justify-content: flex-start; flex-wrap: wrap; }
+/* Mobile: collapse the account chips to a single horizontally-scrollable
+   strip instead of stacking onto multiple rows. The compose "New" button
+   stays outside the scroller (it's a sibling of #email-lib-accounts inside
+   .email-accounts-row) so it remains pinned on the right. */
+@media (max-width: 768px) {
+  #email-lib-accounts {
+    flex-wrap: nowrap;
+    overflow-x: auto;
+    overflow-y: hidden;
+    scrollbar-width: none;
+    -ms-overflow-style: none;
+    scroll-snap-type: x proximity;
+    -webkit-overflow-scrolling: touch;
+  }
+  #email-lib-accounts::-webkit-scrollbar { display: none; height: 0; }
+  #email-lib-accounts > * { flex-shrink: 0; scroll-snap-align: start; }
+}
 .email-accounts-loading-whirlpool {
   width: 14px;
   height: 14px;
@@ -36198,6 +36428,16 @@ body.theme-frosted .modal {
   justify-content: center;
 }
 
+/* Mobile: drop the inline icons on Launch + Cancel in the serve panel so
+   the buttons are text-only and don't wrap on narrow screens. Icons stay
+   on desktop where horizontal space isn't tight. */
+@media (max-width: 600px) {
+  .hwfit-serve-launch > svg,
+  .hwfit-serve-cancel > svg {
+    display: none !important;
+  }
+}
+
 /* Schedule form — mounted inside the cookbook serve panel. Uses the
    theme tokens (--bg, --panel, --border, --accent, --red) so it
    matches the rest of the cookbook chrome instead of inline whites. */
@@ -36249,6 +36489,18 @@ body.theme-frosted .modal {
   flex-wrap: wrap;
   gap: 5px;
 }
+/* Days field inline with From / Until — push it + the action buttons to
+   the right end of the row so the row reads: From | Until | …gap… | Days | Cancel | Save. */
+.hwfit-schedule-days-field {
+  margin-left: auto;
+}
+.hwfit-schedule-actions-inline {
+  display: inline-flex;
+  align-items: flex-end;
+  gap: 6px;
+  align-self: flex-end;
+  padding-bottom: 1px;
+}
 .hwfit-sched-day-chip {
   width: 32px;
   height: 32px;