diff --git a/app.py b/app.py
index 97906bd46..0af6b18ea 100644
--- a/app.py
+++ b/app.py
@@ -650,6 +650,10 @@ app.include_router(calendar_router)
from routes.shell_routes import setup_shell_routes
app.include_router(setup_shell_routes())
+# Terminal agents (tmux-backed Codex/Claude/shell sessions)
+from routes.terminal_agent_routes import setup_terminal_agent_routes
+app.include_router(setup_terminal_agent_routes())
+
# Cookbook (model download/serve/cache, cookbook state sync)
from routes.cookbook_routes import setup_cookbook_routes
app.include_router(setup_cookbook_routes())
diff --git a/mcp_servers/email_server.py b/mcp_servers/email_server.py
index d1c2ac07e..db731ec0f 100644
--- a/mcp_servers/email_server.py
+++ b/mcp_servers/email_server.py
@@ -22,6 +22,7 @@ import os
import os.path
from pathlib import Path
from datetime import datetime, timedelta
+import uuid
from mcp.server import Server
from mcp.server.stdio import stdio_server
@@ -67,6 +68,59 @@ def _db_path() -> Path:
return Path(APP_DB)
+def _load_email_writing_style() -> str:
+ """Return the existing Settings > Email > Writing Style value."""
+ try:
+ settings_path = DATA_DIR / "settings.json"
+ if not settings_path.exists():
+ return ""
+ settings = json.loads(settings_path.read_text(encoding="utf-8"))
+ return str(settings.get("email_writing_style") or "").strip()
+ except Exception:
+ return ""
+
+
+def _writing_style_guidance() -> str:
+ style = _load_email_writing_style()
+ if not style:
+ return (
+ "No saved writing style is configured in Settings > Email > Writing Style. "
+ "Use a concise, natural tone and do not invent facts."
+ )
+ return (
+ "Use this saved writing style from Settings > Email > Writing Style when "
+ "drafting the body. It overrides generic tone guidance:\n"
+ f"{style}"
+ )
+
+
+def _default_document_owner() -> str | None:
+ """Best-effort owner for MCP-created documents.
+
+ MCP stdio tools do not receive the browser request's authenticated user,
+ but the document library is owner-filtered. Stamp drafts to the configured
+ single/default admin so assistant-created email drafts are visible.
+ """
+ owner = os.environ.get("ODYSSEUS_DOCUMENT_OWNER", "").strip()
+ if owner:
+ return owner
+ try:
+ auth_path = DATA_DIR / "auth.json"
+ if not auth_path.exists():
+ return None
+ users = (json.loads(auth_path.read_text(encoding="utf-8")).get("users") or {})
+ if not isinstance(users, dict) or not users:
+ return None
+ admins = [name for name, data in users.items() if isinstance(data, dict) and data.get("is_admin")]
+ if len(admins) == 1:
+ return admins[0]
+ if len(users) == 1:
+ return next(iter(users))
+ return admins[0] if admins else next(iter(users))
+ except Exception:
+ return None
+
+
def _list_accounts_raw() -> list:
"""Return list of dicts from the email_accounts table. Empty list if table
missing or empty. Never raises."""
@@ -896,6 +950,340 @@ def _send_email(to, subject, body, in_reply_to=None, references=None, cc=None, b
}
+def _build_email_document_content(
+ to,
+ subject,
+ body,
+ *,
+ cc=None,
+ bcc=None,
+ in_reply_to=None,
+ references=None,
+ source_uid=None,
+ source_folder=None,
+):
+ header_lines = [f"To: {to or ''}"]
+ if cc:
+ header_lines.append(f"Cc: {cc}")
+ if bcc:
+ header_lines.append(f"Bcc: {bcc}")
+ header_lines.append(f"Subject: {subject or ''}")
+ if in_reply_to:
+ header_lines.append(f"In-Reply-To: {in_reply_to}")
+ if references:
+ header_lines.append(f"References: {references}")
+ if source_uid:
+ header_lines.append(f"X-Source-UID: {source_uid}")
+ if source_folder:
+ header_lines.append(f"X-Source-Folder: {source_folder}")
+ return "\n".join(header_lines) + "\n---\n" + (body or "")
+
+
+def _merge_email_reply_body(existing_content: str, reply_body: str) -> str:
+ """Preserve email headers and quoted chain while replacing the editable reply body."""
+ if "\n---\n" not in (existing_content or ""):
+ return reply_body or ""
+ head, body = existing_content.split("\n---\n", 1)
+ quote_markers = (
+ "---------- Previous message ----------",
+ "-----Original Message-----",
+ "----- Original Message -----",
+ )
+ quote_index = -1
+ for marker in quote_markers:
+ idx = body.find(marker)
+ if idx != -1 and (quote_index == -1 or idx < quote_index):
+ quote_index = idx
+ quote = body[quote_index:].strip() if quote_index != -1 else ""
+ merged_body = (reply_body or "").strip()
+ if quote:
+ merged_body = f"{merged_body}\n\n{quote}" if merged_body else quote
+ return f"{head}\n---\n{merged_body}"
+
+
+def _create_email_draft_document(
+ *,
+ to,
+ subject,
+ body,
+ title=None,
+ cc=None,
+ bcc=None,
+ in_reply_to=None,
+ references=None,
+ source_uid=None,
+ source_folder=None,
+ account=None,
+ source_message_id=None,
+):
+ """Create an Odysseus email compose document for user review. Does not send."""
+ from core.database import SessionLocal, Document, DocumentVersion
+ try:
+ from src.event_bus import fire_event
+ except Exception:
+ fire_event = None
+
+ cfg = _load_config(account) if account else _load_config(None)
+ content = _build_email_document_content(
+ to,
+ subject,
+ body,
+ cc=cc,
+ bcc=bcc,
+ in_reply_to=in_reply_to,
+ references=references,
+ source_uid=source_uid,
+ source_folder=source_folder,
+ )
+ doc_id = str(uuid.uuid4())
+ ver_id = str(uuid.uuid4())
+ doc_title = (title or subject or "Email draft").strip() or "Email draft"
+ doc_owner = _default_document_owner()
+
+ db = SessionLocal()
+ try:
+ if source_uid and source_folder:
+ existing = (
+ db.query(Document)
+ .filter(Document.is_active == True)
+ .filter(Document.language == "email")
+ .filter(Document.owner == doc_owner)
+ .filter(Document.source_email_uid == str(source_uid))
+ .filter(Document.source_email_folder == source_folder)
+ .order_by(Document.updated_at.desc())
+ .first()
+ )
+ if existing and "\n---\n" in (existing.current_content or ""):
+ existing.current_content = _merge_email_reply_body(existing.current_content, body or "")
+ existing.version_count = (existing.version_count or 0) + 1
+ ver = DocumentVersion(
+ id=ver_id,
+ document_id=existing.id,
+ version_number=existing.version_count,
+ content=existing.current_content,
+ summary="Updated by email MCP draft tool",
+ source="ai",
+ )
+ db.add(ver)
+ db.commit()
+ if fire_event:
+ try:
+ fire_event("document_updated", doc_owner)
+ except Exception:
+ pass
+ return {
+ "draft": True,
+ "updated": True,
+ "doc_id": existing.id,
+ "title": existing.title,
+ "language": existing.language,
+ "account": cfg.get("account_name"),
+ "account_id": cfg.get("account_id"),
+ "to": to,
+ "subject": subject,
+ }
+
+ doc = Document(
+ id=doc_id,
+ session_id=None,
+ title=doc_title,
+ language="email",
+ current_content=content,
+ version_count=1,
+ is_active=True,
+ owner=doc_owner,
+ source_email_uid=source_uid,
+ source_email_folder=source_folder,
+ source_email_account_id=cfg.get("account_id"),
+ source_email_message_id=source_message_id,
+ )
+ ver = DocumentVersion(
+ id=ver_id,
+ document_id=doc_id,
+ version_number=1,
+ content=content,
+ summary="Created by email MCP draft tool",
+ source="ai",
+ )
+ db.add(doc)
+ db.add(ver)
+ db.commit()
+ if fire_event:
+ try:
+ fire_event("document_created", doc_owner)
+ except Exception:
+ pass
+ return {
+ "draft": True,
+ "doc_id": doc_id,
+ "title": doc_title,
+ "language": "email",
+ "account": cfg.get("account_name"),
+ "account_id": cfg.get("account_id"),
+ "to": to,
+ "subject": subject,
+ }
+ finally:
+ db.close()
+
+
+def _draft_reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None, title=None):
+ """Create a threaded Odysseus reply draft document. Does not send."""
+ conn = _imap_connect(account)
+ conn.select(folder, readonly=True)
+ status, msg_data = conn.uid("FETCH", _b(uid), "(RFC822)")
+ conn.logout()
+ if status != "OK" or not msg_data or not msg_data[0]:
+ return {"error": f"Failed to fetch email UID {uid}"}
+ raw = msg_data[0][1]
+ orig = email.message_from_bytes(raw)
+
+ orig_subject = _decode_header(orig.get("Subject", ""))
+ reply_subject = orig_subject if orig_subject.lower().startswith("re:") else f"Re: {orig_subject}"
+ orig_message_id = orig.get("Message-ID", "")
+ orig_references = orig.get("References", "")
+ new_references = (orig_references + " " + orig_message_id).strip() if orig_references else orig_message_id
+
+ sender = _decode_header(orig.get("From", ""))
+ _, sender_addr = email.utils.parseaddr(sender)
+ to_addrs = sender_addr
+
+ cc = None
+ if reply_all:
+ cc_addrs = []
+ cfg = _load_config(account)
+ own_addrs = {
+ (cfg.get("imap_user") or "").strip().lower(),
+ (cfg.get("from_address") or "").strip().lower(),
+ }
+ for header_name in ("To", "Cc"):
+ for _, addr in email.utils.getaddresses([orig.get(header_name, "")]):
+ addr_l = (addr or "").strip().lower()
+ if addr and addr != sender_addr and addr_l not in own_addrs:
+ cc_addrs.append(addr)
+ if cc_addrs:
+ cc = ", ".join(dict.fromkeys(cc_addrs))
+
+ return _create_email_draft_document(
+ to=to_addrs,
+ subject=reply_subject,
+ body=body,
+ title=title or reply_subject,
+ cc=cc,
+ in_reply_to=orig_message_id,
+ references=new_references,
+ source_uid=uid,
+ source_folder=folder,
+ account=account,
+ source_message_id=orig_message_id,
+ )
+
+
+async def _ai_draft_reply_to_email(uid, folder="INBOX", reply_all=False, account=None, title=None):
+ """Generate a reply with Odysseus' AI-reply prompt/style, then create a compose doc."""
+ read_result = _read_email(uid=uid, folder=folder, account=account)
+ if "error" in read_result:
+ return read_result
+
+ to_addr = read_result.get("from_address") or email.utils.parseaddr(read_result.get("from") or "")[1]
+ subject = read_result.get("subject") or ""
+ reply_subject = subject if subject.lower().startswith("re:") else f"Re: {subject}"
+ original_body = read_result.get("body") or ""
+ message_id = read_result.get("message_id") or ""
+
+ if not original_body.strip():
+ return {"error": "No email body available for AI reply"}
+
+ try:
+ from routes.email_helpers import (
+ _EMAIL_REPLY_SYS_PROMPT_BASE,
+ _apply_email_style_mechanics,
+ _extract_reply,
+ _load_settings,
+ )
+ from src.endpoint_resolver import (
+ resolve_endpoint,
+ resolve_utility_fallback_candidates,
+ resolve_chat_fallback_candidates,
+ )
+ from src.llm_core import llm_call_async_with_fallback
+ except Exception as exc:
+ return {"error": f"AI reply helpers unavailable: {exc}"}
+
+ settings = _load_settings()
+ style = settings.get("email_writing_style", "")
+ system_prompt = _EMAIL_REPLY_SYS_PROMPT_BASE
+ if style:
+ system_prompt += f"\n\nWRITING STYLE TO MATCH:\n{style}"
+
+ user_msg = (
+ f"Recipient: {to_addr}\nSubject: {reply_subject}\n\n"
+ f"Original email and any current draft:\n{original_body[:6000]}\n\n"
+ "Draft a reply. Return only the reply body text."
+ )
+
+ candidates = []
+ seen = set()
+
+ def _add(url, model, headers):
+ key = (url or "", model or "")
+ if not url or not model or key in seen:
+ return
+ seen.add(key)
+ candidates.append((url, model, headers))
+
+ try:
+ _add(*resolve_endpoint("utility", owner=None))
+ except Exception:
+ pass
+ try:
+ _add(*resolve_endpoint("default", owner=None))
+ except Exception:
+ pass
+ try:
+ utility_fallbacks = resolve_utility_fallback_candidates(owner=None) or []
+ except TypeError:
+ utility_fallbacks = resolve_utility_fallback_candidates() or []
+ for cand in utility_fallbacks:
+ _add(*cand)
+ try:
+ chat_fallbacks = resolve_chat_fallback_candidates(owner=None) or []
+ except TypeError:
+ chat_fallbacks = resolve_chat_fallback_candidates() or []
+ for cand in chat_fallbacks:
+ _add(*cand)
+
+ if not candidates:
+ return {"error": "No LLM endpoint configured for AI reply"}
+
+ try:
+ raw_reply = await llm_call_async_with_fallback(
+ candidates,
+ messages=[
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_msg},
+ ],
+ temperature=0.7,
+ max_tokens=1024,
+ timeout=60,
+ )
+ except Exception as exc:
+ return {"error": f"AI reply generation failed: {exc}"}
+
+ reply = _apply_email_style_mechanics(_extract_reply(raw_reply or ""))
+ if not reply:
+ return {"error": "AI reply generation returned an empty response"}
+
+ return _draft_reply_to_email(
+ uid=uid,
+ body=reply,
+ folder=folder,
+ reply_all=reply_all,
+ account=account,
+ title=title or reply_subject,
+ )
+
+
def _reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None):
"""Reply to an existing email by UID. Threads via In-Reply-To/References."""
conn = None
@@ -1189,6 +1577,8 @@ async def list_tools() -> list[Tool]:
name="send_email",
description=(
"Send a new email via SMTP. Provide recipient(s), subject, and body. "
+ "This sends immediately; for normal assistant-written email, prefer "
+ "draft_email so the user can review and send from Odysseus. "
"For replying to an existing thread, use reply_to_email instead. "
"Pass `account` to send from a non-default mailbox."
),
@@ -1205,10 +1595,35 @@ async def list_tools() -> list[Tool]:
"required": ["to", "subject", "body"],
},
),
+ Tool(
+ name="draft_email",
+ description=(
+ "Create a new Odysseus email compose draft document. This DOES NOT send. "
+ "Use this as the default way to write an email for the user: it opens "
+ "a reviewable email document with To/Cc/Bcc/Subject/body, and the user "
+ "can edit or press Send in Odysseus. "
+ f"{_writing_style_guidance()}"
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "to": {"type": "string", "description": "Recipient email address(es), comma-separated"},
+ "subject": {"type": "string", "description": "Email subject line"},
+ "body": {"type": "string", "description": "Draft body"},
+ "cc": {"type": "string", "description": "CC address(es), comma-separated (optional)"},
+ "bcc": {"type": "string", "description": "BCC address(es), comma-separated (optional)"},
+ "title": {"type": "string", "description": "Optional Odysseus document title"},
+ **ACCOUNT_PROP,
+ },
+ "required": ["to", "subject", "body"],
+ },
+ ),
Tool(
name="reply_to_email",
description=(
- "Reply to an existing email by UID. Automatically threads the reply with "
+ "Reply to an existing email by UID. This sends immediately; for normal "
+ "assistant-written replies, prefer draft_email_reply so the user can "
+ "review and send from Odysseus. Automatically threads the reply with "
"In-Reply-To and References headers, prefixes 'Re:' on the subject, and "
"uses the original sender as the recipient. Set reply_all=true to also CC "
"the original To/Cc recipients. For follow-up 'reply ...' requests, use "
@@ -1226,6 +1641,49 @@ async def list_tools() -> list[Tool]:
"required": ["uid", "body"],
},
),
+ Tool(
+ name="draft_email_reply",
+ description=(
+ "Create an Odysseus email reply draft document for an existing email UID. "
+ "This DOES NOT send. It threads the draft with In-Reply-To/References, "
+ "prefills the recipient and subject, and stores source email metadata so "
+ "the user can review and send from the normal email composer. "
+ f"{_writing_style_guidance()}"
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"},
+ "body": {"type": "string", "description": "Draft reply body text"},
+ "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"},
+ "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False},
+ "title": {"type": "string", "description": "Optional Odysseus document title"},
+ **ACCOUNT_PROP,
+ },
+ "required": ["uid", "body"],
+ },
+ ),
+ Tool(
+ name="ai_draft_email_reply",
+ description=(
+ "Generate an AI reply using Odysseus' existing AI Reply behavior, "
+ "including Settings > Email > Writing Style, then create an email "
+ "compose document for review. This DOES NOT send and does NOT save "
+ "to the mailbox Drafts folder. Use this when the user asks you to "
+ "write or draft a reply to an email without dictating the exact body."
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"},
+ "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"},
+ "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False},
+ "title": {"type": "string", "description": "Optional Odysseus document title"},
+ **ACCOUNT_PROP,
+ },
+ "required": ["uid"],
+ },
+ ),
Tool(
name="archive_email",
description="Move an email out of the inbox into the Archive folder. Use after handling an email you want to keep but no longer need in the inbox.",
@@ -1552,6 +2010,31 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
acct_note = f" (from {result['account']})" if result.get("account") else ""
return [TextContent(type="text", text=f"Sent email to {result['to']} with subject '{result['subject']}'{acct_note}.")]
+ elif name == "draft_email":
+ to = arguments.get("to")
+ subject = arguments.get("subject")
+ body = arguments.get("body")
+ if not to or not subject or body is None:
+ return [TextContent(type="text", text="Error: to, subject, and body are required")]
+ result = _create_email_draft_document(
+ to=to,
+ subject=subject,
+ body=body,
+ title=arguments.get("title"),
+ cc=arguments.get("cc"),
+ bcc=arguments.get("bcc"),
+ account=acct,
+ )
+ acct_note = f" from {result['account']}" if result.get("account") else ""
+ return [TextContent(
+ type="text",
+ text=(
+ f"Created Odysseus email draft `{result['title']}` "
+ f"(document ID: {result['doc_id']}){acct_note}. "
+ "It has not been sent; open the document in Odysseus to review and send."
+ ),
+ )]
+
elif name == "reply_to_email":
uid = arguments.get("uid")
body = arguments.get("body")
@@ -1573,6 +2056,54 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
pass
return [TextContent(type="text", text=f"Replied to UID {uid}: '{result['subject']}' → {result['to']}")]
+ elif name == "draft_email_reply":
+ uid = arguments.get("uid")
+ body = arguments.get("body")
+ if not uid or body is None:
+ return [TextContent(type="text", text="Error: uid and body are required")]
+ result = _draft_reply_to_email(
+ uid=uid,
+ body=body,
+ folder=arguments.get("folder", "INBOX"),
+ reply_all=bool(arguments.get("reply_all", False)),
+ account=acct,
+ title=arguments.get("title"),
+ )
+ if "error" in result:
+ return [TextContent(type="text", text=f"Error: {result['error']}")]
+ acct_note = f" from {result['account']}" if result.get("account") else ""
+ return [TextContent(
+ type="text",
+ text=(
+ f"Created Odysseus reply draft `{result['title']}` for UID {uid} "
+ f"(document ID: {result['doc_id']}){acct_note}. "
+ "It has not been sent; open the document in Odysseus to review and send."
+ ),
+ )]
+
+ elif name == "ai_draft_email_reply":
+ uid = arguments.get("uid")
+ if not uid:
+ return [TextContent(type="text", text="Error: uid is required")]
+ result = await _ai_draft_reply_to_email(
+ uid=uid,
+ folder=arguments.get("folder", "INBOX"),
+ reply_all=bool(arguments.get("reply_all", False)),
+ account=acct,
+ title=arguments.get("title"),
+ )
+ if "error" in result:
+ return [TextContent(type="text", text=f"Error: {result['error']}")]
+ acct_note = f" from {result['account']}" if result.get("account") else ""
+ return [TextContent(
+ type="text",
+ text=(
+ f"Generated AI reply and created Odysseus compose draft "
+ f"`{result['title']}` for UID {uid} (document ID: {result['doc_id']}){acct_note}. "
+ "It has not been sent; open the document in Odysseus to review and send."
+ ),
+ )]
+
elif name == "archive_email":
uid = arguments.get("uid")
if not uid:
diff --git a/routes/api_token_routes.py b/routes/api_token_routes.py
index 97c576d15..05806e420 100644
--- a/routes/api_token_routes.py
+++ b/routes/api_token_routes.py
@@ -25,6 +25,8 @@ ALLOWED_SCOPES = {
"calendar:write",
"memory:read",
"memory:write",
+ "cookbook:read",
+ "cookbook:launch",
}
TOKEN_PROFILES = {
"chat": ["chat"],
diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py
index 39a18f715..a450278be 100644
--- a/routes/cookbook_helpers.py
+++ b/routes/cookbook_helpers.py
@@ -30,8 +30,9 @@ _LOCAL_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
_OLLAMA_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._:/-]{0,200}$")
# Include pattern is a glob: allow typical safe glyphs only.
_INCLUDE_RE = re.compile(r"^[A-Za-z0-9._\-*?/\[\]]+$")
-# Remote host: user@host (optionally with :port-free hostname parts).
-_REMOTE_HOST_RE = re.compile(r"^[A-Za-z0-9._-]+@[A-Za-z0-9._-]+$")
+# Remote host: either `user@host` or plain `host` (alias is allowed), where host
+# is a safe DNS-like token or a short SSH config alias.
+_REMOTE_HOST_RE = re.compile(r"^(?:[A-Za-z0-9._-]+@)?[A-Za-z0-9._-]+$")
# HF tokens and API tokens are url-safe base64-like.
_TOKEN_RE = re.compile(r"^[A-Za-z0-9._~+/=-]+$")
# Session IDs we mint look like "cookbook-deadbeef" or "serve-deadbeef".
@@ -81,7 +82,7 @@ def _validate_remote_host(v: str | None) -> str | None:
if v is None or v == "":
return None
if not _REMOTE_HOST_RE.match(v):
- raise HTTPException(400, "Invalid remote_host — must be user@host, no SSH option syntax")
+ raise HTTPException(400, "Invalid remote_host — must be host or user@host, no SSH option syntax")
return v
@@ -787,6 +788,7 @@ def _llama_cpp_rebuild_cmd() -> str:
class ModelDownloadRequest(BaseModel):
repo_id: str
+ backend: str | None = None # "hf" (default) or "ollama"
include: str | None = None # glob pattern e.g. "*Q4_K_M*"
hf_token: str | None = None
env_prefix: str | None = None # e.g. "source ~/venv/bin/activate"
diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py
index 7a1ee85c6..ba950f4b7 100644
--- a/routes/cookbook_routes.py
+++ b/routes/cookbook_routes.py
@@ -15,26 +15,19 @@ from pathlib import Path
from fastapi import APIRouter, HTTPException, Request, Depends
from src.auth_helpers import require_user
-from src.constants import COOKBOOK_STATE_FILE
from pydantic import BaseModel
from core.middleware import require_admin
from core.platform_compat import (
IS_WINDOWS,
- SSH_PATH_OVERRIDE,
- NVIDIA_PATH_CANDIDATES,
detached_popen_kwargs,
find_bash,
- git_bash_path,
kill_process_tree,
pid_alive,
safe_chmod,
which_tool,
- translate_path,
- get_wsl_windows_user_profile,
)
from routes.shell_routes import TMUX_LOG_DIR
-from src.constants import COOKBOOK_STATE_FILE
logger = logging.getLogger(__name__)
@@ -45,10 +38,8 @@ from routes.cookbook_helpers import (
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
_safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
_append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
- _append_vllm_linux_preflight_lines, _ollama_bind_from_cmd, _pip_install_fallback_chain,
- _pip_install_no_cache, _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
- _append_pip_install_runner_lines,
- _diagnose_serve_output, run_ssh_command_async,
+ _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache,
+ _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
ModelDownloadRequest, ServeRequest,
)
@@ -63,7 +54,7 @@ _HF_TOKEN_STATUS_SNIPPET = (
def setup_cookbook_routes() -> APIRouter:
router = APIRouter(tags=["cookbook"])
- _cookbook_state_path = Path(COOKBOOK_STATE_FILE)
+ _cookbook_state_path = Path(os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
def _mask_secret(value: str) -> str:
if not value:
@@ -90,6 +81,127 @@ def setup_cookbook_routes() -> APIRouter:
task["payload"].pop("hf_token", None)
return state
+ def _diagnose_serve_output(text: str) -> dict | None:
+ """Server-side mirror of the Cookbook UI's common serve diagnoses.
+
+ The browser uses cookbook-diagnosis.js for clickable fixes. This gives
+ the agent/tool path the same structured signal so it can retry with an
+ adjusted command instead of guessing from raw tmux output.
+ """
+ if not text:
+ return None
+ tail = text[-6000:]
+ patterns = [
+ (
+ r"No available memory for the cache blocks|Available KV cache memory:.*-",
+ "No GPU memory left for KV cache after loading model.",
+ [
+ {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
+ {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
+ ],
+ ),
+ (
+ r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
+ "GPU ran out of memory during startup or warmup.",
+ [
+ {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+ {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
+ {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
+ ],
+ ),
+ (
+ r"not divisib|must be divisible|attention heads.*divisible",
+ "Tensor parallel size is incompatible with the model.",
+ [
+ {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
+ {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
+ ],
+ ),
+ (
+ r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
+ "Context length is too large for available GPU memory.",
+ [
+ {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
+ {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+ ],
+ ),
+ (
+ r"enable-auto-tool-choice requires --tool-call-parser",
+ "Auto tool choice requires an explicit tool call parser.",
+ [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
+ ),
+ (
+ r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
+ "Model requires custom code or newer model support.",
+ [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
+ ),
+ (
+ r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer",
+ "vLLM/Transformers kernel package mismatch.",
+ [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}],
+ ),
+ (
+ r"Address already in use|bind.*address.*in use",
+ "Port is already in use.",
+ [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
+ ),
+ (
+ r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
+ "No GPUs are visible to the serve process.",
+ [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
+ ),
+ (
+ r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
+ "vLLM could not find a supported GPU (CUDA or ROCm). "
+ "This machine may have integrated or unsupported graphics only.",
+ [
+ {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+ {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+ ],
+ ),
+ (
+ r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
+ "vLLM is not installed or not in PATH on this server.",
+ [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
+ ),
+ (
+ r"sglang.*command not found|No module named sglang|SGLang is not installed",
+ "SGLang is not installed or not in PATH on this server.",
+ [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
+ ),
+ (
+ r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+ "llama.cpp / llama-cpp-python dependencies are missing.",
+ [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
+ ),
+ (
+ r"No GGUF found on this host|no \.gguf file|No GGUF file found",
+ "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
+ [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
+ ),
+ (
+ r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
+ "Diffusion serving requires PyTorch and diffusers.",
+ [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}],
+ ),
+ (
+ r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
+ "Model access is gated or unauthorized.",
+ [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
+ ),
+ ]
+ for pattern, message, suggestions in patterns:
+ if re.search(pattern, tail, re.I):
+ return {"message": message, "suggestions": suggestions}
+ if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
+ r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
+ ):
+ return {
+ "message": "Python traceback detected during serve startup.",
+ "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
+ }
+ return None
+
def _state_for_client(state):
"""Return cookbook state without raw secrets for browser clients."""
_strip_task_secrets(state)
@@ -183,7 +295,6 @@ def setup_cookbook_routes() -> APIRouter:
safe_chmod(key_path.with_suffix(".pub"), 0o644)
return {"ok": True, "public_key": _read_cookbook_public_key()}
-
def _needs_binary(cmd: str, binary: str) -> bool:
return bool(re.search(rf"(^|[\s;&|()]){re.escape(binary)}($|[\s;&|()])", cmd or ""))
@@ -244,8 +355,8 @@ def setup_cookbook_routes() -> APIRouter:
# POSIX form + shell-quoting so drive paths / spaces survive.
inner = TMUX_LOG_DIR / f"{session_id}_run.sh"
inner.write_text("\n".join(bash_lines) + "\n", encoding="utf-8")
- lp = shlex.quote(git_bash_path(log_path))
- ip = shlex.quote(git_bash_path(inner))
+ lp = shlex.quote(log_path.as_posix())
+ ip = shlex.quote(inner.as_posix())
script_path = TMUX_LOG_DIR / f"{session_id}.sh"
script_path.write_text(
f"bash {ip} > {lp} 2>&1\n",
@@ -286,24 +397,33 @@ def setup_cookbook_routes() -> APIRouter:
require_admin(request)
# Defence-in-depth: even though this endpoint is admin-gated, refuse
# values that would land in shell contexts with metacharacters.
- _validate_repo_id(req.repo_id)
- _validate_include(req.include)
+ backend = (req.backend or "").strip().lower()
+ is_ollama_download = backend == "ollama" or ("/" not in req.repo_id and ":" in req.repo_id)
+ if is_ollama_download:
+ _validate_serve_model_id(req.repo_id)
+ req.include = None
+ req.local_dir = None
+ else:
+ _validate_repo_id(req.repo_id)
+ _validate_include(req.include)
_validate_remote_host(req.remote_host)
req.ssh_port = _validate_ssh_port(req.ssh_port)
req.local_dir = _validate_local_dir(req.local_dir)
- req.hf_token = req.hf_token or _load_stored_hf_token()
+ req.hf_token = "" if is_ollama_download else (req.hf_token or _load_stored_hf_token())
_validate_token(req.hf_token)
TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
session_id = f"cookbook-{uuid.uuid4().hex[:8]}"
wrapper_script = TMUX_LOG_DIR / f"{session_id}.sh"
- # When a download directory is set, target a per-model subfolder under it
- # (
/) so the flat-directory cache scan lists it as its own
- # model. Without it, hf/snapshot_download falls back to the HF cache.
- _dl_short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
- _dl_base = (req.local_dir.rstrip("/") + "/" + _dl_short) if req.local_dir else None
- _dl_shell = _shell_path(_dl_base) if _dl_base else None # for hf CLI / bash
- _dl_pyarg = (", local_dir=os.path.expanduser(" + repr(_dl_base) + ")") if _dl_base else ""
+ # Custom download dir: point the HF cache at /hub via env vars
+ # (HF_HOME + HUGGINGFACE_HUB_CACHE) instead of --local-dir. local_dir
+ # produces a flat layout (//) and the local-dir
+ # bookkeeping files (.cache/huggingface/.gitignore.lock), and it
+ # also breaks robust resume on flaky transfers — the blob-based hub
+ # cache survives SSL ReadError mid-stream by reusing .incomplete,
+ # local_dir does not. See issue #2722.
+ _dl_hf_home_shell = _shell_path(req.local_dir.rstrip("/")) if req.local_dir else None
+ _dl_pyarg = "" # snapshot_download honors the env vars too — no kwarg needed
# Build the hf download command. Redirection to suppress the interactive
# "update available? [Y/n]" prompt is added per-platform further down
@@ -311,8 +431,7 @@ def setup_cookbook_routes() -> APIRouter:
hf_cmd = f"hf download {req.repo_id}"
if req.include:
hf_cmd += f" --include '{req.include}'"
- if _dl_shell:
- hf_cmd += f" --local-dir {_dl_shell}"
+ ollama_cmd = f"ollama pull {shlex.quote(req.repo_id)}"
# Build the shell wrapper — runs hf download directly in tmux (which is a TTY)
# No script/tee needed — we'll use tmux capture-pane to read output
@@ -320,8 +439,15 @@ def setup_cookbook_routes() -> APIRouter:
lines.extend(_user_shell_path_bootstrap())
if req.hf_token:
lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
+ if _dl_hf_home_shell and not is_ollama_download:
+ # Make hf download / snapshot_download honor the chosen dir via the
+ # standard HF cache (gives us the models--org--name/blobs/... layout
+ # with resumable .incomplete blobs).
+ lines.append(f"export HF_HOME={_dl_hf_home_shell}")
+ lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub")
+ lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub")
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
- lines.append('export PATH="$HOME/.local/bin:$PATH"')
+ lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
# When Odysseus runs from a venv (e.g. native macOS install), put its bin
# on PATH so the tmux shell finds the bundled `hf`/`python3` without an
# activated venv. Local bash runs only — meaningless over SSH.
@@ -332,14 +458,25 @@ def setup_cookbook_routes() -> APIRouter:
# throughput. Retries set disable_hf_transfer to fall back to the plain,
# slower-but-reliable downloader (resumes cleanly from the .incomplete files).
# Use `python3 -m pip` not `pip` — macOS has no bare `pip` command.
- lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
- if req.disable_hf_transfer:
- lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
- lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ if is_ollama_download:
+ lines.append('if command -v ollama >/dev/null 2>&1; then')
+ lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}')
+ lines.append('elif command -v docker >/dev/null 2>&1; then')
+ lines.append(' ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"')
+ lines.append(' if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then')
+ lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}')
+ lines.append(' fi')
+ lines.append('fi')
+ lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi')
else:
- lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
- lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
- lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
+ lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
+ if req.disable_hf_transfer:
+ lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
+ lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ else:
+ lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
+ lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+ lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
remote = req.remote_host # None for local
is_windows = req.platform == "windows"
@@ -361,37 +498,48 @@ def setup_cookbook_routes() -> APIRouter:
ps_lines = []
ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
- ps_lines.append('$env:PYTHONIOENCODING = "utf-8"')
- ps_lines.append('$env:PYTHONUTF8 = "1"')
if req.hf_token:
ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
+ if req.local_dir and not is_ollama_download:
+ # Mirror the bash branch — point the HF cache at the user's dir
+ # via env vars instead of --local-dir, so resume works on flaky
+ # transfers (issue #2722).
+ _dl_ps = _ps_squote(req.local_dir.rstrip("/"))
+ ps_lines.append(f"$env:HF_HOME = '{_dl_ps}'")
+ ps_lines.append(f"$env:HUGGINGFACE_HUB_CACHE = '{_dl_ps}/hub'")
+ ps_lines.append(f"$env:HF_HUB_CACHE = '{_dl_ps}/hub'")
if req.env_prefix:
ps_lines.append(_safe_env_prefix(req.env_prefix))
- # Try hf CLI, fall back to Python huggingface_hub, then auto-install
- ps_lines.append('try {{')
- ps_lines.append(' $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
- ps_lines.append(' if ($hfPath) {{')
- # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
- ps_lines.append(f' $null | {hf_cmd}')
- ps_lines.append(' }} else {{')
- ps_lines.append(' python -c "import huggingface_hub" 2>$null')
- ps_lines.append(' if ($LASTEXITCODE -eq 0) {{')
- ps_lines.append(' Write-Host "hf CLI not found, using Python huggingface_hub..."')
- ps_lines.append(' python -m pip install -q hf_transfer 2>$null')
- ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
- ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
- ps_lines.append(' }} else {{')
- ps_lines.append(' Write-Host "Installing huggingface-hub..."')
- ps_lines.append(' python -m pip install -q huggingface-hub hf_transfer')
- ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
- ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
- ps_lines.append(' }}')
- ps_lines.append(' }}')
- ps_lines.append(' if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
- ps_lines.append(' else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
- ps_lines.append('}} catch {{')
- ps_lines.append(' Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
- ps_lines.append('}}')
+ if is_ollama_download:
+ ps_lines.append('if (-not (Get-Command ollama -ErrorAction SilentlyContinue)) { Write-Host "ERROR: Ollama not found. Install from https://ollama.com/download/windows"; exit 127 }')
+ ps_lines.append(f"$null | ollama pull '{_ps_squote(req.repo_id)}'")
+ ps_lines.append('if ($LASTEXITCODE -eq 0) { Write-Host ""; Write-Host "DOWNLOAD_OK" } else { Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }')
+ else:
+ # Try hf CLI, fall back to Python huggingface_hub, then auto-install
+ ps_lines.append('try {{')
+ ps_lines.append(' $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
+ ps_lines.append(' if ($hfPath) {{')
+ # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
+ ps_lines.append(f' $null | {hf_cmd}')
+ ps_lines.append(' }} else {{')
+ ps_lines.append(' python -c "import huggingface_hub" 2>$null')
+ ps_lines.append(' if ($LASTEXITCODE -eq 0) {{')
+ ps_lines.append(' Write-Host "hf CLI not found, using Python huggingface_hub..."')
+ ps_lines.append(' python -m pip install -q hf_transfer 2>$null')
+ ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
+ ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
+ ps_lines.append(' }} else {{')
+ ps_lines.append(' Write-Host "Installing huggingface-hub..."')
+ ps_lines.append(' python -m pip install -q huggingface-hub hf_transfer')
+ ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
+ ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
+ ps_lines.append(' }}')
+ ps_lines.append(' }}')
+ ps_lines.append(' if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
+ ps_lines.append(' else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
+ ps_lines.append('}} catch {{')
+ ps_lines.append(' Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
+ ps_lines.append('}}')
ps_lines.append(f'Remove-Item -Force "$HOME\\{remote_runner}" -ErrorAction SilentlyContinue')
runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1"
runner_path.write_text("\r\n".join(ps_lines) + "\r\n", encoding="utf-8")
@@ -422,6 +570,10 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append("deactivate 2>/dev/null; hash -r")
if req.hf_token:
runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
+ if _dl_hf_home_shell and not is_ollama_download:
+ runner_lines.append(f"export HF_HOME={_dl_hf_home_shell}")
+ runner_lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub")
+ runner_lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub")
if req.env_prefix:
runner_lines.append(_safe_env_prefix(req.env_prefix))
else:
@@ -432,42 +584,67 @@ def setup_cookbook_routes() -> APIRouter:
'done'
)
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
- runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
+ runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
# Install hf CLI + optional hf_transfer best-effort. Retries disable
# hf_transfer because the Rust parallel path is fast but has been
# flaky near the end of very large multi-file downloads.
- # The helper tries active pip first, then guarded user-site fallbacks.
- runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
- if req.disable_hf_transfer:
- runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
- runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ # Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
+ if is_ollama_download:
+ runner_lines.append('if command -v ollama >/dev/null 2>&1; then')
+ runner_lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}')
+ runner_lines.append('elif command -v docker >/dev/null 2>&1; then')
+ runner_lines.append(' ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"')
+ runner_lines.append(' if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then')
+ runner_lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}')
+ runner_lines.append(' fi')
+ runner_lines.append('fi')
+ runner_lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi')
else:
- runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
- runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
- runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
- # Surface whether the HF token actually reached THIS server, so a gated
- # download's "not authorized" failure can be told apart from a missing
- # token (the token is masked — we only print applied / not-set).
- runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
- # Try hf CLI first, fall back to Python huggingface_hub, then auto-install
- runner_lines.append('if command -v hf &>/dev/null; then')
- # < /dev/null suppresses interactive "update available? [Y/n]" prompt
- runner_lines.append(f' {hf_cmd} < /dev/null')
- runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
- runner_lines.append(' echo "hf CLI not found, using Python huggingface_hub..."')
- runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
- runner_lines.append('else')
- runner_lines.append(' echo "Installing huggingface-hub and dependencies..."')
- runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null')
- if req.disable_hf_transfer:
- runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
- runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0')
+ runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
+ if req.disable_hf_transfer:
+ runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
+ runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ else:
+ runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
+ runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+ runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
+ # Surface whether the HF token actually reached THIS server, so a gated
+ # download's "not authorized" failure can be told apart from a missing
+ # token (the token is masked — we only print applied / not-set).
+ runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
+ # Wrap the download in a retry loop. Large HF/Ollama transfers can
+ # hit transient network failures; both backends resume cached partials.
+ mw = 4 if req.disable_hf_transfer else 8
+ runner_lines.append('_max_retries=10; _attempt=0; _ec=0')
+ runner_lines.append('while [ $_attempt -lt $_max_retries ]; do')
+ runner_lines.append(' _attempt=$((_attempt+1))')
+ if is_ollama_download:
+ runner_lines.append(' eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null')
else:
- runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
- runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
- runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
- runner_lines.append('fi')
- runner_lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
+ runner_lines.append(' if command -v hf &>/dev/null; then')
+ runner_lines.append(f' {hf_cmd} < /dev/null')
+ runner_lines.append(' elif python3 -c "import huggingface_hub" 2>/dev/null; then')
+ runner_lines.append(' [ $_attempt -eq 1 ] && echo "hf CLI not found, using Python huggingface_hub..."')
+ runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"')
+ runner_lines.append(' else')
+ runner_lines.append(' echo "Installing huggingface-hub and dependencies..."')
+ runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null')
+ if req.disable_hf_transfer:
+ runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
+ runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0')
+ else:
+ runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
+ runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+ runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"')
+ runner_lines.append(' fi')
+ runner_lines.append(' _ec=$?')
+ runner_lines.append(' if [ $_ec -eq 0 ]; then break; fi')
+ runner_lines.append(' if [ $_attempt -lt $_max_retries ]; then')
+ runner_lines.append(' echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."')
+ runner_lines.append(' sleep 30')
+ runner_lines.append(' fi')
+ runner_lines.append('done')
+ runner_lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi')
runner_lines.append(f"rm -f {remote_runner}")
runner_lines.append('exec "${SHELL:-/bin/bash}"')
runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
@@ -493,23 +670,30 @@ def setup_cookbook_routes() -> APIRouter:
lines.append("deactivate 2>/dev/null; hash -r")
# Show whether the HF token reached this run (masked) — tells a gated
# "not authorized" failure apart from a missing token.
- lines.append(_HF_TOKEN_STATUS_SNIPPET)
- if IS_WINDOWS:
- # Detached path: no controlling TTY, so skip `< /dev/null`
- # (handled by Popen stdin=DEVNULL) and don't keep a shell open.
- lines.append(hf_cmd)
- lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
- else:
- # < /dev/null suppresses interactive "update available? [Y/n]" prompt
- lines.append(f"{hf_cmd} < /dev/null")
- lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
+ if not is_ollama_download:
+ lines.append(_HF_TOKEN_STATUS_SNIPPET)
+ # Retry loop — same rationale as the remote-bash path. Issue #2722.
+ _hf_invoke = 'eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null' if is_ollama_download else (hf_cmd if IS_WINDOWS else f"{hf_cmd} < /dev/null")
+ lines.append('_max_retries=10; _attempt=0; _ec=0')
+ lines.append('while [ $_attempt -lt $_max_retries ]; do')
+ lines.append(' _attempt=$((_attempt+1))')
+ lines.append(f' {_hf_invoke}')
+ lines.append(' _ec=$?')
+ lines.append(' if [ $_ec -eq 0 ]; then break; fi')
+ lines.append(' if [ $_attempt -lt $_max_retries ]; then')
+ lines.append(' echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."')
+ lines.append(' sleep 30')
+ lines.append(' fi')
+ lines.append('done')
+ lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi')
+ if not IS_WINDOWS:
lines.append(f"rm -f '{wrapper_script}'")
lines.append('exec "${SHELL:-/bin/bash}"')
wrapper_script.write_text("\n".join(lines) + "\n", encoding="utf-8")
wrapper_script.chmod(0o755)
setup_cmd = None if IS_WINDOWS else f"tmux new-session -d -s {session_id} {shlex.quote(str(wrapper_script))}"
- logger.info(f"Model download: {req.repo_id} (include={req.include}, session={session_id}, remote={remote})")
+ logger.info(f"Model download: {req.repo_id} (backend={'ollama' if is_ollama_download else 'hf'}, include={req.include}, session={session_id}, remote={remote})")
logger.info(f"Download setup_cmd: {setup_cmd}")
if setup_cmd is None:
@@ -564,35 +748,24 @@ def setup_cookbook_routes() -> APIRouter:
for d in model_dir.split(','):
d = d.strip()
if d:
- translated_d = translate_path(d) if not host else d
- model_dirs.append(translated_d)
- win_hf_hub = None
- if not host:
- win_profile = get_wsl_windows_user_profile()
- win_hf_hub = os.path.join(win_profile, ".cache", "huggingface", "hub") if win_profile else None
-
- paths_code = _cached_model_scan_script(model_dirs, win_hf_hub)
+ model_dirs.append(d)
+ paths_code = _cached_model_scan_script(model_dirs)
scan_py = TMUX_LOG_DIR / "scan_cache.py"
scan_py.write_text(paths_code, encoding="utf-8")
- scan_payload = scan_py.read_bytes()
if host:
+ _pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
if platform == "windows":
- remote_cmd = "python -"
+ # Windows: use 'python' and pipe via stdin with double-quote wrapping
+ cmd = f'ssh {_pf}{host} "python -" < \'{scan_py}\''
else:
- # POSIX: use 'python3' if available, fall back to 'python'; throw if neither is found.
- remote_cmd = (
- "if command -v python3 >/dev/null 2>&1; then python3 -; "
- "elif command -v python >/dev/null 2>&1; then python -; "
- "else echo \"python3/python not found\" >&2; exit 127; fi"
- )
- rc, stdout_b, stderr_b = await run_ssh_command_async(
- host,
- ssh_port,
- remote_cmd,
- timeout=60,
- stdin_data=scan_payload,
+ cmd = f"ssh {_pf}{host} 'python3 -' < '{scan_py}'"
+ proc = await asyncio.create_subprocess_shell(
+ cmd,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ cwd=str(Path.home()),
)
else:
# LOCAL scan: use sys.executable (the venv Python Odysseus is already
@@ -612,7 +785,7 @@ def setup_cookbook_routes() -> APIRouter:
stderr=asyncio.subprocess.PIPE,
cwd=str(Path.home()),
)
- stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)
+ stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)
models = []
try:
@@ -752,6 +925,100 @@ def setup_cookbook_routes() -> APIRouter:
return p
return None
+ async def _serve_crash_watchdog(
+ endpoint_id: str,
+ session_id: str,
+ remote: str | None,
+ ssh_port: str | None,
+ is_windows: bool,
+ ) -> None:
+ """Drop a freshly-registered endpoint when the cookbook serve dies early.
+
+ The runner script always emits ``=== Process exited with code N ===``
+ when the launched cmd terminates (success or failure). We poll the
+ tmux pane periodically; on a non-zero exit detected within the watch
+ window, the endpoint row is deleted so the picker doesn't keep a
+ dead model around. A zero exit (rare for a long-running serve, but
+ possible for fast-failing builds that the runner reports as code 0)
+ and "missing exit marker" both leave the endpoint alone — that's
+ the loading-but-not-yet-bound state, which the probe-marks-offline
+ logic already handles.
+
+ Times are picked to outlast realistic vLLM load times (Qwen3.5-122B
+ takes ~3 min to load) without burning resources on a stuck-forever
+ wait. After the last check, the watchdog gives up — the picker's
+ per-endpoint probe takes over from there.
+ """
+ # Cumulative wait points: 25 s, 60 s, 2 min, 5 min.
+ _waits = [25, 35, 60, 180]
+ # Tmux capture-pane equivalent of the polling path used elsewhere in
+ # this file. Build it once and reuse on each tick. Skip the watchdog
+ # entirely on native-Windows local runs (no tmux). The Windows
+ # detached-process path writes its log to a known file and has its
+ # own lifecycle tracking; punting here keeps the code simple.
+ local_win = is_windows and not remote
+ if local_win:
+ return
+ if remote:
+ ssh_args = ["ssh"]
+ if ssh_port and ssh_port != "22":
+ ssh_args.extend(["-p", str(ssh_port)])
+ capture_cmd = ssh_args + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"]
+ else:
+ capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"]
+
+ _exit_re = re.compile(r"=== Process exited with code (-?\d+) ===")
+ for wait_s in _waits:
+ await asyncio.sleep(wait_s)
+ try:
+ proc = await asyncio.create_subprocess_exec(
+ *capture_cmd,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.DEVNULL,
+ )
+ stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=8)
+ output = stdout.decode("utf-8", errors="replace")
+ except Exception as e:
+ logger.debug(f"crash-watchdog: capture-pane failed (will retry): {e!r}")
+ continue
+ # Last occurrence wins — a serve that exits/restarts under the
+ # runner's "exec bash -i" trail will emit multiple markers; the
+ # most-recent code is the one that matters.
+ matches = list(_exit_re.finditer(output))
+ if not matches:
+ continue
+ try:
+ exit_code = int(matches[-1].group(1))
+ except (ValueError, IndexError):
+ continue
+ if exit_code == 0:
+ # Exit 0 on a long-running serve is unusual (a normal "loaded
+ # then ready" path keeps the process alive) but it happens for
+ # commands like "ollama pull" the user might launch through
+ # the same form. Don't drop the endpoint on a clean exit;
+ # let the probe layer mark it offline if nothing's listening.
+ logger.info(f"crash-watchdog: serve {session_id} exited cleanly (0); leaving endpoint {endpoint_id}")
+ return
+ # Non-zero exit — drop the endpoint.
+ try:
+ from core.database import SessionLocal as _SL, ModelEndpoint as _ME
+ db = _SL()
+ try:
+ ep = db.query(_ME).filter(_ME.id == endpoint_id).first()
+ if ep:
+ logger.info(
+ f"crash-watchdog: dropping endpoint {endpoint_id} "
+ f"({ep.name} @ {ep.base_url}) — serve exited {exit_code}"
+ )
+ db.delete(ep)
+ db.commit()
+ finally:
+ db.close()
+ except Exception as e:
+ logger.warning(f"crash-watchdog: endpoint cleanup failed: {e!r}")
+ return
+ logger.debug(f"crash-watchdog: no exit marker for {session_id} within window; leaving endpoint {endpoint_id}")
+
def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
"""Register a freshly-served LLM as a model endpoint so it appears in the
model picker without a manual /setup step — the text-model sibling of
@@ -763,6 +1030,10 @@ def setup_cookbook_routes() -> APIRouter:
probing /v1/models and dims the endpoint until the server is reachable,
so registering immediately (before the server finishes loading) is safe.
"""
+ logger.info(
+ f"_auto_register_llm_endpoint: ENTRY repo_id={req.repo_id!r} "
+ f"remote={remote!r} cmd_prefix={req.cmd[:80]!r}"
+ )
import re
from core.database import SessionLocal, ModelEndpoint
@@ -787,16 +1058,20 @@ def setup_cookbook_routes() -> APIRouter:
else:
port = 8080 # llama.cpp's llama-server default — the Apple Silicon path
- # Determine host (mirrors the image path: SSH alias for remote serves).
- # For local serves while Odysseus runs inside Docker, "localhost"
- # resolves to the container itself — useless. Use host.docker.internal
- # which compose maps to the actual host, matching what /setup adds
- # for Ollama by hand.
+ # Determine host. The cookbook tmux for `local=true` serves runs INSIDE
+ # the odysseus container — so the right URL for the in-container
+ # backend to reach it is `localhost`, NOT `host.docker.internal`
+ # (the latter points at the docker HOST, which doesn't have a server
+ # on that port). The previous host.docker.internal fallback only made
+ # sense for /setup-added external services like systemd Ollama on the
+ # host — and those go through manual setup, not this auto-register
+ # code path. For remote serves we still use the SSH host alias.
if remote:
host = remote.split("@")[-1] if "@" in remote else remote
+ elif re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\b", req.cmd or ""):
+ host = "host.docker.internal"
else:
- from routes.model_routes import _docker_host_gateway_reachable
- host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost"
+ host = "localhost"
base_url = f"http://{host}:{port}/v1"
@@ -805,7 +1080,9 @@ def setup_cookbook_routes() -> APIRouter:
# If the serve command opts models into OpenAI tool-calling, record it so
# agent_loop trusts emitted tool_calls instead of the name heuristic.
+ is_ollama_endpoint = "ollama" in (req.cmd or "").lower()
supports_tools = True if "--enable-auto-tool-choice" in req.cmd else None
+ pinned_models = [req.repo_id] if is_ollama_endpoint and req.repo_id else []
db = SessionLocal()
try:
@@ -815,14 +1092,43 @@ def setup_cookbook_routes() -> APIRouter:
existing.is_enabled = True
existing.model_type = "llm"
existing.name = display_name
+ if is_ollama_endpoint:
+ existing.endpoint_kind = "ollama"
+ if pinned_models:
+ existing.cached_models = json.dumps(pinned_models)
+ existing.pinned_models = json.dumps(pinned_models)
if supports_tools is not None:
existing.supports_tools = supports_tools
- # Wipe stale model lists so the picker re-probes and discovers
- # the newly-served model instead of showing the old one.
- existing.cached_models = None
- existing.hidden_models = None
db.commit()
logger.info(f"Updated existing local model endpoint: {base_url}")
+ # Re-probe so cached_models matches what the server actually
+ # serves right now (the URL may have stayed the same but the
+ # model behind it changed across launches).
+ try:
+ from routes.model_routes import _probe_endpoint
+ import json as _json2
+ probed = _probe_endpoint(base_url, existing.api_key, timeout=5)
+ if probed:
+ existing.cached_models = _json2.dumps(probed)
+ db.commit()
+ except Exception as _pe:
+ logger.warning(f"Re-probe failed for {base_url}: {_pe!r}")
+ # Sweep stale dupes: other endpoints with the same display name
+ # at DIFFERENT URLs (likely failed earlier-attempt ports) get
+ # deleted so the picker doesn't show an offline ghost next to
+ # the working one. Only sweeps endpoints whose id starts with
+ # `local-` so we never touch a user's hand-added DeepSeek/OpenAI/
+ # etc. entry with a coincidentally matching name.
+ stale = (db.query(ModelEndpoint)
+ .filter(ModelEndpoint.name == display_name)
+ .filter(ModelEndpoint.base_url != base_url)
+ .filter(ModelEndpoint.id.like("local-%"))
+ .all())
+ for s in stale:
+ logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})")
+ db.delete(s)
+ if stale:
+ db.commit()
return existing.id
ep_id = f"local-{uuid.uuid4().hex[:8]}"
@@ -833,11 +1139,42 @@ def setup_cookbook_routes() -> APIRouter:
api_key=None,
is_enabled=True,
model_type="llm",
+ endpoint_kind="ollama" if is_ollama_endpoint else "auto",
+ cached_models=json.dumps(pinned_models) if pinned_models else None,
+ pinned_models=json.dumps(pinned_models) if pinned_models else None,
supports_tools=supports_tools,
)
db.add(ep)
db.commit()
logger.info(f"Auto-registered local model endpoint: {display_name} @ {base_url}")
+ # Same sweep on first-register path: drop any pre-existing local-*
+ # endpoints with this display name pointed elsewhere.
+ stale = (db.query(ModelEndpoint)
+ .filter(ModelEndpoint.name == display_name)
+ .filter(ModelEndpoint.id != ep_id)
+ .filter(ModelEndpoint.id.like("local-%"))
+ .all())
+ for s in stale:
+ logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})")
+ db.delete(s)
+ if stale:
+ db.commit()
+ # Probe /v1/models NOW and write cached_models so the chat
+ # picker actually shows the model on the next /api/models
+ # call. Without this immediate probe, the endpoint has empty
+ # cached_models until the next background refresh fires (up
+ # to a minute later) and the picker shows nothing — even
+ # though the endpoint is in the DB and the server is up.
+ try:
+ from routes.model_routes import _probe_endpoint
+ import json as _json2
+ probed = _probe_endpoint(base_url, None, timeout=5)
+ if probed:
+ ep.cached_models = _json2.dumps(probed)
+ db.commit()
+ logger.info(f"Auto-register: probed {len(probed)} models @ {base_url}")
+ except Exception as _pe:
+ logger.warning(f"Auto-register: probe-after-create failed for {base_url}: {_pe!r}")
return ep_id
except Exception as e:
logger.error(f"Failed to auto-register local model endpoint: {e}")
@@ -877,27 +1214,11 @@ def setup_cookbook_routes() -> APIRouter:
in_venv=sys.prefix != sys.base_prefix,
)
is_pip_install = bool(req.cmd and "pip install" in req.cmd)
- remote = req.remote_host
- is_windows = req.platform == "windows"
- local_windows = IS_WINDOWS and not remote
- if is_windows or local_windows:
- if req.cmd.startswith("python3 "):
- req.cmd = "python " + req.cmd[len("python3 "):]
- if is_pip_install and ("llama-cpp-python" in req.cmd or "llama_cpp" in req.cmd) and (is_windows or local_windows):
- if "--extra-index-url" not in req.cmd:
- req.cmd += " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
-
if is_pip_install:
# Keep big dependency wheel builds (vLLM, …) off the home filesystem's
# pip cache so they don't fail mid-build with "No space left" (#1219)
# and leave the dep installed-but-unusable (#1459).
req.cmd = _pip_install_no_cache(req.cmd)
- # Accept common aliases and enforce server extras for llama-cpp so
- # `python -m llama_cpp.server` has all runtime dependencies.
- req.cmd = re.sub(r"(?=!~,` for version specifiers.
# v2 review HIGH-14: tightened from the previous regex which
@@ -920,7 +1241,12 @@ def setup_cookbook_routes() -> APIRouter:
# Otherwise the runner script picks one at runtime and `_auto_register`
# below still registers the stale 11434 default — which on a host with
# a systemd ollama lands on the wrong (unreachable-from-docker) service.
- if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd:
+ # Match "ollama serve" as a phrase (with optional flags after), not
+ # any substring containing "ollama" — otherwise commands like
+ # `docker exec ollama-test ollama-import …` get wrapped as if they
+ # were native `ollama serve`, prepending OLLAMA_HOST=… and then
+ # running the ollama-not-found preflight which exits 127.
+ if re.search(r"\bollama\s+serve\b", req.cmd) and "OLLAMA_HOST=" not in req.cmd:
_ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1"
_ollama_chosen_port = _pick_free_port_for_ollama(
remote, req.ssh_port, start_port=11434, max_offset=10,
@@ -950,8 +1276,6 @@ def setup_cookbook_routes() -> APIRouter:
ps_lines = []
ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
- ps_lines.append('$env:PYTHONIOENCODING = "utf-8"')
- ps_lines.append('$env:PYTHONUTF8 = "1"')
if req.hf_token:
ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
if req.gpus:
@@ -970,7 +1294,7 @@ def setup_cookbook_routes() -> APIRouter:
ps_lines.append('try { python -c "import llama_cpp" 2>$null } catch {}')
ps_lines.append('if ($LASTEXITCODE -ne 0) {')
ps_lines.append(' Write-Host "Installing llama-cpp-python..."')
- ps_lines.append(' python -m pip install llama-cpp-python[server] --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu')
+ ps_lines.append(' python -m pip install llama-cpp-python[server]')
ps_lines.append('}')
elif "vllm" in req.cmd:
ps_lines.append('Write-Host "ERROR: vLLM is not supported on Windows. Use Ollama or llama.cpp instead."')
@@ -1045,58 +1369,46 @@ def setup_cookbook_routes() -> APIRouter:
# ollama is found (otherwise macOS falls back to a slow source build).
# /opt/homebrew = Apple Silicon, /usr/local = Intel; harmless on Linux.
runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:$HOME/llama.cpp/build/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
- if local_windows:
- # LOCAL Windows: no native source compilation (no cmake/compiler on Git Bash).
- # Just check python bindings (using native `python` binary) and fall back to pip install.
- runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "llama-server not found — installing Python bindings..."')
- runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='python')} || true")
- runner_lines.append('fi')
- runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install attempts."')
- runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
- runner_lines.append('fi')
- else:
- runner_lines.append('if [ -d /data/data/com.termux ]; then')
- runner_lines.append(' # Termux: no native build — use the Python bindings (CPU).')
- runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' pkg install -y cmake 2>/dev/null')
- runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null')
- runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
- runner_lines.append(' fi')
- runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
- runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
- runner_lines.append(' mkdir -p ~/bin')
- runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
- # Build with the right accelerator: Metal on macOS (llama.cpp
- # enables it automatically, no flag), CUDA on Linux when present,
- # else a plain CPU build. nproc is Linux-only — fall back to
- # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships
- # a prebuilt llama-server and skips this whole source build.)
- runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"')
- runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then')
- runner_lines.append(' command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."')
- # Start from a clean cache: a prior failed configure (e.g. a CUDA
- # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build`
- # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is
- # explicit so the binary is optimized (Metal auto-enables on macOS).
- runner_lines.append(' cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
- runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
- runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
- runner_lines.append(' else')
- _append_llama_cpp_linux_accel_build_lines(runner_lines)
- runner_lines.append(' fi')
- # If the native build failed, fall back to the Python bindings.
- runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
- runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
- runner_lines.append(' fi')
- runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."')
- runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
- runner_lines.append(' fi')
- runner_lines.append('fi')
- elif "ollama" in req.cmd:
+ runner_lines.append('if [ -d /data/data/com.termux ]; then')
+ runner_lines.append(' # Termux: no native build — use the Python bindings (CPU).')
+ runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then')
+ runner_lines.append(' pkg install -y cmake 2>/dev/null')
+ runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null')
+ runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
+ runner_lines.append(' fi')
+ runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
+ runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
+ runner_lines.append(' mkdir -p ~/bin')
+ runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
+ # Build with the right accelerator: Metal on macOS (llama.cpp
+ # enables it automatically, no flag), CUDA on Linux when present,
+ # else a plain CPU build. nproc is Linux-only — fall back to
+ # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships
+ # a prebuilt llama-server and skips this whole source build.)
+ runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"')
+ runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then')
+ runner_lines.append(' command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."')
+ # Start from a clean cache: a prior failed configure (e.g. a CUDA
+ # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build`
+ # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is
+ # explicit so the binary is optimized (Metal auto-enables on macOS).
+ runner_lines.append(' cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
+ runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
+ runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+ runner_lines.append(' else')
+ _append_llama_cpp_linux_accel_build_lines(runner_lines)
+ runner_lines.append(' fi')
+ runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
+ runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
+ runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
+ runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
+ runner_lines.append(' fi')
+ runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
+ runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."')
+ runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
+ runner_lines.append(' fi')
+ runner_lines.append('fi')
+ elif re.search(r"\bollama\s+serve\b", req.cmd):
handled_ollama_serve = True
_ollama_default_host = "0.0.0.0" if remote else "127.0.0.1"
_ollama_host, _ollama_port = _ollama_bind_from_cmd(
@@ -1117,23 +1429,13 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_try_port"')
runner_lines.append(' break')
runner_lines.append(' fi')
- runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"')
- runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."')
- if local_windows:
- # Windows detached process has no TTY; exec bash -i crashes.
- # Keep the monitoring task alive with a sleep loop.
- runner_lines.append(' while true; do sleep 60; done')
- else:
- runner_lines.append(' exec bash -i')
- runner_lines.append('fi')
+ runner_lines.append(' exec 3<&-; exec 3>&-')
+ runner_lines.append('done')
runner_lines.append('if ! command -v ollama &>/dev/null; then')
runner_lines.append(' echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."')
runner_lines.append(' echo')
runner_lines.append(' echo "=== Process exited with code 127 ==="')
- if local_windows:
- runner_lines.append(' exit 127')
- else:
- runner_lines.append(' exec bash -i')
+ runner_lines.append(' exec bash -i')
runner_lines.append('fi')
runner_lines.append('ODYSSEUS_OLLAMA_URL="http://${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}"')
if remote and _ollama_host in ("0.0.0.0", "::"):
@@ -1141,20 +1443,24 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append('echo "[odysseus] Ollama has no built-in authentication; expose this only on a trusted LAN/VPN or provide an explicit OLLAMA_HOST with your own access controls."')
runner_lines.append('echo "Starting ollama server on ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}..."')
runner_lines.append('OLLAMA_HOST="${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
- if local_windows:
- _append_serve_exit_code_lines(runner_lines, keep_shell_open=False)
- else:
- runner_lines.append('_ody_exit=$?')
- runner_lines.append('echo')
- runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
- runner_lines.append('exec bash -i')
+ runner_lines.append('_ody_exit=$?')
+ runner_lines.append('echo')
+ runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
+ runner_lines.append('exec bash -i')
elif "vllm serve" in req.cmd:
# vLLM is CUDA/ROCm-only and does not run on macOS at all.
runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then')
runner_lines.append(' echo "ERROR: vLLM does not run on macOS. Use Ollama or llama.cpp (Metal) instead."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=1')
runner_lines.append('fi')
- _append_vllm_linux_preflight_lines(runner_lines)
+ # Put ~/.local/bin on PATH first — without a venv, vllm installs
+ # there via --user and the non-login serve shell otherwise can't
+ # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
+ runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
+ runner_lines.append('if ! command -v vllm &>/dev/null; then')
+ runner_lines.append(' echo "ERROR: vLLM is not installed."')
+ runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
+ runner_lines.append('fi')
elif "sglang.launch_server" in req.cmd:
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! command -v sglang &>/dev/null; then')
@@ -1173,15 +1479,30 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')
- if not handled_ollama_serve:
+ handled_ollama_sidecar_probe = False
+ if (not handled_ollama_serve
+ and re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\s+ollama\s+show\b", req.cmd or "")):
+ handled_ollama_sidecar_probe = True
_append_serve_preflight_exit_lines(
runner_lines,
keep_shell_open=not local_windows,
)
- if is_pip_install:
- _append_pip_install_runner_lines(runner_lines, req.cmd)
- else:
- runner_lines.append(req.cmd)
+ runner_lines.append(req.cmd)
+ runner_lines.append('_ody_exit=$?')
+ runner_lines.append('echo')
+ runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
+ runner_lines.append('if [ "$_ody_exit" -eq 0 ]; then')
+ runner_lines.append(' echo "[odysseus] Ollama sidecar model is available; keeping Cookbook task attached to the persistent Ollama daemon."')
+ runner_lines.append(' while true; do sleep 3600; done')
+ runner_lines.append('fi')
+ runner_lines.append('exec bash -i')
+
+ if not handled_ollama_serve and not handled_ollama_sidecar_probe:
+ _append_serve_preflight_exit_lines(
+ runner_lines,
+ keep_shell_open=not local_windows,
+ )
+ runner_lines.append(req.cmd)
if local_windows:
# Detached background process — no interactive shell to keep open.
# Print the exit marker the status poller looks for, then stop.
@@ -1263,6 +1584,26 @@ def setup_cookbook_routes() -> APIRouter:
elif not is_pip_install:
endpoint_id = _auto_register_llm_endpoint(req, remote)
+ # Crash watchdog: the auto-register above writes the endpoint row
+ # IMMEDIATELY (before the server has even bound its port) so the
+ # picker shows the model as it warms up. When the serve process
+ # crashes right at startup (missing module, bad cmd, port collision,
+ # ModuleNotFoundError on llama_cpp, etc.), the endpoint is left
+ # dangling — every subsequent chat returns 503 or an empty response.
+ # Schedule a background task to read the tmux output for the
+ # "=== Process exited with code N ===" marker the runner emits;
+ # if N != 0 within the watch window, delete the endpoint we just
+ # created. Skipped for diffusion (different image-endpoint cleanup
+ # path) and pip-install tasks (no endpoint to drop).
+ if endpoint_id and not is_diffusion and not is_pip_install:
+ asyncio.create_task(_serve_crash_watchdog(
+ endpoint_id=endpoint_id,
+ session_id=session_id,
+ remote=remote,
+ ssh_port=req.ssh_port,
+ is_windows=is_windows,
+ ))
+
# Log to assistant
try:
from src.assistant_log import log_to_assistant
@@ -1342,8 +1683,8 @@ def setup_cookbook_routes() -> APIRouter:
cmd = f"ssh {pf}{host} '{setup_script}'"
else:
# Linux: auto-install tmux (via whichever package manager is available)
- # and huggingface_hub + hf_transfer (falling back to --user, then
- # guarded --break-system-packages on PEP-668 locked distros).
+ # and huggingface_hub + hf_transfer (falling back to --user/--break-system-packages
+ # on PEP-668 locked distros like Arch / newer Debian).
setup_script = (
# Install tmux if missing — try common package managers; skip if no sudo
"if ! command -v tmux >/dev/null 2>&1; then "
@@ -1355,15 +1696,10 @@ def setup_cookbook_routes() -> APIRouter:
" fi; "
"fi; "
"command -v tmux >/dev/null 2>&1 || echo 'WARNING: tmux missing and auto-install failed (need passwordless sudo). Install manually.'; "
- # Install Python bits. Try system install first; fall back to --user,
- # then use --break-system-packages only when pip supports it.
+ # Install Python bits. Try system install first; fall back to --user --break-system-packages on PEP 668 systems.
"pip install -q huggingface_hub hf_transfer 2>/dev/null || "
- "pip install --user -q huggingface_hub hf_transfer 2>/dev/null || "
- "( pip install --help 2>/dev/null | grep -q -- --break-system-packages && "
- "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ) || "
- "pip3 install --user -q huggingface_hub hf_transfer 2>/dev/null || "
- "( pip3 install --help 2>/dev/null | grep -q -- --break-system-packages && "
- "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ); "
+ "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null || "
+ "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null; "
"python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'"
)
cmd = f"ssh {pf}{host} '{setup_script}'"
@@ -1386,38 +1722,11 @@ def setup_cookbook_routes() -> APIRouter:
async def _run_nvidia_smi(query: str, host: str | None, ssh_port: str | None, timeout: int = 8):
"""Run nvidia-smi locally or over SSH. Returns (stdout, error_or_None)."""
if host:
- candidates = [query]
- stripped = query.strip()
- if stripped.startswith("nvidia-smi "):
- args = stripped[len("nvidia-smi "):]
- candidates.append(
- "bash -lc "
- + shlex.quote(
- f"{SSH_PATH_OVERRIDE}"
- f"nvidia-smi {args}"
- )
- )
- for nvidia_path in NVIDIA_PATH_CANDIDATES:
- candidates.append(f"{nvidia_path} {args}")
-
- last_err = "nvidia-smi failed"
- for candidate in candidates:
- try:
- rc, stdout, stderr = await run_ssh_command_async(
- host,
- ssh_port,
- candidate,
- connect_timeout=5,
- timeout=timeout,
- )
- except asyncio.TimeoutError:
- return None, "nvidia-smi timed out"
- if rc == 0:
- return stdout.decode("utf-8", errors="replace"), None
- err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
- if err:
- last_err = err
- return None, last_err
+ pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
+ cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{query}'"
+ proc = await asyncio.create_subprocess_shell(
+ cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+ )
else:
proc = await asyncio.create_subprocess_exec(
*shlex.split(query),
@@ -1996,30 +2305,58 @@ def setup_cookbook_routes() -> APIRouter:
return {"models": out}
- # Rate-limit for the orphan-tmux adoption sweep. The UI polls
- # tasks/status every ~3s; we don't want to SSH every host on every
- # poll. 20s is fast enough that a model the agent launched in the
- # background shows up "almost immediately" in the UI without being
- # wasteful.
+ # Rate-limit for the orphan-tmux adoption sweep. 60s interval so SSH
+ # work is genuinely sparse even on an actively-polled cookbook page.
_last_orphan_sweep_ts = [0.0]
- _ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0
+ _ORPHAN_SWEEP_MIN_INTERVAL_S = 60.0
+ # Concurrency guard so two requests racing don't both spawn a sweep.
+ _orphan_sweep_inflight = [False]
def _maybe_sweep_orphans(tasks: list, state: dict) -> None:
"""Scan each configured cookbook server for `serve-*` tmux sessions
the cookbook doesn't know about and adopt them into state.tasks.
- Writes are conditional: if no orphans are found, nothing is touched.
- Rate-limited so polling UIs don't trigger SSH on every refresh.
+ Heavy SSH work runs in a background thread via asyncio.to_thread so
+ it never blocks the request that triggered it. Was previously
+ disabled because the sync implementation pegged uvicorn CPU during
+ active cookbook polling — re-enabled now with the work pushed off
+ the event loop and a slower (60s) cadence.
"""
import time as _time
- import subprocess
- logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}")
now = _time.monotonic()
+ if _orphan_sweep_inflight[0]:
+ return
if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S:
- logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last")
return
_last_orphan_sweep_ts[0] = now
+ _orphan_sweep_inflight[0] = True
+ # Snapshot inputs so the worker doesn't race with state mutations.
+ try:
+ tasks_snap = list(tasks or [])
+ except Exception:
+ tasks_snap = []
+ state_snap = state if isinstance(state, dict) else {}
+ # Caller is _cookbook_tasks_status_sync (sync context, no event
+ # loop). Use a plain background thread — no asyncio needed.
+ import threading
+ def _run_sweep() -> None:
+ try:
+ _sync_sweep_orphans(tasks_snap, state_snap)
+ except Exception as _e:
+ logger.warning(f"orphan sweep thread failed: {_e!r}")
+ finally:
+ _orphan_sweep_inflight[0] = False
+ try:
+ threading.Thread(target=_run_sweep, daemon=True, name="orphan-sweep").start()
+ except Exception as _e:
+ logger.warning(f"orphan sweep thread spawn failed: {_e!r}")
+ _orphan_sweep_inflight[0] = False
+ return
+
+ def _sync_sweep_orphans(tasks: list, state: dict) -> None:
+ """The actual sync sweep — never call this on the event loop."""
+ import subprocess
env = state.get("env") if isinstance(state, dict) else {}
servers = env.get("servers") if isinstance(env, dict) else []
logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}")
@@ -2143,6 +2480,121 @@ def setup_cookbook_routes() -> APIRouter:
except Exception as e:
logger.warning(f"orphan sweep: state write failed: {e}")
+ # In-memory cache for the Ollama library scrape. ollama.com is a public
+ # site, but it doesn't expose a stable JSON listing — we fetch the HTML
+ # search page and regex out the model cards. Cached for 1 h so a busy
+ # cookbook view doesn't hammer the site on every render.
+ _ollama_library_cache: dict = {"models": [], "fetched_at": 0.0, "error": None}
+
+ _OLLAMA_FALLBACK_LIBRARY = [
+ {"name": "qwen2.5", "description": "Qwen2.5 series — strong general/coding model from Alibaba.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b", "72b"]},
+ {"name": "qwen2.5-coder", "description": "Code-specialized Qwen2.5 family.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b"]},
+ {"name": "qwen3", "description": "Qwen3 — newer Alibaba family with hybrid reasoning.", "sizes": ["0.6b", "1.7b", "4b", "8b", "14b", "32b"]},
+ {"name": "llama3.2", "description": "Meta Llama 3.2 instruct (and tiny / vision variants).", "sizes": ["1b", "3b", "11b", "90b"]},
+ {"name": "llama3.1", "description": "Meta Llama 3.1 instruct.", "sizes": ["8b", "70b", "405b"]},
+ {"name": "llama3.3", "description": "Meta Llama 3.3 70B instruct.", "sizes": ["70b"]},
+ {"name": "gemma3", "description": "Google Gemma 3 — multimodal capable open-weights.", "sizes": ["1b", "4b", "12b", "27b"]},
+ {"name": "gemma2", "description": "Google Gemma 2 instruct.", "sizes": ["2b", "9b", "27b"]},
+ {"name": "mistral", "description": "Mistral 7B instruct — small, fast generalist.", "sizes": ["7b"]},
+ {"name": "mistral-nemo", "description": "Mistral NeMo 12B instruct.", "sizes": ["12b"]},
+ {"name": "mistral-small", "description": "Mistral Small 22B / 24B instruct.", "sizes": ["22b", "24b"]},
+ {"name": "mixtral", "description": "Mistral MoE 8x7B / 8x22B.", "sizes": ["8x7b", "8x22b"]},
+ {"name": "phi3", "description": "Microsoft Phi-3 small / medium.", "sizes": ["mini", "medium"]},
+ {"name": "phi4", "description": "Microsoft Phi-4 14B.", "sizes": ["14b"]},
+ {"name": "deepseek-r1", "description": "DeepSeek R1 reasoning model (distilled variants).", "sizes": ["1.5b", "7b", "8b", "14b", "32b", "70b"]},
+ {"name": "deepseek-v3", "description": "DeepSeek V3 MoE 671B (huge — needs serious VRAM).", "sizes": ["671b"]},
+ {"name": "codellama", "description": "Meta Code Llama instruct family.", "sizes": ["7b", "13b", "34b", "70b"]},
+ {"name": "starcoder2", "description": "BigCode StarCoder2 — code completion.", "sizes": ["3b", "7b", "15b"]},
+ {"name": "deepseek-coder-v2", "description": "DeepSeek Coder V2 — code MoE.", "sizes": ["16b", "236b"]},
+ {"name": "nomic-embed-text", "description": "Embedding model — text vector encoder.", "sizes": ["latest"]},
+ {"name": "mxbai-embed-large", "description": "Embedding model — Mixedbread large.", "sizes": ["latest"]},
+ {"name": "llava", "description": "LLaVA multimodal vision-language model.", "sizes": ["7b", "13b", "34b"]},
+ {"name": "minicpm-v", "description": "MiniCPM-V multimodal.", "sizes": ["8b"]},
+ {"name": "command-r", "description": "Cohere Command R — RAG-oriented.", "sizes": ["35b"]},
+ {"name": "command-r-plus", "description": "Cohere Command R+ — larger RAG model.", "sizes": ["104b"]},
+ {"name": "qwq", "description": "Qwen QwQ reasoning preview.", "sizes": ["32b"]},
+ {"name": "smollm2", "description": "HuggingFaceTB SmolLM2 — tiny capable models.", "sizes": ["135m", "360m", "1.7b"]},
+ {"name": "granite3.1-dense", "description": "IBM Granite 3.1 dense instruct.", "sizes": ["2b", "8b"]},
+ {"name": "nemotron", "description": "NVIDIA Nemotron 70B.", "sizes": ["70b"]},
+ {"name": "olmo2", "description": "AI2 OLMo 2 open-weights.", "sizes": ["7b", "13b"]},
+ ]
+
+ @router.get("/api/cookbook/ollama/library")
+ async def ollama_library(refresh: int = 0, request: Request = None, owner: str = Depends(require_user)):
+ """List popular Ollama library models for the Browse picker.
+
+ Tries a 1-hour-cached fetch of ollama.com/library, falls back to a
+ curated hard-coded list so the picker always renders something."""
+ import time as _time
+ import httpx as _httpx
+ TTL = 3600.0
+ now = _time.time()
+ if refresh or (now - _ollama_library_cache["fetched_at"]) > TTL or not _ollama_library_cache["models"]:
+ models: list[dict] = []
+ err = None
+ try:
+ async with _httpx.AsyncClient(timeout=8, follow_redirects=True) as client:
+ resp = await client.get(
+ "https://ollama.com/search?sort=popular",
+ headers={"User-Agent": "odysseus-cookbook/1.0"},
+ )
+ if resp.status_code == 200:
+ html = resp.text
+ # ollama.com renders each model card as a single anchor:
+ # …
+ # The description + sizes live inside that anchor. Pull
+ # the whole block then extract pieces individually.
+ block_re = re.compile(
+ r']*href="/library/([A-Za-z0-9._-]+)"[^>]*>(.*?)',
+ re.DOTALL,
+ )
+ desc_re = re.compile(r'
]*>([^<]{4,400})
', re.DOTALL)
+ # Size tags on ollama.com cards look like "0.5b", "14b",
+ # "8x7b", "27b". Pulled from short -wrapped chips.
+ size_re = re.compile(r'>\s*(\d+(?:\.\d+)?(?:x\d+)?[bBmM])\s*<')
+ seen: set[str] = set()
+ for bm in block_re.finditer(html):
+ name = bm.group(1).strip()
+ if name in seen:
+ continue
+ seen.add(name)
+ body = bm.group(2)
+ dm = desc_re.search(body)
+ desc = (dm.group(1).strip() if dm else "").replace("\n", " ")
+ sizes_raw = size_re.findall(body)
+ # Dedup sizes preserving order
+ sizes: list[str] = []
+ for s in sizes_raw:
+ s_low = s.lower()
+ if s_low not in sizes:
+ sizes.append(s_low)
+ models.append({"name": name, "description": desc, "sizes": sizes})
+ if len(models) >= 80:
+ break
+ else:
+ err = f"HTTP {resp.status_code}"
+ except Exception as e:
+ err = str(e)[:160]
+ # Merge curated fallback so classics (qwen2.5, llama3, deepseek-r1,
+ # …) stay reachable even when ollama.com's front page is dominated
+ # by brand-new releases the user might not be looking for.
+ live_names = {m["name"] for m in models}
+ for fb in _OLLAMA_FALLBACK_LIBRARY:
+ if fb["name"] not in live_names:
+ models.append(fb)
+ if not models:
+ models = list(_OLLAMA_FALLBACK_LIBRARY)
+ if err is None:
+ err = "parsed 0 results — using fallback list"
+ _ollama_library_cache["models"] = models
+ _ollama_library_cache["fetched_at"] = now
+ _ollama_library_cache["error"] = err
+ return {
+ "models": _ollama_library_cache["models"],
+ "fetched_at": _ollama_library_cache["fetched_at"],
+ "error": _ollama_library_cache["error"],
+ }
+
@router.get("/api/cookbook/tasks/status")
async def cookbook_tasks_status(request: Request):
"""Check status of all active cookbook tmux sessions.
@@ -2180,13 +2632,39 @@ def setup_cookbook_routes() -> APIRouter:
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
"sys.exit(0 if ok and not inc else 1)"
)
- if remote_host:
- cmd = ["python3", "-c", py, repo_id]
- else:
- # Local Windows: python3 can hit the Microsoft Store stub. Use the
- # real Python Odysseus is running under (guaranteed to exist).
- import sys as _sys_local
- cmd = [_sys_local.executable, "-c", py, repo_id]
+ cmd = ["python3", "-c", py, repo_id]
+ try:
+ if remote_host:
+ ssh_base = ["ssh"]
+ if ssh_port and ssh_port != "22":
+ ssh_base.extend(["-p", str(ssh_port)])
+ shell_cmd = " ".join(shlex.quote(x) for x in cmd)
+ proc = subprocess.run(ssh_base + [remote_host, shell_cmd], timeout=12, capture_output=True)
+ else:
+ proc = subprocess.run(cmd, timeout=12, capture_output=True)
+ return proc.returncode == 0
+ except Exception:
+ return False
+
+ def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
+ """Best-effort check for resumable HF partial blobs.
+
+ A lost SSH/tmux session can leave a real download still incomplete.
+ Treat any *.incomplete blob as stronger evidence than stale
+ "100%" lines in the captured pane output.
+ """
+ if not repo_id or "/" not in repo_id:
+ return False
+ py = (
+ "import os,sys;"
+ "repo=sys.argv[1];"
+ "base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
+ "d=os.path.join(base,'models--'+repo.replace('/','--'));"
+ "blobs=os.path.join(d,'blobs');"
+ "inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
+ "sys.exit(0 if inc else 1)"
+ )
+ cmd = ["python3", "-c", py, repo_id]
try:
if remote_host:
ssh_base = ["ssh"]
@@ -2333,28 +2811,43 @@ def setup_cookbook_routes() -> APIRouter:
except Exception:
pass
else:
- try:
- alive = subprocess.run(check_cmd, timeout=10, capture_output=True)
- is_alive = alive.returncode == 0
- except Exception:
+ # Skip the live SSH check entirely for tasks already in a
+ # terminal state — they won't change, and 10s timeouts
+ # stacked per task were the dominant cost of this whole
+ # status endpoint (3+ minute stalls with ~8 accumulated
+ # stopped tasks). The agent's `list_served_models` call
+ # was blocking the chat stream every time.
+ _task_status = (task.get("status") or "").lower()
+ if _task_status in {"stopped", "done", "completed",
+ "crashed", "error", "failed",
+ "ended", "killed"}:
is_alive = False
-
- # Capture last lines for progress. Prefer the "Downloading" line
- # (real aggregate bytes) over "Fetching N files" (whole-file count that
- # lags with hf_transfer). Falls back to the true last line otherwise.
- if is_alive:
+ # Keep the persisted output_tail for the UI — it's
+ # what the agent uses to diagnose past failures.
+ full_snapshot = (task.get("output") or "")[-12000:]
+ else:
try:
- cap = subprocess.run(capture_cmd, timeout=10, capture_output=True, text=True)
- if cap.returncode == 0:
- full_snapshot = cap.stdout.strip()
- lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
- downloading_lines = [l for l in lines if l.startswith("Downloading")]
- if downloading_lines:
- progress_text = downloading_lines[-1]
- elif lines:
- progress_text = lines[-1]
+ alive = subprocess.run(check_cmd, timeout=4, capture_output=True)
+ is_alive = alive.returncode == 0
except Exception:
- pass
+ is_alive = False
+
+ # Capture last lines for progress. Prefer the "Downloading" line
+ # (real aggregate bytes) over "Fetching N files" (whole-file count that
+ # lags with hf_transfer). Falls back to the true last line otherwise.
+ if is_alive:
+ try:
+ cap = subprocess.run(capture_cmd, timeout=4, capture_output=True, text=True)
+ if cap.returncode == 0:
+ full_snapshot = cap.stdout.strip()
+ lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
+ downloading_lines = [l for l in lines if l.startswith("Downloading")]
+ if downloading_lines:
+ progress_text = downloading_lines[-1]
+ elif lines:
+ progress_text = lines[-1]
+ except Exception:
+ pass
# Determine status. For the local-Windows detached model the log file
# persists after the process exits, so a finished download still has a
@@ -2362,6 +2855,16 @@ def setup_cookbook_routes() -> APIRouter:
# when the PID is gone instead of blindly reporting "stopped".
download_zero_files = False
status = "unknown"
+ download_has_ok = task_type == "download" and "DOWNLOAD_OK" in full_snapshot
+ download_has_failed = task_type == "download" and "DOWNLOAD_FAILED" in full_snapshot
+ download_has_incomplete_evidence = (
+ task_type == "download"
+ and (
+ ".incomplete" in full_snapshot
+ or bool(re.search(r'model-\d+-of-\d+\.[A-Za-z0-9_.-]+:\s+(?:[0-9]|[1-8][0-9])%', full_snapshot))
+ or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""))
+ )
+ )
if is_alive or (local_win_task and full_snapshot):
lower = full_snapshot.lower()
exit_match = re.search(r"=== process exited with code\s+(-?\d+)", full_snapshot, re.I)
@@ -2374,20 +2877,24 @@ def setup_cookbook_routes() -> APIRouter:
elif has_exit and task_type == "download":
# Dependency installs are tracked as download tasks but only
# emit the generic runner exit marker, not HF download markers.
- status = "completed" if exit_code == 0 else "error"
+ if download_has_incomplete_evidence and not download_has_ok:
+ status = "running" if is_alive else "stopped"
+ else:
+ status = "completed" if exit_code == 0 else "error"
elif has_exit and "unrecognized arguments" in lower:
status = "error"
elif has_error and not ("application startup complete" in lower):
status = "error"
- elif task_type == "download" and ("100%" in full_snapshot or "DOWNLOAD_OK" in full_snapshot):
- # Only download tasks treat 100% as "completed".
- # Serve tasks log 100%|██████| during inference progress
- # (diffusion sampling, etc.) — that's "running", not done.
+ elif task_type == "download" and download_has_ok:
if re.search(r"Fetching\s+0\s+files", full_snapshot, re.IGNORECASE):
status = "error"
download_zero_files = True
else:
status = "completed"
+ elif task_type == "download" and download_has_failed:
+ status = "error"
+ elif task_type == "download" and download_has_incomplete_evidence:
+ status = "running" if is_alive else "stopped"
elif "application startup complete" in lower:
status = "ready"
elif not is_alive:
@@ -2397,7 +2904,11 @@ def setup_cookbook_routes() -> APIRouter:
status = "running"
else:
# Session is dead — check if it completed or crashed
- if task_type == "download" and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")):
+ if (
+ task_type == "download"
+ and not download_has_incomplete_evidence
+ and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""))
+ ):
status = "completed"
if not progress_text:
progress_text = "Download complete"
@@ -2407,12 +2918,12 @@ def setup_cookbook_routes() -> APIRouter:
status = "stopped"
# Parse structured phase info — single source of truth for the UI
- phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
+ phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and full_snapshot) else {}
if phase_info.get("status") == "ready":
status = "ready"
serve_phase = phase_info.get("phase", "")
diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
- if diagnosis and status in {"running", "unknown", "stopped"}:
+ if diagnosis and status in {"running", "unknown", "stopped"} and phase_info.get("status") != "ready":
status = "error"
if download_zero_files:
diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."}
diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py
index a7af18b04..eb408ac9d 100644
--- a/routes/hwfit_routes.py
+++ b/routes/hwfit_routes.py
@@ -196,7 +196,24 @@ def setup_hwfit_routes():
if target_context is not None:
target_context = max(1024, min(target_context, 1000000))
- results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None, target_context=target_context, fit_only=fit_only)
+ rank_kwargs = {
+ "use_case": use_case or None,
+ "limit": limit,
+ "search": search or None,
+ "sort": sort,
+ "quant": quant or None,
+ "fit_only": fit_only,
+ }
+ if target_context is not None:
+ rank_kwargs["target_context"] = target_context
+ try:
+ import inspect
+ supported = set(inspect.signature(rank_models).parameters)
+ rank_kwargs = {k: v for k, v in rank_kwargs.items() if k in supported}
+ except Exception:
+ rank_kwargs.pop("target_context", None)
+ rank_kwargs.pop("fit_only", None)
+ results = rank_models(system, **rank_kwargs)
return {"system": system, "models": results}
@router.get("/profiles")
diff --git a/routes/model_routes.py b/routes/model_routes.py
index 995705d75..6b76dc71f 100644
--- a/routes/model_routes.py
+++ b/routes/model_routes.py
@@ -5,7 +5,6 @@ import re
import uuid
import json
import socket
-import hashlib
import time as _time
import logging
import httpx
@@ -283,11 +282,8 @@ _HOST_TO_CURATED = (
("fireworks.ai", "fireworks"),
("googleapis.com", "google"),
("x.ai", "xai"),
-
("openrouter.ai", "openrouter"),
("ollama.com", "ollama"),
- ("opencode.ai/zen/go", "opencode-go"),
- ("opencode.ai/zen", "opencode-zen"),
)
@@ -494,8 +490,6 @@ _NON_CHAT_EXACT_PREFIXES = (
def _is_chat_model(model_id: str) -> bool:
"""Return True if the model ID looks like a chat/completions-capable model."""
mid = model_id.lower()
- if mid in {"gpt-5.1-codex"}:
- return True
for prefix in _NON_CHAT_PREFIXES:
if mid.startswith(prefix):
return False
@@ -508,67 +502,9 @@ def _is_chat_model(model_id: str) -> bool:
return True
-def _delete_orphaned_provider_auth(db, auth_id: Optional[str], exclude_ep_id: Optional[str] = None) -> bool:
- """Delete a ProviderAuthSession once no endpoint still references it.
-
- Subscription providers (e.g. ChatGPT Subscription) keep their refresh token
- in ProviderAuthSession rather than ModelEndpoint.api_key. When the last
- endpoint backed by that auth row is removed, the stored credentials should
- be cleared instead of lingering. Returns True if a row was deleted.
- ``exclude_ep_id`` drops the endpoint currently being deleted from the
- reference count so it does not keep its own auth alive.
- """
- if not auth_id:
- return False
- from core.database import ProviderAuthSession
- still_referenced = db.query(ModelEndpoint.id).filter(
- ModelEndpoint.provider_auth_id == auth_id,
- ModelEndpoint.id != exclude_ep_id,
- ).first()
- if still_referenced is not None:
- return False
- auth_row = db.query(ProviderAuthSession).filter(ProviderAuthSession.id == auth_id).first()
- if auth_row is None:
- return False
- db.delete(auth_row)
- return True
-
-
-def _is_discovery_only_provider(provider: str) -> bool:
- """Provider that only supports model discovery, not live probing.
-
- ChatGPT Subscription speaks the Responses/Codex API and has no
- chat-completions or general health endpoint, so completion probes and
- reachability pings are skipped — status is derived from cached models.
- """
- return provider == "chatgpt-subscription"
-
-
-def _resolve_probe_key(ep) -> Optional[str]:
- """API key/bearer to probe an endpoint with.
-
- Delegates to ``resolve_endpoint_runtime``, which already returns the static
- ``ModelEndpoint.api_key`` for keyed endpoints and resolves (and refreshes)
- the runtime bearer for session-backed providers (e.g. ChatGPT Subscription).
- Returns None if resolution fails (e.g. re-auth required) so probing skips
- rather than raising. Reads only already-loaded scalar attributes of ``ep``.
- """
- try:
- from src.endpoint_resolver import resolve_endpoint_runtime
- _base, key = resolve_endpoint_runtime(ep, owner=getattr(ep, "owner", None))
- return key
- except Exception as e:
- logger.warning("Probe key resolution failed for %s: %s", getattr(ep, "id", "?"), e)
- return None
-
-
-def _probe_single_model(base: str, api_key: Optional[str], model_id: str, timeout: int = 10, with_tools: bool = False) -> dict:
+def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 10, with_tools: bool = False) -> dict:
"""Send a realistic completion request to a single model. Returns {status, latency_ms, error?}."""
provider = _detect_provider(base)
- if _is_discovery_only_provider(provider):
- # Responses/Codex API, not chat-completions: a completion probe would
- # 400 and the re-probe flow would then hide every model. Discovery-only.
- return {"status": "ok", "latency_ms": 0, "skipped": True}
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Say OK"},
@@ -682,11 +618,6 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
For Anthropic, queries their /v1/models API, falling back to hardcoded list."""
from src.endpoint_resolver import resolve_url
base = resolve_url(_normalize_base(base_url))
- if _detect_provider(base) == "chatgpt-subscription":
- from src.chatgpt_subscription import fetch_available_models
- if api_key:
- return fetch_available_models(api_key, timeout=timeout)
- return []
if _detect_provider(base) == "anthropic":
# Try Anthropic's /v1/models endpoint first
url = build_models_url(base)
@@ -713,10 +644,6 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
logger.warning(f"Anthropic /v1/models failed, using hardcoded list: {e}")
return list(ANTHROPIC_MODELS)
url = build_models_url(base)
- if not url:
- curated_key = _match_provider_curated(base, None)
- fallback = _PROVIDER_CURATED.get(curated_key) if curated_key else None
- return list(fallback or [])
headers = build_headers(api_key, base)
try:
r = httpx.get(url, headers=headers, timeout=timeout, verify=llm_verify())
@@ -770,6 +697,7 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
return list(fallback)
return []
+
def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> Dict[str, Any]:
"""Reachability probe that does not require installed/listed models."""
from src.endpoint_resolver import resolve_url
@@ -785,10 +713,6 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
or "ollama" in (parsed_base.hostname or "").lower()
)
- # APFEL-specific detection
- host = (parsed_base.hostname or "").lower()
- looks_like_apfel = "apfel" in host or parsed_base.port == 11435
-
def _result_from_response(r) -> Dict[str, Any]:
if 300 <= r.status_code < 400:
loc = r.headers.get("location", "")
@@ -810,23 +734,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
last_error: Optional[str] = None
try:
- # APFEL does not behave like Ollama; use its health endpoint.
- if looks_like_apfel:
- root = base
- for suffix in ("/v1", "/api"):
- if root.endswith(suffix):
- root = root[: -len(suffix)].rstrip("/")
- break
- try:
- r = httpx.get(root + "/health", timeout=timeout, verify=llm_verify())
- result = _result_from_response(r)
- if result["reachable"]:
- return result
- last_error = result.get("error")
- except Exception as e:
- last_error = str(e)[:120]
-
- elif looks_like_ollama:
+ if looks_like_ollama:
root = base
for suffix in ("/v1", "/api"):
if root.endswith(suffix):
@@ -844,33 +752,44 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
except Exception:
pass
+ # OpenAI-compatible servers (vLLM, llama.cpp, SGLang, lmdeploy, …) expose
+ # /v1/models but return 404 on the bare /v1 root. The probe used to GET
+ # the base URL only, so a fully-working vLLM endpoint (chats fine!) read
+ # as offline because /v1 → 404. Try /models first; fall back to the base
+ # URL only if /models couldn't be reached (TCP-level failure).
+ models_url = build_models_url(base)
+ try:
+ r = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify())
+ result = _result_from_response(r)
+ if result["reachable"]:
+ return result
+ last_error = result.get("error")
+ except Exception as e:
+ last_error = str(e)[:120]
+
try:
r = httpx.get(base, headers=headers, timeout=timeout, verify=llm_verify())
result = _result_from_response(r)
- # If the bare base URL returns a non-auth 4xx (e.g. 404), try /models
- # as a fallback. OpenAI-compatible servers like llama-swap return 404
- # on the base /v1 prefix but 200 on /v1/models. Auth failures (401/403)
- # are definitive — probing /models would just repeat the same rejection.
- if (
- not result["reachable"]
- and result.get("status_code") is not None
- and 400 <= result["status_code"] < 500
- and result["status_code"] not in (401, 403)
- ):
- models_url = build_models_url(base)
- try:
- r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify())
- result2 = _result_from_response(r2)
- if result2["reachable"]:
- return result2
- except Exception:
- pass
- return result
+ if result["reachable"]:
+ return result
+ # 4xx from a reachable HTTP server (404 /v1, 401/403 missing key) is
+ # still proof the upstream is alive. Only treat connection-level
+ # failures, 5xx, and redirect-to-/login as truly offline.
+ sc = result.get("status_code") or 0
+ if 400 <= sc < 500 and sc not in (407, 408, 421, 425, 429):
+ return {
+ "reachable": True,
+ "status_code": sc,
+ "error": None,
+ }
+ last_error = result.get("error") or last_error
except Exception as e:
last_error = str(e)[:120]
return {"reachable": False, "status_code": None, "error": last_error}
+
+
def _model_endpoint_error_message(base_url: str, ping: Dict[str, Any] = None) -> str:
"""Return a provider-aware error message for failed endpoint probes."""
ping = ping or {}
@@ -959,14 +878,6 @@ def _visible_models(cached_models, hidden_models, pinned_models=None):
return [m for m in merged if m not in hidden]
-def _api_key_fingerprint(api_key: Optional[str]) -> str:
- """Stable, non-secret label for distinguishing same-URL credentials."""
- key = (api_key or "").strip()
- if not key:
- return ""
- return hashlib.sha256(key.encode("utf-8")).hexdigest()[:8]
-
-
def setup_model_routes(model_discovery):
router = APIRouter(prefix="/api")
@@ -1068,17 +979,6 @@ def setup_model_routes(model_discovery):
ok, info = _should_refresh_endpoint(ep, now, force=force)
if not ok:
continue
- if getattr(ep, "provider_auth_id", None):
- try:
- from src.endpoint_resolver import resolve_endpoint_runtime
- info["base"], info["api_key"] = resolve_endpoint_runtime(
- ep,
- owner=getattr(ep, "owner", None),
- )
- info["key"] = _refresh_key(info["base"], info["api_key"])
- except Exception as e:
- logger.warning("Skipping model refresh for %s: could not resolve provider auth: %s", getattr(ep, "name", ep.id), e)
- continue
groups.setdefault(info["key"], {
"base": info["base"],
"api_key": info["api_key"],
@@ -1232,9 +1132,8 @@ def setup_model_routes(model_discovery):
raise HTTPException(401, "Not authenticated")
except HTTPException:
raise
- except Exception as e:
- logger.error('Auth gate error in GET /api/models, failing closed: %s', e)
- raise HTTPException(status_code=500, detail='Internal error')
+ except Exception:
+ pass
# Admins see every endpoint (they manage the global pool); regular
# users get the owner-scoped view.
_is_admin = False
@@ -1298,7 +1197,14 @@ def setup_model_routes(model_discovery):
t0 = _time.time()
try:
import asyncio as _asyncio
- ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 1.5)
+ # Bumped 1.5s → 3.5s. The previous 1.5s budget was clipping
+ # local vLLM endpoints on Tailscale links where the model
+ # server is still loading (Qwen3.5-122B takes 2–3 min to
+ # warm); /v1/models can take 500–2500 ms on a busy box,
+ # which pushed _ping_endpoint's full path-discovery sweep
+ # past the cap and marked the row offline despite the
+ # user actively chatting with it.
+ ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 3.5)
lat = round((_time.time() - t0) * 1000)
return {
"alive": bool(ping.get("reachable")),
@@ -1348,20 +1254,12 @@ def setup_model_routes(model_discovery):
"endpoint_kind": kind,
}
try:
- if _is_discovery_only_provider(provider):
- # No general health endpoint — an unauthenticated GET just
- # 401s. Report status from cached models instead of pinging.
- entry["latency_ms"] = None
- entry["status"] = "online" if cached_count else "offline"
- entry["error"] = None
- entry["model_count"] = cached_count
- else:
- t0 = _time.time()
- ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
- entry["latency_ms"] = round((_time.time() - t0) * 1000)
- entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
- entry["error"] = ping.get("error")
- entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
+ t0 = _time.time()
+ ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
+ entry["latency_ms"] = round((_time.time() - t0) * 1000)
+ entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
+ entry["error"] = ping.get("error")
+ entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
except Exception as e:
entry["latency_ms"] = None
entry["status"] = "online" if cached_count else "offline"
@@ -1394,7 +1292,7 @@ def setup_model_routes(model_discovery):
if ep_id and ep_id not in endpoints_cache:
ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first()
if ep:
- endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": _resolve_probe_key(ep)}
+ endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": ep.api_key}
ep_data = endpoints_cache.get(ep_id)
if not ep_data:
# Try to find by base_url from the model's endpoint field
@@ -1433,7 +1331,7 @@ def setup_model_routes(model_discovery):
"id": ep.id,
"name": ep.name,
"base_url": ep.base_url,
- "api_key": _resolve_probe_key(ep),
+ "api_key": ep.api_key,
})
finally:
db.close()
@@ -1522,21 +1420,43 @@ def setup_model_routes(model_discovery):
# Endpoint counts as reachable if it has any model — including
# admin-pinned IDs that a probe would never surface.
status = "online" if (all_models or pinned) else "offline"
- base = _normalize_base(r.base_url)
ping = None
- # Discovery-only providers have no health endpoint — an
- # unauthenticated ping just 401s, so don't bother.
- if not all_models and not pinned and r.is_enabled and not _is_discovery_only_provider(_detect_provider(base)):
- ping = _ping_endpoint(r.base_url, r.api_key, timeout=1.0)
+ # When cached_models is empty, do a quick reachability probe.
+ # Bumped 1.0s → 3.5s because the user reported endpoints they
+ # were ACTIVELY chatting with showed "offline" — the previous
+ # 1s timeout was clipping live cloud endpoints (DeepSeek can
+ # take 1.5–2.5s on /v1/models when their region is under load,
+ # vLLM on a remote GPU box behind SSH can also push past 1s).
+ # 3.5s still keeps the picker render snappy in the common
+ # "everything's already cached" path because this branch only
+ # runs for endpoints with an empty cached_models.
+ if not all_models and not pinned and r.is_enabled:
+ ping = _ping_endpoint(r.base_url, r.api_key, timeout=3.5)
if ping.get("reachable"):
status = "empty"
+ # Best-effort: if the probe came back reachable, try
+ # to populate cached_models in the background so the
+ # NEXT picker load shows "online" instead of "empty".
+ # Failure here is silent — we already returned the
+ # "empty" status, and the existing background refresh
+ # path will eventually fill it in too.
+ try:
+ probed = _probe_endpoint(r.base_url, r.api_key, timeout=5)
+ if probed:
+ r.cached_models = json.dumps(probed)
+ db.commit()
+ all_models = probed
+ visible = _visible_models(all_models, r.hidden_models, pinned)
+ status = "online"
+ except Exception as _refill_err:
+ logger.debug(f"opportunistic cached_models refill failed for {r.id}: {_refill_err!r}")
+ base = _normalize_base(r.base_url)
kind = _effective_endpoint_kind(r, base)
results.append({
"id": r.id,
"name": r.name,
"base_url": r.base_url,
"has_key": bool(r.api_key),
- "api_key_fingerprint": _api_key_fingerprint(r.api_key),
"is_enabled": r.is_enabled,
"models": visible,
"pinned_models": pinned,
@@ -1603,34 +1523,21 @@ def setup_model_routes(model_discovery):
)
explicit_timeout = _explicit_model_list_timeout(base_url, requested_kind, refresh_timeout)
- # Dedupe: if an endpoint with the same base_url and compatible
- # credentials already exists and is reachable by the caller (shared or
- # owned by them), return it instead of creating a duplicate row. Keep
- # same-url/different-key rows distinct so users can group the same
- # provider URL under multiple credentials.
+ # Dedupe: if an endpoint with the same base_url already exists and
+ # is reachable by the caller (shared or owned by them), return it
+ # instead of creating a duplicate row. Fixes "Scan for Servers"
+ # re-adding manually-added endpoints under their host:port name.
from src.auth_helpers import get_current_user as _gcu_dedup
_caller = _gcu_dedup(request) or None
- _incoming_api_key = api_key.strip()
_db_dedup = SessionLocal()
try:
- _same_url_rows = (
+ existing = (
_db_dedup.query(ModelEndpoint)
.filter(ModelEndpoint.base_url == base_url)
.filter((ModelEndpoint.owner.is_(None)) | (ModelEndpoint.owner == _caller))
.order_by(ModelEndpoint.owner.desc()) # prefer owned over shared
- .all()
+ .first()
)
- existing = None
- _empty_key_existing = None
- for _candidate in _same_url_rows:
- _candidate_key = (getattr(_candidate, "api_key", None) or "").strip()
- if _candidate_key == _incoming_api_key:
- existing = _candidate
- break
- if _incoming_api_key and not _candidate_key and _empty_key_existing is None:
- _empty_key_existing = _candidate
- if existing is None and _incoming_api_key and _empty_key_existing is not None:
- existing = _empty_key_existing
if existing:
changed = False
# Persist any incoming pinned IDs onto the existing row. An
@@ -1679,8 +1586,6 @@ def setup_model_routes(model_discovery):
"id": existing.id,
"name": existing.name,
"base_url": existing.base_url,
- "has_key": bool(existing.api_key),
- "api_key_fingerprint": _api_key_fingerprint(existing.api_key),
"models": _visible_models(
existing_models,
getattr(existing, "hidden_models", None),
@@ -1754,8 +1659,6 @@ def setup_model_routes(model_discovery):
"id": ep_id,
"name": name.strip(),
"base_url": base_url,
- "has_key": bool(api_key.strip()),
- "api_key_fingerprint": _api_key_fingerprint(api_key),
"models": _merge_model_ids(model_ids, _pinned),
"pinned_models": _pinned,
"online": bool(model_ids) or bool(_pinned) or bool(ping.get("reachable")),
@@ -1805,7 +1708,7 @@ def setup_model_routes(model_discovery):
ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first()
if not ep:
raise HTTPException(404, "Endpoint not found")
- ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": _resolve_probe_key(ep)}
+ ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": ep.api_key}
finally:
db.close()
@@ -1869,7 +1772,7 @@ def setup_model_routes(model_discovery):
category = _classify_endpoint(base, kind)
timeout = _manual_refresh_timeout(ep, category, refresh_timeout)
try:
- probed = _probe_endpoint(base, _resolve_probe_key(ep), timeout=timeout)
+ probed = _probe_endpoint(base, ep.api_key, timeout=timeout)
except Exception as exc:
logger.warning("Manual model refresh failed for endpoint %s at %s: %s", ep_id, base, exc)
probed = []
@@ -2105,8 +2008,6 @@ def setup_model_routes(model_discovery):
"name": ep.name,
"model_type": ep.model_type,
"base_url": ep.base_url,
- "has_key": bool(ep.api_key),
- "api_key_fingerprint": _api_key_fingerprint(ep.api_key),
"pinned_models": _normalize_model_ids(getattr(ep, "pinned_models", None)),
"endpoint_kind": getattr(ep, "endpoint_kind", None) or "auto",
"model_refresh_mode": getattr(ep, "model_refresh_mode", None) or "auto",
@@ -2208,9 +2109,7 @@ def setup_model_routes(model_discovery):
cleared_user_preferences = _clear_user_prefs_for_endpoint(ep_id)
cleared_sessions = _clear_sessions_for_endpoint(db, ep.base_url)
cleared_loaded_sessions = _clear_loaded_sessions_for_endpoint(ep.base_url)
- auth_id = getattr(ep, "provider_auth_id", None)
db.delete(ep)
- cleared_provider_auth = _delete_orphaned_provider_auth(db, auth_id, exclude_ep_id=ep_id)
db.commit()
_invalidate_models_cache()
_local_probe_cache["data"] = None
@@ -2220,7 +2119,6 @@ def setup_model_routes(model_discovery):
"cleared_user_preferences": cleared_user_preferences,
"cleared_sessions": cleared_sessions,
"cleared_loaded_sessions": cleared_loaded_sessions,
- "cleared_provider_auth": cleared_provider_auth,
}
finally:
db.close()
diff --git a/services/hwfit/data/hf_models.json b/services/hwfit/data/hf_models.json
index e73cc26dc..35b55d9a9 100644
--- a/services/hwfit/data/hf_models.json
+++ b/services/hwfit/data/hf_models.json
@@ -14036,6 +14036,29 @@
"vision"
]
},
+ {
+ "name": "google/gemma-4-12B",
+ "provider": "Google",
+ "parameter_count": "12.0B",
+ "parameters_raw": 12000000000,
+ "min_ram_gb": 24.0,
+ "recommended_ram_gb": 32.0,
+ "min_vram_gb": 24.0,
+ "quantization": "BF16",
+ "context_length": 131072,
+ "use_case": "General purpose, multimodal",
+ "is_moe": false,
+ "num_experts": null,
+ "active_experts": null,
+ "active_parameters": null,
+ "architecture": "gemma4",
+ "pipeline_tag": "image-text-to-text",
+ "release_date": "2026-04-01",
+ "gguf_sources": [],
+ "capabilities": [
+ "vision"
+ ]
+ },
{
"name": "google/gemma-4-31B-it",
"provider": "Google",
@@ -19121,4 +19144,4 @@
],
"_discovered": true
}
-]
\ No newline at end of file
+]
diff --git a/services/memory/skill_extractor.py b/services/memory/skill_extractor.py
index e763bca4c..79e4c67c2 100644
--- a/services/memory/skill_extractor.py
+++ b/services/memory/skill_extractor.py
@@ -243,6 +243,20 @@ async def maybe_extract_skill(
logger.debug("[skill-extract] '%s' already exists — dropped as duplicate", title)
return None
+ # Auto-publish gate: if the user has `auto_approve_skills` on, the
+ # newly-extracted skill is created `published` immediately rather
+ # than waiting for the next audit batch. The audit still runs later
+ # and can demote it back to `draft` (or delete) on failure. Default
+ # ON matches the UI label "Auto-approve skills".
+ _initial_status = "draft"
+ try:
+ from routes.prefs_routes import _load_for_user as _load_prefs
+ _prefs = _load_prefs(owner) or {}
+ if _prefs.get("auto_approve_skills", True):
+ _initial_status = "published"
+ except Exception:
+ pass
+
entry = skills_manager.add_skill(
title=title,
problem=data.get("problem", ""),
@@ -253,6 +267,7 @@ async def maybe_extract_skill(
confidence=data.get("confidence", 0.7),
session_id=getattr(session, "session_id", None),
owner=owner,
+ status=_initial_status,
)
try:
from src.event_bus import fire_event
diff --git a/src/tool_implementations.py b/src/tool_implementations.py
index 548f6f0f5..5e62e686c 100644
--- a/src/tool_implementations.py
+++ b/src/tool_implementations.py
@@ -664,6 +664,17 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict:
proc = args.get("steps") or []
if not proc and not args.get("body_extra") and not args.get("solution"):
return {"error": "procedure (or solution body) is required", "exit_code": 1}
+ # Same auto-publish gate as the extractor path — when the user
+ # has auto_approve_skills on and the caller didn't pin an explicit
+ # status, publish immediately. Audit later demotes/removes on fail.
+ _status_arg = args.get("status")
+ if not _status_arg:
+ try:
+ from routes.prefs_routes import _load_for_user as _load_prefs
+ _prefs = _load_prefs(owner) or {}
+ _status_arg = "published" if _prefs.get("auto_approve_skills", True) else "draft"
+ except Exception:
+ _status_arg = "draft"
entry = sm.add_skill(
name=args.get("name"),
description=(args.get("description") or args.get("title") or "").strip(),
@@ -677,7 +688,7 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict:
procedure=proc,
pitfalls=args.get("pitfalls") or [],
verification=args.get("verification") or [],
- status=args.get("status") or "draft",
+ status=_status_arg,
version=args.get("version") or "1.0.0",
confidence=args.get("confidence", 0.8),
source=args.get("source", "learned"),
@@ -2621,8 +2632,90 @@ async def _cookbook_env_for_host(host: str) -> Dict[str, Any]:
}
-async def _cookbook_register_task(session_id: str, model: str, host: str,
- cmd: str, task_type: str = "serve") -> bool:
+def _infer_serve_port(cmd: str) -> int:
+ """Infer likely listen port from a serve command."""
+ if not cmd:
+ return 8080
+ m = re.search(r"--port\\s+(\\d+)", cmd)
+ if m:
+ try:
+ return int(m.group(1))
+ except Exception:
+ pass
+ m = re.search(r"OLLAMA_HOST=[^\\s]*?:(\\d+)", cmd)
+ if m:
+ try:
+ return int(m.group(1))
+ except Exception:
+ pass
+ if "ollama" in cmd:
+ return 11434
+ return 8080
+
+
+def _infer_serve_host(host: str | None) -> tuple[str, bool]:
+ """Return (host, container_local) for registering a served endpoint."""
+ if not (host or "").strip():
+ return "localhost", True
+ base_host = host.split("@", 1)[-1] if "@" in host else host
+ return base_host, False
+
+
+async def _ensure_served_endpoint(
+ *,
+ model: str,
+ cmd: str,
+ host: str | None,
+) -> Dict[str, Any]:
+ """Register/fetch a model endpoint for a running serve session."""
+ import httpx
+ endpoint_host, container_local = _infer_serve_host(host)
+ port = _infer_serve_port(cmd)
+ base_url = f"http://{endpoint_host}:{port}/v1"
+ short_name = model.split("/")[-1] if "/" in model else model
+ is_image = "diffusion_server.py" in (cmd or "")
+ payload = {
+ "name": short_name if not is_image else f"{short_name} (image)",
+ "base_url": base_url,
+ "skip_probe": "true",
+ "model_type": "image" if is_image else "llm",
+ "container_local": "true" if container_local else "false",
+ }
+ try:
+ async with httpx.AsyncClient(timeout=30) as client:
+ resp = await client.post(
+ f"{_COOKBOOK_BASE}/api/model-endpoints",
+ data=payload,
+ headers=_internal_headers(),
+ )
+ data = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
+ if resp.status_code >= 400:
+ logger.debug(
+ f"ensure endpoint failed for {model!r}: status={resp.status_code} data={data}"
+ )
+ return {"added": False, "endpoint_id": "", "base_url": base_url, "error": data}
+ ep_id = data.get("id") if isinstance(data, dict) else None
+ return {
+ "added": bool(ep_id),
+ "endpoint_id": ep_id or "",
+ "base_url": base_url,
+ "data": data,
+ }
+ except Exception as e:
+ logger.debug(f"ensure endpoint exception for {model!r}: {e}")
+ return {"added": False, "endpoint_id": "", "base_url": base_url, "error": str(e)}
+
+
+async def _cookbook_register_task(
+ session_id: str,
+ model: str,
+ host: str,
+ cmd: str,
+ task_type: str = "serve",
+ *,
+ endpoint_added: bool = False,
+ endpoint_id: str = "",
+) -> bool:
"""Append a task entry to cookbook_state.json after the agent
launches via /api/model/serve or /api/model/download. The route
spawns tmux but leaves state-writing to the UI; the agent needs to
@@ -2672,7 +2765,8 @@ async def _cookbook_register_task(session_id: str, model: str, host: str,
"sshPort": "",
"platform": "linux",
"_serveReady": False,
- "_endpointAdded": False,
+ "_endpointAdded": bool(endpoint_added),
+ "_endpointId": endpoint_id or "",
})
state["tasks"] = tasks
try:
@@ -3008,7 +3102,12 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict:
if _servers.get("default_host"):
host = _servers["default_host"]
_host_defaulted = True
+ backend = (args.get("backend") or "").strip().lower()
+ if not backend and "/" not in repo_id and ":" in repo_id:
+ backend = "ollama"
payload = {"repo_id": repo_id}
+ if backend:
+ payload["backend"] = backend
if host:
payload["remote_host"] = host
if args.get("include"):
@@ -3028,12 +3127,20 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict:
sid = data.get("session_id", "?")
registered = await _cookbook_register_task(
session_id=sid, model=repo_id, host=host,
- cmd=f"hf download {repo_id}", task_type="download",
+ cmd=(f"ollama pull {repo_id}" if backend == "ollama" else f"hf download {repo_id}"),
+ task_type="download",
)
note = "" if registered else " (state-write failed — download may not show in UI)"
where = host or "local"
default_note = " (defaulted to the cookbook's selected server — pass host= or local=true to override)" if _host_defaulted else ""
- return {"output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}", "session_id": sid, "host": host, "exit_code": 0}
+ return {
+ "output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}",
+ "session_id": sid,
+ "host": host,
+ "task_type": "download",
+ "phase": "running",
+ "exit_code": 0,
+ }
return {"error": data.get("error", "Download failed"), "exit_code": 1}
except Exception as e:
return {"error": str(e), "exit_code": 1}
@@ -3102,12 +3209,28 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict:
data = resp.json()
if data.get("ok"):
sid = data.get("session_id", "?")
+ endpoint_id = data.get("endpoint_id") or ""
+ if endpoint_id:
+ endpoint_added = True
+ else:
+ endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host)
+ endpoint_added = bool(endpoint_meta.get("added"))
+ endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id
registered = await _cookbook_register_task(
session_id=sid, model=repo_id,
host=host, cmd=cmd, task_type="serve",
+ endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
)
note = "" if registered else " (state-write failed — task may not show in UI)"
- return {"output": f"Serving {repo_id} (session: {sid}){note}", "session_id": sid, "exit_code": 0}
+ return {
+ "output": f"Serving {repo_id} (session: {sid}){note}",
+ "session_id": sid,
+ "task_type": "serve",
+ "phase": "running",
+ "host": host,
+ "endpoint_id": endpoint_id,
+ "exit_code": 0,
+ }
# FastAPI HTTPException puts the message under `detail`, not `error`.
# Surface BOTH so the agent sees "Invalid characters in cmd" (from
# _validate_serve_cmd rejecting `&&`/`source`/`cd`) instead of
@@ -3804,7 +3927,8 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict:
if env_cfg.get("gpus"): payload["gpus"] = env_cfg["gpus"]
if env_cfg.get("hf_token"): payload["hf_token"] = env_cfg["hf_token"]
if env_cfg.get("platform"): payload["platform"] = env_cfg["platform"]
- if env_cfg.get("ssh_port"): payload["ssh_port"] = env_cfg["ssh_port"]
+ if env_cfg.get("ssh_port"):
+ payload["ssh_port"] = env_cfg["ssh_port"]
try:
async with httpx.AsyncClient(timeout=30) as client:
@@ -3813,12 +3937,20 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict:
data = resp.json()
if data.get("ok"):
sid = data.get("session_id", "?")
+ endpoint_id = data.get("endpoint_id") or ""
+ if endpoint_id:
+ endpoint_added = True
+ else:
+ endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host)
+ endpoint_added = bool(endpoint_meta.get("added"))
+ endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id
registered = await _cookbook_register_task(
session_id=sid, model=repo_id, host=host,
cmd=cmd, task_type="serve",
+ endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
)
note = "" if registered else " (state-write failed — task may not show in UI)"
- return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "exit_code": 0}
+ return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "host": host, "endpoint_id": endpoint_id, "exit_code": 0}
return {"error": data.get("error", "Serve failed"), "exit_code": 1}
except Exception as e:
return {"error": str(e), "exit_code": 1}
diff --git a/static/index.html b/static/index.html
index ec4af199f..ae3092659 100644
--- a/static/index.html
+++ b/static/index.html
@@ -1492,21 +1492,7 @@
-
-
Agent
-
Controls for the agent tool loop.
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
- Quickstart
-
-
-
-
-
-
+
+
+
@@ -2116,19 +2109,33 @@
-
-
+
+
+
+
+
-
+
+
+
-
+
@@ -2136,7 +2143,15 @@
-
Added Models (Endpoints)
+
Added Models (Endpoints)
+
+
+
+
Manage the endpoints you've added.
@@ -2167,10 +2182,45 @@
+
+
API Tokens
+
Bearer tokens for external integrations (scripts, Codex, headless agent runs). Token value shown ONCE on create — copy it then.
+
+
+
+
+
+
+
+
+
Copy now — this is the only time you'll see it:
+
+
+
+
+
+
Agent
+
Controls for the agent tool loop.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Agent loop
+
Supervisor ladder. When on, every effectful agent turn that claims done is verified; on FAIL the ladder escalates verify → different method → teacher → stop-with-blocker, each rung visible in chat. Teacher rung requires teacher_model to be set.
+
Built-in Tools
Enable or disable tools available to the AI agent.
diff --git a/static/js/admin.js b/static/js/admin.js
index e4a39adf3..82b90b737 100644
--- a/static/js/admin.js
+++ b/static/js/admin.js
@@ -1149,6 +1149,144 @@ function initEndpointForm() {
}
}
+ // API Key reveal toggle. The key inputs are hidden by default so the Add
+ // form reads as a single action row; the Key button toggles the input row
+ // and flips aria-expanded for screen readers / CSS pseudo-classes.
+ const _wireKeyToggle = (btnId, rowId) => {
+ const btn = el(btnId);
+ const row = el(rowId);
+ if (!btn || !row) return;
+ btn.addEventListener('click', () => {
+ const showing = row.style.display !== 'none';
+ row.style.display = showing ? 'none' : '';
+ btn.setAttribute('aria-expanded', showing ? 'false' : 'true');
+ btn.style.opacity = showing ? '0.75' : '1';
+ if (!showing) {
+ const inp = row.querySelector('input');
+ if (inp) inp.focus();
+ }
+ });
+ };
+ _wireKeyToggle('adm-epLocalKeyBtn', 'adm-epLocalApiKey-row');
+ _wireKeyToggle('adm-epApiKeyBtn', 'adm-epApiKey-row');
+
+ // ── Added Models toolbar: Probe + Clear offline ────────────────────
+ // Both buttons act over the currently-rendered endpoint list. The
+ // online/offline marker is stamped on each row's [data-adm-ep-online]
+ // attribute by loadEndpoints(), so both buttons just iterate the DOM
+ // without re-fetching anything they don't already have.
+ const _refreshOfflineCount = () => {
+ const lbl = el('adm-epOfflineCount');
+ if (!lbl) return;
+ const n = document.querySelectorAll('[data-adm-ep-id] [data-adm-ep-online="0"]').length;
+ lbl.textContent = n > 0 ? `(${n})` : '';
+ // Keep the button enabled even when there are no offline rows — a
+ // click on the empty case fires a toast instead of feeling dead.
+ const btn = el('adm-epClearOfflineBtn');
+ if (btn) btn.style.opacity = n === 0 ? '0.55' : '0.85';
+ };
+ // Wire after every loadEndpoints() run by patching the render hook —
+ // simplest path: MutationObserver on the two list containers.
+ const _obsRoots = ['adm-epList-local', 'adm-epList-api']
+ .map(id => el(id)).filter(Boolean);
+ if (_obsRoots.length) {
+ const mo = new MutationObserver(_refreshOfflineCount);
+ _obsRoots.forEach(r => mo.observe(r, { childList: true, subtree: true }));
+ _refreshOfflineCount();
+ }
+
+ const probeAllBtn = el('adm-epProbeAllBtn');
+ if (probeAllBtn) {
+ probeAllBtn.addEventListener('click', async () => {
+ probeAllBtn.disabled = true;
+ const origHTML = probeAllBtn.innerHTML;
+ probeAllBtn.innerHTML = 'Probing…';
+ try {
+ // Hit the bulk local probe (same one the model picker uses).
+ await fetch('/api/model-endpoints/probe-local', { credentials: 'same-origin' }).catch(() => {});
+ // Then per-endpoint /probe for the rest so API/cloud endpoints
+ // refresh too. Parallel — capped to 6 at a time so we don't
+ // hammer the backend on a big list.
+ const ids = Array.from(document.querySelectorAll('[data-adm-ep-id]')).map(r => r.getAttribute('data-adm-ep-id')).filter(Boolean);
+ const lane = async (id) => {
+ try { await fetch(`/api/model-endpoints/${id}/probe`, { credentials: 'same-origin' }); } catch (_) {}
+ };
+ const queue = [...ids];
+ const workers = Array.from({length: Math.min(6, queue.length)}, () => (async () => {
+ while (queue.length) {
+ const id = queue.shift();
+ if (id) await lane(id);
+ }
+ })());
+ await Promise.all(workers);
+ await loadEndpoints();
+ if (uiModule && uiModule.showToast) uiModule.showToast('Endpoint status refreshed', 1800);
+ } finally {
+ probeAllBtn.innerHTML = origHTML;
+ probeAllBtn.disabled = false;
+ }
+ });
+ }
+
+ const clearOfflineBtn = el('adm-epClearOfflineBtn');
+ if (clearOfflineBtn) {
+ clearOfflineBtn.addEventListener('click', async () => {
+ const offlineBtns = Array.from(document.querySelectorAll('[data-adm-del-ep][data-adm-ep-online="0"]'));
+ const ids = offlineBtns.map(b => b.getAttribute('data-adm-del-ep')).filter(Boolean);
+ if (!ids.length) {
+ if (uiModule && uiModule.showToast) {
+ uiModule.showToast('No offline endpoints — nothing to clear', 1800);
+ }
+ return;
+ }
+ const confirmMsg = ids.length === 1
+ ? 'Remove 1 offline endpoint?'
+ : `Remove ${ids.length} offline endpoints?`;
+ if (uiModule && uiModule.styledConfirm) {
+ const ok = await uiModule.styledConfirm(confirmMsg, { confirmText: 'Remove', danger: true });
+ if (!ok) return;
+ } else if (!confirm(confirmMsg)) {
+ return;
+ }
+ clearOfflineBtn.disabled = true;
+ // Optimistic UI: pull rows immediately, then fire the DELETEs.
+ offlineBtns.forEach(b => {
+ const row = b.closest('[data-adm-ep-id]');
+ if (row) row.remove();
+ });
+ await Promise.all(ids.map(id =>
+ fetch('/api/model-endpoints/' + id, { method: 'DELETE', credentials: 'same-origin' }).catch(() => {})
+ ));
+ try { await loadEndpoints(); } catch (_) {}
+ _refreshOfflineCount();
+ if (uiModule && uiModule.showToast) uiModule.showToast(`Removed ${ids.length} offline endpoint${ids.length === 1 ? '' : 's'}`, 1800);
+ });
+ }
+
+ // Clear-on-focus for the API key inputs. The fields are type=password so the
+ // value is masked; users can't see what's there to edit it in place, so the
+ // expected gesture is "click in, type new key". Wiping on focus removes the
+ // select-all-and-delete dance.
+ const _wireClearOnFocus = (id) => {
+ const inp = el(id);
+ if (!inp) return;
+ inp.addEventListener('focus', () => {
+ if (inp.value) inp.value = '';
+ });
+ };
+ _wireClearOnFocus('adm-epLocalApiKey');
+ _wireClearOnFocus('adm-epApiKey');
+
+ // Drop the Ollama provider logo into the Ollama Quickstart button. Reuses
+ // the same SVG the provider picker uses, so brand parity stays free.
+ try {
+ const _ollamaLogoSlot = document.querySelector('#adm-epOllamaBtn .adm-ollama-logo');
+ if (_ollamaLogoSlot) {
+ const svg = providerLogo('ollama') || '';
+ if (svg) _ollamaLogoSlot.innerHTML = svg;
+ }
+ } catch (_) {}
+
// Local "Add" button — sibling form for self-hosted base URLs.
const localAddBtn = el('adm-epLocalAddBtn');
const localTestBtn = el('adm-epLocalTestBtn');
@@ -2073,17 +2211,28 @@ async function loadTokens() {
}
function initTokenForm() {
- el('adm-tokenAddBtn').addEventListener('click', async () => {
+ const addBtn = el('adm-tokenAddBtn');
+ if (!addBtn || addBtn.dataset.bound) return;
+ addBtn.dataset.bound = '1';
+ addBtn.addEventListener('click', async () => {
const msg = el('adm-tokenMsg');
const reveal = el('adm-tokenReveal');
msg.textContent = ''; msg.className = ''; reveal.style.display = 'none';
const name = el('adm-tokenName').value.trim();
if (!name) { msg.textContent = 'Token name is required'; msg.className = 'admin-error'; return; }
const fd = new FormData(); fd.append('name', name);
+ const scopes = (el('adm-tokenScopes')?.value || '').trim();
+ if (scopes) fd.append('scopes', scopes);
try {
const res = await fetch('/api/tokens', { method: 'POST', body: fd, credentials: 'same-origin' });
const data = await res.json();
- if (res.ok) { el('adm-tokenValue').textContent = data.token; reveal.style.display = ''; el('adm-tokenName').value = ''; loadTokens(); }
+ if (res.ok) {
+ el('adm-tokenValue').textContent = data.token;
+ reveal.style.display = '';
+ el('adm-tokenName').value = '';
+ if (el('adm-tokenScopes')) el('adm-tokenScopes').value = '';
+ loadTokens();
+ }
else { msg.textContent = data.detail || 'Failed'; msg.className = 'admin-error'; }
} catch (e) { msg.textContent = 'Request failed'; msg.className = 'admin-error'; }
});
@@ -2344,7 +2493,7 @@ function initDangerZone() {
═══════════════════════════════════════════ */
function initAll() {
modalEl = el('settings-modal');
- const inits = [initSignupToggle, initAddUser, initEndpointForm, initMcpForm, initCalDAV, initBackup, initDangerZone, () => settingsModule.initIntegrations()];
+ const inits = [initSignupToggle, initAddUser, initEndpointForm, initMcpForm, initCalDAV, initBackup, initDangerZone, initTokenForm, () => settingsModule.initIntegrations()];
for (const fn of inits) {
try { fn(); } catch (e) { console.error('Admin init error in', fn.name || 'anonymous', e); }
}
@@ -2357,6 +2506,7 @@ function refreshAll() {
loadEndpoints();
loadBuiltinTools();
loadMcpServers();
+ loadTokens();
}
/* ═══════════════════════════════════════════
diff --git a/static/js/chatRenderer.js b/static/js/chatRenderer.js
index fc7ed1aeb..9a5c6f78b 100644
--- a/static/js/chatRenderer.js
+++ b/static/js/chatRenderer.js
@@ -2118,6 +2118,28 @@ export function addMessage(role, content, modelName, metadata) {
return lastWrap;
}
+ // --- Wake-task / supervisor system check-in ---
+ // The self-wake mechanism injects "Did you finish?" as a user message
+ // (or persisted history shows a "[Task] Self-check: " envelope)
+ // so the agent loop re-enters and re-checks status. Render as a
+ // normal user-style bubble — same chrome as a real user message,
+ // just with role "Supervisor" and a short summary body — instead of
+ // a slim system chip. Matches chat style and integrates cleanly
+ // into the conversation flow.
+ let _isWakeCheck = !!(metadata?.wake_check_in || metadata?.hidden_from_user_view);
+ if (!_isWakeCheck && typeof textRaw === 'string') {
+ // Also catch historical messages persisted as "[Task] Self-check: "
+ // (older wake tasks that didn't set wake_check_in metadata).
+ if (/^\s*\[Task\]\s+Self-check:/i.test(textRaw)) {
+ _isWakeCheck = true;
+ }
+ }
+ if (_isWakeCheck) {
+ // Supervisor self-check messages are an internal control signal —
+ // skip rendering entirely so they don't show up in the conversation.
+ return null;
+ }
+
// --- Standard single-bubble message ---
const wrap = document.createElement('div');
wrap.className = 'msg ' + (role === 'user' ? 'msg-user' : 'msg-ai');
diff --git a/static/js/cookbook-diagnosis.js b/static/js/cookbook-diagnosis.js
index 19512ab50..24d5770e7 100644
--- a/static/js/cookbook-diagnosis.js
+++ b/static/js/cookbook-diagnosis.js
@@ -610,12 +610,47 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
? `Suggested action: ${fixes[0].label}.`
: 'Suggested action: copy the error and adjust the serve settings.');
- // Simplified diagnosis card: just the error message + suggestion + fix
- // button(s). Removed the fold toggle, copy button, and × dismiss — they
- // made the card noisy without earning their keep. _diagCollapsed is kept
- // as a stub so callers don't have to change.
panel._diagCollapsed = false;
+ // Top-right toolbar: Copy bundle + × dismiss. Restored after user feedback
+ // — without them there's no way to quietly close a stale diagnosis or grab
+ // the full error+context for a forum/discord paste.
+ const toolbar = document.createElement('div');
+ toolbar.className = 'cookbook-diag-toolbar';
+ toolbar.style.cssText = 'display:flex;justify-content:flex-end;align-items:center;gap:4px;margin-bottom:-2px;';
+
+ const copyBtn = document.createElement('button');
+ copyBtn.type = 'button';
+ copyBtn.className = 'cookbook-diag-copy';
+ copyBtn.title = 'Copy diagnosis details';
+ copyBtn.setAttribute('aria-label', 'Copy diagnosis');
+ copyBtn.innerHTML = '';
+ copyBtn.addEventListener('click', async (e) => {
+ e.stopPropagation();
+ const bundle = _diagnosisCopyBundle(task, diagnosis, sourceText, suggestionText);
+ try {
+ await navigator.clipboard.writeText(bundle);
+ copyBtn.classList.add('copied');
+ setTimeout(() => { if (copyBtn.isConnected) copyBtn.classList.remove('copied'); }, 1200);
+ } catch (_) {}
+ });
+
+ const dismissBtn = document.createElement('button');
+ dismissBtn.type = 'button';
+ dismissBtn.className = 'cookbook-diag-dismiss';
+ dismissBtn.title = 'Dismiss diagnosis';
+ dismissBtn.setAttribute('aria-label', 'Dismiss');
+ dismissBtn.textContent = '×';
+ dismissBtn.addEventListener('click', (e) => {
+ e.stopPropagation();
+ panel._diagDismissed = diagnosis.message;
+ _clearDiagnosis(panel);
+ });
+
+ toolbar.appendChild(copyBtn);
+ toolbar.appendChild(dismissBtn);
+ diag.appendChild(toolbar);
+
const body = document.createElement('div');
body.className = 'cookbook-diag-body';
const msg = document.createElement('div');
diff --git a/static/js/cookbook-hwfit.js b/static/js/cookbook-hwfit.js
index 74571bae9..d8652d02e 100644
--- a/static/js/cookbook-hwfit.js
+++ b/static/js/cookbook-hwfit.js
@@ -416,9 +416,11 @@ function _hwfitShowError(list, host, detail) {
if (rb) rb.addEventListener('click', () => { _resetGpuToggleState(); _hwfitFetch(true); });
}
-// Client-side "Engine" filter (llama.cpp / vLLM / SGLang). Empty = show all.
-// Uses the same _detectBackend() the serve commands use, so what you filter to
-// is exactly what would be launched. Pure view filter — no refetch needed.
+// Client-side "Engine" filter (llama.cpp / vLLM / SGLang / Ollama). Empty =
+// show all. Uses the same _detectBackend() the serve commands use, so what you
+// filter to is exactly what would be launched. Pure view filter — no refetch
+// needed. Ollama rows are merged into the main list (see _ensureOllamaLib +
+// _ollamaToHwfitRows below) so the filter handles all engines uniformly.
function _applyEngineFilter(models) {
const want = document.getElementById('hwfit-engine')?.value || '';
if (!want || !Array.isArray(models)) return models || [];
@@ -427,6 +429,86 @@ function _applyEngineFilter(models) {
});
}
+// Ollama library cache (per-page). Filled lazily on first _hwfitFetch; the raw
+// list is the same shape returned by /api/cookbook/ollama/library, then turned
+// into per-tag hwfit rows so they slot into the main list grid alongside HF
+// scan results.
+let _ollamaLibCache = null;
+async function _ensureOllamaLib() {
+ if (_ollamaLibCache) return _ollamaLibCache;
+ try {
+ const res = await fetch('/api/cookbook/ollama/library');
+ const data = await res.json();
+ _ollamaLibCache = Array.isArray(data?.models) ? data.models : [];
+ } catch { _ollamaLibCache = []; }
+ return _ollamaLibCache;
+}
+
+// Convert an Ollama library entry's sizes into per-tag hwfit rows. Shape
+// matches what _hwfitRenderList expects (fit_level, parameter_count,
+// required_gb, score, …) so the rows render identically to HF results.
+function _olParseSize(s) {
+ // "14b" → 14, "1.5b" → 1.5, "8x7b" → 56 (rough), "135m" → 0.135, "latest" → null
+ if (!s) return null;
+ const low = s.toLowerCase();
+ let m = low.match(/^(\d+(?:\.\d+)?)x(\d+(?:\.\d+)?)b$/);
+ if (m) return parseFloat(m[1]) * parseFloat(m[2]);
+ m = low.match(/^(\d+(?:\.\d+)?)b$/);
+ if (m) return parseFloat(m[1]);
+ m = low.match(/^(\d+(?:\.\d+)?)m$/);
+ if (m) return parseFloat(m[1]) / 1000;
+ return null;
+}
+function _ollamaToHwfitRows(libModels, vramAvail, ramAvail) {
+ const out = [];
+ if (!Array.isArray(libModels)) return out;
+ for (const m of libModels) {
+ const sizes = (Array.isArray(m.sizes) && m.sizes.length) ? m.sizes : ['latest'];
+ for (const sz of sizes) {
+ const params = _olParseSize(sz);
+ // Ollama default GGUF is ~Q4_K_M. Rough VRAM estimate: 0.6 GB / B.
+ const vramGb = params ? params * 0.6 : 0;
+ let fitLevel = 'no_fit';
+ if (vramGb && vramAvail) {
+ if (vramGb <= vramAvail * 0.6) fitLevel = 'perfect';
+ else if (vramGb <= vramAvail) fitLevel = 'good';
+ else if (ramAvail && vramGb <= ramAvail) fitLevel = 'marginal';
+ else fitLevel = 'too_tight';
+ } else if (vramGb && ramAvail && vramGb <= ramAvail) {
+ fitLevel = 'marginal';
+ }
+ const tag = `${m.name}:${sz}`;
+ const paramsLabel = params
+ ? (params >= 1 ? params.toFixed(params >= 10 ? 0 : 1) + 'B' : (params * 1000).toFixed(0) + 'M')
+ : '?';
+ // A modest score so Ollama rows still sort sensibly in the default
+ // score view — bigger models get a slightly higher base, but they
+ // always come in below well-scored HF results. Sort by Fit or VRAM
+ // to surface them more aggressively.
+ const score = params ? Math.min(30 + params * 0.3, 60) : 25;
+ out.push({
+ name: tag,
+ repo_id: tag,
+ quant: 'Q4_K_M',
+ parameter_count: paramsLabel,
+ params_b: params || 0,
+ required_gb: vramGb,
+ fit_level: fitLevel,
+ score,
+ speed_tps: 0,
+ context: 0,
+ is_gguf: true,
+ backend: 'ollama',
+ _isOllama: true,
+ _olName: m.name,
+ _olSize: sz,
+ _description: m.description || '',
+ });
+ }
+ }
+ return out;
+}
+
export async function _hwfitFetch(fresh = false) {
const _tk = ++_hwfitFetchToken;
const useCase = document.getElementById('hwfit-usecase')?.value || '';
@@ -475,7 +557,12 @@ export async function _hwfitFetch(fresh = false) {
_setLastCacheHost(remoteKey);
const _cacheSrv = _serverByVal(_envState.remoteServerKey || remoteHost);
const _cachePort = _cacheSrv?.port || '';
- const _cacheParams = new URLSearchParams({ host: remoteHost }); if (_cachePort) _cacheParams.set('ssh_port', _cachePort); if (_cacheSrv?.platform) _cacheParams.set('platform', _cacheSrv.platform);
+ const _cacheParams = new URLSearchParams();
+ if (remoteHost) {
+ _cacheParams.set('host', remoteHost);
+ if (_cachePort) _cacheParams.set('ssh_port', _cachePort);
+ if (_cacheSrv?.platform) _cacheParams.set('platform', _cacheSrv.platform);
+ }
fetch(`/api/model/cached?${_cacheParams}`, { credentials: 'same-origin' })
.then(r => r.json())
.then(d => {
@@ -543,7 +630,18 @@ export async function _hwfitFetch(fresh = false) {
// A newer scan started while this one was in flight (user switched servers
// mid-probe) — drop this stale response so it can't clobber the new one.
if (_tk !== _hwfitFetchToken) { try { wp.destroy(); } catch {} return; }
- if (!res.ok) throw new Error(res.statusText);
+ if (!res.ok) {
+ const body = await res.text().catch(() => '');
+ let msg = '';
+ try {
+ const payload = JSON.parse(body);
+ msg = payload && (payload.detail || payload.error || payload.message);
+ } catch {
+ msg = body;
+ }
+ msg = typeof msg === 'string' ? msg.trim() : '';
+ throw new Error(`HTTP ${res.status} ${res.statusText}${msg ? `: ${msg}` : ''}`);
+ }
let data = await res.json();
if (_tk !== _hwfitFetchToken) { try { wp.destroy(); } catch {} return; }
if (!isImageMode && quantPref && !data.error && Array.isArray(data.models) && data.models.length === 0) {
@@ -583,6 +681,23 @@ export async function _hwfitFetch(fresh = false) {
if (!_cached) { _hwfitShowError(list, remoteHost, data.error); if (hw) hw.innerHTML = ''; }
return;
}
+ // Merge Ollama library rows into the main list so they appear with the
+ // same Fit/Param/Quant/VRAM/Mode columns as HF results and respond to the
+ // Engine filter. Skipped in image-gen mode (Ollama doesn't serve diffusers).
+ if (!isImageMode) {
+ const _vramAvail = data.system?.gpu_vram_gb || 0;
+ const _ramAvail = data.system?.total_ram_gb || 0;
+ const _lib = await _ensureOllamaLib();
+ const _olRows = _ollamaToHwfitRows(_lib, _vramAvail, _ramAvail);
+ // Search filter on Ollama rows: HF API already filters by search; do the
+ // same client-side over Ollama name + description so the search box
+ // works consistently across both sources.
+ const _s = (search || '').trim().toLowerCase();
+ const _olFiltered = _s
+ ? _olRows.filter(r => r.name.toLowerCase().includes(_s) || (r._description || '').toLowerCase().includes(_s))
+ : _olRows;
+ data.models = (data.models || []).concat(_olFiltered);
+ }
_hwfitCache = data;
_hwfitRenderHw(hw, data.system);
// Propagate local platform from hardware probe so _isWindows(task) works
@@ -964,14 +1079,36 @@ export function _hwfitRenderList(el, models) {
html += `
`;
}
el.innerHTML = html;
- // Click row → expand inline action panel
+ // Click row → expand inline action panel. Exception: Ollama rows skip the
+ // expand panel (no HF metadata to power it) and just fill the Download
+ // input with the `:` tag — one click → ready to pull.
el.querySelectorAll('.hwfit-row:not(.hwfit-header)').forEach(row => {
row.addEventListener('click', () => {
const name = row.dataset.model;
if (!name) return;
- // Find model data from cache
const modelData = (_hwfitCache?.models || []).find(m => m.name === name);
if (!modelData) return;
+ if (modelData._isOllama) {
+ // Force-open the Download card if it's been collapsed — otherwise
+ // filling the (hidden) input silently swallows the click.
+ const dlBody = document.getElementById('cookbook-download-card-body');
+ const dlArrow = document.getElementById('cookbook-download-card-arrow');
+ if (dlBody && dlBody.style.display === 'none') {
+ dlBody.style.display = 'block';
+ if (dlArrow) dlArrow.style.transform = 'rotate(90deg)';
+ }
+ const dlInput = document.getElementById('cookbook-dl-repo');
+ if (dlInput) {
+ dlInput.value = modelData.name;
+ dlInput.focus();
+ // Briefly highlight so the user sees what got filled even when the
+ // download card sits far above the (long) hwfit list.
+ dlInput.classList.add('cookbook-dl-flash');
+ setTimeout(() => dlInput.classList.remove('cookbook-dl-flash'), 800);
+ dlInput.scrollIntoView({ behavior: 'smooth', block: 'center' });
+ }
+ return;
+ }
_expandModelRow(row, modelData);
});
});
@@ -1297,7 +1434,7 @@ export function _hwfitInit() {
if (sort) sort.addEventListener('change', () => _hwfitFetch());
if (qpref) qpref.addEventListener('change', () => _hwfitFetch());
// Engine filter is a pure client-side view filter over the already-fetched
- // list, so just re-render from cache instead of re-probing hardware.
+ // list (HF + Ollama merged), so just re-render from cache.
const engine = document.getElementById('hwfit-engine');
if (engine) engine.addEventListener('change', () => {
const list = document.getElementById('hwfit-list');
@@ -1694,6 +1831,15 @@ export function _hwfitInit() {
saveBtn.addEventListener('click', () => {
_syncServers();
_rebuildServerSelect();
+ // Broadcast for anything outside the settings tab that depends on
+ // the server list (Serve dialog host picker, Running tasks, etc.).
+ // Without this the user had to hard-refresh to see the new entry
+ // in those other places.
+ try {
+ document.dispatchEvent(new CustomEvent('cookbook:servers-changed', {
+ detail: { servers: _envState.servers.slice() },
+ }));
+ } catch (_) {}
saveBtn.classList.add('saved');
saveBtn.innerHTML = 'Saved';
});
@@ -1713,6 +1859,11 @@ export function _hwfitInit() {
entry.remove();
_syncServers();
_rebuildServerSelect();
+ try {
+ document.dispatchEvent(new CustomEvent('cookbook:servers-changed', {
+ detail: { servers: _envState.servers.slice() },
+ }));
+ } catch (_) {}
_hwfitCache = null;
_hwfitFetch();
});
diff --git a/static/js/cookbook.js b/static/js/cookbook.js
index c1395179c..03319d0de 100644
--- a/static/js/cookbook.js
+++ b/static/js/cookbook.js
@@ -72,7 +72,7 @@ function _platformIcon(platform) {
return '';
}
-export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', remoteServerKey: '', servers: [], modelPaths: [], platform: '', defaultServer: '' };
+export let _envState = { env: 'none', envPath: '', hfToken: '', hfTokenConfigured: false, hfTokenMasked: '', gpus: '', remoteHost: '', servers: [], modelPaths: [], platform: '', defaultServer: '' };
let _lastCacheHostVal = null;
let _cookbookOpeningSpinners = [];
export function _lastCacheHost() { return _lastCacheHostVal; }
@@ -89,8 +89,8 @@ function _setCookbookOpening(on) {
].filter(Boolean);
if (!on) {
_cookbookOpeningSpinners.forEach(({ spinner, wrap, target }) => {
- try { spinner?.stop?.(); } catch { }
- try { wrap?.remove?.(); } catch { }
+ try { spinner?.stop?.(); } catch {}
+ try { wrap?.remove?.(); } catch {}
target?.classList?.remove('cookbook-opening');
});
_cookbookOpeningSpinners = [];
@@ -114,44 +114,18 @@ function _setCookbookOpening(on) {
// True for the local server entry (empty / "local" / "localhost" host).
function _isLocalEntry(s) { return !s || !s.host || s.host === 'local' || s.host.toLowerCase() === 'localhost'; }
-// Resolve a dropdown option value to a server entry. New option values are
-// stable per-profile keys, so same-host SSH profiles stay distinguishable.
-// Host strings and numeric indices remain accepted for stale saved state.
-export function _serverKey(s) {
- if (_isLocalEntry(s)) return 'local';
- return 'srv:' + [
- s?.name || '',
- s?.host || '',
- s?.port || '',
- s?.envPath || '',
- s?.platform || '',
- ].map(v => encodeURIComponent(String(v).trim())).join('|');
-}
-
+// Resolve a dropdown option value to a server entry. Option values are the
+// stable HOST string ('local' for the local box) — NOT array indices — because
+// `_envState.servers` gets deduped/reordered, which made index-based selection
+// silently resolve to the wrong (or local) server. Accepts a numeric index too
+// for backwards-compat with any stale value.
function _serverByVal(val) {
if (val == null || val === 'local' || val === '') return null;
- const raw = String(val);
- let s = _envState.servers.find(x => _serverKey(x) === raw);
- if (!s) s = _envState.servers.find(x => x.host === raw);
+ let s = _envState.servers.find(x => x.host === val);
if (!s && /^\d+$/.test(String(val))) s = _envState.servers[parseInt(val)];
return s || null;
}
-export function _selectedServer() {
- if (_envState.remoteServerKey) {
- const keyed = _serverByVal(_envState.remoteServerKey);
- if (keyed) return keyed;
- }
- if (_envState.remoteHost) return _envState.servers.find(s => s.host === _envState.remoteHost) || null;
- return null;
-}
-
-export function _currentServerValue() {
- const selected = _selectedServer();
- if (selected) return _serverKey(selected);
- return _envState.remoteHost || 'local';
-}
-
function _buildServerOpts(excludeLocal = false) {
// The local server is ALWAYS represented by the synthetic value="local" option
// (showing its custom name from the "server name" feature). We must therefore
@@ -160,20 +134,13 @@ function _buildServerOpts(excludeLocal = false) {
const _localSrv = _localIdx >= 0 ? _envState.servers[_localIdx] : null;
const _localLabel = (_localSrv && _localSrv.name) ? _localSrv.name : 'Local';
let html = ``;
- const selectedKey = _envState.remoteServerKey || '';
- let legacyHostSelected = false;
for (let i = 0; i < _envState.servers.length; i++) {
const s = _envState.servers[i];
if (i === _localIdx) continue; // already the synthetic "local" option
if (excludeLocal && _isLocalEntry(s)) continue;
const label = s.name || s.host || `Server ${i + 1}`;
- const value = _serverKey(s);
- let selected = selectedKey ? value === selectedKey : false;
- if (!selectedKey && _envState.remoteHost === s.host && !legacyHostSelected) {
- selected = true;
- legacyHostSelected = true;
- }
- html += ``;
+ const selected = _envState.remoteHost === s.host ? ' selected' : '';
+ html += ``;
}
return html;
}
@@ -187,41 +154,16 @@ export function _sshCmd(host, cmd, port) {
/** Get SSH port for a given host (or task object) */
function _getPort(hostOrTask) {
if (!hostOrTask) return '';
- if (typeof hostOrTask === 'object') return hostOrTask.sshPort || _getPort(hostOrTask.remoteServerKey || hostOrTask.remoteHost);
- const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null;
- const srv = selected || _serverByVal(hostOrTask);
+ if (typeof hostOrTask === 'object') return hostOrTask.sshPort || _getPort(hostOrTask.remoteHost);
+ const srv = _envState.servers.find(s => s.host === hostOrTask);
return srv?.port || '';
}
/** Get platform for a given host (or task object). Returns 'windows', 'termux', 'linux', or '' */
export function _getPlatform(hostOrTask) {
- const isWinBrowser = (window.navigator.userAgent || window.navigator.platform || '').toLowerCase().includes('win');
- // The browser's OS is NOT the server's OS when the UI is opened remotely —
- // e.g. a Windows browser driving a Mac/Linux homeserver. Trusting the
- // user-agent there makes the serve builder emit the Windows python-only
- // shape (`python -m llama_cpp.server`, no `llama-server ||` fallback), which
- // then fails on the actual Unix server. The local hardware probe is
- // authoritative: it reports a backend (metal/cuda/rocm/cpu_*) for any Unix
- // server and carries platform:"windows" for local Windows (which sets
- // _envState.platform, short-circuiting below). So only fall back to the
- // browser hint when we have no server-side signal at all.
- const localPlatform = () => {
- if (_envState.platform) return _envState.platform;
- if (String(_hwfitCache?.system?.backend || '')) return '';
- return isWinBrowser ? 'windows' : '';
- };
- if (!hostOrTask || hostOrTask === 'local') {
- return localPlatform();
- }
- if (typeof hostOrTask === 'object') {
- const h = hostOrTask.remoteHost;
- if (!h || h === 'local') {
- return hostOrTask.platform || localPlatform();
- }
- return hostOrTask.platform || _getPlatform(hostOrTask.remoteServerKey || h);
- }
- const selected = hostOrTask === _envState.remoteHost ? _selectedServer() : null;
- const srv = selected || _serverByVal(hostOrTask);
+ if (!hostOrTask) return _envState.platform || '';
+ if (typeof hostOrTask === 'object') return hostOrTask.platform || _getPlatform(hostOrTask.remoteHost);
+ const srv = _envState.servers.find(s => s.host === hostOrTask);
return srv?.platform || '';
}
@@ -237,19 +179,6 @@ export function _isMetal() {
return ['metal', 'mps', 'apple'].includes(String(_hwfitCache?.system?.backend || '').toLowerCase());
}
-const GEMMA4_THINKING_CHAT_TEMPLATE = `{% for message in messages %}{% if message['role'] == 'system' %}<|turn>system\n<|think|>{{ message['content'] }}\n{% elif message['role'] == 'user' %}<|turn>user\n{{ message['content'] }}\n{% elif message['role'] == 'assistant' %}<|turn>model\n{{ message['content'] }}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|turn>model\n<|channel>thought{% endif %}`;
-
-function _isGemma4ThinkingModel(modelName) {
- const n = (modelName || '').toLowerCase();
- return n.includes('gemma-4') || n.includes('gemma4');
-}
-
-function _gemma4ThinkingChatTemplateArg(modelName) {
- return _isGemma4ThinkingModel(modelName)
- ? _shellQuote(GEMMA4_THINKING_CHAT_TEMPLATE)
- : '';
-}
-
/** Detect model-specific vLLM optimizations */
function _detectModelOptimizations(modelName) {
const n = (modelName || '').toLowerCase();
@@ -326,7 +255,10 @@ export function _detectToolParser(modelName) {
// ── Backend detection ──
export function _detectBackend(model) {
- if (model?.backend === 'ollama' || model?.is_ollama) {
+ const _ollamaName = String(model?.repo_id || model?.name || model?.id || '').trim();
+ const _ollamaMeta = `${model?.backend || ''} ${model?.endpoint_kind || ''} ${model?.provider || ''} ${model?.source || ''}`.toLowerCase();
+ const _looksLikeOllamaTag = /^[A-Za-z0-9][A-Za-z0-9._-]*(?::[A-Za-z0-9][A-Za-z0-9._-]*)$/.test(_ollamaName);
+ if (model?.backend === 'ollama' || model?.is_ollama || _ollamaMeta.includes('ollama') || _looksLikeOllamaTag) {
return { backend: 'ollama', label: 'Ollama' };
}
const q = (model.quant || '').toUpperCase();
@@ -450,8 +382,6 @@ export function _buildServeCmd(f, modelName, backend) {
const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim();
if (_extraEnv) cmd += _extraEnv + ' ';
cmd += `${_vllmBin} serve ${modelName} --host 0.0.0.0 --port ${f.port || '8000'}`;
- const _gemma4ChatTemplate = _gemma4ThinkingChatTemplateArg(modelName);
- if (_gemma4ChatTemplate) cmd += ` --chat-template ${_gemma4ChatTemplate}`;
cmd += ` --tensor-parallel-size ${f.tp || '1'}`;
cmd += ` --max-model-len ${f.ctx || '8192'}`;
cmd += ` --gpu-memory-utilization ${f.gpu_mem || '0.90'}`;
@@ -482,8 +412,6 @@ export function _buildServeCmd(f, modelName, backend) {
const _extraEnv = (f.extra_env ?? '').toString().replace(/\s+/g, ' ').trim();
if (_extraEnv) cmd += _extraEnv + ' ';
cmd += `${_py3Bin} -m sglang.launch_server --model-path ${modelName} --host 0.0.0.0 --port ${f.port || '30000'}`;
- const _gemma4ChatTemplate = _gemma4ThinkingChatTemplateArg(modelName);
- if (_gemma4ChatTemplate) cmd += ` --chat-template ${_gemma4ChatTemplate}`;
if (f.tp && f.tp !== '1') cmd += ` --tp ${f.tp}`;
if (f.ctx) cmd += ` --context-length ${f.ctx}`;
if (f.gpu_mem && f.gpu_mem !== '0.90') cmd += ` --mem-fraction-static ${f.gpu_mem}`;
@@ -585,9 +513,34 @@ export function _buildServeCmd(f, modelName, backend) {
}
} else if (backend === 'ollama') {
const ollamaPort = f.port || '11434';
- const bindHost = _envState.remoteHost ? '0.0.0.0' : '127.0.0.1';
- const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=${bindHost}:${ollamaPort} ` : '';
- cmd = `${hostEnv}ollama serve`;
+ // GGUF + Ollama: delegate to the iGPU-bound ollama-test container via
+ // its /usr/local/bin/ollama-import helper. Plain `ollama serve` errors
+ // 127 on hosts where ollama isn't on PATH (and even when it is, it
+ // doesn't import the GGUF — it just starts the daemon). Args are all
+ // literal so the cookbook validator (which bans &&/||/;/$() ) is
+ // happy: `docker exec ollama-test ollama-import
+ // `. The helper handles the find/Modelfile/preload dance.
+ if (modelName.includes('/') && (f.gguf_file || /-GGUF$/i.test(modelName))) {
+ // HF-GGUF repo → import + preload + tail
+ const _name = (modelName.split('/').pop() || modelName)
+ .replace(/-GGUF$/i, '')
+ .toLowerCase()
+ .replace(/[^a-z0-9._:-]+/g, '-')
+ .replace(/^-+|-+$/g, '');
+ const _ctx = f.ctx || '8192';
+ const _file = (f.gguf_file || '').split('/').pop() || '';
+ // Trailing GGUF_FILE is optional; helper picks the first match if empty.
+ cmd = `docker exec ollama-test ollama-import ${modelName} ${_name} ${_ctx}${_file ? ' ' + _file : ''}`;
+ } else if (!modelName.includes('/') && modelName) {
+ // Already-pulled Ollama tag (e.g. `qwen2.5:7b`). On kierkegaard the
+ // runtime is the ROCm Ollama sidecar; this quick command verifies the
+ // tag exists, then the backend auto-registers http://host.docker.internal:11434/v1.
+ cmd = `docker exec ollama-rocm ollama show ${modelName}`;
+ } else {
+ const bindHost = _envState.remoteHost ? '0.0.0.0' : '127.0.0.1';
+ const hostEnv = ollamaPort !== '11434' ? `OLLAMA_HOST=${bindHost}:${ollamaPort} ` : '';
+ cmd = `${hostEnv}ollama serve`;
+ }
} else if (backend === 'diffusers') {
const gpuStr = f.gpus?.trim();
if (gpuStr) cmd += `CUDA_VISIBLE_DEVICES=${gpuStr} `;
@@ -630,7 +583,7 @@ function _fallbackCopy(text) {
ta.style.cssText = 'position:fixed;left:-9999px;top:-9999px';
document.body.appendChild(ta);
ta.select();
- try { document.execCommand('copy'); } catch (_) { }
+ try { document.execCommand('copy'); } catch (_) {}
document.body.removeChild(ta);
return Promise.resolve();
}
@@ -663,7 +616,7 @@ function _readStoredEnvState() {
export function _persistEnvState() {
try { localStorage.setItem(LAST_STATE_KEY, JSON.stringify(_envStateForStorage())); }
- catch (_) { }
+ catch (_) {}
_saveTasks(_loadTasks());
}
@@ -712,24 +665,22 @@ async function _fetchDependencies() {
const data = await resp.json();
const pkgs = data.packages || [];
if (!pkgs.length) { list.innerHTML = '
`;
- html += ``;
+ html += ``;
html += ``;
html += `
`;
+ // Browse Ollama library — fetches popular models from ollama.com via the
+ // /api/cookbook/ollama/library cached proxy, click → fills the input with
+ // `:` so the existing Download button kicks off `ollama pull`.
+ html += `
`;
+ html += `
`;
+ html += ``;
+ html += ``;
+ html += `
`;
+ html += ``;
+ html += `
`;
// Latest HF models that fit — collapsible card list
- html += `
`;
+ html += `
`;
html += `
`;
html += `
';
@@ -1885,7 +1900,7 @@ function _renderRecipes() {
html += '
';
html += _srvDirs.map(d => `${esc(d)}`).join('');
@@ -1909,7 +1924,7 @@ function _renderRecipes() {
html += '';
html += '0 selected';
html += '';
- html += '';
+ html += '';
html += '
';
html += '';
@@ -1963,7 +1978,7 @@ function _renderRecipes() {
html += '
';
html += '
Servers
';
// Reuse the calendar +New pill: spinning plus, label fades in idea uses
- // the same `.cal-add-btn-text` rules, so styling stays consistent.
+ // the same `.cal-add-btn-text` rules, so styling stays consistent.
html += '';
html += '
';
html += '
Configure SSH servers, install Odysseus keys, choose model directories, and set the default server. Local is this machine.
';
@@ -2059,73 +2074,73 @@ export async function open(opts) {
}
_setCookbookOpening(true);
try {
- // Invalidate any pending close() animation handlers so they won't re-hide us
- _closeGen++;
- // Clear any leftover inline styles from a previous swipe-dismiss or close animation
- const _content = modal.querySelector('.modal-content');
- if (_content) {
- _content.classList.remove('modal-closing', 'sheet-ready', 'cookbook-modal-entering');
- _content.style.transform = '';
- _content.style.transition = '';
- _content.style.animation = '';
- _content.style.opacity = '';
+ // Invalidate any pending close() animation handlers so they won't re-hide us
+ _closeGen++;
+ // Clear any leftover inline styles from a previous swipe-dismiss or close animation
+ const _content = modal.querySelector('.modal-content');
+ if (_content) {
+ _content.classList.remove('modal-closing', 'sheet-ready', 'cookbook-modal-entering');
+ _content.style.transform = '';
+ _content.style.transition = '';
+ _content.style.animation = '';
+ _content.style.opacity = '';
+ }
+ modal.style.display = '';
+ Modals.register('cookbook-modal', {
+ railBtnId: 'rail-cookbook',
+ sidebarBtnId: 'tool-cookbook-btn',
+ closeFn: () => _doClose(),
+ restoreFn: () => { _renderRunningTab(); },
+ });
+ _wireCookbookDrag(modal);
+ await _syncFromServer();
+ // `_syncFromServer` lives in cookbookRunning.js and populates *its* _envState
+ // (a different object reference than this module's), then mirrors the merged
+ // state to localStorage. So ALWAYS hydrate our _envState from that mirror —
+ // on a successful sync it holds the freshly-fetched servers; on failure it
+ // holds the last-known state. Gating this on `!synced` left the render's
+ // _envState empty whenever sync succeeded → "servers don't show".
+ try { Object.assign(_envState, _readStoredEnvState()); } catch {}
+ // Honour a user-set default server: always land on it when Cookbook opens, so
+ // every dropdown (scan/download/serve/cache/deps) starts on the same machine.
+ if (_envState.defaultServer) {
+ const _dk = _envState.defaultServer;
+ if (_dk === 'local') {
+ _envState.remoteHost = ''; _envState.env = 'none'; _envState.envPath = ''; _envState.platform = '';
+ } else {
+ const _ds = (_envState.servers || []).find(s => s.host === _dk);
+ if (_ds) { _envState.remoteHost = _ds.host; _envState.env = _ds.env || 'none'; _envState.envPath = _ds.envPath || ''; _envState.platform = _ds.platform || ''; }
}
- modal.style.display = '';
- Modals.register('cookbook-modal', {
- railBtnId: 'rail-cookbook',
- sidebarBtnId: 'tool-cookbook-btn',
- closeFn: () => _doClose(),
- restoreFn: () => { _renderRunningTab(); },
- });
- _wireCookbookDrag(modal);
- await _syncFromServer();
- // `_syncFromServer` lives in cookbookRunning.js and populates *its* _envState
- // (a different object reference than this module's), then mirrors the merged
- // state to localStorage. So ALWAYS hydrate our _envState from that mirror —
- // on a successful sync it holds the freshly-fetched servers; on failure it
- // holds the last-known state. Gating this on `!synced` left the render's
- // _envState empty whenever sync succeeded → "servers don't show".
- try { Object.assign(_envState, _readStoredEnvState()); } catch { }
- // Honour a user-set default server: always land on it when Cookbook opens, so
- // every dropdown (scan/download/serve/cache/deps) starts on the same machine.
- if (_envState.defaultServer) {
- const _dk = _envState.defaultServer;
- if (_dk === 'local') {
- _envState.remoteHost = ''; _envState.remoteServerKey = ''; _envState.env = 'none'; _envState.envPath = ''; _envState.platform = '';
- } else {
- const _ds = _serverByVal(_dk);
- if (_ds) { _envState.remoteHost = _ds.host; _envState.remoteServerKey = _serverKey(_ds); _envState.env = _ds.env || 'none'; _envState.envPath = _ds.envPath || ''; _envState.platform = _ds.platform || ''; }
- }
- }
- // Re-render on every open AFTER sync so the freshly-fetched state (servers,
- // HF token, presets) is always reflected. Gating this to once-per-page used
- // to freeze a stale/empty servers list whenever the first sync raced or
- // returned before hydration — and since close/reopen doesn't reset the page,
- // only a full reload recovered it. Re-rendering is cheap and the in-progress
- // Running tab is rendered separately just below.
- _renderRecipes();
- _rendered = true;
- _clearCookbookNotif();
- _renderRunningTab();
- // Self-heal: revive any download tasks whose tmux session is still alive
- // but were persisted as done/error (covers the "restarted server while a
- // big multi-shard download was in flight" case — the task survived in
- // tmux, the cookbook just lost track of it).
- try { _selfHealStaleTasks({ oneShot: true }); } catch { }
- if (_content) {
- // Put the panel in its entering state before it becomes visible. On
- // mobile, showing first and adding the class a frame later can paint the
- // sheet at its final position, which makes the slide-up look like a snap.
- _content.classList.add('cookbook-modal-entering');
- }
- modal.classList.remove('hidden');
- if (_content) {
- void _content.offsetWidth;
- _content.addEventListener('animationend', () => {
- _content.classList.remove('cookbook-modal-entering');
- }, { once: true });
- }
- setTimeout(_applyIntent, 0);
+ }
+ // Re-render on every open AFTER sync so the freshly-fetched state (servers,
+ // HF token, presets) is always reflected. Gating this to once-per-page used
+ // to freeze a stale/empty servers list whenever the first sync raced or
+ // returned before hydration — and since close/reopen doesn't reset the page,
+ // only a full reload recovered it. Re-rendering is cheap and the in-progress
+ // Running tab is rendered separately just below.
+ _renderRecipes();
+ _rendered = true;
+ _clearCookbookNotif();
+ _renderRunningTab();
+ // Self-heal: revive any download tasks whose tmux session is still alive
+ // but were persisted as done/error (covers the "restarted server while a
+ // big multi-shard download was in flight" case — the task survived in
+ // tmux, the cookbook just lost track of it).
+ try { _selfHealStaleTasks({ oneShot: true }); } catch {}
+ if (_content) {
+ // Put the panel in its entering state before it becomes visible. On
+ // mobile, showing first and adding the class a frame later can paint the
+ // sheet at its final position, which makes the slide-up look like a snap.
+ _content.classList.add('cookbook-modal-entering');
+ }
+ modal.classList.remove('hidden');
+ if (_content) {
+ void _content.offsetWidth;
+ _content.addEventListener('animationend', () => {
+ _content.classList.remove('cookbook-modal-entering');
+ }, { once: true });
+ }
+ setTimeout(_applyIntent, 0);
} finally {
_setCookbookOpening(false);
}
@@ -2217,9 +2232,6 @@ const shared = {
_getPort,
_sshPrefix,
_getPlatform,
- _serverByVal,
- _selectedServer,
- _currentServerValue,
_isWindows,
_isMetal,
_buildEnvPrefix,
diff --git a/static/js/cookbookDownload.js b/static/js/cookbookDownload.js
index 6c155c8d7..6ea07cc85 100644
--- a/static/js/cookbookDownload.js
+++ b/static/js/cookbookDownload.js
@@ -242,11 +242,7 @@ export function _wirePanelEvents(panel, model, backend) {
const dlBtn = panel.querySelector('.hwfit-dl-btn');
if (dlBtn) {
dlBtn.addEventListener('click', () => {
- if (backend === 'ollama') {
- _runPanelCmd(panel, _buildDownloadCmd(model, backend), { timeout: 0 });
- } else {
- _runModelDownload(panel, model, backend);
- }
+ _runModelDownload(panel, model, backend)
});
}
@@ -459,7 +455,9 @@ export async function _runModelDownload(panel, model, backend, hostOverride) {
uiModule.showToast(_missingGgufMessage(model));
return;
}
- const repo = ggufSource?.repo || model.quant_repo || model.name;
+ const repo = backend === 'ollama'
+ ? (model.ollama || model.ollama_name || model.name)
+ : (ggufSource?.repo || model.quant_repo || model.name);
const include = backend === 'llamacpp' ? _ggufIncludePattern(model, ggufSource) : null;
_syncEnvFromPanel(panel);
@@ -494,7 +492,7 @@ export async function _runModelDownload(panel, model, backend, hostOverride) {
const platform = host ? (srv.platform || '') : (_envState.platform || '');
const isWin = host ? (platform === 'windows') : _isWindows();
- const payload = { repo_id: repo };
+ const payload = { repo_id: repo, backend };
if (include) payload.include = include;
// Large downloads are where hf_transfer most often dies near the end. Use the
// plain HuggingFace downloader up front for big model files; it is slower, but
diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js
index a4e7b83eb..b13856c08 100644
--- a/static/js/cookbookRunning.js
+++ b/static/js/cookbookRunning.js
@@ -1564,6 +1564,10 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid
const payload = { repo_id: repo, remote_host: _host || undefined, ssh_port: _sp || undefined, _cmd: cmd, _fields: fields || undefined, _env: _usedEnv, _envPath: _usedEnvPath, _gpus: _usedGpus };
_addTask(data.session_id, shortName, 'serve', payload);
uiModule.showToast(`Serving ${shortName}...`);
+ // Auto-register may have enabled an existing (offline) endpoint for this
+ // host:port. Refresh the picker so the row is no longer dimmed, and the
+ // user doesn't see "offline" on a serve they just started.
+ try { _refreshModelsAfterEndpointChange(); } catch (_) {}
} catch (e) {
uiModule.showToast('Failed: ' + e.message);
}
@@ -3032,6 +3036,11 @@ async function _reconnectTask(el, task) {
if (info.status === 'ready' && !task._serveReady) {
task._serveReady = true;
_updateTask(task.sessionId, { _serveReady: true });
+ // The auto-registered endpoint was marked offline while the
+ // server was coming up. Now that it's reachable, nudge the
+ // picker to re-probe so the offline pill clears without the
+ // user having to reopen Settings or refresh the page.
+ try { _refreshModelsAfterEndpointChange(); } catch (_) {}
}
if (info.phase) {
badge.textContent = info.phase;
diff --git a/static/js/cookbookSchedule.js b/static/js/cookbookSchedule.js
index a26de5dbc..69f28a6b5 100644
--- a/static/js/cookbookSchedule.js
+++ b/static/js/cookbookSchedule.js
@@ -129,7 +129,7 @@ try { (function () {
-
+
-
-
-
- Days
-
- ${DAYS.map(d => `
-
- `).join("")}
+
+
+
+
-
-
-
diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js
index 3f7e53916..d06477baf 100644
--- a/static/js/cookbookServe.js
+++ b/static/js/cookbookServe.js
@@ -14,7 +14,6 @@ import { bindMenuDismiss, dismissOrRemove } from './escMenuStack.js';
let _envState;
let _sshCmd;
let _getPort;
-let _serverByVal;
let _sshPrefix;
let _getPlatform;
let _isWindows;
@@ -98,14 +97,14 @@ function _selectedServeTarget(panel) {
const select = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server');
const servers = Array.isArray(_envState.servers) ? _envState.servers : [];
let host = _envState.remoteHost || '';
- let server = host ? (_serverByVal?.(_envState.remoteServerKey || host) || servers.find(s => s.host === host)) : null;
+ let server = host ? servers.find(s => s.host === host) : null;
if (select && select.value != null) {
if (select.value === 'local') {
host = '';
server = servers.find(s => !s.host || s.host === 'local') || null;
} else {
const idx = /^\d+$/.test(String(select.value)) ? parseInt(select.value, 10) : -1;
- server = _serverByVal?.(select.value) || (idx >= 0 ? servers[idx] : null) || null;
+ server = servers.find(s => s.host === select.value) || (idx >= 0 ? servers[idx] : null) || null;
host = server?.host || '';
}
}
@@ -115,7 +114,7 @@ function _selectedServeTarget(panel) {
: (server?.name || 'local server');
return {
host,
- port: host ? (server?.port || _getPort(host) || '') : '',
+ port: host ? (_getPort(host) || server?.port || '') : '',
venv,
label,
};
@@ -243,21 +242,6 @@ function _shellPathExpr(path) {
function _selectedGgufExpr(model, repo, relPath) {
const rel = String(relPath || '').replace(/^\/+/, '');
if (!rel) return '';
- if (_isWindows()) {
- // PowerShell: plain path — no bash $() syntax (backend validator rejects
- // $( ) in non-prelude commands, and PowerShell doesn't have printf).
- const relW = rel.replace(/\//g, '\\');
- if (model.is_local_dir && model.path) {
- const base = String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\');
- return `${base}\\${repo.replace(/\//g, '\\')}\\${relW}`;
- }
- if (model.path) {
- const base = String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\');
- return `${base}\\models--${repo.replace(/\//g, '--')}\\snapshots\\${relW}`;
- }
- const cacheRepo = repo.replace(/\//g, '--');
- return `$env:USERPROFILE\\.cache\\huggingface\\hub\\models--${cacheRepo}\\snapshots\\${relW}`;
- }
if (model.is_local_dir && model.path) {
const base = String(model.path || '').replace(/\/+$/, '');
return `$(printf %s ${_shellPathExpr(`${base}/${repo}/${rel}`)})`;
@@ -271,15 +255,6 @@ function _selectedGgufExpr(model, repo, relPath) {
}
function _ggufSearchDirExpr(model, repo) {
- if (_isWindows()) {
- if (model.is_local_dir && model.path) {
- return `${String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\')}\\${repo.replace(/\//g, '\\')}`;
- }
- if (model.path) {
- return `${String(model.path || '').replace(/\/+$/, '').replace(/\//g, '\\')}\\models--${repo.replace(/\//g, '--')}\\snapshots`;
- }
- return `$env:USERPROFILE\\.cache\\huggingface\\hub\\models--${repo.replace(/\//g, '--')}\\snapshots`;
- }
if (model.is_local_dir && model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/${repo}`);
if (model.path) return _shellQuote(`${String(model.path || '').replace(/\/+$/, '')}/models--${repo.replace(/\//g, '--')}/snapshots`);
return `"$HOME/.cache/huggingface/hub/models--${repo.replace(/\//g, '--')}/snapshots"`;
@@ -537,7 +512,7 @@ function _rerenderCachedModels() {
// The venv set per-server in Settings (server.envPath). Used as the venv
// field default when the global active env path isn't carrying it, so a
// configured server venv shows up without re-typing it.
- const _selSrv = _serverByVal?.(_es.remoteServerKey || _es.remoteHost || '') || {};
+ const _selSrv = (_es.servers || []).find(s => s.host === (_es.remoteHost || '')) || {};
const _srvVenv = _selSrv.envPath || '';
// Serve state schema: { _byRepo: { : {...} }, _lastUsed: {...} }.
// Loading priority: this-repo's saved settings → last-used (from any
@@ -600,7 +575,7 @@ function _rerenderCachedModels() {
+ ``
+ `
`;
- let panelHtml = `
${_slotsHtml}`;
+ let panelHtml = `
`;
// Warn when serving a model whose download hasn't fully completed —
// the user CAN still hit Launch (vLLM/llama-server will start, then
// crash trying to read missing shards), but they should know.
@@ -633,26 +608,48 @@ function _rerenderCachedModels() {
_gpuBtnsHtml += ``;
}
panelHtml += ``;
+ // Save / saved-configs split button — moved into Row 1 (next to GPUs)
+ // so it shares the same baseline as the rest of the top controls.
+ panelHtml += _slotsHtml;
panelHtml += `
`;
- panelHtml += ``;
+ // Show the GGUF File dropdown for BOTH llama.cpp and Ollama — Ollama
+ // also needs to know which exact .gguf to import via the new
+ // `docker exec ollama-test ollama-import` auto-fill (otherwise the
+ // helper falls back to "first sorted gguf", which may not match what
+ // the user picked).
+ panelHtml += `
`;
+ // Row 2: Core settings — the handful you actually touch every launch.
+ // TP / Context / GPU / GPU Mem / Max Seqs / Dtype. Everything else
+ // (Swap, KV Cache, Attention backend, Env vars, llama.cpp batch/ubatch)
+ // moved to the Advanced fold below to keep this row scannable.
+ panelHtml += `
`;
panelHtml += ``;
// ctx resets to the model's max on every panel open (the real ctx slider
// lives in the Scan/Download toolbar — see cookbook.js .hwfit-ctx-control).
panelHtml += ``;
panelHtml += ``;
panelHtml += ``;
- panelHtml += ``;
panelHtml += ``;
panelHtml += ``;
+ panelHtml += `
`;
+ // ── Advanced (collapsed by default) ──
+ // Everything below the fold is tuning users only touch occasionally:
+ // vLLM kernel/env knobs, llama.cpp fit/cache/split controls, the
+ // GGUF batch sizes, the speculative-decoding row, and the live VRAM
+ // monitor. Wrapped in a native so toggle state survives
+ // re-renders cheaply and a closed fold doesn't trigger any layout
+ // work for the dozens of nested inputs.
+ panelHtml += ``;
+ panelHtml += `Advanced`;
+ // Advanced vLLM/SGLang row (KV Cache, Attention, Swap, Env)
+ panelHtml += `
`;
panelHtml += ``;
// Attention backend selector — pin the kernel impl. Default `auto` lets
// vLLM pick FlashInfer (which JITs on first use and breaks on older
@@ -662,6 +659,7 @@ function _rerenderCachedModels() {
const vllmAttnBackendOpts = ['auto', 'FLASH_ATTN', 'XFORMERS', 'FLASHINFER', 'TORCH_SDPA']
.map(b => ``).join('');
panelHtml += ``;
+ panelHtml += ``;
// Free-text env-vars field. Anything pasted here is prepended to the
// launch command verbatim. Use for CUDACXX, PATH overrides, NCCL_*
// tuning, or any other KEY=VALUE pair that doesn't have a dedicated
@@ -669,6 +667,12 @@ function _rerenderCachedModels() {
// already exported so they expand correctly here.
panelHtml += ``;
panelHtml += `
`;
+ // Advanced llama.cpp row (Batch / UBatch — moved out of Core for the
+ // same "rarely touched" reason as the vLLM extras above).
+ panelHtml += `
`;
- // Row 2d: Auto profiles — computed from detected hardware (see profiles.py).
- // Buttons are injected after the panel mounts (needs an async fetch).
- panelHtml += `
`;
+ // Auto-profile chips row removed — visual fit with the rest of the
+ // serve panel was off, and the manual ctx/n_cpu_moe/cache controls
+ // above are already sufficient. The hwfit profile API
+ // (/api/hwfit/profiles) is still available for any caller that
+ // wants it.
// Live VRAM / RAM-spillover monitor for the serve target's GPU. Polls
// /api/cookbook/gpus while the panel is open so you can SEE whether the
// config fits VRAM (fast) or spills to system RAM (slow). Populated after mount.
@@ -745,7 +746,7 @@ function _rerenderCachedModels() {
// even for models the auto-detector doesn't recognize. Expert-parallel,
// reasoning-parser and MoE-env still only appear when auto-detected.
const _opts2 = _detectModelOptimizations(repo);
- panelHtml += `
`;
+ panelHtml += `
`;
if (_opts2.flags.includes('--enable-expert-parallel')) panelHtml += ``;
if (_opts2.flags.some(f => f.includes('--reasoning-parser'))) { const rp = _opts2.flags.find(f => f.includes('--reasoning-parser')).split(' ')[1]; panelHtml += ``; }
{
@@ -764,6 +765,8 @@ function _rerenderCachedModels() {
}
if (_opts2.envVars.length) panelHtml += ``;
panelHtml += `
`;
+ // ── End Advanced fold ──
+ panelHtml += ``;
// Command preview + actions. Wrap the textarea so a floating Copy
// button can sit at its top-right corner — same pattern as the chat
// run-output panel.
@@ -825,27 +828,17 @@ function _rerenderCachedModels() {
// model the file lives under "/" — search there just like we
// search the HF snapshots dir, so serving a GGUF from a custom dir works
// instead of handing llama.cpp a directory (which fails).
- const _ldir = m.path
- ? (_isWindows() ? `${m.path.replace(/\//g, '\\')}\\${repo.replace(/\//g, '\\')}` : _shellQuote(`${m.path}/${repo}`))
- : (_isWindows() ? '' : '""');
- if (selectedGguf) {
- f._gguf_path = _selectedGgufExpr(m, repo, selectedGguf.rel_path);
- } else if (_isWindows()) {
- // Windows fallback: no bash $() available; validator rejects it.
- // Return empty so the serve fails with a clear message.
- f._gguf_path = '';
- } else if (m.is_local_dir && m.path) {
- f._gguf_path = `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
- } else {
- f._gguf_path = `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
- }
+ const _ldir = m.path ? _shellQuote(`${m.path}/${repo}`) : '""';
+ f._gguf_path = selectedGguf
+ ? _selectedGgufExpr(m, repo, selectedGguf.rel_path)
+ : m.is_local_dir && m.path
+ ? `$({ find ${_ldir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${_ldir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`
+ : `$({ find ${dir} -name '*-00001-of-*.gguf' 2>/dev/null | sort; find ${dir} -name '*.gguf' 2>/dev/null | sort; } | head -1)`;
// Vision: auto-find the mmproj (CLIP/projector) file in the same dir.
// Resolved at runtime so the toggle just works if an mmproj-*.gguf is
// present (downloaded alongside the model). Empty if none → cmd omits it.
const _vsearchdir = (m.is_local_dir && m.path) ? _ldir : dir;
- f._mmproj_path = _isWindows()
- ? (_vsearchdir ? `${_vsearchdir}\\mmproj*.gguf` : '')
- : `$(find ${_vsearchdir} -iname 'mmproj*.gguf' 2>/dev/null | sort | head -1)`;
+ f._mmproj_path = `$(find ${_vsearchdir} -iname 'mmproj*.gguf' 2>/dev/null | sort | head -1)`;
}
if (f.reasoning_parser) {
const _rpEl2 = panel.querySelector('[data-field="reasoning_parser"]');
@@ -886,72 +879,29 @@ function _rerenderCachedModels() {
_clampCtx(false); // fix any stale/preset value already present
}
- // Auto profiles — fetch hardware-computed llama.cpp profiles and render
- // them as clickable chips. Clicking one fills the ctx/CPU-MoE/KV/flash
- // fields and rebuilds the command. Computed from detected VRAM (see
- // services/hwfit/profiles.py); rough on t/s, accurate on fit.
- async function _loadServeProfiles() {
- const wrap = panel.querySelector('.hwfit-profile-btns');
- if (!wrap) return;
+ // Tighten the ctx slider's upper bound to the model's trained limit.
+ // Asking llama.cpp for ctx > n_ctx_train overflows and, with a quantized
+ // KV cache, can crash the GPU (radv ErrorDeviceLost). The auto-profile
+ // chip row that used to also live here was removed — visual fit with
+ // the rest of the serve panel was off — but this clamp is essential.
+ (async () => {
try {
const host = (_es.remoteHost || '').trim();
- const selected = _serverByVal?.(_es.remoteServerKey || host);
const params = new URLSearchParams({ model: repo });
if (host) {
params.set('host', host);
- const _sp = selected?.port;
+ const _sp = (_es.servers || []).find(s => s.host === host)?.port;
if (_sp) params.set('ssh_port', _sp);
}
- // SERVE mode: this is a specific GGUF file already on disk, so its quant
- // is fixed — tell the profiler the file's real size + quant so it varies
- // only the serving knobs (KV/ctx/offload), not the quant. Parse the size
- // from m.size (e.g. "20.6 GB") and the quant from the file/repo name.
- const _sizeMatch = String(m.size || '').match(/([\d.]+)\s*GB/i);
- if (_sizeMatch) params.set('serve_weights_gb', _sizeMatch[1]);
- const _qMatch = String(repo).match(/(Q\d[\w]*|IQ\d[\w]*|F16|BF16|FP8)/i);
- if (_qMatch) params.set('serve_quant', _qMatch[1]);
const res = await fetch(`/api/hwfit/profiles?${params}`);
const data = await res.json();
- // Remember the model's trained context limit and clamp the ctx field
- // to it — asking llama.cpp for ctx > n_ctx_train overflows and, with a
- // quantized KV cache, can crash the GPU (radv ErrorDeviceLost).
const ctxMax = Number(data && data.model_ctx_max) || 0;
if (ctxMax > 0) {
- panel._modelCtxMax = ctxMax; // tighten the clamp to the real limit
- _clampCtx(false); // re-apply now that we know the model's max
+ panel._modelCtxMax = ctxMax;
+ _clampCtx(false);
}
- const profs = (data && Array.isArray(data.profiles)) ? data.profiles : [];
- if (!profs.length) { wrap.innerHTML = `no auto profile for this model`; return; }
- wrap.innerHTML = '';
- for (const p of profs) {
- const b = document.createElement('button');
- b.type = 'button';
- b.className = 'cookbook-btn hwfit-profile-chip';
- b.style.cssText = 'height:24px;padding:0 9px;font-size:11px;';
- const off = p.offloads ? `, ncm${p.n_cpu_moe}` : ', all-GPU';
- b.textContent = `${p.label} · ${p.quant} · ${Math.round(p.ctx/1024)}k${off}`;
- b.title = `${p.note}\nKV ${p.cache_type}, ~${p.est_vram_gb} GB VRAM`;
- b.addEventListener('click', () => {
- const set = (field, val) => {
- const el = panel.querySelector(`[data-field="${field}"]`);
- if (!el) return;
- if (el.type === 'checkbox') el.checked = !!val; else el.value = val;
- };
- set('ctx', p.ctx);
- set('n_cpu_moe', p.n_cpu_moe || '');
- set('cache_type', p.cache_type || '');
- set('flash_attn', true); // required for a quantized KV cache
- wrap.querySelectorAll('.hwfit-profile-chip').forEach(x => x.classList.remove('cookbook-btn-active'));
- b.classList.add('cookbook-btn-active');
- updateCmd();
- });
- wrap.appendChild(b);
- }
- } catch {
- wrap.innerHTML = `profile compute failed`;
- }
- }
- _loadServeProfiles();
+ } catch { /* clamp falls back to the static default */ }
+ })();
// Live GPU-memory monitor: poll /api/cookbook/gpus and show VRAM usage +
// RAM-spillover, with a plain-language health/speed hint. Lets you tell at
@@ -962,11 +912,10 @@ function _rerenderCachedModels() {
if (!el || !document.body.contains(el)) return false; // panel closed → stop
try {
const host = (_es.remoteHost || '').trim();
- const selected = _serverByVal?.(_es.remoteServerKey || host);
const params = new URLSearchParams();
if (host) {
params.set('host', host);
- const _sp = selected?.port;
+ const _sp = (_es.servers || []).find(s => s.host === host)?.port;
if (_sp) params.set('ssh_port', _sp);
}
const res = await fetch('/api/cookbook/gpus' + (params.toString() ? '?' + params : ''));
@@ -1535,6 +1484,38 @@ function _rerenderCachedModels() {
}
panel._gpuProbe.byIdx = new Map(data.gpus.map(g => [g.index, g]));
panel._gpuProbe.host = remoteHost;
+ // If the probe found more GPUs than the panel originally
+ // rendered (e.g. host switched from a 1-iGPU local box to an
+ // 8-GPU remote), append buttons for the missing indexes so the
+ // user can actually toggle them. Reuse the parent
from
+ // the first existing button as the insertion target.
+ try {
+ const _existing = Array.from(panel.querySelectorAll('.cookbook-gpu-btn'));
+ const _grp = _existing[0] && _existing[0].parentElement;
+ if (_grp) {
+ const _have = new Set(_existing.map(b => parseInt(b.dataset.gpu, 10)));
+ const _activeStr = (panel.querySelector('[data-field="gpus"]')?.value || '').split(',').map(s => s.trim());
+ data.gpus.forEach(g => {
+ if (_have.has(g.index)) return;
+ const _b = document.createElement('button');
+ _b.type = 'button';
+ _b.className = 'cookbook-gpu-btn' + (_activeStr.includes(String(g.index)) ? ' active' : '');
+ _b.dataset.gpu = String(g.index);
+ _b.textContent = String(g.index);
+ _grp.appendChild(_b);
+ // Re-wire the click handler the same way the panel did
+ // on first render. Toggles active + rewrites the hidden
+ // gpus input from the live set of active buttons.
+ _b.addEventListener('click', () => {
+ _b.classList.toggle('active');
+ const activeBtns = [...panel.querySelectorAll('.cookbook-gpu-btn.active')];
+ const ids = activeBtns.map(x => x.dataset.gpu).sort((a, b) => +a - +b).join(',');
+ const hidden = panel.querySelector('[data-field="gpus"]');
+ if (hidden) { hidden.value = ids; hidden.dispatchEvent(new Event('change', { bubbles: true })); }
+ });
+ });
+ }
+ } catch (_) {}
panel.querySelectorAll('.cookbook-gpu-btn').forEach(b => {
const idx = parseInt(b.dataset.gpu);
const g = panel._gpuProbe.byIdx.get(idx);
@@ -1790,7 +1771,7 @@ function _rerenderCachedModels() {
const _probeParams = new URLSearchParams();
if (_probeHost) {
_probeParams.set('host', _probeHost);
- const _sp = (_serverByVal?.(_envState.remoteServerKey || _probeHost) || {}).port;
+ const _sp = (_envState.servers || []).find(s => s.host === _probeHost)?.port;
if (_sp) _probeParams.set('ssh_port', _sp);
}
const _probeRes = await fetch('/api/cookbook/gpus' + (_probeParams.toString() ? '?' + _probeParams : ''), { credentials: 'same-origin' });
@@ -1861,12 +1842,20 @@ function _rerenderCachedModels() {
}
// Save in the { _byRepo, _lastUsed } schema — no legacy flat keys at
// the root so per-model state doesn't leak between models.
+ // Stamp `_forceBackend: true` so the next open of this model defaults
+ // to the launched configuration end-to-end, even when the detector
+ // would have picked a different backend. Without this flag, the
+ // `savedMatchesBackend` gate inside sv() throws away every saved
+ // value when the detected backend doesn't match — the user opens
+ // Serve again and the panel looks like a fresh form despite a
+ // known-good prior launch.
try {
let cur = {};
try { cur = JSON.parse(localStorage.getItem(SERVE_STATE_KEY)) || {}; } catch {}
const byRepo = (cur && cur._byRepo && typeof cur._byRepo === 'object') ? cur._byRepo : {};
- byRepo[repo] = serveState;
- localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: serveState }));
+ const _saved = { ...serveState, _forceBackend: true };
+ byRepo[repo] = _saved;
+ localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: _saved }));
} catch {}
const origEnv = _envState.env;
const origEnvPath = _envState.envPath;
@@ -1882,7 +1871,8 @@ function _rerenderCachedModels() {
if (_ssEl && _ssEl.value != null) {
if (_ssEl.value === 'local') serveHost = '';
else {
- const _srv = _serverByVal?.(_ssEl.value) || _envState.servers[parseInt(_ssEl.value)];
+ // Values are host strings now; resolve by host (numeric fallback).
+ const _srv = _envState.servers.find(s => s.host === _ssEl.value) || _envState.servers[parseInt(_ssEl.value)];
if (_srv) {
serveHost = _srv.host;
_srvEnv = _srv.env || '';
@@ -1938,10 +1928,24 @@ function _rerenderCachedModels() {
function _resolveCacheHost() {
let host = _envState.remoteHost || '';
const cacheSrv = document.getElementById('hwfit-cache-server');
+
+ function _serverByCacheValue(val) {
+ if (val === 'local') return null;
+ const found = _envState.servers.find(x => x.host === val)
+ || (/^\d+$/.test(String(val)) ? _envState.servers[parseInt(val)] : null)
+ || _envState.servers.find(x => x.name === val)
+ || null;
+ return found || null;
+ }
+
if (cacheSrv) {
const val = cacheSrv.value;
- if (val === 'local') host = '';
- else { const s = _serverByVal?.(val) || _envState.servers[parseInt(val)]; if (s) host = s.host; }
+ if (val === 'local') {
+ host = '';
+ } else {
+ const s = _serverByCacheValue(val);
+ if (s) host = s.host;
+ }
}
return host;
}
@@ -2071,8 +2075,12 @@ export async function openServePanelForRepo(repo, fields) {
let cur = {};
try { cur = JSON.parse(localStorage.getItem(SERVE_STATE_KEY)) || {}; } catch {}
const byRepo = (cur && cur._byRepo && typeof cur._byRepo === 'object') ? cur._byRepo : {};
- byRepo[repo] = fields;
- localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: fields }));
+ // Mirror the launch-time save: stamp _forceBackend so the panel's
+ // sv() helper treats these seeded fields as authoritative, not as
+ // overridable defaults.
+ const _seeded = { ...fields, _forceBackend: true };
+ byRepo[repo] = _seeded;
+ localStorage.setItem(SERVE_STATE_KEY, JSON.stringify({ _byRepo: byRepo, _lastUsed: _seeded }));
} catch {}
}
// Switch to the Serve tab (its click handler triggers _fetchCachedModels).
@@ -2099,7 +2107,18 @@ export async function openServePanelForRepo(repo, fields) {
.find(el => (el.dataset.repo || '').split('/').pop() === _short);
}
if (card) {
- if (!card.classList.contains('doclib-card-expanded')) card.click();
+ // If we were given fields to restore, force a fresh render of the
+ // serve panel so it reads the just-written _byRepo[repo] values
+ // from localStorage. Without this, an already-expanded card kept
+ // its stale form and the "Edit serve" → previous settings round-
+ // trip looked broken from the user's side.
+ if (fields && card.classList.contains('doclib-card-expanded')) {
+ card.click();
+ await new Promise(r => setTimeout(r, 40));
+ card.click();
+ } else if (!card.classList.contains('doclib-card-expanded')) {
+ card.click();
+ }
try { card.scrollIntoView({ behavior: 'smooth', block: 'center' }); } catch {}
return true;
}
@@ -2130,6 +2149,14 @@ export async function _fetchCachedModels() {
try {
let host = _envState.remoteHost || '';
let selectedServer = null;
+ const _serverByCacheValue = (val) => {
+ if (val === 'local') return null;
+ return _envState.servers.find(x => x.host === val)
+ || (/^\d+$/.test(String(val)) ? _envState.servers[parseInt(val)] : null)
+ || _envState.servers.find(x => x.name === val)
+ || null;
+ };
+
const cacheSrv = document.getElementById('hwfit-cache-server');
if (cacheSrv) {
const val = cacheSrv.value;
@@ -2137,11 +2164,11 @@ export async function _fetchCachedModels() {
host = '';
selectedServer = _envState.servers.find(s => !s.host || s.host === 'local') || _envState.servers[0];
} else {
- const s = _serverByVal?.(val) || _envState.servers[parseInt(val)];
+ const s = _serverByCacheValue(val);
if (s) { host = s.host; selectedServer = s; }
}
} else {
- selectedServer = _serverByVal?.(_envState.remoteServerKey || host) || _envState.servers[0];
+ selectedServer = _envState.servers.find(s => s.host === host) || _envState.servers[0];
}
// Read extra model dirs from the SELECTED server's modelDirs (canonical source)
const modelDirs = [];
@@ -2171,7 +2198,18 @@ export async function _fetchCachedModels() {
if (modelDirs.length) qp.set('model_dir', modelDirs.join(','));
const params = qp.toString() ? `?${qp}` : '';
const res = await fetch(`/api/model/cached${params}`);
- if (!res.ok) throw new Error(res.statusText);
+ if (!res.ok) {
+ const body = await res.text().catch(() => '');
+ let msg = '';
+ try {
+ const payload = JSON.parse(body);
+ msg = payload && (payload.detail || payload.error || payload.message);
+ } catch {
+ msg = body;
+ }
+ msg = typeof msg === 'string' ? msg.trim() : '';
+ throw new Error(`HTTP ${res.status} ${res.statusText}${msg ? `: ${msg}` : ''}`);
+ }
const data = await res.json();
_dlWp.destroy();
@@ -2268,7 +2306,6 @@ export function initServe(shared) {
_envState = shared._envState;
_sshCmd = shared._sshCmd;
_getPort = shared._getPort;
- _serverByVal = shared._serverByVal;
_sshPrefix = shared._sshPrefix;
_getPlatform = shared._getPlatform;
_isWindows = shared._isWindows;
diff --git a/static/js/documentLibrary.js b/static/js/documentLibrary.js
index 642a91faa..8c632a3a9 100644
--- a/static/js/documentLibrary.js
+++ b/static/js/documentLibrary.js
@@ -578,13 +578,12 @@ let _libraryArchivedView = false; // Documents tab showing archived docs?
const pieces = [];
if (doc.session_name) pieces.push(`${_esc(doc.session_name)}`);
if (doc.language && doc.language !== 'text') {
- const ic = langIcon(doc.language, 11, { style: 'vertical-align:-2px;flex-shrink:0;opacity:0.65;color:currentColor;' });
- pieces.push(`${ic}${_esc(doc.language)}`);
+ // Per-language icon lives in the title row above; just the language
+ // name here keeps the meta line scannable without duplicating the icon.
+ pieces.push(`${_esc(doc.language)}`);
}
pieces.push(`${_esc(libraryRelativeTime(doc.updated_at))}`);
meta.innerHTML = pieces.join('\u00b7');
- // Strip the per-language icon from the meta line \u2014 it now sits next to the
- // title above, so duplicating it here was redundant.
content.appendChild(meta);
card.appendChild(content);
diff --git a/static/js/emailLibrary.js b/static/js/emailLibrary.js
index a294ca010..4dd2f720d 100644
--- a/static/js/emailLibrary.js
+++ b/static/js/emailLibrary.js
@@ -788,7 +788,7 @@ export function openEmailLibrary(opts = {}) {