diff --git a/app.py b/app.py
index 97906bd46..22b63cc82 100644
--- a/app.py
+++ b/app.py
@@ -529,9 +529,6 @@ upload_cleanup_task = None
from routes.emoji_routes import setup_emoji_routes
app.include_router(setup_emoji_routes())
-from routes.workspace_routes import setup_workspace_routes
-app.include_router(setup_workspace_routes())
-
# Sessions
from routes.session_routes import setup_session_routes
session_config = {"REQUEST_TIMEOUT": REQUEST_TIMEOUT, "OPENAI_API_KEY": OPENAI_API_KEY, "SESSIONS_FILE": SESSIONS_FILE}
diff --git a/mcp_servers/email_server.py b/mcp_servers/email_server.py
index d1c2ac07e..b807937cd 100644
--- a/mcp_servers/email_server.py
+++ b/mcp_servers/email_server.py
@@ -22,6 +22,7 @@ import os
import os.path
from pathlib import Path
from datetime import datetime, timedelta
+import uuid
from mcp.server import Server
from mcp.server.stdio import stdio_server
@@ -67,6 +68,59 @@ def _db_path() -> Path:
return Path(APP_DB)
+def _load_email_writing_style() -> str:
+ """Return the existing Settings > Email > Writing Style value."""
+ try:
+ settings_path = DATA_DIR / "settings.json"
+ if not settings_path.exists():
+ return ""
+ settings = json.loads(settings_path.read_text(encoding="utf-8"))
+ return str(settings.get("email_writing_style") or "").strip()
+ except Exception:
+ return ""
+
+
+def _writing_style_guidance() -> str:
+ style = _load_email_writing_style()
+ if not style:
+ return (
+ "No saved writing style is configured in Settings > Email > Writing Style. "
+ "Use a concise, natural tone and do not invent facts."
+ )
+ return (
+ "Use this saved writing style from Settings > Email > Writing Style when "
+ "drafting the body. It overrides generic tone guidance:\n"
+ f"{style}"
+ )
+
+
+def _default_document_owner() -> str | None:
+ """Best-effort owner for MCP-created documents.
+
+ MCP stdio tools do not receive the browser request's authenticated user,
+ but the document library is owner-filtered. Stamp drafts to the configured
+ single/default admin so assistant-created email drafts are visible.
+ """
+ owner = os.environ.get("ODYSSEUS_DOCUMENT_OWNER", "").strip()
+ if owner:
+ return owner
+ try:
+ auth_path = DATA_DIR / "auth.json"
+ if not auth_path.exists():
+ return None
+ users = (json.loads(auth_path.read_text(encoding="utf-8")).get("users") or {})
+ if not isinstance(users, dict) or not users:
+ return None
+ admins = [name for name, data in users.items() if isinstance(data, dict) and data.get("is_admin")]
+ if len(admins) == 1:
+ return admins[0]
+ if len(users) == 1:
+ return next(iter(users))
+ return admins[0] if admins else next(iter(users))
+ except Exception:
+ return None
+
+
def _list_accounts_raw() -> list:
"""Return list of dicts from the email_accounts table. Empty list if table
missing or empty. Never raises."""
@@ -896,6 +950,340 @@ def _send_email(to, subject, body, in_reply_to=None, references=None, cc=None, b
}
+def _build_email_document_content(
+ to,
+ subject,
+ body,
+ *,
+ cc=None,
+ bcc=None,
+ in_reply_to=None,
+ references=None,
+ source_uid=None,
+ source_folder=None,
+):
+ header_lines = [f"To: {to or ''}"]
+ if cc:
+ header_lines.append(f"Cc: {cc}")
+ if bcc:
+ header_lines.append(f"Bcc: {bcc}")
+ header_lines.append(f"Subject: {subject or ''}")
+ if in_reply_to:
+ header_lines.append(f"In-Reply-To: {in_reply_to}")
+ if references:
+ header_lines.append(f"References: {references}")
+ if source_uid:
+ header_lines.append(f"X-Source-UID: {source_uid}")
+ if source_folder:
+ header_lines.append(f"X-Source-Folder: {source_folder}")
+ return "\n".join(header_lines) + "\n---\n" + (body or "")
+
+
+def _merge_email_reply_body(existing_content: str, reply_body: str) -> str:
+ """Preserve email headers and quoted chain while replacing the editable reply body."""
+ if "\n---\n" not in (existing_content or ""):
+ return reply_body or ""
+ head, body = existing_content.split("\n---\n", 1)
+ quote_markers = (
+ "---------- Previous message ----------",
+ "-----Original Message-----",
+ "----- Original Message -----",
+ )
+ quote_index = -1
+ for marker in quote_markers:
+ idx = body.find(marker)
+ if idx != -1 and (quote_index == -1 or idx < quote_index):
+ quote_index = idx
+ quote = body[quote_index:].strip() if quote_index != -1 else ""
+ merged_body = (reply_body or "").strip()
+ if quote:
+ merged_body = f"{merged_body}\n\n{quote}" if merged_body else quote
+ return f"{head}\n---\n{merged_body}"
+
+
+def _create_email_draft_document(
+ *,
+ to,
+ subject,
+ body,
+ title=None,
+ cc=None,
+ bcc=None,
+ in_reply_to=None,
+ references=None,
+ source_uid=None,
+ source_folder=None,
+ account=None,
+ source_message_id=None,
+):
+ """Create an Odysseus email compose document for user review. Does not send."""
+ from core.database import SessionLocal, Document, DocumentVersion
+ try:
+ from src.event_bus import fire_event
+ except Exception:
+ fire_event = None
+
+ cfg = _load_config(account) if account else _load_config(None)
+ content = _build_email_document_content(
+ to,
+ subject,
+ body,
+ cc=cc,
+ bcc=bcc,
+ in_reply_to=in_reply_to,
+ references=references,
+ source_uid=source_uid,
+ source_folder=source_folder,
+ )
+ doc_id = str(uuid.uuid4())
+ ver_id = str(uuid.uuid4())
+ doc_title = (title or subject or "Email draft").strip() or "Email draft"
+ doc_owner = _default_document_owner()
+
+ db = SessionLocal()
+ try:
+ if source_uid and source_folder:
+ existing = (
+ db.query(Document)
+ .filter(Document.is_active == True)
+ .filter(Document.language == "email")
+ .filter(Document.owner == doc_owner)
+ .filter(Document.source_email_uid == str(source_uid))
+ .filter(Document.source_email_folder == source_folder)
+ .order_by(Document.updated_at.desc())
+ .first()
+ )
+ if existing and "\n---\n" in (existing.current_content or ""):
+ existing.current_content = _merge_email_reply_body(existing.current_content, body or "")
+ existing.version_count = (existing.version_count or 0) + 1
+ ver = DocumentVersion(
+ id=ver_id,
+ document_id=existing.id,
+ version_number=existing.version_count,
+ content=existing.current_content,
+ summary="Updated by email MCP draft tool",
+ source="ai",
+ )
+ db.add(ver)
+ db.commit()
+ if fire_event:
+ try:
+ fire_event("document_updated", doc_owner)
+ except Exception:
+ pass
+ return {
+ "draft": True,
+ "updated": True,
+ "doc_id": existing.id,
+ "title": existing.title,
+ "language": existing.language,
+ "account": cfg.get("account_name"),
+ "account_id": cfg.get("account_id"),
+ "to": to,
+ "subject": subject,
+ }
+
+ doc = Document(
+ id=doc_id,
+ session_id=None,
+ title=doc_title,
+ language="email",
+ current_content=content,
+ version_count=1,
+ is_active=True,
+ owner=doc_owner,
+ source_email_uid=source_uid,
+ source_email_folder=source_folder,
+ source_email_account_id=cfg.get("account_id"),
+ source_email_message_id=source_message_id,
+ )
+ ver = DocumentVersion(
+ id=ver_id,
+ document_id=doc_id,
+ version_number=1,
+ content=content,
+ summary="Created by email MCP draft tool",
+ source="ai",
+ )
+ db.add(doc)
+ db.add(ver)
+ db.commit()
+ if fire_event:
+ try:
+ fire_event("document_created", doc_owner)
+ except Exception:
+ pass
+ return {
+ "draft": True,
+ "doc_id": doc_id,
+ "title": doc_title,
+ "language": "email",
+ "account": cfg.get("account_name"),
+ "account_id": cfg.get("account_id"),
+ "to": to,
+ "subject": subject,
+ }
+ finally:
+ db.close()
+
+
+def _draft_reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None, title=None):
+ """Create a threaded Odysseus reply draft document. Does not send."""
+ conn = _imap_connect(account)
+ conn.select(_q(folder), readonly=True)
+ status, msg_data = conn.uid("FETCH", _b(uid), "(BODY.PEEK[])")
+ conn.logout()
+ if status != "OK" or not msg_data or not msg_data[0]:
+ return {"error": f"Failed to fetch email UID {uid}"}
+ raw = msg_data[0][1]
+ orig = email.message_from_bytes(raw)
+
+ orig_subject = _decode_header(orig.get("Subject", ""))
+ reply_subject = orig_subject if orig_subject.lower().startswith("re:") else f"Re: {orig_subject}"
+ orig_message_id = orig.get("Message-ID", "")
+ orig_references = orig.get("References", "")
+ new_references = (orig_references + " " + orig_message_id).strip() if orig_references else orig_message_id
+
+ sender = _decode_header(orig.get("From", ""))
+ _, sender_addr = email.utils.parseaddr(sender)
+ to_addrs = sender_addr
+
+ cc = None
+ if reply_all:
+ cc_addrs = []
+ cfg = _load_config(account)
+ own_addrs = {
+ (cfg.get("imap_user") or "").strip().lower(),
+ (cfg.get("from_address") or "").strip().lower(),
+ }
+ for header_name in ("To", "Cc"):
+ for _, addr in email.utils.getaddresses([orig.get(header_name, "")]):
+ addr_l = (addr or "").strip().lower()
+ if addr and addr != sender_addr and addr_l not in own_addrs:
+ cc_addrs.append(addr)
+ if cc_addrs:
+ cc = ", ".join(dict.fromkeys(cc_addrs))
+
+ return _create_email_draft_document(
+ to=to_addrs,
+ subject=reply_subject,
+ body=body,
+ title=title or reply_subject,
+ cc=cc,
+ in_reply_to=orig_message_id,
+ references=new_references,
+ source_uid=uid,
+ source_folder=folder,
+ account=account,
+ source_message_id=orig_message_id,
+ )
+
+
+async def _ai_draft_reply_to_email(uid, folder="INBOX", reply_all=False, account=None, title=None):
+ """Generate a reply with Odysseus' AI-reply prompt/style, then create a compose doc."""
+ read_result = _read_email(uid=uid, folder=folder, account=account)
+ if "error" in read_result:
+ return read_result
+
+ to_addr = read_result.get("from_address") or email.utils.parseaddr(read_result.get("from") or "")[1]
+ subject = read_result.get("subject") or ""
+ reply_subject = subject if subject.lower().startswith("re:") else f"Re: {subject}"
+ original_body = read_result.get("body") or ""
+ message_id = read_result.get("message_id") or ""
+
+ if not original_body.strip():
+ return {"error": "No email body available for AI reply"}
+
+ try:
+ from routes.email_helpers import (
+ _EMAIL_REPLY_SYS_PROMPT_BASE,
+ _apply_email_style_mechanics,
+ _extract_reply,
+ _load_settings,
+ )
+ from src.endpoint_resolver import (
+ resolve_endpoint,
+ resolve_utility_fallback_candidates,
+ resolve_chat_fallback_candidates,
+ )
+ from src.llm_core import llm_call_async_with_fallback
+ except Exception as exc:
+ return {"error": f"AI reply helpers unavailable: {exc}"}
+
+ settings = _load_settings()
+ style = settings.get("email_writing_style", "")
+ system_prompt = _EMAIL_REPLY_SYS_PROMPT_BASE
+ if style:
+ system_prompt += f"\n\nWRITING STYLE TO MATCH:\n{style}"
+
+ user_msg = (
+ f"Recipient: {to_addr}\nSubject: {reply_subject}\n\n"
+ f"Original email and any current draft:\n{original_body[:6000]}\n\n"
+ "Draft a reply. Return only the reply body text."
+ )
+
+ candidates = []
+ seen = set()
+
+ def _add(url, model, headers):
+ key = (url or "", model or "")
+ if not url or not model or key in seen:
+ return
+ seen.add(key)
+ candidates.append((url, model, headers))
+
+ try:
+ _add(*resolve_endpoint("utility", owner=None))
+ except Exception:
+ pass
+ try:
+ _add(*resolve_endpoint("default", owner=None))
+ except Exception:
+ pass
+ try:
+ utility_fallbacks = resolve_utility_fallback_candidates(owner=None) or []
+ except TypeError:
+ utility_fallbacks = resolve_utility_fallback_candidates() or []
+ for cand in utility_fallbacks:
+ _add(*cand)
+ try:
+ chat_fallbacks = resolve_chat_fallback_candidates(owner=None) or []
+ except TypeError:
+ chat_fallbacks = resolve_chat_fallback_candidates() or []
+ for cand in chat_fallbacks:
+ _add(*cand)
+
+ if not candidates:
+ return {"error": "No LLM endpoint configured for AI reply"}
+
+ try:
+ raw_reply = await llm_call_async_with_fallback(
+ candidates,
+ messages=[
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_msg},
+ ],
+ temperature=0.7,
+ max_tokens=1024,
+ timeout=60,
+ )
+ except Exception as exc:
+ return {"error": f"AI reply generation failed: {exc}"}
+
+ reply = _apply_email_style_mechanics(_extract_reply(raw_reply or ""))
+ if not reply:
+ return {"error": "AI reply generation returned an empty response"}
+
+ return _draft_reply_to_email(
+ uid=uid,
+ body=reply,
+ folder=folder,
+ reply_all=reply_all,
+ account=account,
+ title=title or reply_subject,
+ )
+
+
def _reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None):
"""Reply to an existing email by UID. Threads via In-Reply-To/References."""
conn = None
@@ -1189,6 +1577,8 @@ async def list_tools() -> list[Tool]:
name="send_email",
description=(
"Send a new email via SMTP. Provide recipient(s), subject, and body. "
+ "This sends immediately; for normal assistant-written email, prefer "
+ "draft_email so the user can review and send from Odysseus. "
"For replying to an existing thread, use reply_to_email instead. "
"Pass `account` to send from a non-default mailbox."
),
@@ -1205,10 +1595,35 @@ async def list_tools() -> list[Tool]:
"required": ["to", "subject", "body"],
},
),
+ Tool(
+ name="draft_email",
+ description=(
+ "Create a new Odysseus email compose draft document. This DOES NOT send. "
+ "Use this as the default way to write an email for the user: it opens "
+ "a reviewable email document with To/Cc/Bcc/Subject/body, and the user "
+ "can edit or press Send in Odysseus. "
+ f"{_writing_style_guidance()}"
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "to": {"type": "string", "description": "Recipient email address(es), comma-separated"},
+ "subject": {"type": "string", "description": "Email subject line"},
+ "body": {"type": "string", "description": "Draft body"},
+ "cc": {"type": "string", "description": "CC address(es), comma-separated (optional)"},
+ "bcc": {"type": "string", "description": "BCC address(es), comma-separated (optional)"},
+ "title": {"type": "string", "description": "Optional Odysseus document title"},
+ **ACCOUNT_PROP,
+ },
+ "required": ["to", "subject", "body"],
+ },
+ ),
Tool(
name="reply_to_email",
description=(
- "Reply to an existing email by UID. Automatically threads the reply with "
+ "Reply to an existing email by UID. This sends immediately; for normal "
+ "assistant-written replies, prefer draft_email_reply so the user can "
+ "review and send from Odysseus. Automatically threads the reply with "
"In-Reply-To and References headers, prefixes 'Re:' on the subject, and "
"uses the original sender as the recipient. Set reply_all=true to also CC "
"the original To/Cc recipients. For follow-up 'reply ...' requests, use "
@@ -1226,6 +1641,49 @@ async def list_tools() -> list[Tool]:
"required": ["uid", "body"],
},
),
+ Tool(
+ name="draft_email_reply",
+ description=(
+ "Create an Odysseus email reply draft document for an existing email UID. "
+ "This DOES NOT send. It threads the draft with In-Reply-To/References, "
+ "prefills the recipient and subject, and stores source email metadata so "
+ "the user can review and send from the normal email composer. "
+ f"{_writing_style_guidance()}"
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"},
+ "body": {"type": "string", "description": "Draft reply body text"},
+ "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"},
+ "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False},
+ "title": {"type": "string", "description": "Optional Odysseus document title"},
+ **ACCOUNT_PROP,
+ },
+ "required": ["uid", "body"],
+ },
+ ),
+ Tool(
+ name="ai_draft_email_reply",
+ description=(
+ "Generate an AI reply using Odysseus' existing AI Reply behavior, "
+ "including Settings > Email > Writing Style, then create an email "
+ "compose document for review. This DOES NOT send and does NOT save "
+ "to the mailbox Drafts folder. Use this when the user asks you to "
+ "write or draft a reply to an email without dictating the exact body."
+ ),
+ inputSchema={
+ "type": "object",
+ "properties": {
+ "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"},
+ "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"},
+ "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False},
+ "title": {"type": "string", "description": "Optional Odysseus document title"},
+ **ACCOUNT_PROP,
+ },
+ "required": ["uid"],
+ },
+ ),
Tool(
name="archive_email",
description="Move an email out of the inbox into the Archive folder. Use after handling an email you want to keep but no longer need in the inbox.",
@@ -1552,6 +2010,31 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
acct_note = f" (from {result['account']})" if result.get("account") else ""
return [TextContent(type="text", text=f"Sent email to {result['to']} with subject '{result['subject']}'{acct_note}.")]
+ elif name == "draft_email":
+ to = arguments.get("to")
+ subject = arguments.get("subject")
+ body = arguments.get("body")
+ if not to or not subject or body is None:
+ return [TextContent(type="text", text="Error: to, subject, and body are required")]
+ result = _create_email_draft_document(
+ to=to,
+ subject=subject,
+ body=body,
+ title=arguments.get("title"),
+ cc=arguments.get("cc"),
+ bcc=arguments.get("bcc"),
+ account=acct,
+ )
+ acct_note = f" from {result['account']}" if result.get("account") else ""
+ return [TextContent(
+ type="text",
+ text=(
+ f"Created Odysseus email draft `{result['title']}` "
+ f"(document ID: {result['doc_id']}){acct_note}. "
+ "It has not been sent; open the document in Odysseus to review and send."
+ ),
+ )]
+
elif name == "reply_to_email":
uid = arguments.get("uid")
body = arguments.get("body")
@@ -1573,6 +2056,54 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
pass
return [TextContent(type="text", text=f"Replied to UID {uid}: '{result['subject']}' → {result['to']}")]
+ elif name == "draft_email_reply":
+ uid = arguments.get("uid")
+ body = arguments.get("body")
+ if not uid or body is None:
+ return [TextContent(type="text", text="Error: uid and body are required")]
+ result = _draft_reply_to_email(
+ uid=uid,
+ body=body,
+ folder=arguments.get("folder", "INBOX"),
+ reply_all=bool(arguments.get("reply_all", False)),
+ account=acct,
+ title=arguments.get("title"),
+ )
+ if "error" in result:
+ return [TextContent(type="text", text=f"Error: {result['error']}")]
+ acct_note = f" from {result['account']}" if result.get("account") else ""
+ return [TextContent(
+ type="text",
+ text=(
+ f"Created Odysseus reply draft `{result['title']}` for UID {uid} "
+ f"(document ID: {result['doc_id']}){acct_note}. "
+ "It has not been sent; open the document in Odysseus to review and send."
+ ),
+ )]
+
+ elif name == "ai_draft_email_reply":
+ uid = arguments.get("uid")
+ if not uid:
+ return [TextContent(type="text", text="Error: uid is required")]
+ result = await _ai_draft_reply_to_email(
+ uid=uid,
+ folder=arguments.get("folder", "INBOX"),
+ reply_all=bool(arguments.get("reply_all", False)),
+ account=acct,
+ title=arguments.get("title"),
+ )
+ if "error" in result:
+ return [TextContent(type="text", text=f"Error: {result['error']}")]
+ acct_note = f" from {result['account']}" if result.get("account") else ""
+ return [TextContent(
+ type="text",
+ text=(
+ f"Generated AI reply and created Odysseus compose draft "
+ f"`{result['title']}` for UID {uid} (document ID: {result['doc_id']}){acct_note}. "
+ "It has not been sent; open the document in Odysseus to review and send."
+ ),
+ )]
+
elif name == "archive_email":
uid = arguments.get("uid")
if not uid:
diff --git a/routes/api_token_routes.py b/routes/api_token_routes.py
index 97c576d15..05806e420 100644
--- a/routes/api_token_routes.py
+++ b/routes/api_token_routes.py
@@ -25,6 +25,8 @@ ALLOWED_SCOPES = {
"calendar:write",
"memory:read",
"memory:write",
+ "cookbook:read",
+ "cookbook:launch",
}
TOKEN_PROFILES = {
"chat": ["chat"],
diff --git a/routes/chat_routes.py b/routes/chat_routes.py
index a718d3fbe..39c17ec6c 100644
--- a/routes/chat_routes.py
+++ b/routes/chat_routes.py
@@ -452,14 +452,11 @@ def setup_chat_routes(
search_context = form_data.get("search_context") # pre-fetched web search results (compare mode)
compare_mode = str(form_data.get("compare_mode", "")).lower() == "true"
incognito = str(form_data.get("incognito", "")).lower() == "true"
- plan_mode = str(form_data.get("plan_mode", "")).lower() == "true"
+ # Plan mode is not part of the merge-ready UI. Ignore stale clients or
+ # manual form posts that still send plan_mode=true.
+ plan_mode = False
chat_mode = str(form_data.get("mode", "")).lower() # 'chat' or 'agent'
- # Workspace: confine the agent's file/shell tools to this folder. Validate
- # it's a real directory; ignore (no confinement) otherwise.
- workspace = (form_data.get("workspace") or "").strip()
- if workspace:
- _ws_real = os.path.realpath(os.path.expanduser(workspace))
- workspace = _ws_real if os.path.isdir(_ws_real) else ""
+ workspace = ""
# Plan mode is a modifier on agent mode — it only makes sense with tools.
if plan_mode:
chat_mode = "agent"
@@ -1138,7 +1135,7 @@ def setup_chat_routes(
tool_policy=tool_policy,
owner=_user,
fallbacks=_fallback_candidates,
- workspace=workspace or None,
+ workspace=None,
plan_mode=plan_mode,
approved_plan=approved_plan or None,
):
@@ -1270,8 +1267,7 @@ def setup_chat_routes(
# without waiting on the next streamed chunk.
#
# Normal chat/agent streams keep the DETACHED behavior below: they
- # survive the client closing the tab / navigating away (true
- # terminal-agent semantics). The SSE response just subscribes (replay
+ # survive the client closing the tab / navigating away. The SSE response just subscribes (replay
# buffered output + live); dropping the SSE only removes a subscriber —
# the run keeps going and saves the assistant message on completion
# regardless. Reconnect via /api/chat/resume.
diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py
index 39a18f715..a450278be 100644
--- a/routes/cookbook_helpers.py
+++ b/routes/cookbook_helpers.py
@@ -30,8 +30,9 @@ _LOCAL_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
_OLLAMA_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._:/-]{0,200}$")
# Include pattern is a glob: allow typical safe glyphs only.
_INCLUDE_RE = re.compile(r"^[A-Za-z0-9._\-*?/\[\]]+$")
-# Remote host: user@host (optionally with :port-free hostname parts).
-_REMOTE_HOST_RE = re.compile(r"^[A-Za-z0-9._-]+@[A-Za-z0-9._-]+$")
+# Remote host: either `user@host` or plain `host` (alias is allowed), where host
+# is a safe DNS-like token or a short SSH config alias.
+_REMOTE_HOST_RE = re.compile(r"^(?:[A-Za-z0-9._-]+@)?[A-Za-z0-9._-]+$")
# HF tokens and API tokens are url-safe base64-like.
_TOKEN_RE = re.compile(r"^[A-Za-z0-9._~+/=-]+$")
# Session IDs we mint look like "cookbook-deadbeef" or "serve-deadbeef".
@@ -81,7 +82,7 @@ def _validate_remote_host(v: str | None) -> str | None:
if v is None or v == "":
return None
if not _REMOTE_HOST_RE.match(v):
- raise HTTPException(400, "Invalid remote_host — must be user@host, no SSH option syntax")
+ raise HTTPException(400, "Invalid remote_host — must be host or user@host, no SSH option syntax")
return v
@@ -787,6 +788,7 @@ def _llama_cpp_rebuild_cmd() -> str:
class ModelDownloadRequest(BaseModel):
repo_id: str
+ backend: str | None = None # "hf" (default) or "ollama"
include: str | None = None # glob pattern e.g. "*Q4_K_M*"
hf_token: str | None = None
env_prefix: str | None = None # e.g. "source ~/venv/bin/activate"
diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py
index 7a1ee85c6..081638cae 100644
--- a/routes/cookbook_routes.py
+++ b/routes/cookbook_routes.py
@@ -15,26 +15,19 @@ from pathlib import Path
from fastapi import APIRouter, HTTPException, Request, Depends
from src.auth_helpers import require_user
-from src.constants import COOKBOOK_STATE_FILE
from pydantic import BaseModel
from core.middleware import require_admin
from core.platform_compat import (
IS_WINDOWS,
- SSH_PATH_OVERRIDE,
- NVIDIA_PATH_CANDIDATES,
detached_popen_kwargs,
find_bash,
- git_bash_path,
kill_process_tree,
pid_alive,
safe_chmod,
which_tool,
- translate_path,
- get_wsl_windows_user_profile,
)
from routes.shell_routes import TMUX_LOG_DIR
-from src.constants import COOKBOOK_STATE_FILE
logger = logging.getLogger(__name__)
@@ -45,10 +38,8 @@ from routes.cookbook_helpers import (
_ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase,
_safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines,
_append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script,
- _append_vllm_linux_preflight_lines, _ollama_bind_from_cmd, _pip_install_fallback_chain,
- _pip_install_no_cache, _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
- _append_pip_install_runner_lines,
- _diagnose_serve_output, run_ssh_command_async,
+ _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache,
+ _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd,
ModelDownloadRequest, ServeRequest,
)
@@ -63,7 +54,7 @@ _HF_TOKEN_STATUS_SNIPPET = (
def setup_cookbook_routes() -> APIRouter:
router = APIRouter(tags=["cookbook"])
- _cookbook_state_path = Path(COOKBOOK_STATE_FILE)
+ _cookbook_state_path = Path(os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
def _mask_secret(value: str) -> str:
if not value:
@@ -90,6 +81,127 @@ def setup_cookbook_routes() -> APIRouter:
task["payload"].pop("hf_token", None)
return state
+ def _diagnose_serve_output(text: str) -> dict | None:
+ """Server-side mirror of the Cookbook UI's common serve diagnoses.
+
+ The browser uses cookbook-diagnosis.js for clickable fixes. This gives
+ the agent/tool path the same structured signal so it can retry with an
+ adjusted command instead of guessing from raw tmux output.
+ """
+ if not text:
+ return None
+ tail = text[-6000:]
+ patterns = [
+ (
+ r"No available memory for the cache blocks|Available KV cache memory:.*-",
+ "No GPU memory left for KV cache after loading model.",
+ [
+ {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"},
+ {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"},
+ ],
+ ),
+ (
+ r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization",
+ "GPU ran out of memory during startup or warmup.",
+ [
+ {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+ {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"},
+ {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"},
+ ],
+ ),
+ (
+ r"not divisib|must be divisible|attention heads.*divisible",
+ "Tensor parallel size is incompatible with the model.",
+ [
+ {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"},
+ {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"},
+ ],
+ ),
+ (
+ r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context",
+ "Context length is too large for available GPU memory.",
+ [
+ {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"},
+ {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"},
+ ],
+ ),
+ (
+ r"enable-auto-tool-choice requires --tool-call-parser",
+ "Auto tool choice requires an explicit tool call parser.",
+ [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}],
+ ),
+ (
+ r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not",
+ "Model requires custom code or newer model support.",
+ [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}],
+ ),
+ (
+ r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer",
+ "vLLM/Transformers kernel package mismatch.",
+ [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}],
+ ),
+ (
+ r"Address already in use|bind.*address.*in use",
+ "Port is already in use.",
+ [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}],
+ ),
+ (
+ r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid",
+ "No GPUs are visible to the serve process.",
+ [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}],
+ ),
+ (
+ r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available",
+ "vLLM could not find a supported GPU (CUDA or ROCm). "
+ "This machine may have integrated or unsupported graphics only.",
+ [
+ {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+ {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"},
+ ],
+ ),
+ (
+ r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed",
+ "vLLM is not installed or not in PATH on this server.",
+ [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}],
+ ),
+ (
+ r"sglang.*command not found|No module named sglang|SGLang is not installed",
+ "SGLang is not installed or not in PATH on this server.",
+ [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}],
+ ),
+ (
+ r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found",
+ "llama.cpp / llama-cpp-python dependencies are missing.",
+ [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}],
+ ),
+ (
+ r"No GGUF found on this host|no \.gguf file|No GGUF file found",
+ "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.",
+ [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}],
+ ),
+ (
+ r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers",
+ "Diffusion serving requires PyTorch and diffusers.",
+ [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}],
+ ),
+ (
+ r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review",
+ "Model access is gated or unauthorized.",
+ [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}],
+ ),
+ ]
+ for pattern, message, suggestions in patterns:
+ if re.search(pattern, tail, re.I):
+ return {"message": message, "suggestions": suggestions}
+ if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search(
+ r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I
+ ):
+ return {
+ "message": "Python traceback detected during serve startup.",
+ "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}],
+ }
+ return None
+
def _state_for_client(state):
"""Return cookbook state without raw secrets for browser clients."""
_strip_task_secrets(state)
@@ -183,7 +295,6 @@ def setup_cookbook_routes() -> APIRouter:
safe_chmod(key_path.with_suffix(".pub"), 0o644)
return {"ok": True, "public_key": _read_cookbook_public_key()}
-
def _needs_binary(cmd: str, binary: str) -> bool:
return bool(re.search(rf"(^|[\s;&|()]){re.escape(binary)}($|[\s;&|()])", cmd or ""))
@@ -244,8 +355,8 @@ def setup_cookbook_routes() -> APIRouter:
# POSIX form + shell-quoting so drive paths / spaces survive.
inner = TMUX_LOG_DIR / f"{session_id}_run.sh"
inner.write_text("\n".join(bash_lines) + "\n", encoding="utf-8")
- lp = shlex.quote(git_bash_path(log_path))
- ip = shlex.quote(git_bash_path(inner))
+ lp = shlex.quote(log_path.as_posix())
+ ip = shlex.quote(inner.as_posix())
script_path = TMUX_LOG_DIR / f"{session_id}.sh"
script_path.write_text(
f"bash {ip} > {lp} 2>&1\n",
@@ -286,24 +397,33 @@ def setup_cookbook_routes() -> APIRouter:
require_admin(request)
# Defence-in-depth: even though this endpoint is admin-gated, refuse
# values that would land in shell contexts with metacharacters.
- _validate_repo_id(req.repo_id)
- _validate_include(req.include)
+ backend = (req.backend or "").strip().lower()
+ is_ollama_download = backend == "ollama" or ("/" not in req.repo_id and ":" in req.repo_id)
+ if is_ollama_download:
+ _validate_serve_model_id(req.repo_id)
+ req.include = None
+ req.local_dir = None
+ else:
+ _validate_repo_id(req.repo_id)
+ _validate_include(req.include)
_validate_remote_host(req.remote_host)
req.ssh_port = _validate_ssh_port(req.ssh_port)
req.local_dir = _validate_local_dir(req.local_dir)
- req.hf_token = req.hf_token or _load_stored_hf_token()
+ req.hf_token = "" if is_ollama_download else (req.hf_token or _load_stored_hf_token())
_validate_token(req.hf_token)
TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True)
session_id = f"cookbook-{uuid.uuid4().hex[:8]}"
wrapper_script = TMUX_LOG_DIR / f"{session_id}.sh"
- # When a download directory is set, target a per-model subfolder under it
- # (
/) so the flat-directory cache scan lists it as its own
- # model. Without it, hf/snapshot_download falls back to the HF cache.
- _dl_short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id
- _dl_base = (req.local_dir.rstrip("/") + "/" + _dl_short) if req.local_dir else None
- _dl_shell = _shell_path(_dl_base) if _dl_base else None # for hf CLI / bash
- _dl_pyarg = (", local_dir=os.path.expanduser(" + repr(_dl_base) + ")") if _dl_base else ""
+ # Custom download dir: point the HF cache at /hub via env vars
+ # (HF_HOME + HUGGINGFACE_HUB_CACHE) instead of --local-dir. local_dir
+ # produces a flat layout (//) and the local-dir
+ # bookkeeping files (.cache/huggingface/.gitignore.lock), and it
+ # also breaks robust resume on flaky transfers — the blob-based hub
+ # cache survives SSL ReadError mid-stream by reusing .incomplete,
+ # local_dir does not. See issue #2722.
+ _dl_hf_home_shell = _shell_path(req.local_dir.rstrip("/")) if req.local_dir else None
+ _dl_pyarg = "" # snapshot_download honors the env vars too — no kwarg needed
# Build the hf download command. Redirection to suppress the interactive
# "update available? [Y/n]" prompt is added per-platform further down
@@ -311,8 +431,7 @@ def setup_cookbook_routes() -> APIRouter:
hf_cmd = f"hf download {req.repo_id}"
if req.include:
hf_cmd += f" --include '{req.include}'"
- if _dl_shell:
- hf_cmd += f" --local-dir {_dl_shell}"
+ ollama_cmd = f"ollama pull {shlex.quote(req.repo_id)}"
# Build the shell wrapper — runs hf download directly in tmux (which is a TTY)
# No script/tee needed — we'll use tmux capture-pane to read output
@@ -320,8 +439,15 @@ def setup_cookbook_routes() -> APIRouter:
lines.extend(_user_shell_path_bootstrap())
if req.hf_token:
lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
+ if _dl_hf_home_shell and not is_ollama_download:
+ # Make hf download / snapshot_download honor the chosen dir via the
+ # standard HF cache (gives us the models--org--name/blobs/... layout
+ # with resumable .incomplete blobs).
+ lines.append(f"export HF_HOME={_dl_hf_home_shell}")
+ lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub")
+ lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub")
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
- lines.append('export PATH="$HOME/.local/bin:$PATH"')
+ lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
# When Odysseus runs from a venv (e.g. native macOS install), put its bin
# on PATH so the tmux shell finds the bundled `hf`/`python3` without an
# activated venv. Local bash runs only — meaningless over SSH.
@@ -332,14 +458,25 @@ def setup_cookbook_routes() -> APIRouter:
# throughput. Retries set disable_hf_transfer to fall back to the plain,
# slower-but-reliable downloader (resumes cleanly from the .incomplete files).
# Use `python3 -m pip` not `pip` — macOS has no bare `pip` command.
- lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
- if req.disable_hf_transfer:
- lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
- lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ if is_ollama_download:
+ lines.append('if command -v ollama >/dev/null 2>&1; then')
+ lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}')
+ lines.append('elif command -v docker >/dev/null 2>&1; then')
+ lines.append(' ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"')
+ lines.append(' if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then')
+ lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}')
+ lines.append(' fi')
+ lines.append('fi')
+ lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi')
else:
- lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
- lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
- lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
+ lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}")
+ if req.disable_hf_transfer:
+ lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
+ lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ else:
+ lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}")
+ lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+ lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
remote = req.remote_host # None for local
is_windows = req.platform == "windows"
@@ -361,37 +498,48 @@ def setup_cookbook_routes() -> APIRouter:
ps_lines = []
ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
- ps_lines.append('$env:PYTHONIOENCODING = "utf-8"')
- ps_lines.append('$env:PYTHONUTF8 = "1"')
if req.hf_token:
ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
+ if req.local_dir and not is_ollama_download:
+ # Mirror the bash branch — point the HF cache at the user's dir
+ # via env vars instead of --local-dir, so resume works on flaky
+ # transfers (issue #2722).
+ _dl_ps = _ps_squote(req.local_dir.rstrip("/"))
+ ps_lines.append(f"$env:HF_HOME = '{_dl_ps}'")
+ ps_lines.append(f"$env:HUGGINGFACE_HUB_CACHE = '{_dl_ps}/hub'")
+ ps_lines.append(f"$env:HF_HUB_CACHE = '{_dl_ps}/hub'")
if req.env_prefix:
ps_lines.append(_safe_env_prefix(req.env_prefix))
- # Try hf CLI, fall back to Python huggingface_hub, then auto-install
- ps_lines.append('try {{')
- ps_lines.append(' $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
- ps_lines.append(' if ($hfPath) {{')
- # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
- ps_lines.append(f' $null | {hf_cmd}')
- ps_lines.append(' }} else {{')
- ps_lines.append(' python -c "import huggingface_hub" 2>$null')
- ps_lines.append(' if ($LASTEXITCODE -eq 0) {{')
- ps_lines.append(' Write-Host "hf CLI not found, using Python huggingface_hub..."')
- ps_lines.append(' python -m pip install -q hf_transfer 2>$null')
- ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
- ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
- ps_lines.append(' }} else {{')
- ps_lines.append(' Write-Host "Installing huggingface-hub..."')
- ps_lines.append(' python -m pip install -q huggingface-hub hf_transfer')
- ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
- ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
- ps_lines.append(' }}')
- ps_lines.append(' }}')
- ps_lines.append(' if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
- ps_lines.append(' else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
- ps_lines.append('}} catch {{')
- ps_lines.append(' Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
- ps_lines.append('}}')
+ if is_ollama_download:
+ ps_lines.append('if (-not (Get-Command ollama -ErrorAction SilentlyContinue)) { Write-Host "ERROR: Ollama not found. Install from https://ollama.com/download/windows"; exit 127 }')
+ ps_lines.append(f"$null | ollama pull '{_ps_squote(req.repo_id)}'")
+ ps_lines.append('if ($LASTEXITCODE -eq 0) { Write-Host ""; Write-Host "DOWNLOAD_OK" } else { Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }')
+ else:
+ # Try hf CLI, fall back to Python huggingface_hub, then auto-install
+ ps_lines.append('try {{')
+ ps_lines.append(' $hfPath = Get-Command hf -ErrorAction SilentlyContinue')
+ ps_lines.append(' if ($hfPath) {{')
+ # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt
+ ps_lines.append(f' $null | {hf_cmd}')
+ ps_lines.append(' }} else {{')
+ ps_lines.append(' python -c "import huggingface_hub" 2>$null')
+ ps_lines.append(' if ($LASTEXITCODE -eq 0) {{')
+ ps_lines.append(' Write-Host "hf CLI not found, using Python huggingface_hub..."')
+ ps_lines.append(' python -m pip install -q hf_transfer 2>$null')
+ ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
+ ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
+ ps_lines.append(' }} else {{')
+ ps_lines.append(' Write-Host "Installing huggingface-hub..."')
+ ps_lines.append(' python -m pip install -q huggingface-hub hf_transfer')
+ ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"')
+ ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"")
+ ps_lines.append(' }}')
+ ps_lines.append(' }}')
+ ps_lines.append(' if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}')
+ ps_lines.append(' else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}')
+ ps_lines.append('}} catch {{')
+ ps_lines.append(' Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"')
+ ps_lines.append('}}')
ps_lines.append(f'Remove-Item -Force "$HOME\\{remote_runner}" -ErrorAction SilentlyContinue')
runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1"
runner_path.write_text("\r\n".join(ps_lines) + "\r\n", encoding="utf-8")
@@ -422,6 +570,10 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append("deactivate 2>/dev/null; hash -r")
if req.hf_token:
runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'")
+ if _dl_hf_home_shell and not is_ollama_download:
+ runner_lines.append(f"export HF_HOME={_dl_hf_home_shell}")
+ runner_lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub")
+ runner_lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub")
if req.env_prefix:
runner_lines.append(_safe_env_prefix(req.env_prefix))
else:
@@ -432,42 +584,67 @@ def setup_cookbook_routes() -> APIRouter:
'done'
)
# Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH
- runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
+ runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
# Install hf CLI + optional hf_transfer best-effort. Retries disable
# hf_transfer because the Rust parallel path is fast but has been
# flaky near the end of very large multi-file downloads.
- # The helper tries active pip first, then guarded user-site fallbacks.
- runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
- if req.disable_hf_transfer:
- runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
- runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ # Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail.
+ if is_ollama_download:
+ runner_lines.append('if command -v ollama >/dev/null 2>&1; then')
+ runner_lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}')
+ runner_lines.append('elif command -v docker >/dev/null 2>&1; then')
+ runner_lines.append(' ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"')
+ runner_lines.append(' if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then')
+ runner_lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}')
+ runner_lines.append(' fi')
+ runner_lines.append('fi')
+ runner_lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi')
else:
- runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
- runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
- runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
- # Surface whether the HF token actually reached THIS server, so a gated
- # download's "not authorized" failure can be told apart from a missing
- # token (the token is masked — we only print applied / not-set).
- runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
- # Try hf CLI first, fall back to Python huggingface_hub, then auto-install
- runner_lines.append('if command -v hf &>/dev/null; then')
- # < /dev/null suppresses interactive "update available? [Y/n]" prompt
- runner_lines.append(f' {hf_cmd} < /dev/null')
- runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then')
- runner_lines.append(' echo "hf CLI not found, using Python huggingface_hub..."')
- runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
- runner_lines.append('else')
- runner_lines.append(' echo "Installing huggingface-hub and dependencies..."')
- runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null')
- if req.disable_hf_transfer:
- runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
- runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0')
+ runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}")
+ if req.disable_hf_transfer:
+ runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0")
+ runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4")
+ else:
+ runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}")
+ runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+ runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8")
+ # Surface whether the HF token actually reached THIS server, so a gated
+ # download's "not authorized" failure can be told apart from a missing
+ # token (the token is masked — we only print applied / not-set).
+ runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
+ # Wrap the download in a retry loop. Large HF/Ollama transfers can
+ # hit transient network failures; both backends resume cached partials.
+ mw = 4 if req.disable_hf_transfer else 8
+ runner_lines.append('_max_retries=10; _attempt=0; _ec=0')
+ runner_lines.append('while [ $_attempt -lt $_max_retries ]; do')
+ runner_lines.append(' _attempt=$((_attempt+1))')
+ if is_ollama_download:
+ runner_lines.append(' eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null')
else:
- runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
- runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
- runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"')
- runner_lines.append('fi')
- runner_lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
+ runner_lines.append(' if command -v hf &>/dev/null; then')
+ runner_lines.append(f' {hf_cmd} < /dev/null')
+ runner_lines.append(' elif python3 -c "import huggingface_hub" 2>/dev/null; then')
+ runner_lines.append(' [ $_attempt -eq 1 ] && echo "hf CLI not found, using Python huggingface_hub..."')
+ runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"')
+ runner_lines.append(' else')
+ runner_lines.append(' echo "Installing huggingface-hub and dependencies..."')
+ runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null')
+ if req.disable_hf_transfer:
+ runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null')
+ runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0')
+ else:
+ runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null')
+ runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1")
+ runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"')
+ runner_lines.append(' fi')
+ runner_lines.append(' _ec=$?')
+ runner_lines.append(' if [ $_ec -eq 0 ]; then break; fi')
+ runner_lines.append(' if [ $_attempt -lt $_max_retries ]; then')
+ runner_lines.append(' echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."')
+ runner_lines.append(' sleep 30')
+ runner_lines.append(' fi')
+ runner_lines.append('done')
+ runner_lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi')
runner_lines.append(f"rm -f {remote_runner}")
runner_lines.append('exec "${SHELL:-/bin/bash}"')
runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
@@ -493,23 +670,30 @@ def setup_cookbook_routes() -> APIRouter:
lines.append("deactivate 2>/dev/null; hash -r")
# Show whether the HF token reached this run (masked) — tells a gated
# "not authorized" failure apart from a missing token.
- lines.append(_HF_TOKEN_STATUS_SNIPPET)
- if IS_WINDOWS:
- # Detached path: no controlling TTY, so skip `< /dev/null`
- # (handled by Popen stdin=DEVNULL) and don't keep a shell open.
- lines.append(hf_cmd)
- lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
- else:
- # < /dev/null suppresses interactive "update available? [Y/n]" prompt
- lines.append(f"{hf_cmd} < /dev/null")
- lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi')
+ if not is_ollama_download:
+ lines.append(_HF_TOKEN_STATUS_SNIPPET)
+ # Retry loop — same rationale as the remote-bash path. Issue #2722.
+ _hf_invoke = 'eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null' if is_ollama_download else (hf_cmd if IS_WINDOWS else f"{hf_cmd} < /dev/null")
+ lines.append('_max_retries=10; _attempt=0; _ec=0')
+ lines.append('while [ $_attempt -lt $_max_retries ]; do')
+ lines.append(' _attempt=$((_attempt+1))')
+ lines.append(f' {_hf_invoke}')
+ lines.append(' _ec=$?')
+ lines.append(' if [ $_ec -eq 0 ]; then break; fi')
+ lines.append(' if [ $_attempt -lt $_max_retries ]; then')
+ lines.append(' echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."')
+ lines.append(' sleep 30')
+ lines.append(' fi')
+ lines.append('done')
+ lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi')
+ if not IS_WINDOWS:
lines.append(f"rm -f '{wrapper_script}'")
lines.append('exec "${SHELL:-/bin/bash}"')
wrapper_script.write_text("\n".join(lines) + "\n", encoding="utf-8")
wrapper_script.chmod(0o755)
setup_cmd = None if IS_WINDOWS else f"tmux new-session -d -s {session_id} {shlex.quote(str(wrapper_script))}"
- logger.info(f"Model download: {req.repo_id} (include={req.include}, session={session_id}, remote={remote})")
+ logger.info(f"Model download: {req.repo_id} (backend={'ollama' if is_ollama_download else 'hf'}, include={req.include}, session={session_id}, remote={remote})")
logger.info(f"Download setup_cmd: {setup_cmd}")
if setup_cmd is None:
@@ -564,35 +748,24 @@ def setup_cookbook_routes() -> APIRouter:
for d in model_dir.split(','):
d = d.strip()
if d:
- translated_d = translate_path(d) if not host else d
- model_dirs.append(translated_d)
- win_hf_hub = None
- if not host:
- win_profile = get_wsl_windows_user_profile()
- win_hf_hub = os.path.join(win_profile, ".cache", "huggingface", "hub") if win_profile else None
-
- paths_code = _cached_model_scan_script(model_dirs, win_hf_hub)
+ model_dirs.append(d)
+ paths_code = _cached_model_scan_script(model_dirs)
scan_py = TMUX_LOG_DIR / "scan_cache.py"
scan_py.write_text(paths_code, encoding="utf-8")
- scan_payload = scan_py.read_bytes()
if host:
+ _pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
if platform == "windows":
- remote_cmd = "python -"
+ # Windows: use 'python' and pipe via stdin with double-quote wrapping
+ cmd = f'ssh {_pf}{host} "python -" < \'{scan_py}\''
else:
- # POSIX: use 'python3' if available, fall back to 'python'; throw if neither is found.
- remote_cmd = (
- "if command -v python3 >/dev/null 2>&1; then python3 -; "
- "elif command -v python >/dev/null 2>&1; then python -; "
- "else echo \"python3/python not found\" >&2; exit 127; fi"
- )
- rc, stdout_b, stderr_b = await run_ssh_command_async(
- host,
- ssh_port,
- remote_cmd,
- timeout=60,
- stdin_data=scan_payload,
+ cmd = f"ssh {_pf}{host} 'python3 -' < '{scan_py}'"
+ proc = await asyncio.create_subprocess_shell(
+ cmd,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ cwd=str(Path.home()),
)
else:
# LOCAL scan: use sys.executable (the venv Python Odysseus is already
@@ -612,7 +785,7 @@ def setup_cookbook_routes() -> APIRouter:
stderr=asyncio.subprocess.PIPE,
cwd=str(Path.home()),
)
- stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)
+ stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60)
models = []
try:
@@ -752,6 +925,100 @@ def setup_cookbook_routes() -> APIRouter:
return p
return None
+ async def _serve_crash_watchdog(
+ endpoint_id: str,
+ session_id: str,
+ remote: str | None,
+ ssh_port: str | None,
+ is_windows: bool,
+ ) -> None:
+ """Drop a freshly-registered endpoint when the cookbook serve dies early.
+
+ The runner script always emits ``=== Process exited with code N ===``
+ when the launched cmd terminates (success or failure). We poll the
+ tmux pane periodically; on a non-zero exit detected within the watch
+ window, the endpoint row is deleted so the picker doesn't keep a
+ dead model around. A zero exit (rare for a long-running serve, but
+ possible for fast-failing builds that the runner reports as code 0)
+ and "missing exit marker" both leave the endpoint alone — that's
+ the loading-but-not-yet-bound state, which the probe-marks-offline
+ logic already handles.
+
+ Times are picked to outlast realistic vLLM load times (Qwen3.5-122B
+ takes ~3 min to load) without burning resources on a stuck-forever
+ wait. After the last check, the watchdog gives up — the picker's
+ per-endpoint probe takes over from there.
+ """
+ # Cumulative wait points: 25 s, 60 s, 2 min, 5 min.
+ _waits = [25, 35, 60, 180]
+ # Tmux capture-pane equivalent of the polling path used elsewhere in
+ # this file. Build it once and reuse on each tick. Skip the watchdog
+ # entirely on native-Windows local runs (no tmux). The Windows
+ # detached-process path writes its log to a known file and has its
+ # own lifecycle tracking; punting here keeps the code simple.
+ local_win = is_windows and not remote
+ if local_win:
+ return
+ if remote:
+ ssh_args = ["ssh"]
+ if ssh_port and ssh_port != "22":
+ ssh_args.extend(["-p", str(ssh_port)])
+ capture_cmd = ssh_args + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"]
+ else:
+ capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"]
+
+ _exit_re = re.compile(r"=== Process exited with code (-?\d+) ===")
+ for wait_s in _waits:
+ await asyncio.sleep(wait_s)
+ try:
+ proc = await asyncio.create_subprocess_exec(
+ *capture_cmd,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.DEVNULL,
+ )
+ stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=8)
+ output = stdout.decode("utf-8", errors="replace")
+ except Exception as e:
+ logger.debug(f"crash-watchdog: capture-pane failed (will retry): {e!r}")
+ continue
+ # Last occurrence wins — a serve that exits/restarts under the
+ # runner's "exec bash -i" trail will emit multiple markers; the
+ # most-recent code is the one that matters.
+ matches = list(_exit_re.finditer(output))
+ if not matches:
+ continue
+ try:
+ exit_code = int(matches[-1].group(1))
+ except (ValueError, IndexError):
+ continue
+ if exit_code == 0:
+ # Exit 0 on a long-running serve is unusual (a normal "loaded
+ # then ready" path keeps the process alive) but it happens for
+ # commands like "ollama pull" the user might launch through
+ # the same form. Don't drop the endpoint on a clean exit;
+ # let the probe layer mark it offline if nothing's listening.
+ logger.info(f"crash-watchdog: serve {session_id} exited cleanly (0); leaving endpoint {endpoint_id}")
+ return
+ # Non-zero exit — drop the endpoint.
+ try:
+ from core.database import SessionLocal as _SL, ModelEndpoint as _ME
+ db = _SL()
+ try:
+ ep = db.query(_ME).filter(_ME.id == endpoint_id).first()
+ if ep:
+ logger.info(
+ f"crash-watchdog: dropping endpoint {endpoint_id} "
+ f"({ep.name} @ {ep.base_url}) — serve exited {exit_code}"
+ )
+ db.delete(ep)
+ db.commit()
+ finally:
+ db.close()
+ except Exception as e:
+ logger.warning(f"crash-watchdog: endpoint cleanup failed: {e!r}")
+ return
+ logger.debug(f"crash-watchdog: no exit marker for {session_id} within window; leaving endpoint {endpoint_id}")
+
def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
"""Register a freshly-served LLM as a model endpoint so it appears in the
model picker without a manual /setup step — the text-model sibling of
@@ -763,6 +1030,10 @@ def setup_cookbook_routes() -> APIRouter:
probing /v1/models and dims the endpoint until the server is reachable,
so registering immediately (before the server finishes loading) is safe.
"""
+ logger.info(
+ f"_auto_register_llm_endpoint: ENTRY repo_id={req.repo_id!r} "
+ f"remote={remote!r} cmd_prefix={req.cmd[:80]!r}"
+ )
import re
from core.database import SessionLocal, ModelEndpoint
@@ -787,16 +1058,20 @@ def setup_cookbook_routes() -> APIRouter:
else:
port = 8080 # llama.cpp's llama-server default — the Apple Silicon path
- # Determine host (mirrors the image path: SSH alias for remote serves).
- # For local serves while Odysseus runs inside Docker, "localhost"
- # resolves to the container itself — useless. Use host.docker.internal
- # which compose maps to the actual host, matching what /setup adds
- # for Ollama by hand.
+ # Determine host. The cookbook tmux for `local=true` serves runs INSIDE
+ # the odysseus container — so the right URL for the in-container
+ # backend to reach it is `localhost`, NOT `host.docker.internal`
+ # (the latter points at the docker HOST, which doesn't have a server
+ # on that port). The previous host.docker.internal fallback only made
+ # sense for /setup-added external services like systemd Ollama on the
+ # host — and those go through manual setup, not this auto-register
+ # code path. For remote serves we still use the SSH host alias.
if remote:
host = remote.split("@")[-1] if "@" in remote else remote
+ elif re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\b", req.cmd or ""):
+ host = "host.docker.internal"
else:
- from routes.model_routes import _docker_host_gateway_reachable
- host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost"
+ host = "localhost"
base_url = f"http://{host}:{port}/v1"
@@ -805,7 +1080,9 @@ def setup_cookbook_routes() -> APIRouter:
# If the serve command opts models into OpenAI tool-calling, record it so
# agent_loop trusts emitted tool_calls instead of the name heuristic.
+ is_ollama_endpoint = "ollama" in (req.cmd or "").lower()
supports_tools = True if "--enable-auto-tool-choice" in req.cmd else None
+ pinned_models = [req.repo_id] if is_ollama_endpoint and req.repo_id else []
db = SessionLocal()
try:
@@ -815,14 +1092,43 @@ def setup_cookbook_routes() -> APIRouter:
existing.is_enabled = True
existing.model_type = "llm"
existing.name = display_name
+ if is_ollama_endpoint:
+ existing.endpoint_kind = "ollama"
+ if pinned_models:
+ existing.cached_models = json.dumps(pinned_models)
+ existing.pinned_models = json.dumps(pinned_models)
if supports_tools is not None:
existing.supports_tools = supports_tools
- # Wipe stale model lists so the picker re-probes and discovers
- # the newly-served model instead of showing the old one.
- existing.cached_models = None
- existing.hidden_models = None
db.commit()
logger.info(f"Updated existing local model endpoint: {base_url}")
+ # Re-probe so cached_models matches what the server actually
+ # serves right now (the URL may have stayed the same but the
+ # model behind it changed across launches).
+ try:
+ from routes.model_routes import _probe_endpoint
+ import json as _json2
+ probed = _probe_endpoint(base_url, existing.api_key, timeout=5)
+ if probed:
+ existing.cached_models = _json2.dumps(probed)
+ db.commit()
+ except Exception as _pe:
+ logger.warning(f"Re-probe failed for {base_url}: {_pe!r}")
+ # Sweep stale dupes: other endpoints with the same display name
+ # at DIFFERENT URLs (likely failed earlier-attempt ports) get
+ # deleted so the picker doesn't show an offline ghost next to
+ # the working one. Only sweeps endpoints whose id starts with
+ # `local-` so we never touch a user's hand-added DeepSeek/OpenAI/
+ # etc. entry with a coincidentally matching name.
+ stale = (db.query(ModelEndpoint)
+ .filter(ModelEndpoint.name == display_name)
+ .filter(ModelEndpoint.base_url != base_url)
+ .filter(ModelEndpoint.id.like("local-%"))
+ .all())
+ for s in stale:
+ logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})")
+ db.delete(s)
+ if stale:
+ db.commit()
return existing.id
ep_id = f"local-{uuid.uuid4().hex[:8]}"
@@ -833,11 +1139,42 @@ def setup_cookbook_routes() -> APIRouter:
api_key=None,
is_enabled=True,
model_type="llm",
+ endpoint_kind="ollama" if is_ollama_endpoint else "auto",
+ cached_models=json.dumps(pinned_models) if pinned_models else None,
+ pinned_models=json.dumps(pinned_models) if pinned_models else None,
supports_tools=supports_tools,
)
db.add(ep)
db.commit()
logger.info(f"Auto-registered local model endpoint: {display_name} @ {base_url}")
+ # Same sweep on first-register path: drop any pre-existing local-*
+ # endpoints with this display name pointed elsewhere.
+ stale = (db.query(ModelEndpoint)
+ .filter(ModelEndpoint.name == display_name)
+ .filter(ModelEndpoint.id != ep_id)
+ .filter(ModelEndpoint.id.like("local-%"))
+ .all())
+ for s in stale:
+ logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})")
+ db.delete(s)
+ if stale:
+ db.commit()
+ # Probe /v1/models NOW and write cached_models so the chat
+ # picker actually shows the model on the next /api/models
+ # call. Without this immediate probe, the endpoint has empty
+ # cached_models until the next background refresh fires (up
+ # to a minute later) and the picker shows nothing — even
+ # though the endpoint is in the DB and the server is up.
+ try:
+ from routes.model_routes import _probe_endpoint
+ import json as _json2
+ probed = _probe_endpoint(base_url, None, timeout=5)
+ if probed:
+ ep.cached_models = _json2.dumps(probed)
+ db.commit()
+ logger.info(f"Auto-register: probed {len(probed)} models @ {base_url}")
+ except Exception as _pe:
+ logger.warning(f"Auto-register: probe-after-create failed for {base_url}: {_pe!r}")
return ep_id
except Exception as e:
logger.error(f"Failed to auto-register local model endpoint: {e}")
@@ -877,16 +1214,6 @@ def setup_cookbook_routes() -> APIRouter:
in_venv=sys.prefix != sys.base_prefix,
)
is_pip_install = bool(req.cmd and "pip install" in req.cmd)
- remote = req.remote_host
- is_windows = req.platform == "windows"
- local_windows = IS_WINDOWS and not remote
- if is_windows or local_windows:
- if req.cmd.startswith("python3 "):
- req.cmd = "python " + req.cmd[len("python3 "):]
- if is_pip_install and ("llama-cpp-python" in req.cmd or "llama_cpp" in req.cmd) and (is_windows or local_windows):
- if "--extra-index-url" not in req.cmd:
- req.cmd += " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu"
-
if is_pip_install:
# Keep big dependency wheel builds (vLLM, …) off the home filesystem's
# pip cache so they don't fail mid-build with "No space left" (#1219)
@@ -920,7 +1247,12 @@ def setup_cookbook_routes() -> APIRouter:
# Otherwise the runner script picks one at runtime and `_auto_register`
# below still registers the stale 11434 default — which on a host with
# a systemd ollama lands on the wrong (unreachable-from-docker) service.
- if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd:
+ # Match "ollama serve" as a phrase (with optional flags after), not
+ # any substring containing "ollama" — otherwise commands like
+ # `docker exec ollama-test ollama-import …` get wrapped as if they
+ # were native `ollama serve`, prepending OLLAMA_HOST=… and then
+ # running the ollama-not-found preflight which exits 127.
+ if re.search(r"\bollama\s+serve\b", req.cmd) and "OLLAMA_HOST=" not in req.cmd:
_ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1"
_ollama_chosen_port = _pick_free_port_for_ollama(
remote, req.ssh_port, start_port=11434, max_offset=10,
@@ -950,8 +1282,6 @@ def setup_cookbook_routes() -> APIRouter:
ps_lines = []
ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"')
ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null')
- ps_lines.append('$env:PYTHONIOENCODING = "utf-8"')
- ps_lines.append('$env:PYTHONUTF8 = "1"')
if req.hf_token:
ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'")
if req.gpus:
@@ -970,7 +1300,7 @@ def setup_cookbook_routes() -> APIRouter:
ps_lines.append('try { python -c "import llama_cpp" 2>$null } catch {}')
ps_lines.append('if ($LASTEXITCODE -ne 0) {')
ps_lines.append(' Write-Host "Installing llama-cpp-python..."')
- ps_lines.append(' python -m pip install llama-cpp-python[server] --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu')
+ ps_lines.append(' python -m pip install llama-cpp-python[server]')
ps_lines.append('}')
elif "vllm" in req.cmd:
ps_lines.append('Write-Host "ERROR: vLLM is not supported on Windows. Use Ollama or llama.cpp instead."')
@@ -1045,58 +1375,46 @@ def setup_cookbook_routes() -> APIRouter:
# ollama is found (otherwise macOS falls back to a slow source build).
# /opt/homebrew = Apple Silicon, /usr/local = Intel; harmless on Linux.
runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:$HOME/llama.cpp/build/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"')
- if local_windows:
- # LOCAL Windows: no native source compilation (no cmake/compiler on Git Bash).
- # Just check python bindings (using native `python` binary) and fall back to pip install.
- runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "llama-server not found — installing Python bindings..."')
- runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='python')} || true")
- runner_lines.append('fi')
- runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install attempts."')
- runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
- runner_lines.append('fi')
- else:
- runner_lines.append('if [ -d /data/data/com.termux ]; then')
- runner_lines.append(' # Termux: no native build — use the Python bindings (CPU).')
- runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' pkg install -y cmake 2>/dev/null')
- runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null')
- runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
- runner_lines.append(' fi')
- runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
- runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
- runner_lines.append(' mkdir -p ~/bin')
- runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
- # Build with the right accelerator: Metal on macOS (llama.cpp
- # enables it automatically, no flag), CUDA on Linux when present,
- # else a plain CPU build. nproc is Linux-only — fall back to
- # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships
- # a prebuilt llama-server and skips this whole source build.)
- runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"')
- runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then')
- runner_lines.append(' command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."')
- # Start from a clean cache: a prior failed configure (e.g. a CUDA
- # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build`
- # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is
- # explicit so the binary is optimized (Metal auto-enables on macOS).
- runner_lines.append(' cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
- runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
- runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
- runner_lines.append(' else')
- _append_llama_cpp_linux_accel_build_lines(runner_lines)
- runner_lines.append(' fi')
- # If the native build failed, fall back to the Python bindings.
- runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
- runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
- runner_lines.append(' fi')
- runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
- runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."')
- runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
- runner_lines.append(' fi')
- runner_lines.append('fi')
- elif "ollama" in req.cmd:
+ runner_lines.append('if [ -d /data/data/com.termux ]; then')
+ runner_lines.append(' # Termux: no native build — use the Python bindings (CPU).')
+ runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then')
+ runner_lines.append(' pkg install -y cmake 2>/dev/null')
+ runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null')
+ runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true')
+ runner_lines.append(' fi')
+ runner_lines.append('elif ! command -v llama-server &>/dev/null; then')
+ runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."')
+ runner_lines.append(' mkdir -p ~/bin')
+ runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp')
+ # Build with the right accelerator: Metal on macOS (llama.cpp
+ # enables it automatically, no flag), CUDA on Linux when present,
+ # else a plain CPU build. nproc is Linux-only — fall back to
+ # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships
+ # a prebuilt llama-server and skips this whole source build.)
+ runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"')
+ runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then')
+ runner_lines.append(' command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."')
+ # Start from a clean cache: a prior failed configure (e.g. a CUDA
+ # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build`
+ # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is
+ # explicit so the binary is optimized (Metal auto-enables on macOS).
+ runner_lines.append(' cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\')
+ runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\')
+ runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server')
+ runner_lines.append(' else')
+ _append_llama_cpp_linux_accel_build_lines(runner_lines)
+ runner_lines.append(' fi')
+ runner_lines.append(' # If the native build failed, fall back to the Python bindings.')
+ runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
+ runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."')
+ runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true")
+ runner_lines.append(' fi')
+ runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then')
+ runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."')
+ runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
+ runner_lines.append(' fi')
+ runner_lines.append('fi')
+ elif re.search(r"\bollama\s+serve\b", req.cmd):
handled_ollama_serve = True
_ollama_default_host = "0.0.0.0" if remote else "127.0.0.1"
_ollama_host, _ollama_port = _ollama_bind_from_cmd(
@@ -1117,23 +1435,13 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_try_port"')
runner_lines.append(' break')
runner_lines.append(' fi')
- runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"')
- runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."')
- if local_windows:
- # Windows detached process has no TTY; exec bash -i crashes.
- # Keep the monitoring task alive with a sleep loop.
- runner_lines.append(' while true; do sleep 60; done')
- else:
- runner_lines.append(' exec bash -i')
- runner_lines.append('fi')
+ runner_lines.append(' exec 3<&-; exec 3>&-')
+ runner_lines.append('done')
runner_lines.append('if ! command -v ollama &>/dev/null; then')
runner_lines.append(' echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."')
runner_lines.append(' echo')
runner_lines.append(' echo "=== Process exited with code 127 ==="')
- if local_windows:
- runner_lines.append(' exit 127')
- else:
- runner_lines.append(' exec bash -i')
+ runner_lines.append(' exec bash -i')
runner_lines.append('fi')
runner_lines.append('ODYSSEUS_OLLAMA_URL="http://${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}"')
if remote and _ollama_host in ("0.0.0.0", "::"):
@@ -1141,20 +1449,24 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append('echo "[odysseus] Ollama has no built-in authentication; expose this only on a trusted LAN/VPN or provide an explicit OLLAMA_HOST with your own access controls."')
runner_lines.append('echo "Starting ollama server on ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}..."')
runner_lines.append('OLLAMA_HOST="${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
- if local_windows:
- _append_serve_exit_code_lines(runner_lines, keep_shell_open=False)
- else:
- runner_lines.append('_ody_exit=$?')
- runner_lines.append('echo')
- runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
- runner_lines.append('exec bash -i')
+ runner_lines.append('_ody_exit=$?')
+ runner_lines.append('echo')
+ runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
+ runner_lines.append('exec bash -i')
elif "vllm serve" in req.cmd:
# vLLM is CUDA/ROCm-only and does not run on macOS at all.
runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then')
runner_lines.append(' echo "ERROR: vLLM does not run on macOS. Use Ollama or llama.cpp (Metal) instead."')
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=1')
runner_lines.append('fi')
- _append_vllm_linux_preflight_lines(runner_lines)
+ # Put ~/.local/bin on PATH first — without a venv, vllm installs
+ # there via --user and the non-login serve shell otherwise can't
+ # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above.
+ runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
+ runner_lines.append('if ! command -v vllm &>/dev/null; then')
+ runner_lines.append(' echo "ERROR: vLLM is not installed."')
+ runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
+ runner_lines.append('fi')
elif "sglang.launch_server" in req.cmd:
runner_lines.append('export PATH="$HOME/.local/bin:$PATH"')
runner_lines.append('if ! command -v sglang &>/dev/null; then')
@@ -1173,15 +1485,30 @@ def setup_cookbook_routes() -> APIRouter:
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
runner_lines.append('fi')
- if not handled_ollama_serve:
+ handled_ollama_sidecar_probe = False
+ if (not handled_ollama_serve
+ and re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\s+ollama\s+show\b", req.cmd or "")):
+ handled_ollama_sidecar_probe = True
_append_serve_preflight_exit_lines(
runner_lines,
keep_shell_open=not local_windows,
)
- if is_pip_install:
- _append_pip_install_runner_lines(runner_lines, req.cmd)
- else:
- runner_lines.append(req.cmd)
+ runner_lines.append(req.cmd)
+ runner_lines.append('_ody_exit=$?')
+ runner_lines.append('echo')
+ runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
+ runner_lines.append('if [ "$_ody_exit" -eq 0 ]; then')
+ runner_lines.append(' echo "[odysseus] Ollama sidecar model is available; keeping Cookbook task attached to the persistent Ollama daemon."')
+ runner_lines.append(' while true; do sleep 3600; done')
+ runner_lines.append('fi')
+ runner_lines.append('exec bash -i')
+
+ if not handled_ollama_serve and not handled_ollama_sidecar_probe:
+ _append_serve_preflight_exit_lines(
+ runner_lines,
+ keep_shell_open=not local_windows,
+ )
+ runner_lines.append(req.cmd)
if local_windows:
# Detached background process — no interactive shell to keep open.
# Print the exit marker the status poller looks for, then stop.
@@ -1263,6 +1590,26 @@ def setup_cookbook_routes() -> APIRouter:
elif not is_pip_install:
endpoint_id = _auto_register_llm_endpoint(req, remote)
+ # Crash watchdog: the auto-register above writes the endpoint row
+ # IMMEDIATELY (before the server has even bound its port) so the
+ # picker shows the model as it warms up. When the serve process
+ # crashes right at startup (missing module, bad cmd, port collision,
+ # ModuleNotFoundError on llama_cpp, etc.), the endpoint is left
+ # dangling — every subsequent chat returns 503 or an empty response.
+ # Schedule a background task to read the tmux output for the
+ # "=== Process exited with code N ===" marker the runner emits;
+ # if N != 0 within the watch window, delete the endpoint we just
+ # created. Skipped for diffusion (different image-endpoint cleanup
+ # path) and pip-install tasks (no endpoint to drop).
+ if endpoint_id and not is_diffusion and not is_pip_install:
+ asyncio.create_task(_serve_crash_watchdog(
+ endpoint_id=endpoint_id,
+ session_id=session_id,
+ remote=remote,
+ ssh_port=req.ssh_port,
+ is_windows=is_windows,
+ ))
+
# Log to assistant
try:
from src.assistant_log import log_to_assistant
@@ -1342,8 +1689,8 @@ def setup_cookbook_routes() -> APIRouter:
cmd = f"ssh {pf}{host} '{setup_script}'"
else:
# Linux: auto-install tmux (via whichever package manager is available)
- # and huggingface_hub + hf_transfer (falling back to --user, then
- # guarded --break-system-packages on PEP-668 locked distros).
+ # and huggingface_hub + hf_transfer (falling back to --user/--break-system-packages
+ # on PEP-668 locked distros like Arch / newer Debian).
setup_script = (
# Install tmux if missing — try common package managers; skip if no sudo
"if ! command -v tmux >/dev/null 2>&1; then "
@@ -1355,15 +1702,10 @@ def setup_cookbook_routes() -> APIRouter:
" fi; "
"fi; "
"command -v tmux >/dev/null 2>&1 || echo 'WARNING: tmux missing and auto-install failed (need passwordless sudo). Install manually.'; "
- # Install Python bits. Try system install first; fall back to --user,
- # then use --break-system-packages only when pip supports it.
+ # Install Python bits. Try system install first; fall back to --user --break-system-packages on PEP 668 systems.
"pip install -q huggingface_hub hf_transfer 2>/dev/null || "
- "pip install --user -q huggingface_hub hf_transfer 2>/dev/null || "
- "( pip install --help 2>/dev/null | grep -q -- --break-system-packages && "
- "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ) || "
- "pip3 install --user -q huggingface_hub hf_transfer 2>/dev/null || "
- "( pip3 install --help 2>/dev/null | grep -q -- --break-system-packages && "
- "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ); "
+ "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null || "
+ "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null; "
"python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'"
)
cmd = f"ssh {pf}{host} '{setup_script}'"
@@ -1386,38 +1728,11 @@ def setup_cookbook_routes() -> APIRouter:
async def _run_nvidia_smi(query: str, host: str | None, ssh_port: str | None, timeout: int = 8):
"""Run nvidia-smi locally or over SSH. Returns (stdout, error_or_None)."""
if host:
- candidates = [query]
- stripped = query.strip()
- if stripped.startswith("nvidia-smi "):
- args = stripped[len("nvidia-smi "):]
- candidates.append(
- "bash -lc "
- + shlex.quote(
- f"{SSH_PATH_OVERRIDE}"
- f"nvidia-smi {args}"
- )
- )
- for nvidia_path in NVIDIA_PATH_CANDIDATES:
- candidates.append(f"{nvidia_path} {args}")
-
- last_err = "nvidia-smi failed"
- for candidate in candidates:
- try:
- rc, stdout, stderr = await run_ssh_command_async(
- host,
- ssh_port,
- candidate,
- connect_timeout=5,
- timeout=timeout,
- )
- except asyncio.TimeoutError:
- return None, "nvidia-smi timed out"
- if rc == 0:
- return stdout.decode("utf-8", errors="replace"), None
- err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200]
- if err:
- last_err = err
- return None, last_err
+ pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
+ cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{query}'"
+ proc = await asyncio.create_subprocess_shell(
+ cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+ )
else:
proc = await asyncio.create_subprocess_exec(
*shlex.split(query),
@@ -1996,30 +2311,58 @@ def setup_cookbook_routes() -> APIRouter:
return {"models": out}
- # Rate-limit for the orphan-tmux adoption sweep. The UI polls
- # tasks/status every ~3s; we don't want to SSH every host on every
- # poll. 20s is fast enough that a model the agent launched in the
- # background shows up "almost immediately" in the UI without being
- # wasteful.
+ # Rate-limit for the orphan-tmux adoption sweep. 60s interval so SSH
+ # work is genuinely sparse even on an actively-polled cookbook page.
_last_orphan_sweep_ts = [0.0]
- _ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0
+ _ORPHAN_SWEEP_MIN_INTERVAL_S = 60.0
+ # Concurrency guard so two requests racing don't both spawn a sweep.
+ _orphan_sweep_inflight = [False]
def _maybe_sweep_orphans(tasks: list, state: dict) -> None:
"""Scan each configured cookbook server for `serve-*` tmux sessions
the cookbook doesn't know about and adopt them into state.tasks.
- Writes are conditional: if no orphans are found, nothing is touched.
- Rate-limited so polling UIs don't trigger SSH on every refresh.
+ Heavy SSH work runs in a background thread via asyncio.to_thread so
+ it never blocks the request that triggered it. Was previously
+ disabled because the sync implementation pegged uvicorn CPU during
+ active cookbook polling — re-enabled now with the work pushed off
+ the event loop and a slower (60s) cadence.
"""
import time as _time
- import subprocess
- logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}")
now = _time.monotonic()
+ if _orphan_sweep_inflight[0]:
+ return
if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S:
- logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last")
return
_last_orphan_sweep_ts[0] = now
+ _orphan_sweep_inflight[0] = True
+ # Snapshot inputs so the worker doesn't race with state mutations.
+ try:
+ tasks_snap = list(tasks or [])
+ except Exception:
+ tasks_snap = []
+ state_snap = state if isinstance(state, dict) else {}
+ # Caller is _cookbook_tasks_status_sync (sync context, no event
+ # loop). Use a plain background thread — no asyncio needed.
+ import threading
+ def _run_sweep() -> None:
+ try:
+ _sync_sweep_orphans(tasks_snap, state_snap)
+ except Exception as _e:
+ logger.warning(f"orphan sweep thread failed: {_e!r}")
+ finally:
+ _orphan_sweep_inflight[0] = False
+ try:
+ threading.Thread(target=_run_sweep, daemon=True, name="orphan-sweep").start()
+ except Exception as _e:
+ logger.warning(f"orphan sweep thread spawn failed: {_e!r}")
+ _orphan_sweep_inflight[0] = False
+ return
+
+ def _sync_sweep_orphans(tasks: list, state: dict) -> None:
+ """The actual sync sweep — never call this on the event loop."""
+ import subprocess
env = state.get("env") if isinstance(state, dict) else {}
servers = env.get("servers") if isinstance(env, dict) else []
logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}")
@@ -2143,6 +2486,121 @@ def setup_cookbook_routes() -> APIRouter:
except Exception as e:
logger.warning(f"orphan sweep: state write failed: {e}")
+ # In-memory cache for the Ollama library scrape. ollama.com is a public
+ # site, but it doesn't expose a stable JSON listing — we fetch the HTML
+ # search page and regex out the model cards. Cached for 1 h so a busy
+ # cookbook view doesn't hammer the site on every render.
+ _ollama_library_cache: dict = {"models": [], "fetched_at": 0.0, "error": None}
+
+ _OLLAMA_FALLBACK_LIBRARY = [
+ {"name": "qwen2.5", "description": "Qwen2.5 series — strong general/coding model from Alibaba.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b", "72b"]},
+ {"name": "qwen2.5-coder", "description": "Code-specialized Qwen2.5 family.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b"]},
+ {"name": "qwen3", "description": "Qwen3 — newer Alibaba family with hybrid reasoning.", "sizes": ["0.6b", "1.7b", "4b", "8b", "14b", "32b"]},
+ {"name": "llama3.2", "description": "Meta Llama 3.2 instruct (and tiny / vision variants).", "sizes": ["1b", "3b", "11b", "90b"]},
+ {"name": "llama3.1", "description": "Meta Llama 3.1 instruct.", "sizes": ["8b", "70b", "405b"]},
+ {"name": "llama3.3", "description": "Meta Llama 3.3 70B instruct.", "sizes": ["70b"]},
+ {"name": "gemma3", "description": "Google Gemma 3 — multimodal capable open-weights.", "sizes": ["1b", "4b", "12b", "27b"]},
+ {"name": "gemma2", "description": "Google Gemma 2 instruct.", "sizes": ["2b", "9b", "27b"]},
+ {"name": "mistral", "description": "Mistral 7B instruct — small, fast generalist.", "sizes": ["7b"]},
+ {"name": "mistral-nemo", "description": "Mistral NeMo 12B instruct.", "sizes": ["12b"]},
+ {"name": "mistral-small", "description": "Mistral Small 22B / 24B instruct.", "sizes": ["22b", "24b"]},
+ {"name": "mixtral", "description": "Mistral MoE 8x7B / 8x22B.", "sizes": ["8x7b", "8x22b"]},
+ {"name": "phi3", "description": "Microsoft Phi-3 small / medium.", "sizes": ["mini", "medium"]},
+ {"name": "phi4", "description": "Microsoft Phi-4 14B.", "sizes": ["14b"]},
+ {"name": "deepseek-r1", "description": "DeepSeek R1 reasoning model (distilled variants).", "sizes": ["1.5b", "7b", "8b", "14b", "32b", "70b"]},
+ {"name": "deepseek-v3", "description": "DeepSeek V3 MoE 671B (huge — needs serious VRAM).", "sizes": ["671b"]},
+ {"name": "codellama", "description": "Meta Code Llama instruct family.", "sizes": ["7b", "13b", "34b", "70b"]},
+ {"name": "starcoder2", "description": "BigCode StarCoder2 — code completion.", "sizes": ["3b", "7b", "15b"]},
+ {"name": "deepseek-coder-v2", "description": "DeepSeek Coder V2 — code MoE.", "sizes": ["16b", "236b"]},
+ {"name": "nomic-embed-text", "description": "Embedding model — text vector encoder.", "sizes": ["latest"]},
+ {"name": "mxbai-embed-large", "description": "Embedding model — Mixedbread large.", "sizes": ["latest"]},
+ {"name": "llava", "description": "LLaVA multimodal vision-language model.", "sizes": ["7b", "13b", "34b"]},
+ {"name": "minicpm-v", "description": "MiniCPM-V multimodal.", "sizes": ["8b"]},
+ {"name": "command-r", "description": "Cohere Command R — RAG-oriented.", "sizes": ["35b"]},
+ {"name": "command-r-plus", "description": "Cohere Command R+ — larger RAG model.", "sizes": ["104b"]},
+ {"name": "qwq", "description": "Qwen QwQ reasoning preview.", "sizes": ["32b"]},
+ {"name": "smollm2", "description": "HuggingFaceTB SmolLM2 — tiny capable models.", "sizes": ["135m", "360m", "1.7b"]},
+ {"name": "granite3.1-dense", "description": "IBM Granite 3.1 dense instruct.", "sizes": ["2b", "8b"]},
+ {"name": "nemotron", "description": "NVIDIA Nemotron 70B.", "sizes": ["70b"]},
+ {"name": "olmo2", "description": "AI2 OLMo 2 open-weights.", "sizes": ["7b", "13b"]},
+ ]
+
+ @router.get("/api/cookbook/ollama/library")
+ async def ollama_library(refresh: int = 0, request: Request = None, owner: str = Depends(require_user)):
+ """List popular Ollama library models for the Browse picker.
+
+ Tries a 1-hour-cached fetch of ollama.com/library, falls back to a
+ curated hard-coded list so the picker always renders something."""
+ import time as _time
+ import httpx as _httpx
+ TTL = 3600.0
+ now = _time.time()
+ if refresh or (now - _ollama_library_cache["fetched_at"]) > TTL or not _ollama_library_cache["models"]:
+ models: list[dict] = []
+ err = None
+ try:
+ async with _httpx.AsyncClient(timeout=8, follow_redirects=True) as client:
+ resp = await client.get(
+ "https://ollama.com/search?sort=popular",
+ headers={"User-Agent": "odysseus-cookbook/1.0"},
+ )
+ if resp.status_code == 200:
+ html = resp.text
+ # ollama.com renders each model card as a single anchor:
+ # …
+ # The description + sizes live inside that anchor. Pull
+ # the whole block then extract pieces individually.
+ block_re = re.compile(
+ r']*href="/library/([A-Za-z0-9._-]+)"[^>]*>(.*?)',
+ re.DOTALL,
+ )
+ desc_re = re.compile(r']*>([^<]{4,400})
', re.DOTALL)
+ # Size tags on ollama.com cards look like "0.5b", "14b",
+ # "8x7b", "27b". Pulled from short -wrapped chips.
+ size_re = re.compile(r'>\s*(\d+(?:\.\d+)?(?:x\d+)?[bBmM])\s*<')
+ seen: set[str] = set()
+ for bm in block_re.finditer(html):
+ name = bm.group(1).strip()
+ if name in seen:
+ continue
+ seen.add(name)
+ body = bm.group(2)
+ dm = desc_re.search(body)
+ desc = (dm.group(1).strip() if dm else "").replace("\n", " ")
+ sizes_raw = size_re.findall(body)
+ # Dedup sizes preserving order
+ sizes: list[str] = []
+ for s in sizes_raw:
+ s_low = s.lower()
+ if s_low not in sizes:
+ sizes.append(s_low)
+ models.append({"name": name, "description": desc, "sizes": sizes})
+ if len(models) >= 80:
+ break
+ else:
+ err = f"HTTP {resp.status_code}"
+ except Exception as e:
+ err = str(e)[:160]
+ # Merge curated fallback so classics (qwen2.5, llama3, deepseek-r1,
+ # …) stay reachable even when ollama.com's front page is dominated
+ # by brand-new releases the user might not be looking for.
+ live_names = {m["name"] for m in models}
+ for fb in _OLLAMA_FALLBACK_LIBRARY:
+ if fb["name"] not in live_names:
+ models.append(fb)
+ if not models:
+ models = list(_OLLAMA_FALLBACK_LIBRARY)
+ if err is None:
+ err = "parsed 0 results — using fallback list"
+ _ollama_library_cache["models"] = models
+ _ollama_library_cache["fetched_at"] = now
+ _ollama_library_cache["error"] = err
+ return {
+ "models": _ollama_library_cache["models"],
+ "fetched_at": _ollama_library_cache["fetched_at"],
+ "error": _ollama_library_cache["error"],
+ }
+
@router.get("/api/cookbook/tasks/status")
async def cookbook_tasks_status(request: Request):
"""Check status of all active cookbook tmux sessions.
@@ -2180,13 +2638,39 @@ def setup_cookbook_routes() -> APIRouter:
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
"sys.exit(0 if ok and not inc else 1)"
)
- if remote_host:
- cmd = ["python3", "-c", py, repo_id]
- else:
- # Local Windows: python3 can hit the Microsoft Store stub. Use the
- # real Python Odysseus is running under (guaranteed to exist).
- import sys as _sys_local
- cmd = [_sys_local.executable, "-c", py, repo_id]
+ cmd = ["python3", "-c", py, repo_id]
+ try:
+ if remote_host:
+ ssh_base = ["ssh"]
+ if ssh_port and ssh_port != "22":
+ ssh_base.extend(["-p", str(ssh_port)])
+ shell_cmd = " ".join(shlex.quote(x) for x in cmd)
+ proc = subprocess.run(ssh_base + [remote_host, shell_cmd], timeout=12, capture_output=True)
+ else:
+ proc = subprocess.run(cmd, timeout=12, capture_output=True)
+ return proc.returncode == 0
+ except Exception:
+ return False
+
+ def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
+ """Best-effort check for resumable HF partial blobs.
+
+ A lost SSH/tmux session can leave a real download still incomplete.
+ Treat any *.incomplete blob as stronger evidence than stale
+ "100%" lines in the captured pane output.
+ """
+ if not repo_id or "/" not in repo_id:
+ return False
+ py = (
+ "import os,sys;"
+ "repo=sys.argv[1];"
+ "base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
+ "d=os.path.join(base,'models--'+repo.replace('/','--'));"
+ "blobs=os.path.join(d,'blobs');"
+ "inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
+ "sys.exit(0 if inc else 1)"
+ )
+ cmd = ["python3", "-c", py, repo_id]
try:
if remote_host:
ssh_base = ["ssh"]
@@ -2333,28 +2817,43 @@ def setup_cookbook_routes() -> APIRouter:
except Exception:
pass
else:
- try:
- alive = subprocess.run(check_cmd, timeout=10, capture_output=True)
- is_alive = alive.returncode == 0
- except Exception:
+ # Skip the live SSH check entirely for tasks already in a
+ # terminal state — they won't change, and 10s timeouts
+ # stacked per task were the dominant cost of this whole
+ # status endpoint (3+ minute stalls with ~8 accumulated
+ # stopped tasks). The agent's `list_served_models` call
+ # was blocking the chat stream every time.
+ _task_status = (task.get("status") or "").lower()
+ if _task_status in {"stopped", "done", "completed",
+ "crashed", "error", "failed",
+ "ended", "killed"}:
is_alive = False
-
- # Capture last lines for progress. Prefer the "Downloading" line
- # (real aggregate bytes) over "Fetching N files" (whole-file count that
- # lags with hf_transfer). Falls back to the true last line otherwise.
- if is_alive:
+ # Keep the persisted output_tail for the UI — it's
+ # what the agent uses to diagnose past failures.
+ full_snapshot = (task.get("output") or "")[-12000:]
+ else:
try:
- cap = subprocess.run(capture_cmd, timeout=10, capture_output=True, text=True)
- if cap.returncode == 0:
- full_snapshot = cap.stdout.strip()
- lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
- downloading_lines = [l for l in lines if l.startswith("Downloading")]
- if downloading_lines:
- progress_text = downloading_lines[-1]
- elif lines:
- progress_text = lines[-1]
+ alive = subprocess.run(check_cmd, timeout=4, capture_output=True)
+ is_alive = alive.returncode == 0
except Exception:
- pass
+ is_alive = False
+
+ # Capture last lines for progress. Prefer the "Downloading" line
+ # (real aggregate bytes) over "Fetching N files" (whole-file count that
+ # lags with hf_transfer). Falls back to the true last line otherwise.
+ if is_alive:
+ try:
+ cap = subprocess.run(capture_cmd, timeout=4, capture_output=True, text=True)
+ if cap.returncode == 0:
+ full_snapshot = cap.stdout.strip()
+ lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()]
+ downloading_lines = [l for l in lines if l.startswith("Downloading")]
+ if downloading_lines:
+ progress_text = downloading_lines[-1]
+ elif lines:
+ progress_text = lines[-1]
+ except Exception:
+ pass
# Determine status. For the local-Windows detached model the log file
# persists after the process exits, so a finished download still has a
@@ -2362,6 +2861,16 @@ def setup_cookbook_routes() -> APIRouter:
# when the PID is gone instead of blindly reporting "stopped".
download_zero_files = False
status = "unknown"
+ download_has_ok = task_type == "download" and "DOWNLOAD_OK" in full_snapshot
+ download_has_failed = task_type == "download" and "DOWNLOAD_FAILED" in full_snapshot
+ download_has_incomplete_evidence = (
+ task_type == "download"
+ and (
+ ".incomplete" in full_snapshot
+ or bool(re.search(r'model-\d+-of-\d+\.[A-Za-z0-9_.-]+:\s+(?:[0-9]|[1-8][0-9])%', full_snapshot))
+ or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""))
+ )
+ )
if is_alive or (local_win_task and full_snapshot):
lower = full_snapshot.lower()
exit_match = re.search(r"=== process exited with code\s+(-?\d+)", full_snapshot, re.I)
@@ -2374,20 +2883,24 @@ def setup_cookbook_routes() -> APIRouter:
elif has_exit and task_type == "download":
# Dependency installs are tracked as download tasks but only
# emit the generic runner exit marker, not HF download markers.
- status = "completed" if exit_code == 0 else "error"
+ if download_has_incomplete_evidence and not download_has_ok:
+ status = "running" if is_alive else "stopped"
+ else:
+ status = "completed" if exit_code == 0 else "error"
elif has_exit and "unrecognized arguments" in lower:
status = "error"
elif has_error and not ("application startup complete" in lower):
status = "error"
- elif task_type == "download" and ("100%" in full_snapshot or "DOWNLOAD_OK" in full_snapshot):
- # Only download tasks treat 100% as "completed".
- # Serve tasks log 100%|██████| during inference progress
- # (diffusion sampling, etc.) — that's "running", not done.
+ elif task_type == "download" and download_has_ok:
if re.search(r"Fetching\s+0\s+files", full_snapshot, re.IGNORECASE):
status = "error"
download_zero_files = True
else:
status = "completed"
+ elif task_type == "download" and download_has_failed:
+ status = "error"
+ elif task_type == "download" and download_has_incomplete_evidence:
+ status = "running" if is_alive else "stopped"
elif "application startup complete" in lower:
status = "ready"
elif not is_alive:
@@ -2397,7 +2910,11 @@ def setup_cookbook_routes() -> APIRouter:
status = "running"
else:
# Session is dead — check if it completed or crashed
- if task_type == "download" and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")):
+ if (
+ task_type == "download"
+ and not download_has_incomplete_evidence
+ and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""))
+ ):
status = "completed"
if not progress_text:
progress_text = "Download complete"
@@ -2407,12 +2924,12 @@ def setup_cookbook_routes() -> APIRouter:
status = "stopped"
# Parse structured phase info — single source of truth for the UI
- phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {}
+ phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and full_snapshot) else {}
if phase_info.get("status") == "ready":
status = "ready"
serve_phase = phase_info.get("phase", "")
diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None
- if diagnosis and status in {"running", "unknown", "stopped"}:
+ if diagnosis and status in {"running", "unknown", "stopped"} and phase_info.get("status") != "ready":
status = "error"
if download_zero_files:
diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."}
diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py
index a7af18b04..eb408ac9d 100644
--- a/routes/hwfit_routes.py
+++ b/routes/hwfit_routes.py
@@ -196,7 +196,24 @@ def setup_hwfit_routes():
if target_context is not None:
target_context = max(1024, min(target_context, 1000000))
- results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None, target_context=target_context, fit_only=fit_only)
+ rank_kwargs = {
+ "use_case": use_case or None,
+ "limit": limit,
+ "search": search or None,
+ "sort": sort,
+ "quant": quant or None,
+ "fit_only": fit_only,
+ }
+ if target_context is not None:
+ rank_kwargs["target_context"] = target_context
+ try:
+ import inspect
+ supported = set(inspect.signature(rank_models).parameters)
+ rank_kwargs = {k: v for k, v in rank_kwargs.items() if k in supported}
+ except Exception:
+ rank_kwargs.pop("target_context", None)
+ rank_kwargs.pop("fit_only", None)
+ results = rank_models(system, **rank_kwargs)
return {"system": system, "models": results}
@router.get("/profiles")
diff --git a/routes/model_routes.py b/routes/model_routes.py
index 995705d75..864035884 100644
--- a/routes/model_routes.py
+++ b/routes/model_routes.py
@@ -4,8 +4,8 @@ import os
import re
import uuid
import json
-import socket
import hashlib
+import socket
import time as _time
import logging
import httpx
@@ -283,11 +283,8 @@ _HOST_TO_CURATED = (
("fireworks.ai", "fireworks"),
("googleapis.com", "google"),
("x.ai", "xai"),
-
("openrouter.ai", "openrouter"),
("ollama.com", "ollama"),
- ("opencode.ai/zen/go", "opencode-go"),
- ("opencode.ai/zen", "opencode-zen"),
)
@@ -494,8 +491,6 @@ _NON_CHAT_EXACT_PREFIXES = (
def _is_chat_model(model_id: str) -> bool:
"""Return True if the model ID looks like a chat/completions-capable model."""
mid = model_id.lower()
- if mid in {"gpt-5.1-codex"}:
- return True
for prefix in _NON_CHAT_PREFIXES:
if mid.startswith(prefix):
return False
@@ -509,15 +504,7 @@ def _is_chat_model(model_id: str) -> bool:
def _delete_orphaned_provider_auth(db, auth_id: Optional[str], exclude_ep_id: Optional[str] = None) -> bool:
- """Delete a ProviderAuthSession once no endpoint still references it.
-
- Subscription providers (e.g. ChatGPT Subscription) keep their refresh token
- in ProviderAuthSession rather than ModelEndpoint.api_key. When the last
- endpoint backed by that auth row is removed, the stored credentials should
- be cleared instead of lingering. Returns True if a row was deleted.
- ``exclude_ep_id`` drops the endpoint currently being deleted from the
- reference count so it does not keep its own auth alive.
- """
+ """Delete a ProviderAuthSession once no endpoint still references it."""
if not auth_id:
return False
from core.database import ProviderAuthSession
@@ -534,40 +521,52 @@ def _delete_orphaned_provider_auth(db, auth_id: Optional[str], exclude_ep_id: Op
return True
-def _is_discovery_only_provider(provider: str) -> bool:
- """Provider that only supports model discovery, not live probing.
+def _safe_detect_provider(base_url: str) -> str:
+ """Best-effort provider detection that must not break endpoint probing."""
+ try:
+ return _detect_provider(base_url)
+ except Exception as exc:
+ logger.debug("Provider detection failed for %s: %s", base_url, exc)
+ return ""
- ChatGPT Subscription speaks the Responses/Codex API and has no
- chat-completions or general health endpoint, so completion probes and
- reachability pings are skipped — status is derived from cached models.
- """
+
+def _safe_build_models_url(base_url: str) -> str:
+ """Build a /models URL without letting optional provider imports break probes."""
+ try:
+ return build_models_url(base_url)
+ except Exception as exc:
+ logger.debug("Model URL detection failed for %s: %s", base_url, exc)
+ return f"{(base_url or '').rstrip('/')}/models"
+
+
+def _safe_build_headers(api_key: Optional[str], base_url: str) -> dict:
+ """Build auth headers without letting optional provider imports break probes."""
+ try:
+ return build_headers(api_key, base_url)
+ except Exception as exc:
+ logger.debug("Header detection failed for %s: %s", base_url, exc)
+ return {"Authorization": f"Bearer {api_key}"} if api_key else {}
+
+
+def _is_discovery_only_provider(provider: str) -> bool:
return provider == "chatgpt-subscription"
def _resolve_probe_key(ep) -> Optional[str]:
- """API key/bearer to probe an endpoint with.
-
- Delegates to ``resolve_endpoint_runtime``, which already returns the static
- ``ModelEndpoint.api_key`` for keyed endpoints and resolves (and refreshes)
- the runtime bearer for session-backed providers (e.g. ChatGPT Subscription).
- Returns None if resolution fails (e.g. re-auth required) so probing skips
- rather than raising. Reads only already-loaded scalar attributes of ``ep``.
- """
+ """API key/bearer to probe an endpoint with."""
try:
from src.endpoint_resolver import resolve_endpoint_runtime
_base, key = resolve_endpoint_runtime(ep, owner=getattr(ep, "owner", None))
return key
- except Exception as e:
- logger.warning("Probe key resolution failed for %s: %s", getattr(ep, "id", "?"), e)
+ except Exception as exc:
+ logger.warning("Probe key resolution failed for %s: %s", getattr(ep, "id", "?"), exc)
return None
-def _probe_single_model(base: str, api_key: Optional[str], model_id: str, timeout: int = 10, with_tools: bool = False) -> dict:
+def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 10, with_tools: bool = False) -> dict:
"""Send a realistic completion request to a single model. Returns {status, latency_ms, error?}."""
- provider = _detect_provider(base)
+ provider = _safe_detect_provider(base)
if _is_discovery_only_provider(provider):
- # Responses/Codex API, not chat-completions: a completion probe would
- # 400 and the re-probe flow would then hide every model. Discovery-only.
return {"status": "ok", "latency_ms": 0, "skipped": True}
messages = [
{"role": "system", "content": "You are a helpful assistant."},
@@ -587,12 +586,12 @@ def _probe_single_model(base: str, api_key: Optional[str], model_id: str, timeou
elif provider == "ollama":
from src.llm_core import _build_ollama_payload
target_url = build_chat_url(base)
- h = build_headers(api_key, base)
+ h = _safe_build_headers(api_key, base)
h["Content-Type"] = "application/json"
payload = _build_ollama_payload(model_id, messages, 0.0, 5, stream=False, tools=_test_tools)
else:
target_url = build_chat_url(base)
- h = build_headers(api_key, base)
+ h = _safe_build_headers(api_key, base)
h["Content-Type"] = "application/json"
from src.llm_core import _uses_max_completion_tokens, _restricts_temperature
_max_key = "max_completion_tokens" if _uses_max_completion_tokens(model_id) else "max_tokens"
@@ -682,14 +681,15 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
For Anthropic, queries their /v1/models API, falling back to hardcoded list."""
from src.endpoint_resolver import resolve_url
base = resolve_url(_normalize_base(base_url))
- if _detect_provider(base) == "chatgpt-subscription":
+ provider = _safe_detect_provider(base)
+ if provider == "chatgpt-subscription":
from src.chatgpt_subscription import fetch_available_models
if api_key:
return fetch_available_models(api_key, timeout=timeout)
return []
- if _detect_provider(base) == "anthropic":
+ if provider == "anthropic":
# Try Anthropic's /v1/models endpoint first
- url = build_models_url(base)
+ url = _safe_build_models_url(base)
headers = {"anthropic-version": "2023-06-01"}
if api_key:
headers["x-api-key"] = api_key
@@ -712,12 +712,8 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
return []
logger.warning(f"Anthropic /v1/models failed, using hardcoded list: {e}")
return list(ANTHROPIC_MODELS)
- url = build_models_url(base)
- if not url:
- curated_key = _match_provider_curated(base, None)
- fallback = _PROVIDER_CURATED.get(curated_key) if curated_key else None
- return list(fallback or [])
- headers = build_headers(api_key, base)
+ url = _safe_build_models_url(base)
+ headers = _safe_build_headers(api_key, base)
try:
r = httpx.get(url, headers=headers, timeout=timeout, verify=llm_verify())
r.raise_for_status()
@@ -770,11 +766,12 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis
return list(fallback)
return []
+
def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> Dict[str, Any]:
"""Reachability probe that does not require installed/listed models."""
from src.endpoint_resolver import resolve_url
base = resolve_url(_normalize_base(base_url))
- headers = build_headers(api_key, base)
+ headers = _safe_build_headers(api_key, base)
# Ollama exposes /v1/models (OpenAI-compatible) AND native /api/version,
# /api/tags. Probe native paths for Ollama-style endpoints, but avoid using
@@ -785,10 +782,6 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
or "ollama" in (parsed_base.hostname or "").lower()
)
- # APFEL-specific detection
- host = (parsed_base.hostname or "").lower()
- looks_like_apfel = "apfel" in host or parsed_base.port == 11435
-
def _result_from_response(r) -> Dict[str, Any]:
if 300 <= r.status_code < 400:
loc = r.headers.get("location", "")
@@ -810,23 +803,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
last_error: Optional[str] = None
try:
- # APFEL does not behave like Ollama; use its health endpoint.
- if looks_like_apfel:
- root = base
- for suffix in ("/v1", "/api"):
- if root.endswith(suffix):
- root = root[: -len(suffix)].rstrip("/")
- break
- try:
- r = httpx.get(root + "/health", timeout=timeout, verify=llm_verify())
- result = _result_from_response(r)
- if result["reachable"]:
- return result
- last_error = result.get("error")
- except Exception as e:
- last_error = str(e)[:120]
-
- elif looks_like_ollama:
+ if looks_like_ollama:
root = base
for suffix in ("/v1", "/api"):
if root.endswith(suffix):
@@ -847,17 +824,11 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
try:
r = httpx.get(base, headers=headers, timeout=timeout, verify=llm_verify())
result = _result_from_response(r)
- # If the bare base URL returns a non-auth 4xx (e.g. 404), try /models
- # as a fallback. OpenAI-compatible servers like llama-swap return 404
- # on the base /v1 prefix but 200 on /v1/models. Auth failures (401/403)
- # are definitive — probing /models would just repeat the same rejection.
- if (
- not result["reachable"]
- and result.get("status_code") is not None
- and 400 <= result["status_code"] < 500
- and result["status_code"] not in (401, 403)
- ):
- models_url = build_models_url(base)
+ if result["reachable"]:
+ return result
+ sc = result.get("status_code") or 0
+ if 400 <= sc < 500 and sc not in (401, 403):
+ models_url = _safe_build_models_url(base)
try:
r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify())
result2 = _result_from_response(r2)
@@ -865,12 +836,16 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) ->
return result2
except Exception:
pass
- return result
+ if sc:
+ return result
+ last_error = result.get("error") or last_error
except Exception as e:
last_error = str(e)[:120]
return {"reachable": False, "status_code": None, "error": last_error}
+
+
def _model_endpoint_error_message(base_url: str, ping: Dict[str, Any] = None) -> str:
"""Return a provider-aware error message for failed endpoint probes."""
ping = ping or {}
@@ -1068,17 +1043,6 @@ def setup_model_routes(model_discovery):
ok, info = _should_refresh_endpoint(ep, now, force=force)
if not ok:
continue
- if getattr(ep, "provider_auth_id", None):
- try:
- from src.endpoint_resolver import resolve_endpoint_runtime
- info["base"], info["api_key"] = resolve_endpoint_runtime(
- ep,
- owner=getattr(ep, "owner", None),
- )
- info["key"] = _refresh_key(info["base"], info["api_key"])
- except Exception as e:
- logger.warning("Skipping model refresh for %s: could not resolve provider auth: %s", getattr(ep, "name", ep.id), e)
- continue
groups.setdefault(info["key"], {
"base": info["base"],
"api_key": info["api_key"],
@@ -1156,7 +1120,7 @@ def setup_model_routes(model_discovery):
for ep in endpoints:
base = _normalize_base(ep.base_url)
- provider = _detect_provider(base)
+ provider = _safe_detect_provider(base)
# Merge cached + pinned models, then filter out hidden ones
ep_model_type = getattr(ep, "model_type", None) or "llm"
model_ids = _visible_models(
@@ -1233,8 +1197,8 @@ def setup_model_routes(model_discovery):
except HTTPException:
raise
except Exception as e:
- logger.error('Auth gate error in GET /api/models, failing closed: %s', e)
- raise HTTPException(status_code=500, detail='Internal error')
+ logger.error("Auth gate error in GET /api/models, failing closed: %s", e)
+ raise HTTPException(status_code=500, detail="Internal error")
# Admins see every endpoint (they manage the global pool); regular
# users get the owner-scoped view.
_is_admin = False
@@ -1298,7 +1262,14 @@ def setup_model_routes(model_discovery):
t0 = _time.time()
try:
import asyncio as _asyncio
- ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 1.5)
+ # Bumped 1.5s → 3.5s. The previous 1.5s budget was clipping
+ # local vLLM endpoints on Tailscale links where the model
+ # server is still loading (Qwen3.5-122B takes 2–3 min to
+ # warm); /v1/models can take 500–2500 ms on a busy box,
+ # which pushed _ping_endpoint's full path-discovery sweep
+ # past the cap and marked the row offline despite the
+ # user actively chatting with it.
+ ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 3.5)
lat = round((_time.time() - t0) * 1000)
return {
"alive": bool(ping.get("reachable")),
@@ -1336,7 +1307,7 @@ def setup_model_routes(model_discovery):
results = []
for ep in endpoints:
base = _normalize_base(ep.base_url)
- provider = _detect_provider(base)
+ provider = _safe_detect_provider(base)
kind = _effective_endpoint_kind(ep, base)
cached_count = len(_cached_model_ids(ep))
entry = {
@@ -1348,20 +1319,12 @@ def setup_model_routes(model_discovery):
"endpoint_kind": kind,
}
try:
- if _is_discovery_only_provider(provider):
- # No general health endpoint — an unauthenticated GET just
- # 401s. Report status from cached models instead of pinging.
- entry["latency_ms"] = None
- entry["status"] = "online" if cached_count else "offline"
- entry["error"] = None
- entry["model_count"] = cached_count
- else:
- t0 = _time.time()
- ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
- entry["latency_ms"] = round((_time.time() - t0) * 1000)
- entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
- entry["error"] = ping.get("error")
- entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
+ t0 = _time.time()
+ ping = _ping_endpoint(base, ep.api_key, timeout=1.5)
+ entry["latency_ms"] = round((_time.time() - t0) * 1000)
+ entry["status"] = "online" if ping.get("reachable") or cached_count else "offline"
+ entry["error"] = ping.get("error")
+ entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0)
except Exception as e:
entry["latency_ms"] = None
entry["status"] = "online" if cached_count else "offline"
@@ -1394,7 +1357,7 @@ def setup_model_routes(model_discovery):
if ep_id and ep_id not in endpoints_cache:
ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first()
if ep:
- endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": _resolve_probe_key(ep)}
+ endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": ep.api_key}
ep_data = endpoints_cache.get(ep_id)
if not ep_data:
# Try to find by base_url from the model's endpoint field
@@ -1433,7 +1396,7 @@ def setup_model_routes(model_discovery):
"id": ep.id,
"name": ep.name,
"base_url": ep.base_url,
- "api_key": _resolve_probe_key(ep),
+ "api_key": ep.api_key,
})
finally:
db.close()
@@ -1522,14 +1485,37 @@ def setup_model_routes(model_discovery):
# Endpoint counts as reachable if it has any model — including
# admin-pinned IDs that a probe would never surface.
status = "online" if (all_models or pinned) else "offline"
- base = _normalize_base(r.base_url)
ping = None
- # Discovery-only providers have no health endpoint — an
- # unauthenticated ping just 401s, so don't bother.
- if not all_models and not pinned and r.is_enabled and not _is_discovery_only_provider(_detect_provider(base)):
- ping = _ping_endpoint(r.base_url, r.api_key, timeout=1.0)
+ # When cached_models is empty, do a quick reachability probe.
+ # Bumped 1.0s → 3.5s because the user reported endpoints they
+ # were ACTIVELY chatting with showed "offline" — the previous
+ # 1s timeout was clipping live cloud endpoints (DeepSeek can
+ # take 1.5–2.5s on /v1/models when their region is under load,
+ # vLLM on a remote GPU box behind SSH can also push past 1s).
+ # 3.5s still keeps the picker render snappy in the common
+ # "everything's already cached" path because this branch only
+ # runs for endpoints with an empty cached_models.
+ if not all_models and not pinned and r.is_enabled:
+ ping = _ping_endpoint(r.base_url, r.api_key, timeout=3.5)
if ping.get("reachable"):
status = "empty"
+ # Best-effort: if the probe came back reachable, try
+ # to populate cached_models in the background so the
+ # NEXT picker load shows "online" instead of "empty".
+ # Failure here is silent — we already returned the
+ # "empty" status, and the existing background refresh
+ # path will eventually fill it in too.
+ try:
+ probed = _probe_endpoint(r.base_url, r.api_key, timeout=5)
+ if probed:
+ r.cached_models = json.dumps(probed)
+ db.commit()
+ all_models = probed
+ visible = _visible_models(all_models, r.hidden_models, pinned)
+ status = "online"
+ except Exception as _refill_err:
+ logger.debug(f"opportunistic cached_models refill failed for {r.id}: {_refill_err!r}")
+ base = _normalize_base(r.base_url)
kind = _effective_endpoint_kind(r, base)
results.append({
"id": r.id,
@@ -1603,11 +1589,10 @@ def setup_model_routes(model_discovery):
)
explicit_timeout = _explicit_model_list_timeout(base_url, requested_kind, refresh_timeout)
- # Dedupe: if an endpoint with the same base_url and compatible
- # credentials already exists and is reachable by the caller (shared or
- # owned by them), return it instead of creating a duplicate row. Keep
- # same-url/different-key rows distinct so users can group the same
- # provider URL under multiple credentials.
+ # Dedupe: if an endpoint with the same base_url already exists and
+ # is reachable by the caller (shared or owned by them), return it
+ # instead of creating a duplicate row. Fixes "Scan for Servers"
+ # re-adding manually-added endpoints under their host:port name.
from src.auth_helpers import get_current_user as _gcu_dedup
_caller = _gcu_dedup(request) or None
_incoming_api_key = api_key.strip()
@@ -1805,7 +1790,7 @@ def setup_model_routes(model_discovery):
ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first()
if not ep:
raise HTTPException(404, "Endpoint not found")
- ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": _resolve_probe_key(ep)}
+ ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": ep.api_key}
finally:
db.close()
@@ -1869,7 +1854,7 @@ def setup_model_routes(model_discovery):
category = _classify_endpoint(base, kind)
timeout = _manual_refresh_timeout(ep, category, refresh_timeout)
try:
- probed = _probe_endpoint(base, _resolve_probe_key(ep), timeout=timeout)
+ probed = _probe_endpoint(base, ep.api_key, timeout=timeout)
except Exception as exc:
logger.warning("Manual model refresh failed for endpoint %s at %s: %s", ep_id, base, exc)
probed = []
@@ -2105,8 +2090,6 @@ def setup_model_routes(model_discovery):
"name": ep.name,
"model_type": ep.model_type,
"base_url": ep.base_url,
- "has_key": bool(ep.api_key),
- "api_key_fingerprint": _api_key_fingerprint(ep.api_key),
"pinned_models": _normalize_model_ids(getattr(ep, "pinned_models", None)),
"endpoint_kind": getattr(ep, "endpoint_kind", None) or "auto",
"model_refresh_mode": getattr(ep, "model_refresh_mode", None) or "auto",
diff --git a/routes/task_routes.py b/routes/task_routes.py
index 57f76d5c6..5734fcb22 100644
--- a/routes/task_routes.py
+++ b/routes/task_routes.py
@@ -519,6 +519,15 @@ def setup_task_routes(task_scheduler) -> APIRouter:
else bool(req.notifications_enabled) if req.notifications_enabled is not None
else True
)
+ # Validate chained task belongs to same owner
+ if req.then_task_id:
+ chain_target = db.query(ScheduledTask).filter(
+ ScheduledTask.id == req.then_task_id
+ ).first()
+ if not chain_target:
+ raise HTTPException(400, "Chained task not found")
+ if chain_target.owner != user:
+ raise HTTPException(403, "Cannot chain to another user's task")
task = ScheduledTask(
id=task_id,
owner=user,
diff --git a/routes/workspace_routes.py b/routes/workspace_routes.py
deleted file mode 100644
index f7b27fbdc..000000000
--- a/routes/workspace_routes.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""Workspace API — browse server directories to pick a tool workspace folder."""
-import os
-from fastapi import APIRouter, Request, HTTPException, Query
-
-from src.auth_helpers import get_current_user
-from src.tool_security import owner_is_admin_or_single_user
-
-
-def setup_workspace_routes():
- router = APIRouter(prefix="/api/workspace", tags=["workspace"])
-
- @router.get("/browse")
- def browse(request: Request, path: str = Query(default="")):
- """List subdirectories of `path` (default: home) so the UI can navigate
- the server filesystem and pick a workspace folder. Directories only.
-
- ADMIN-ONLY: this enumerates the server filesystem, so it is gated the
- same way the file/shell tools are (read_file/write_file/bash are in
- NON_ADMIN_BLOCKED_TOOLS). A non-admin who can't use those tools must not
- be able to map the host's directory tree either.
- """
- owner = get_current_user(request)
- if not owner_is_admin_or_single_user(owner):
- raise HTTPException(status_code=403, detail="Workspace browsing is admin-only")
-
- # Resolve symlinks so the reported path is canonical and the UI navigates
- # real directories (defends against symlink games in displayed paths).
- target = os.path.realpath(os.path.expanduser(path.strip() or "~"))
- if not os.path.isdir(target):
- target = os.path.realpath(os.path.expanduser("~"))
-
- dirs = []
- try:
- with os.scandir(target) as it:
- for entry in it:
- try:
- # Don't follow symlinks when classifying — a symlinked
- # dir is skipped rather than letting the browser wander
- # off via a link. Hidden entries are omitted.
- if entry.is_dir(follow_symlinks=False) and not entry.name.startswith("."):
- # Build the child path server-side with os.path.join
- # so it's correct on Windows (backslashes) and Linux.
- dirs.append({"name": entry.name, "path": os.path.join(target, entry.name)})
- except OSError:
- continue
- except (PermissionError, OSError):
- dirs = []
-
- parent = os.path.dirname(target)
- return {
- "path": target,
- "parent": parent if parent and parent != target else None,
- "dirs": sorted(dirs, key=lambda d: d["name"].lower()),
- }
-
- return router
diff --git a/services/hwfit/data/hf_models.json b/services/hwfit/data/hf_models.json
index e73cc26dc..35b55d9a9 100644
--- a/services/hwfit/data/hf_models.json
+++ b/services/hwfit/data/hf_models.json
@@ -14036,6 +14036,29 @@
"vision"
]
},
+ {
+ "name": "google/gemma-4-12B",
+ "provider": "Google",
+ "parameter_count": "12.0B",
+ "parameters_raw": 12000000000,
+ "min_ram_gb": 24.0,
+ "recommended_ram_gb": 32.0,
+ "min_vram_gb": 24.0,
+ "quantization": "BF16",
+ "context_length": 131072,
+ "use_case": "General purpose, multimodal",
+ "is_moe": false,
+ "num_experts": null,
+ "active_experts": null,
+ "active_parameters": null,
+ "architecture": "gemma4",
+ "pipeline_tag": "image-text-to-text",
+ "release_date": "2026-04-01",
+ "gguf_sources": [],
+ "capabilities": [
+ "vision"
+ ]
+ },
{
"name": "google/gemma-4-31B-it",
"provider": "Google",
@@ -19121,4 +19144,4 @@
],
"_discovered": true
}
-]
\ No newline at end of file
+]
diff --git a/services/memory/skill_extractor.py b/services/memory/skill_extractor.py
index e763bca4c..79e4c67c2 100644
--- a/services/memory/skill_extractor.py
+++ b/services/memory/skill_extractor.py
@@ -243,6 +243,20 @@ async def maybe_extract_skill(
logger.debug("[skill-extract] '%s' already exists — dropped as duplicate", title)
return None
+ # Auto-publish gate: if the user has `auto_approve_skills` on, the
+ # newly-extracted skill is created `published` immediately rather
+ # than waiting for the next audit batch. The audit still runs later
+ # and can demote it back to `draft` (or delete) on failure. Default
+ # ON matches the UI label "Auto-approve skills".
+ _initial_status = "draft"
+ try:
+ from routes.prefs_routes import _load_for_user as _load_prefs
+ _prefs = _load_prefs(owner) or {}
+ if _prefs.get("auto_approve_skills", True):
+ _initial_status = "published"
+ except Exception:
+ pass
+
entry = skills_manager.add_skill(
title=title,
problem=data.get("problem", ""),
@@ -253,6 +267,7 @@ async def maybe_extract_skill(
confidence=data.get("confidence", 0.7),
session_id=getattr(session, "session_id", None),
owner=owner,
+ status=_initial_status,
)
try:
from src.event_bus import fire_event
diff --git a/services/search/providers.py b/services/search/providers.py
index f2d4a583b..1f8097ad8 100644
--- a/services/search/providers.py
+++ b/services/search/providers.py
@@ -134,9 +134,10 @@ _NEWS_HINTS = ("news", "nyheter", "headlines", "breaking", "latest", "today", "i
_GENERAL_ENGINES = os.environ.get("SEARXNG_GENERAL_ENGINES", "bing,mojeek,presearch")
-def searxng_search_api(query: str, count: int = 10, categories: str = "general",
+def searxng_search_api(query: str, count: Optional[int] = None, categories: str = "general",
time_filter: Optional[str] = None) -> List[dict]:
"""Search using SearXNG JSON API. Returns list of {title, url, snippet}."""
+ count = count if count is not None else _get_result_count()
instance = _get_search_instance()
api_key = ""
headers = {"User-Agent": "Mozilla/5.0"}
@@ -282,8 +283,9 @@ def searxng_search(query, max_results=10):
# ── Brave ──
-def brave_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+def brave_search(query: str, count: Optional[int] = None, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Brave API with key from admin settings or env var."""
+ count = count if count is not None else _get_result_count()
api_key = _get_provider_key("brave") or os.environ.get("DATA_BRAVE_API_KEY") or ""
return _brave_search_impl(query, count, time_filter, search_config={"brave_api_key": api_key})
@@ -381,9 +383,9 @@ def _resolve_ddg_redirect(raw: str) -> str:
return resolved
-def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+def duckduckgo_search(query: str, count: Optional[int] = None, time_filter: Optional[str] = None) -> List[dict]:
"""Search using DuckDuckGo via the duckduckgo-search library. No API key needed."""
-
+ count = count if count is not None else _get_result_count()
def _html_fallback() -> List[dict]:
try:
response = httpx.get(
@@ -452,7 +454,7 @@ def duckduckgo_search(query: str, count: int = 10, time_filter: Optional[str] =
# ── Google Programmable Search Engine ──
-def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+def google_pse_search(query: str, count: Optional[int] = None, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Google PSE (Custom Search JSON API).
Requires two keys in settings:
@@ -460,6 +462,7 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] =
- google_pse_cx: Programmable Search Engine ID (cx)
Or env vars GOOGLE_API_KEY and GOOGLE_PSE_CX.
"""
+ count = count if count is not None else _get_result_count()
settings = _get_search_settings()
api_key = _get_provider_key("google_pse") or os.environ.get("GOOGLE_API_KEY", "")
cx = (settings.get("google_pse_cx") or "").strip() or os.environ.get("GOOGLE_PSE_CX", "")
@@ -522,8 +525,9 @@ def google_pse_search(query: str, count: int = 10, time_filter: Optional[str] =
# ── Tavily ──
-def tavily_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+def tavily_search(query: str, count: Optional[int] = None, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Tavily API. Requires search_api_key or TAVILY_API_KEY env var."""
+ count = count if count is not None else _get_result_count()
api_key = _get_provider_key("tavily") or os.environ.get("TAVILY_API_KEY", "")
if not api_key:
logger.warning("Tavily: no API key configured")
@@ -580,8 +584,9 @@ def tavily_search(query: str, count: int = 10, time_filter: Optional[str] = None
# ── Serper.dev ──
-def serper_search(query: str, count: int = 10, time_filter: Optional[str] = None) -> List[dict]:
+def serper_search(query: str, count: Optional[int] = None, time_filter: Optional[str] = None) -> List[dict]:
"""Search using Serper.dev API. Requires search_api_key or SERPER_API_KEY env var."""
+ count = count if count is not None else _get_result_count()
api_key = _get_provider_key("serper") or os.environ.get("SERPER_API_KEY", "")
if not api_key:
logger.warning("Serper: no API key configured")
diff --git a/src/chatgpt_subscription.py b/src/chatgpt_subscription.py
index 263c4f529..e65ccbc8d 100644
--- a/src/chatgpt_subscription.py
+++ b/src/chatgpt_subscription.py
@@ -17,8 +17,6 @@ from typing import Any, Dict, Optional
import httpx
from fastapi import HTTPException
-from core.database import ProviderAuthSession, SessionLocal, utcnow_naive
-
DEFAULT_CHATGPT_SUBSCRIPTION_BASE_URL = (
os.getenv("CHATGPT_SUBSCRIPTION_BASE_URL", "").strip().rstrip("/")
or "https://chatgpt.com/backend-api/codex"
@@ -33,6 +31,11 @@ _AUTH_REFRESH_LOCKS: dict[str, threading.Lock] = {}
_AUTH_REFRESH_LOCKS_GUARD = threading.Lock()
+def _database_handles():
+ from core.database import ProviderAuthSession, SessionLocal, utcnow_naive
+ return ProviderAuthSession, SessionLocal, utcnow_naive
+
+
def _refresh_lock_for(auth_id: str) -> threading.Lock:
with _AUTH_REFRESH_LOCKS_GUARD:
lock = _AUTH_REFRESH_LOCKS.get(auth_id)
@@ -249,6 +252,7 @@ def access_token_is_expiring(access_token: str, skew_seconds: int = CHATGPT_ACCE
def resolve_runtime_credentials(auth_id: str, owner: Optional[str] = None, *, force_refresh: bool = False) -> Dict[str, Any]:
+ ProviderAuthSession, SessionLocal, utcnow_naive = _database_handles()
db = SessionLocal()
try:
q = db.query(ProviderAuthSession).filter(
diff --git a/src/tool_implementations.py b/src/tool_implementations.py
index 548f6f0f5..5e62e686c 100644
--- a/src/tool_implementations.py
+++ b/src/tool_implementations.py
@@ -664,6 +664,17 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict:
proc = args.get("steps") or []
if not proc and not args.get("body_extra") and not args.get("solution"):
return {"error": "procedure (or solution body) is required", "exit_code": 1}
+ # Same auto-publish gate as the extractor path — when the user
+ # has auto_approve_skills on and the caller didn't pin an explicit
+ # status, publish immediately. Audit later demotes/removes on fail.
+ _status_arg = args.get("status")
+ if not _status_arg:
+ try:
+ from routes.prefs_routes import _load_for_user as _load_prefs
+ _prefs = _load_prefs(owner) or {}
+ _status_arg = "published" if _prefs.get("auto_approve_skills", True) else "draft"
+ except Exception:
+ _status_arg = "draft"
entry = sm.add_skill(
name=args.get("name"),
description=(args.get("description") or args.get("title") or "").strip(),
@@ -677,7 +688,7 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict:
procedure=proc,
pitfalls=args.get("pitfalls") or [],
verification=args.get("verification") or [],
- status=args.get("status") or "draft",
+ status=_status_arg,
version=args.get("version") or "1.0.0",
confidence=args.get("confidence", 0.8),
source=args.get("source", "learned"),
@@ -2621,8 +2632,90 @@ async def _cookbook_env_for_host(host: str) -> Dict[str, Any]:
}
-async def _cookbook_register_task(session_id: str, model: str, host: str,
- cmd: str, task_type: str = "serve") -> bool:
+def _infer_serve_port(cmd: str) -> int:
+ """Infer likely listen port from a serve command."""
+ if not cmd:
+ return 8080
+ m = re.search(r"--port\\s+(\\d+)", cmd)
+ if m:
+ try:
+ return int(m.group(1))
+ except Exception:
+ pass
+ m = re.search(r"OLLAMA_HOST=[^\\s]*?:(\\d+)", cmd)
+ if m:
+ try:
+ return int(m.group(1))
+ except Exception:
+ pass
+ if "ollama" in cmd:
+ return 11434
+ return 8080
+
+
+def _infer_serve_host(host: str | None) -> tuple[str, bool]:
+ """Return (host, container_local) for registering a served endpoint."""
+ if not (host or "").strip():
+ return "localhost", True
+ base_host = host.split("@", 1)[-1] if "@" in host else host
+ return base_host, False
+
+
+async def _ensure_served_endpoint(
+ *,
+ model: str,
+ cmd: str,
+ host: str | None,
+) -> Dict[str, Any]:
+ """Register/fetch a model endpoint for a running serve session."""
+ import httpx
+ endpoint_host, container_local = _infer_serve_host(host)
+ port = _infer_serve_port(cmd)
+ base_url = f"http://{endpoint_host}:{port}/v1"
+ short_name = model.split("/")[-1] if "/" in model else model
+ is_image = "diffusion_server.py" in (cmd or "")
+ payload = {
+ "name": short_name if not is_image else f"{short_name} (image)",
+ "base_url": base_url,
+ "skip_probe": "true",
+ "model_type": "image" if is_image else "llm",
+ "container_local": "true" if container_local else "false",
+ }
+ try:
+ async with httpx.AsyncClient(timeout=30) as client:
+ resp = await client.post(
+ f"{_COOKBOOK_BASE}/api/model-endpoints",
+ data=payload,
+ headers=_internal_headers(),
+ )
+ data = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {}
+ if resp.status_code >= 400:
+ logger.debug(
+ f"ensure endpoint failed for {model!r}: status={resp.status_code} data={data}"
+ )
+ return {"added": False, "endpoint_id": "", "base_url": base_url, "error": data}
+ ep_id = data.get("id") if isinstance(data, dict) else None
+ return {
+ "added": bool(ep_id),
+ "endpoint_id": ep_id or "",
+ "base_url": base_url,
+ "data": data,
+ }
+ except Exception as e:
+ logger.debug(f"ensure endpoint exception for {model!r}: {e}")
+ return {"added": False, "endpoint_id": "", "base_url": base_url, "error": str(e)}
+
+
+async def _cookbook_register_task(
+ session_id: str,
+ model: str,
+ host: str,
+ cmd: str,
+ task_type: str = "serve",
+ *,
+ endpoint_added: bool = False,
+ endpoint_id: str = "",
+) -> bool:
"""Append a task entry to cookbook_state.json after the agent
launches via /api/model/serve or /api/model/download. The route
spawns tmux but leaves state-writing to the UI; the agent needs to
@@ -2672,7 +2765,8 @@ async def _cookbook_register_task(session_id: str, model: str, host: str,
"sshPort": "",
"platform": "linux",
"_serveReady": False,
- "_endpointAdded": False,
+ "_endpointAdded": bool(endpoint_added),
+ "_endpointId": endpoint_id or "",
})
state["tasks"] = tasks
try:
@@ -3008,7 +3102,12 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict:
if _servers.get("default_host"):
host = _servers["default_host"]
_host_defaulted = True
+ backend = (args.get("backend") or "").strip().lower()
+ if not backend and "/" not in repo_id and ":" in repo_id:
+ backend = "ollama"
payload = {"repo_id": repo_id}
+ if backend:
+ payload["backend"] = backend
if host:
payload["remote_host"] = host
if args.get("include"):
@@ -3028,12 +3127,20 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict:
sid = data.get("session_id", "?")
registered = await _cookbook_register_task(
session_id=sid, model=repo_id, host=host,
- cmd=f"hf download {repo_id}", task_type="download",
+ cmd=(f"ollama pull {repo_id}" if backend == "ollama" else f"hf download {repo_id}"),
+ task_type="download",
)
note = "" if registered else " (state-write failed — download may not show in UI)"
where = host or "local"
default_note = " (defaulted to the cookbook's selected server — pass host= or local=true to override)" if _host_defaulted else ""
- return {"output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}", "session_id": sid, "host": host, "exit_code": 0}
+ return {
+ "output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}",
+ "session_id": sid,
+ "host": host,
+ "task_type": "download",
+ "phase": "running",
+ "exit_code": 0,
+ }
return {"error": data.get("error", "Download failed"), "exit_code": 1}
except Exception as e:
return {"error": str(e), "exit_code": 1}
@@ -3102,12 +3209,28 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict:
data = resp.json()
if data.get("ok"):
sid = data.get("session_id", "?")
+ endpoint_id = data.get("endpoint_id") or ""
+ if endpoint_id:
+ endpoint_added = True
+ else:
+ endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host)
+ endpoint_added = bool(endpoint_meta.get("added"))
+ endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id
registered = await _cookbook_register_task(
session_id=sid, model=repo_id,
host=host, cmd=cmd, task_type="serve",
+ endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
)
note = "" if registered else " (state-write failed — task may not show in UI)"
- return {"output": f"Serving {repo_id} (session: {sid}){note}", "session_id": sid, "exit_code": 0}
+ return {
+ "output": f"Serving {repo_id} (session: {sid}){note}",
+ "session_id": sid,
+ "task_type": "serve",
+ "phase": "running",
+ "host": host,
+ "endpoint_id": endpoint_id,
+ "exit_code": 0,
+ }
# FastAPI HTTPException puts the message under `detail`, not `error`.
# Surface BOTH so the agent sees "Invalid characters in cmd" (from
# _validate_serve_cmd rejecting `&&`/`source`/`cd`) instead of
@@ -3804,7 +3927,8 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict:
if env_cfg.get("gpus"): payload["gpus"] = env_cfg["gpus"]
if env_cfg.get("hf_token"): payload["hf_token"] = env_cfg["hf_token"]
if env_cfg.get("platform"): payload["platform"] = env_cfg["platform"]
- if env_cfg.get("ssh_port"): payload["ssh_port"] = env_cfg["ssh_port"]
+ if env_cfg.get("ssh_port"):
+ payload["ssh_port"] = env_cfg["ssh_port"]
try:
async with httpx.AsyncClient(timeout=30) as client:
@@ -3813,12 +3937,20 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict:
data = resp.json()
if data.get("ok"):
sid = data.get("session_id", "?")
+ endpoint_id = data.get("endpoint_id") or ""
+ if endpoint_id:
+ endpoint_added = True
+ else:
+ endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host)
+ endpoint_added = bool(endpoint_meta.get("added"))
+ endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id
registered = await _cookbook_register_task(
session_id=sid, model=repo_id, host=host,
cmd=cmd, task_type="serve",
+ endpoint_added=endpoint_added, endpoint_id=endpoint_id or "",
)
note = "" if registered else " (state-write failed — task may not show in UI)"
- return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "exit_code": 0}
+ return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "host": host, "endpoint_id": endpoint_id, "exit_code": 0}
return {"error": data.get("error", "Serve failed"), "exit_code": 1}
except Exception as e:
return {"error": str(e), "exit_code": 1}
diff --git a/src/tool_index.py b/src/tool_index.py
index 3f8010801..4eb8a51ee 100644
--- a/src/tool_index.py
+++ b/src/tool_index.py
@@ -332,6 +332,10 @@ class ToolIndex:
r"|\bat\s+\d{1,2}(?::\d{2})?\s*(?:a\.?m\.?|p\.?m\.?)\b", # at 7:30 am / at 7am
re.I,
)
+ _WEB_RE = re.compile(
+ r"https?://|www\.|\b(?:visit|open|fetch|check|read)\s+(?:this\s+)?(?:url|link|site|website|page)\b",
+ re.I,
+ )
# Keyword hints: if the query mentions these words, force-include the tools.
_KEYWORD_HINTS = {
@@ -493,6 +497,11 @@ class ToolIndex:
# the agent can actually create the cron job instead of fumbling.
if self._SCHEDULE_RE.search(ql):
base.add("manage_tasks")
+ # URL/site requests need web tools even when embedding retrieval is
+ # stubbed/unavailable. Keep this structural, not always-on, so trivial
+ # prompts do not drag web schemas into the agent context.
+ if self._WEB_RE.search(query):
+ base.update({"web_search", "web_fetch"})
return base
diff --git a/static/app.js b/static/app.js
index 8216d6485..c75070bf2 100644
--- a/static/app.js
+++ b/static/app.js
@@ -4,7 +4,6 @@
// ============================================
import Storage from './js/storage.js';
import uiModule from './js/ui.js';
-import workspaceModule from './js/workspace.js';
import fileHandlerModule from './js/fileHandler.js';
import modelsModule from './js/models.js';
import ragModule from './js/rag.js';
@@ -1555,7 +1554,6 @@ function initializeEventListeners() {
const MODE_TOOLS = [
{ btnId: 'web-toggle-btn', checkboxId: 'web-toggle', stateKey: 'web' },
{ btnId: 'bash-toggle-btn', checkboxId: 'bash-toggle', stateKey: 'bash' },
- { btnId: 'plan-toggle-btn', checkboxId: 'plan-toggle', stateKey: 'plan' },
];
function _modeKey(stateKey, mode) { return `${stateKey}_${mode}`; }
@@ -1564,9 +1562,6 @@ function initializeEventListeners() {
const state = loadToggleState();
const key = _modeKey(stateKey, mode);
if (Object.prototype.hasOwnProperty.call(state, key)) return !!state[key];
- // Plan mode is opt-in: never default it on, otherwise every agent turn
- // would be forced into planning.
- if (stateKey === 'plan') return false;
return mode === 'agent'; // default: ON in agent, OFF in chat
}
@@ -1579,7 +1574,6 @@ function initializeEventListeners() {
const TOOL_TOGGLE_TOAST_LABELS = {
web: 'Web search',
bash: 'Shell',
- plan: 'Plan mode',
};
function showToolToggleToast(stateKey, active) {
@@ -1592,8 +1586,8 @@ function initializeEventListeners() {
MODE_TOOLS.forEach(({ btnId, checkboxId, stateKey }) => {
const btn = el(btnId);
if (!btn) return;
- // Hide bash and plan buttons in chat mode
- if (mode === 'chat' && (stateKey === 'bash' || stateKey === 'plan')) {
+ // Hide bash button in chat mode
+ if (mode === 'chat' && stateKey === 'bash') {
btn.style.display = 'none';
return;
}
@@ -1614,12 +1608,10 @@ function initializeEventListeners() {
const state = loadToggleState();
let currentMode = state.mode || 'chat';
- // Immediately hide bash/plan buttons in chat mode on page load
+ // Immediately hide bash button in chat mode on page load
if (currentMode === 'chat') {
const bashBtn = el('bash-toggle-btn');
- const planBtn = el('plan-toggle-btn');
if (bashBtn) bashBtn.style.display = 'none';
- if (planBtn) planBtn.style.display = 'none';
}
function setMode(mode) {
@@ -1709,82 +1701,6 @@ function initializeEventListeners() {
}
setupToggle('web-toggle-btn', 'web-toggle', 'web');
setupToggle('bash-toggle-btn', 'bash-toggle', 'bash');
- try { workspaceModule.initWorkspace(); } catch (_) {}
- setupToggle('plan-toggle-btn', 'plan-toggle', 'plan');
-
- // Set plan mode on/off directly (checkbox + button state + saved pref) WITHOUT
- // going through the button's click handler — used by the plan menu and by the
- // "Approve & Run" flow. Going through .click() would hit the plan-menu
- // intercept below (a stored plan re-opens the menu instead of toggling), which
- // is exactly the bug that left approved plans stuck in plan mode.
- function _setPlanMode(on) {
- const btn = el('plan-toggle-btn');
- const chk = el('plan-toggle');
- const mode = (loadToggleState().mode) || 'chat';
- if (chk) chk.checked = !!on;
- if (btn) { btn.classList.toggle('active', !!on); btn.setAttribute('aria-pressed', String(!!on)); }
- saveToolPref('plan', mode, !!on);
- }
- window._setPlanMode = _setPlanMode;
-
- // ── Plan-button menu ──
- // When a plan exists for this chat, clicking the plan button opens a small
- // menu (Show plan / Plan mode on-off) instead of plain-toggling — so the plan
- // window can be re-opened and docked at any time while the agent works. With
- // no plan, the button behaves as before (one-click toggle).
- (function initPlanMenu() {
- const planBtn = el('plan-toggle-btn');
- if (!planBtn) return;
- const _hasPlan = () => { try { return !!(window._getStoredPlan && window._getStoredPlan()); } catch (_) { return false; } };
- const _close = () => { const m = document.getElementById('plan-menu'); if (m) m.remove(); };
- function _open() {
- _close();
- const planChk = el('plan-toggle');
- const on = !!(planChk && planChk.checked);
- const menu = document.createElement('div');
- menu.id = 'plan-menu';
- menu.className = 'overflow-menu plan-menu';
- menu.innerHTML =
- ''
- + '';
- document.body.appendChild(menu);
- const r = planBtn.getBoundingClientRect();
- menu.style.position = 'fixed';
- menu.style.left = Math.round(r.left) + 'px';
- menu.style.top = Math.round(r.top - menu.offsetHeight - 6) + 'px';
- menu.querySelector('[data-act="show"]').addEventListener('click', () => {
- _close();
- const txt = window._getStoredPlan ? window._getStoredPlan() : '';
- if (txt && window.planWindowModule) window.planWindowModule.openPlanWindow(txt, null);
- });
- menu.querySelector('[data-act="toggle"]').addEventListener('click', () => {
- _close();
- _setPlanMode(!on); // flip state directly (no click → no menu re-open)
- });
- // Dismiss on any outside click (capture so it beats other handlers) / Escape.
- setTimeout(() => {
- const off = (e) => {
- if (!menu.contains(e.target) && e.target !== planBtn) {
- _close(); document.removeEventListener('click', off, true); document.removeEventListener('keydown', esc, true);
- }
- };
- const esc = (e) => { if (e.key === 'Escape') { _close(); document.removeEventListener('click', off, true); document.removeEventListener('keydown', esc, true); } };
- document.addEventListener('click', off, true);
- document.addEventListener('keydown', esc, true);
- }, 0);
- }
- planBtn.addEventListener('click', (e) => {
- // With a stored plan, the button opens the menu (Show plan / toggle).
- // Without one, it falls through to the normal one-click toggle.
- if (_hasPlan()) { e.preventDefault(); e.stopImmediatePropagation(); _open(); }
- }, true); // capture phase: intercept before setupToggle's bubble handler
- })();
-
- try { workspaceModule.initWorkspace(); } catch (_) {}
// Document editor toggle (special: uses module panel, not a checkbox)
const overflowDocBtn = el('overflow-doc-btn');
diff --git a/static/index.html b/static/index.html
index 522129fe9..4ca33c072 100644
--- a/static/index.html
+++ b/static/index.html
@@ -1040,13 +1040,6 @@
RAG
-
-
-
-