From fa8c93ec0ae8a960ba29190eb34f713c76cbd7de Mon Sep 17 00:00:00 2001 From: pewdiepie-archdaemon Date: Mon, 8 Jun 2026 22:38:49 +0900 Subject: [PATCH] Cookbook UI: Ollama browser, advanced serve fold, API tokens form, diagnosis toolbar, polish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface a lot of accumulated cookbook + UI work as a single non-agent commit so the agent rework lands cleanly. Highlights: - Ollama as a first-class backend in the Cookbook: * Download input accepts ollama-style names (name:tag) → backend=ollama * /api/cookbook/ollama/library (cached scrape of ollama.com + curated fallback so classic models like qwen2.5 stay reachable) * "Browse Ollama library" toggle below Download with size chips * Engine=Ollama in hwfit toolbar merges the Ollama library into the main scan list as per-tag rows with the same Fit/Param/Quant/VRAM columns; click → fills Download input - API Tokens form added to Integrations panel (matching wired loadTokens()/initTokenForm() that had no HTML) - Serve panel polish: Advanced fold tightening (-8px nudges on vLLM checks, Extra args, Spec row), n_cpu_moe + Split Mode controls pulled up 8px to align with the row's checkboxes, GGUF File dropdown exposed for Ollama backend, GPU re-render on Edit serve restore, _forceBackend flag so saved serveState wins over backend detection, cookbook:servers-changed CustomEvent so panels don't need refresh - Models page redesign: Add Models row (URL + hidden API key reveal + Type select + Scan/Ollama/Key/Test/Add icon buttons), Probe All + Clear-offline buttons in Added Models toolbar, offline-pill removed (opacity already conveys state), Engine dropdown gains Ollama option - _ping_endpoint probes /v1/models then base, accepts 4xx as reachable (vLLM returns 404 on bare /v1, fully working endpoints were showing offline) - Diagnosis card: × dismiss + Copy bundle buttons restored on the serve error feedback card - Orphan tmux sweep re-enabled behind a 60s rate-limit + background Thread (off the main event loop) so dead serves get discovered - cookbook_routes auto-register watchdog: drops the endpoint if the serve session exits non-zero within the first ~3min - ollama-rocm sidecar awareness in download wrapper (`docker exec ollama-rocm ollama pull` when host ollama isn't installed) - Skill extractor sets initial_status="published" when auto_approve_skills pref is on (audit demotes later) - Skill list / model list / cookbook scan misc polish --- app.py | 4 + mcp_servers/email_server.py | 533 ++++++++++++- routes/api_token_routes.py | 2 + routes/cookbook_helpers.py | 8 +- routes/cookbook_routes.py | 1157 ++++++++++++++++++++-------- routes/hwfit_routes.py | 19 +- routes/model_routes.py | 276 +++---- services/hwfit/data/hf_models.json | 25 +- services/memory/skill_extractor.py | 15 + src/tool_implementations.py | 150 +++- static/index.html | 134 +++- static/js/admin.js | 156 +++- static/js/chatRenderer.js | 22 + static/js/cookbook-diagnosis.js | 43 +- static/js/cookbook-hwfit.js | 167 +++- static/js/cookbook.js | 512 ++++++------ static/js/cookbookDownload.js | 12 +- static/js/cookbookRunning.js | 9 + static/js/cookbookSchedule.js | 36 +- static/js/cookbookServe.js | 301 ++++---- static/js/documentLibrary.js | 7 +- static/js/emailLibrary.js | 2 +- static/js/markdown.js | 151 +++- static/js/modelPicker.js | 11 +- static/js/models.js | 9 +- static/js/settings.js | 10 +- static/js/skills.js | 8 +- static/style.css | 280 ++++++- 28 files changed, 3033 insertions(+), 1026 deletions(-) diff --git a/app.py b/app.py index 97906bd46..0af6b18ea 100644 --- a/app.py +++ b/app.py @@ -650,6 +650,10 @@ app.include_router(calendar_router) from routes.shell_routes import setup_shell_routes app.include_router(setup_shell_routes()) +# Terminal agents (tmux-backed Codex/Claude/shell sessions) +from routes.terminal_agent_routes import setup_terminal_agent_routes +app.include_router(setup_terminal_agent_routes()) + # Cookbook (model download/serve/cache, cookbook state sync) from routes.cookbook_routes import setup_cookbook_routes app.include_router(setup_cookbook_routes()) diff --git a/mcp_servers/email_server.py b/mcp_servers/email_server.py index d1c2ac07e..db731ec0f 100644 --- a/mcp_servers/email_server.py +++ b/mcp_servers/email_server.py @@ -22,6 +22,7 @@ import os import os.path from pathlib import Path from datetime import datetime, timedelta +import uuid from mcp.server import Server from mcp.server.stdio import stdio_server @@ -67,6 +68,59 @@ def _db_path() -> Path: return Path(APP_DB) +def _load_email_writing_style() -> str: + """Return the existing Settings > Email > Writing Style value.""" + try: + settings_path = DATA_DIR / "settings.json" + if not settings_path.exists(): + return "" + settings = json.loads(settings_path.read_text(encoding="utf-8")) + return str(settings.get("email_writing_style") or "").strip() + except Exception: + return "" + + +def _writing_style_guidance() -> str: + style = _load_email_writing_style() + if not style: + return ( + "No saved writing style is configured in Settings > Email > Writing Style. " + "Use a concise, natural tone and do not invent facts." + ) + return ( + "Use this saved writing style from Settings > Email > Writing Style when " + "drafting the body. It overrides generic tone guidance:\n" + f"{style}" + ) + + +def _default_document_owner() -> str | None: + """Best-effort owner for MCP-created documents. + + MCP stdio tools do not receive the browser request's authenticated user, + but the document library is owner-filtered. Stamp drafts to the configured + single/default admin so assistant-created email drafts are visible. + """ + owner = os.environ.get("ODYSSEUS_DOCUMENT_OWNER", "").strip() + if owner: + return owner + try: + auth_path = DATA_DIR / "auth.json" + if not auth_path.exists(): + return None + users = (json.loads(auth_path.read_text(encoding="utf-8")).get("users") or {}) + if not isinstance(users, dict) or not users: + return None + admins = [name for name, data in users.items() if isinstance(data, dict) and data.get("is_admin")] + if len(admins) == 1: + return admins[0] + if len(users) == 1: + return next(iter(users)) + return admins[0] if admins else next(iter(users)) + except Exception: + return None + + def _list_accounts_raw() -> list: """Return list of dicts from the email_accounts table. Empty list if table missing or empty. Never raises.""" @@ -896,6 +950,340 @@ def _send_email(to, subject, body, in_reply_to=None, references=None, cc=None, b } +def _build_email_document_content( + to, + subject, + body, + *, + cc=None, + bcc=None, + in_reply_to=None, + references=None, + source_uid=None, + source_folder=None, +): + header_lines = [f"To: {to or ''}"] + if cc: + header_lines.append(f"Cc: {cc}") + if bcc: + header_lines.append(f"Bcc: {bcc}") + header_lines.append(f"Subject: {subject or ''}") + if in_reply_to: + header_lines.append(f"In-Reply-To: {in_reply_to}") + if references: + header_lines.append(f"References: {references}") + if source_uid: + header_lines.append(f"X-Source-UID: {source_uid}") + if source_folder: + header_lines.append(f"X-Source-Folder: {source_folder}") + return "\n".join(header_lines) + "\n---\n" + (body or "") + + +def _merge_email_reply_body(existing_content: str, reply_body: str) -> str: + """Preserve email headers and quoted chain while replacing the editable reply body.""" + if "\n---\n" not in (existing_content or ""): + return reply_body or "" + head, body = existing_content.split("\n---\n", 1) + quote_markers = ( + "---------- Previous message ----------", + "-----Original Message-----", + "----- Original Message -----", + ) + quote_index = -1 + for marker in quote_markers: + idx = body.find(marker) + if idx != -1 and (quote_index == -1 or idx < quote_index): + quote_index = idx + quote = body[quote_index:].strip() if quote_index != -1 else "" + merged_body = (reply_body or "").strip() + if quote: + merged_body = f"{merged_body}\n\n{quote}" if merged_body else quote + return f"{head}\n---\n{merged_body}" + + +def _create_email_draft_document( + *, + to, + subject, + body, + title=None, + cc=None, + bcc=None, + in_reply_to=None, + references=None, + source_uid=None, + source_folder=None, + account=None, + source_message_id=None, +): + """Create an Odysseus email compose document for user review. Does not send.""" + from core.database import SessionLocal, Document, DocumentVersion + try: + from src.event_bus import fire_event + except Exception: + fire_event = None + + cfg = _load_config(account) if account else _load_config(None) + content = _build_email_document_content( + to, + subject, + body, + cc=cc, + bcc=bcc, + in_reply_to=in_reply_to, + references=references, + source_uid=source_uid, + source_folder=source_folder, + ) + doc_id = str(uuid.uuid4()) + ver_id = str(uuid.uuid4()) + doc_title = (title or subject or "Email draft").strip() or "Email draft" + doc_owner = _default_document_owner() + + db = SessionLocal() + try: + if source_uid and source_folder: + existing = ( + db.query(Document) + .filter(Document.is_active == True) + .filter(Document.language == "email") + .filter(Document.owner == doc_owner) + .filter(Document.source_email_uid == str(source_uid)) + .filter(Document.source_email_folder == source_folder) + .order_by(Document.updated_at.desc()) + .first() + ) + if existing and "\n---\n" in (existing.current_content or ""): + existing.current_content = _merge_email_reply_body(existing.current_content, body or "") + existing.version_count = (existing.version_count or 0) + 1 + ver = DocumentVersion( + id=ver_id, + document_id=existing.id, + version_number=existing.version_count, + content=existing.current_content, + summary="Updated by email MCP draft tool", + source="ai", + ) + db.add(ver) + db.commit() + if fire_event: + try: + fire_event("document_updated", doc_owner) + except Exception: + pass + return { + "draft": True, + "updated": True, + "doc_id": existing.id, + "title": existing.title, + "language": existing.language, + "account": cfg.get("account_name"), + "account_id": cfg.get("account_id"), + "to": to, + "subject": subject, + } + + doc = Document( + id=doc_id, + session_id=None, + title=doc_title, + language="email", + current_content=content, + version_count=1, + is_active=True, + owner=doc_owner, + source_email_uid=source_uid, + source_email_folder=source_folder, + source_email_account_id=cfg.get("account_id"), + source_email_message_id=source_message_id, + ) + ver = DocumentVersion( + id=ver_id, + document_id=doc_id, + version_number=1, + content=content, + summary="Created by email MCP draft tool", + source="ai", + ) + db.add(doc) + db.add(ver) + db.commit() + if fire_event: + try: + fire_event("document_created", doc_owner) + except Exception: + pass + return { + "draft": True, + "doc_id": doc_id, + "title": doc_title, + "language": "email", + "account": cfg.get("account_name"), + "account_id": cfg.get("account_id"), + "to": to, + "subject": subject, + } + finally: + db.close() + + +def _draft_reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None, title=None): + """Create a threaded Odysseus reply draft document. Does not send.""" + conn = _imap_connect(account) + conn.select(folder, readonly=True) + status, msg_data = conn.uid("FETCH", _b(uid), "(RFC822)") + conn.logout() + if status != "OK" or not msg_data or not msg_data[0]: + return {"error": f"Failed to fetch email UID {uid}"} + raw = msg_data[0][1] + orig = email.message_from_bytes(raw) + + orig_subject = _decode_header(orig.get("Subject", "")) + reply_subject = orig_subject if orig_subject.lower().startswith("re:") else f"Re: {orig_subject}" + orig_message_id = orig.get("Message-ID", "") + orig_references = orig.get("References", "") + new_references = (orig_references + " " + orig_message_id).strip() if orig_references else orig_message_id + + sender = _decode_header(orig.get("From", "")) + _, sender_addr = email.utils.parseaddr(sender) + to_addrs = sender_addr + + cc = None + if reply_all: + cc_addrs = [] + cfg = _load_config(account) + own_addrs = { + (cfg.get("imap_user") or "").strip().lower(), + (cfg.get("from_address") or "").strip().lower(), + } + for header_name in ("To", "Cc"): + for _, addr in email.utils.getaddresses([orig.get(header_name, "")]): + addr_l = (addr or "").strip().lower() + if addr and addr != sender_addr and addr_l not in own_addrs: + cc_addrs.append(addr) + if cc_addrs: + cc = ", ".join(dict.fromkeys(cc_addrs)) + + return _create_email_draft_document( + to=to_addrs, + subject=reply_subject, + body=body, + title=title or reply_subject, + cc=cc, + in_reply_to=orig_message_id, + references=new_references, + source_uid=uid, + source_folder=folder, + account=account, + source_message_id=orig_message_id, + ) + + +async def _ai_draft_reply_to_email(uid, folder="INBOX", reply_all=False, account=None, title=None): + """Generate a reply with Odysseus' AI-reply prompt/style, then create a compose doc.""" + read_result = _read_email(uid=uid, folder=folder, account=account) + if "error" in read_result: + return read_result + + to_addr = read_result.get("from_address") or email.utils.parseaddr(read_result.get("from") or "")[1] + subject = read_result.get("subject") or "" + reply_subject = subject if subject.lower().startswith("re:") else f"Re: {subject}" + original_body = read_result.get("body") or "" + message_id = read_result.get("message_id") or "" + + if not original_body.strip(): + return {"error": "No email body available for AI reply"} + + try: + from routes.email_helpers import ( + _EMAIL_REPLY_SYS_PROMPT_BASE, + _apply_email_style_mechanics, + _extract_reply, + _load_settings, + ) + from src.endpoint_resolver import ( + resolve_endpoint, + resolve_utility_fallback_candidates, + resolve_chat_fallback_candidates, + ) + from src.llm_core import llm_call_async_with_fallback + except Exception as exc: + return {"error": f"AI reply helpers unavailable: {exc}"} + + settings = _load_settings() + style = settings.get("email_writing_style", "") + system_prompt = _EMAIL_REPLY_SYS_PROMPT_BASE + if style: + system_prompt += f"\n\nWRITING STYLE TO MATCH:\n{style}" + + user_msg = ( + f"Recipient: {to_addr}\nSubject: {reply_subject}\n\n" + f"Original email and any current draft:\n{original_body[:6000]}\n\n" + "Draft a reply. Return only the reply body text." + ) + + candidates = [] + seen = set() + + def _add(url, model, headers): + key = (url or "", model or "") + if not url or not model or key in seen: + return + seen.add(key) + candidates.append((url, model, headers)) + + try: + _add(*resolve_endpoint("utility", owner=None)) + except Exception: + pass + try: + _add(*resolve_endpoint("default", owner=None)) + except Exception: + pass + try: + utility_fallbacks = resolve_utility_fallback_candidates(owner=None) or [] + except TypeError: + utility_fallbacks = resolve_utility_fallback_candidates() or [] + for cand in utility_fallbacks: + _add(*cand) + try: + chat_fallbacks = resolve_chat_fallback_candidates(owner=None) or [] + except TypeError: + chat_fallbacks = resolve_chat_fallback_candidates() or [] + for cand in chat_fallbacks: + _add(*cand) + + if not candidates: + return {"error": "No LLM endpoint configured for AI reply"} + + try: + raw_reply = await llm_call_async_with_fallback( + candidates, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_msg}, + ], + temperature=0.7, + max_tokens=1024, + timeout=60, + ) + except Exception as exc: + return {"error": f"AI reply generation failed: {exc}"} + + reply = _apply_email_style_mechanics(_extract_reply(raw_reply or "")) + if not reply: + return {"error": "AI reply generation returned an empty response"} + + return _draft_reply_to_email( + uid=uid, + body=reply, + folder=folder, + reply_all=reply_all, + account=account, + title=title or reply_subject, + ) + + def _reply_to_email(uid, body, folder="INBOX", reply_all=False, account=None): """Reply to an existing email by UID. Threads via In-Reply-To/References.""" conn = None @@ -1189,6 +1577,8 @@ async def list_tools() -> list[Tool]: name="send_email", description=( "Send a new email via SMTP. Provide recipient(s), subject, and body. " + "This sends immediately; for normal assistant-written email, prefer " + "draft_email so the user can review and send from Odysseus. " "For replying to an existing thread, use reply_to_email instead. " "Pass `account` to send from a non-default mailbox." ), @@ -1205,10 +1595,35 @@ async def list_tools() -> list[Tool]: "required": ["to", "subject", "body"], }, ), + Tool( + name="draft_email", + description=( + "Create a new Odysseus email compose draft document. This DOES NOT send. " + "Use this as the default way to write an email for the user: it opens " + "a reviewable email document with To/Cc/Bcc/Subject/body, and the user " + "can edit or press Send in Odysseus. " + f"{_writing_style_guidance()}" + ), + inputSchema={ + "type": "object", + "properties": { + "to": {"type": "string", "description": "Recipient email address(es), comma-separated"}, + "subject": {"type": "string", "description": "Email subject line"}, + "body": {"type": "string", "description": "Draft body"}, + "cc": {"type": "string", "description": "CC address(es), comma-separated (optional)"}, + "bcc": {"type": "string", "description": "BCC address(es), comma-separated (optional)"}, + "title": {"type": "string", "description": "Optional Odysseus document title"}, + **ACCOUNT_PROP, + }, + "required": ["to", "subject", "body"], + }, + ), Tool( name="reply_to_email", description=( - "Reply to an existing email by UID. Automatically threads the reply with " + "Reply to an existing email by UID. This sends immediately; for normal " + "assistant-written replies, prefer draft_email_reply so the user can " + "review and send from Odysseus. Automatically threads the reply with " "In-Reply-To and References headers, prefixes 'Re:' on the subject, and " "uses the original sender as the recipient. Set reply_all=true to also CC " "the original To/Cc recipients. For follow-up 'reply ...' requests, use " @@ -1226,6 +1641,49 @@ async def list_tools() -> list[Tool]: "required": ["uid", "body"], }, ), + Tool( + name="draft_email_reply", + description=( + "Create an Odysseus email reply draft document for an existing email UID. " + "This DOES NOT send. It threads the draft with In-Reply-To/References, " + "prefills the recipient and subject, and stores source email metadata so " + "the user can review and send from the normal email composer. " + f"{_writing_style_guidance()}" + ), + inputSchema={ + "type": "object", + "properties": { + "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"}, + "body": {"type": "string", "description": "Draft reply body text"}, + "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"}, + "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False}, + "title": {"type": "string", "description": "Optional Odysseus document title"}, + **ACCOUNT_PROP, + }, + "required": ["uid", "body"], + }, + ), + Tool( + name="ai_draft_email_reply", + description=( + "Generate an AI reply using Odysseus' existing AI Reply behavior, " + "including Settings > Email > Writing Style, then create an email " + "compose document for review. This DOES NOT send and does NOT save " + "to the mailbox Drafts folder. Use this when the user asks you to " + "write or draft a reply to an email without dictating the exact body." + ), + inputSchema={ + "type": "object", + "properties": { + "uid": {"type": "string", "description": "Exact Email UID from list_emails/read_email; never invent UID 1"}, + "folder": {"type": "string", "description": "IMAP folder (default: INBOX)", "default": "INBOX"}, + "reply_all": {"type": "boolean", "description": "Reply to all recipients (default: false)", "default": False}, + "title": {"type": "string", "description": "Optional Odysseus document title"}, + **ACCOUNT_PROP, + }, + "required": ["uid"], + }, + ), Tool( name="archive_email", description="Move an email out of the inbox into the Archive folder. Use after handling an email you want to keep but no longer need in the inbox.", @@ -1552,6 +2010,31 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: acct_note = f" (from {result['account']})" if result.get("account") else "" return [TextContent(type="text", text=f"Sent email to {result['to']} with subject '{result['subject']}'{acct_note}.")] + elif name == "draft_email": + to = arguments.get("to") + subject = arguments.get("subject") + body = arguments.get("body") + if not to or not subject or body is None: + return [TextContent(type="text", text="Error: to, subject, and body are required")] + result = _create_email_draft_document( + to=to, + subject=subject, + body=body, + title=arguments.get("title"), + cc=arguments.get("cc"), + bcc=arguments.get("bcc"), + account=acct, + ) + acct_note = f" from {result['account']}" if result.get("account") else "" + return [TextContent( + type="text", + text=( + f"Created Odysseus email draft `{result['title']}` " + f"(document ID: {result['doc_id']}){acct_note}. " + "It has not been sent; open the document in Odysseus to review and send." + ), + )] + elif name == "reply_to_email": uid = arguments.get("uid") body = arguments.get("body") @@ -1573,6 +2056,54 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: pass return [TextContent(type="text", text=f"Replied to UID {uid}: '{result['subject']}' → {result['to']}")] + elif name == "draft_email_reply": + uid = arguments.get("uid") + body = arguments.get("body") + if not uid or body is None: + return [TextContent(type="text", text="Error: uid and body are required")] + result = _draft_reply_to_email( + uid=uid, + body=body, + folder=arguments.get("folder", "INBOX"), + reply_all=bool(arguments.get("reply_all", False)), + account=acct, + title=arguments.get("title"), + ) + if "error" in result: + return [TextContent(type="text", text=f"Error: {result['error']}")] + acct_note = f" from {result['account']}" if result.get("account") else "" + return [TextContent( + type="text", + text=( + f"Created Odysseus reply draft `{result['title']}` for UID {uid} " + f"(document ID: {result['doc_id']}){acct_note}. " + "It has not been sent; open the document in Odysseus to review and send." + ), + )] + + elif name == "ai_draft_email_reply": + uid = arguments.get("uid") + if not uid: + return [TextContent(type="text", text="Error: uid is required")] + result = await _ai_draft_reply_to_email( + uid=uid, + folder=arguments.get("folder", "INBOX"), + reply_all=bool(arguments.get("reply_all", False)), + account=acct, + title=arguments.get("title"), + ) + if "error" in result: + return [TextContent(type="text", text=f"Error: {result['error']}")] + acct_note = f" from {result['account']}" if result.get("account") else "" + return [TextContent( + type="text", + text=( + f"Generated AI reply and created Odysseus compose draft " + f"`{result['title']}` for UID {uid} (document ID: {result['doc_id']}){acct_note}. " + "It has not been sent; open the document in Odysseus to review and send." + ), + )] + elif name == "archive_email": uid = arguments.get("uid") if not uid: diff --git a/routes/api_token_routes.py b/routes/api_token_routes.py index 97c576d15..05806e420 100644 --- a/routes/api_token_routes.py +++ b/routes/api_token_routes.py @@ -25,6 +25,8 @@ ALLOWED_SCOPES = { "calendar:write", "memory:read", "memory:write", + "cookbook:read", + "cookbook:launch", } TOKEN_PROFILES = { "chat": ["chat"], diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index 39a18f715..a450278be 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -30,8 +30,9 @@ _LOCAL_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$") _OLLAMA_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._:/-]{0,200}$") # Include pattern is a glob: allow typical safe glyphs only. _INCLUDE_RE = re.compile(r"^[A-Za-z0-9._\-*?/\[\]]+$") -# Remote host: user@host (optionally with :port-free hostname parts). -_REMOTE_HOST_RE = re.compile(r"^[A-Za-z0-9._-]+@[A-Za-z0-9._-]+$") +# Remote host: either `user@host` or plain `host` (alias is allowed), where host +# is a safe DNS-like token or a short SSH config alias. +_REMOTE_HOST_RE = re.compile(r"^(?:[A-Za-z0-9._-]+@)?[A-Za-z0-9._-]+$") # HF tokens and API tokens are url-safe base64-like. _TOKEN_RE = re.compile(r"^[A-Za-z0-9._~+/=-]+$") # Session IDs we mint look like "cookbook-deadbeef" or "serve-deadbeef". @@ -81,7 +82,7 @@ def _validate_remote_host(v: str | None) -> str | None: if v is None or v == "": return None if not _REMOTE_HOST_RE.match(v): - raise HTTPException(400, "Invalid remote_host — must be user@host, no SSH option syntax") + raise HTTPException(400, "Invalid remote_host — must be host or user@host, no SSH option syntax") return v @@ -787,6 +788,7 @@ def _llama_cpp_rebuild_cmd() -> str: class ModelDownloadRequest(BaseModel): repo_id: str + backend: str | None = None # "hf" (default) or "ollama" include: str | None = None # glob pattern e.g. "*Q4_K_M*" hf_token: str | None = None env_prefix: str | None = None # e.g. "source ~/venv/bin/activate" diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index 7a1ee85c6..ba950f4b7 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -15,26 +15,19 @@ from pathlib import Path from fastapi import APIRouter, HTTPException, Request, Depends from src.auth_helpers import require_user -from src.constants import COOKBOOK_STATE_FILE from pydantic import BaseModel from core.middleware import require_admin from core.platform_compat import ( IS_WINDOWS, - SSH_PATH_OVERRIDE, - NVIDIA_PATH_CANDIDATES, detached_popen_kwargs, find_bash, - git_bash_path, kill_process_tree, pid_alive, safe_chmod, which_tool, - translate_path, - get_wsl_windows_user_profile, ) from routes.shell_routes import TMUX_LOG_DIR -from src.constants import COOKBOOK_STATE_FILE logger = logging.getLogger(__name__) @@ -45,10 +38,8 @@ from routes.cookbook_helpers import ( _ps_squote, _bash_squote, _validate_serve_cmd, _parse_serve_phase, _safe_env_prefix, _local_tooling_path_export, _append_serve_preflight_exit_lines, _append_serve_exit_code_lines, _append_llama_cpp_linux_accel_build_lines, _cached_model_scan_script, - _append_vllm_linux_preflight_lines, _ollama_bind_from_cmd, _pip_install_fallback_chain, - _pip_install_no_cache, _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd, - _append_pip_install_runner_lines, - _diagnose_serve_output, run_ssh_command_async, + _ollama_bind_from_cmd, _pip_install_fallback_chain, _pip_install_no_cache, + _user_shell_path_bootstrap, _venv_safe_local_pip_install_cmd, ModelDownloadRequest, ServeRequest, ) @@ -63,7 +54,7 @@ _HF_TOKEN_STATUS_SNIPPET = ( def setup_cookbook_routes() -> APIRouter: router = APIRouter(tags=["cookbook"]) - _cookbook_state_path = Path(COOKBOOK_STATE_FILE) + _cookbook_state_path = Path(os.environ.get("DATA_DIR", "data")) / "cookbook_state.json" def _mask_secret(value: str) -> str: if not value: @@ -90,6 +81,127 @@ def setup_cookbook_routes() -> APIRouter: task["payload"].pop("hf_token", None) return state + def _diagnose_serve_output(text: str) -> dict | None: + """Server-side mirror of the Cookbook UI's common serve diagnoses. + + The browser uses cookbook-diagnosis.js for clickable fixes. This gives + the agent/tool path the same structured signal so it can retry with an + adjusted command instead of guessing from raw tmux output. + """ + if not text: + return None + tail = text[-6000:] + patterns = [ + ( + r"No available memory for the cache blocks|Available KV cache memory:.*-", + "No GPU memory left for KV cache after loading model.", + [ + {"label": "retry with GPU memory utilization 0.95", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.95"}, + {"label": "retry with context 2048", "op": "replace", "flag": "--max-model-len", "value": "2048"}, + ], + ), + ( + r"CUDA out of memory|torch\.cuda\.OutOfMemoryError|CUDA error: out of memory|warming up sampler|max_num_seqs.*gpu_memory_utilization", + "GPU ran out of memory during startup or warmup.", + [ + {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"}, + {"label": "retry with GPU memory utilization 0.80", "op": "replace", "flag": "--gpu-memory-utilization", "value": "0.80"}, + {"label": "retry with --enforce-eager", "op": "append", "arg": "--enforce-eager"}, + ], + ), + ( + r"not divisib|must be divisible|attention heads.*divisible", + "Tensor parallel size is incompatible with the model.", + [ + {"label": "retry with tensor parallel size 1", "op": "replace", "flag": "--tensor-parallel-size", "value": "1"}, + {"label": "retry with tensor parallel size 2", "op": "replace", "flag": "--tensor-parallel-size", "value": "2"}, + ], + ), + ( + r"KV cache.*too (small|large)|max_model_len.*exceeds|maximum.*context", + "Context length is too large for available GPU memory.", + [ + {"label": "retry with context 8192", "op": "replace", "flag": "--max-model-len", "value": "8192"}, + {"label": "retry with context 4096", "op": "replace", "flag": "--max-model-len", "value": "4096"}, + ], + ), + ( + r"enable-auto-tool-choice requires --tool-call-parser", + "Auto tool choice requires an explicit tool call parser.", + [{"label": "retry with Hermes tool parser", "op": "append", "arg": "--tool-call-parser hermes"}], + ), + ( + r"Please pass.*trust.remote.code=True|contains custom code which must be executed to correctly load|does not recognize this architecture|model type.*but Transformers does not", + "Model requires custom code or newer model support.", + [{"label": "retry with --trust-remote-code", "op": "append", "arg": "--trust-remote-code"}], + ), + ( + r"Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels/layer", + "vLLM/Transformers kernel package mismatch.", + [{"label": "update vLLM, Transformers, and kernels on this server", "op": "dependency", "package": "vllm transformers kernels"}], + ), + ( + r"Address already in use|bind.*address.*in use", + "Port is already in use.", + [{"label": "retry on port 8001", "op": "replace", "flag": "--port", "value": "8001"}], + ), + ( + r"No CUDA GPUs are available|no GPU.*found|CUDA_VISIBLE_DEVICES.*invalid", + "No GPUs are visible to the serve process.", + [{"label": "clear Cookbook GPU selection or choose available GPUs", "op": "settings", "field": "gpus", "value": ""}], + ), + ( + r"Failed to infer device type|NVML Shared Library Not Found|No module named 'amdsmi'|platform is not available", + "vLLM could not find a supported GPU (CUDA or ROCm). " + "This machine may have integrated or unsupported graphics only.", + [ + {"label": "switch to llama.cpp (CPU/Metal, works without a discrete GPU)", "op": "manual"}, + {"label": "switch to Ollama (CPU/Metal, works without a discrete GPU)", "op": "manual"}, + ], + ), + ( + r"vllm.*command not found|No module named vllm|ERROR: vLLM is not installed", + "vLLM is not installed or not in PATH on this server.", + [{"label": "install vLLM in Cookbook Dependencies", "op": "dependency", "package": "vllm"}], + ), + ( + r"sglang.*command not found|No module named sglang|SGLang is not installed", + "SGLang is not installed or not in PATH on this server.", + [{"label": "install SGLang in Cookbook Dependencies", "op": "dependency", "package": "sglang[all]"}], + ), + ( + r"llama-server.*command not found|llama\.cpp.*not found|No module named.*llama_cpp|No module named 'starlette_context'|git: command not found|cmake: command not found", + "llama.cpp / llama-cpp-python dependencies are missing.", + [{"label": "install llama.cpp dependencies or llama-cpp-python[server]", "op": "dependency", "package": "llama-cpp-python[server]"}], + ), + ( + r"No GGUF found on this host|no \.gguf file|No GGUF file found", + "No GGUF file found for this model on this host. The llama.cpp backend needs a .gguf file.", + [{"label": "download a GGUF build of this model (repo name usually ends in -GGUF, file like Q4_K_M.gguf)", "op": "manual"}], + ), + ( + r"No module named 'torch'|No module named torch|No module named 'diffusers'|No module named diffusers", + "Diffusion serving requires PyTorch and diffusers.", + [{"label": "install diffusers[torch] in Cookbook Dependencies", "op": "dependency", "package": "diffusers[torch]"}], + ), + ( + r"403 Forbidden|401 Unauthorized|Access to model.*is restricted|gated repo|not in the authorized list|awaiting a review", + "Model access is gated or unauthorized.", + [{"label": "set HF token and request model access on HuggingFace", "op": "manual"}], + ), + ] + for pattern, message, suggestions in patterns: + if re.search(pattern, tail, re.I): + return {"message": message, "suggestions": suggestions} + if re.search(r"Traceback \(most recent call last\)", tail, re.I) and not re.search( + r"Application startup complete|GET /v1/|Uvicorn running on", tail, re.I + ): + return { + "message": "Python traceback detected during serve startup.", + "suggestions": [{"label": "inspect traceback and retry with adjusted backend/settings", "op": "manual"}], + } + return None + def _state_for_client(state): """Return cookbook state without raw secrets for browser clients.""" _strip_task_secrets(state) @@ -183,7 +295,6 @@ def setup_cookbook_routes() -> APIRouter: safe_chmod(key_path.with_suffix(".pub"), 0o644) return {"ok": True, "public_key": _read_cookbook_public_key()} - def _needs_binary(cmd: str, binary: str) -> bool: return bool(re.search(rf"(^|[\s;&|()]){re.escape(binary)}($|[\s;&|()])", cmd or "")) @@ -244,8 +355,8 @@ def setup_cookbook_routes() -> APIRouter: # POSIX form + shell-quoting so drive paths / spaces survive. inner = TMUX_LOG_DIR / f"{session_id}_run.sh" inner.write_text("\n".join(bash_lines) + "\n", encoding="utf-8") - lp = shlex.quote(git_bash_path(log_path)) - ip = shlex.quote(git_bash_path(inner)) + lp = shlex.quote(log_path.as_posix()) + ip = shlex.quote(inner.as_posix()) script_path = TMUX_LOG_DIR / f"{session_id}.sh" script_path.write_text( f"bash {ip} > {lp} 2>&1\n", @@ -286,24 +397,33 @@ def setup_cookbook_routes() -> APIRouter: require_admin(request) # Defence-in-depth: even though this endpoint is admin-gated, refuse # values that would land in shell contexts with metacharacters. - _validate_repo_id(req.repo_id) - _validate_include(req.include) + backend = (req.backend or "").strip().lower() + is_ollama_download = backend == "ollama" or ("/" not in req.repo_id and ":" in req.repo_id) + if is_ollama_download: + _validate_serve_model_id(req.repo_id) + req.include = None + req.local_dir = None + else: + _validate_repo_id(req.repo_id) + _validate_include(req.include) _validate_remote_host(req.remote_host) req.ssh_port = _validate_ssh_port(req.ssh_port) req.local_dir = _validate_local_dir(req.local_dir) - req.hf_token = req.hf_token or _load_stored_hf_token() + req.hf_token = "" if is_ollama_download else (req.hf_token or _load_stored_hf_token()) _validate_token(req.hf_token) TMUX_LOG_DIR.mkdir(parents=True, exist_ok=True) session_id = f"cookbook-{uuid.uuid4().hex[:8]}" wrapper_script = TMUX_LOG_DIR / f"{session_id}.sh" - # When a download directory is set, target a per-model subfolder under it - # (/) so the flat-directory cache scan lists it as its own - # model. Without it, hf/snapshot_download falls back to the HF cache. - _dl_short = req.repo_id.split("/")[-1] if "/" in req.repo_id else req.repo_id - _dl_base = (req.local_dir.rstrip("/") + "/" + _dl_short) if req.local_dir else None - _dl_shell = _shell_path(_dl_base) if _dl_base else None # for hf CLI / bash - _dl_pyarg = (", local_dir=os.path.expanduser(" + repr(_dl_base) + ")") if _dl_base else "" + # Custom download dir: point the HF cache at /hub via env vars + # (HF_HOME + HUGGINGFACE_HUB_CACHE) instead of --local-dir. local_dir + # produces a flat layout (//) and the local-dir + # bookkeeping files (.cache/huggingface/.gitignore.lock), and it + # also breaks robust resume on flaky transfers — the blob-based hub + # cache survives SSL ReadError mid-stream by reusing .incomplete, + # local_dir does not. See issue #2722. + _dl_hf_home_shell = _shell_path(req.local_dir.rstrip("/")) if req.local_dir else None + _dl_pyarg = "" # snapshot_download honors the env vars too — no kwarg needed # Build the hf download command. Redirection to suppress the interactive # "update available? [Y/n]" prompt is added per-platform further down @@ -311,8 +431,7 @@ def setup_cookbook_routes() -> APIRouter: hf_cmd = f"hf download {req.repo_id}" if req.include: hf_cmd += f" --include '{req.include}'" - if _dl_shell: - hf_cmd += f" --local-dir {_dl_shell}" + ollama_cmd = f"ollama pull {shlex.quote(req.repo_id)}" # Build the shell wrapper — runs hf download directly in tmux (which is a TTY) # No script/tee needed — we'll use tmux capture-pane to read output @@ -320,8 +439,15 @@ def setup_cookbook_routes() -> APIRouter: lines.extend(_user_shell_path_bootstrap()) if req.hf_token: lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'") + if _dl_hf_home_shell and not is_ollama_download: + # Make hf download / snapshot_download honor the chosen dir via the + # standard HF cache (gives us the models--org--name/blobs/... layout + # with resumable .incomplete blobs). + lines.append(f"export HF_HOME={_dl_hf_home_shell}") + lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub") + lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub") # Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH - lines.append('export PATH="$HOME/.local/bin:$PATH"') + lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"') # When Odysseus runs from a venv (e.g. native macOS install), put its bin # on PATH so the tmux shell finds the bundled `hf`/`python3` without an # activated venv. Local bash runs only — meaningless over SSH. @@ -332,14 +458,25 @@ def setup_cookbook_routes() -> APIRouter: # throughput. Retries set disable_hf_transfer to fall back to the plain, # slower-but-reliable downloader (resumes cleanly from the .incomplete files). # Use `python3 -m pip` not `pip` — macOS has no bare `pip` command. - lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}") - if req.disable_hf_transfer: - lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0") - lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4") + if is_ollama_download: + lines.append('if command -v ollama >/dev/null 2>&1; then') + lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}') + lines.append('elif command -v docker >/dev/null 2>&1; then') + lines.append(' ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"') + lines.append(' if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then') + lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}') + lines.append(' fi') + lines.append('fi') + lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi') else: - lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}") - lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1") - lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8") + lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', upgrade=True)}") + if req.disable_hf_transfer: + lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0") + lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4") + else: + lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer')}") + lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1") + lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8") remote = req.remote_host # None for local is_windows = req.platform == "windows" @@ -361,37 +498,48 @@ def setup_cookbook_routes() -> APIRouter: ps_lines = [] ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"') ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null') - ps_lines.append('$env:PYTHONIOENCODING = "utf-8"') - ps_lines.append('$env:PYTHONUTF8 = "1"') if req.hf_token: ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'") + if req.local_dir and not is_ollama_download: + # Mirror the bash branch — point the HF cache at the user's dir + # via env vars instead of --local-dir, so resume works on flaky + # transfers (issue #2722). + _dl_ps = _ps_squote(req.local_dir.rstrip("/")) + ps_lines.append(f"$env:HF_HOME = '{_dl_ps}'") + ps_lines.append(f"$env:HUGGINGFACE_HUB_CACHE = '{_dl_ps}/hub'") + ps_lines.append(f"$env:HF_HUB_CACHE = '{_dl_ps}/hub'") if req.env_prefix: ps_lines.append(_safe_env_prefix(req.env_prefix)) - # Try hf CLI, fall back to Python huggingface_hub, then auto-install - ps_lines.append('try {{') - ps_lines.append(' $hfPath = Get-Command hf -ErrorAction SilentlyContinue') - ps_lines.append(' if ($hfPath) {{') - # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt - ps_lines.append(f' $null | {hf_cmd}') - ps_lines.append(' }} else {{') - ps_lines.append(' python -c "import huggingface_hub" 2>$null') - ps_lines.append(' if ($LASTEXITCODE -eq 0) {{') - ps_lines.append(' Write-Host "hf CLI not found, using Python huggingface_hub..."') - ps_lines.append(' python -m pip install -q hf_transfer 2>$null') - ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"') - ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"") - ps_lines.append(' }} else {{') - ps_lines.append(' Write-Host "Installing huggingface-hub..."') - ps_lines.append(' python -m pip install -q huggingface-hub hf_transfer') - ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"') - ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"") - ps_lines.append(' }}') - ps_lines.append(' }}') - ps_lines.append(' if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}') - ps_lines.append(' else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}') - ps_lines.append('}} catch {{') - ps_lines.append(' Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"') - ps_lines.append('}}') + if is_ollama_download: + ps_lines.append('if (-not (Get-Command ollama -ErrorAction SilentlyContinue)) { Write-Host "ERROR: Ollama not found. Install from https://ollama.com/download/windows"; exit 127 }') + ps_lines.append(f"$null | ollama pull '{_ps_squote(req.repo_id)}'") + ps_lines.append('if ($LASTEXITCODE -eq 0) { Write-Host ""; Write-Host "DOWNLOAD_OK" } else { Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }') + else: + # Try hf CLI, fall back to Python huggingface_hub, then auto-install + ps_lines.append('try {{') + ps_lines.append(' $hfPath = Get-Command hf -ErrorAction SilentlyContinue') + ps_lines.append(' if ($hfPath) {{') + # Pipe $null to stdin to suppress interactive "update available? [Y/n]" prompt + ps_lines.append(f' $null | {hf_cmd}') + ps_lines.append(' }} else {{') + ps_lines.append(' python -c "import huggingface_hub" 2>$null') + ps_lines.append(' if ($LASTEXITCODE -eq 0) {{') + ps_lines.append(' Write-Host "hf CLI not found, using Python huggingface_hub..."') + ps_lines.append(' python -m pip install -q hf_transfer 2>$null') + ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"') + ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"") + ps_lines.append(' }} else {{') + ps_lines.append(' Write-Host "Installing huggingface-hub..."') + ps_lines.append(' python -m pip install -q huggingface-hub hf_transfer') + ps_lines.append(' $env:HF_HUB_ENABLE_HF_TRANSFER = "1"') + ps_lines.append(f" python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download('{req.repo_id}'{_dl_pyarg}, max_workers=8)\"") + ps_lines.append(' }}') + ps_lines.append(' }}') + ps_lines.append(' if ($LASTEXITCODE -eq 0) {{ Write-Host ""; Write-Host "DOWNLOAD_OK" }}') + ps_lines.append(' else {{ Write-Host ""; Write-Host "DOWNLOAD_FAILED (exit $LASTEXITCODE)" }}') + ps_lines.append('}} catch {{') + ps_lines.append(' Write-Host ""; Write-Host "DOWNLOAD_FAILED ($_)"') + ps_lines.append('}}') ps_lines.append(f'Remove-Item -Force "$HOME\\{remote_runner}" -ErrorAction SilentlyContinue') runner_path = TMUX_LOG_DIR / f"{session_id}_run.ps1" runner_path.write_text("\r\n".join(ps_lines) + "\r\n", encoding="utf-8") @@ -422,6 +570,10 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append("deactivate 2>/dev/null; hash -r") if req.hf_token: runner_lines.append(f"export HF_TOKEN='{_bash_squote(req.hf_token)}'") + if _dl_hf_home_shell and not is_ollama_download: + runner_lines.append(f"export HF_HOME={_dl_hf_home_shell}") + runner_lines.append(f"export HUGGINGFACE_HUB_CACHE={_dl_hf_home_shell}/hub") + runner_lines.append(f"export HF_HUB_CACHE={_dl_hf_home_shell}/hub") if req.env_prefix: runner_lines.append(_safe_env_prefix(req.env_prefix)) else: @@ -432,42 +584,67 @@ def setup_cookbook_routes() -> APIRouter: 'done' ) # Ensure pip-user scripts (e.g. hf CLI installed via --user) are on PATH - runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') + runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"') # Install hf CLI + optional hf_transfer best-effort. Retries disable # hf_transfer because the Rust parallel path is fast but has been # flaky near the end of very large multi-file downloads. - # The helper tries active pip first, then guarded user-site fallbacks. - runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}") - if req.disable_hf_transfer: - runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0") - runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4") + # Use --break-system-packages on PEP-668 systems (Arch, newer Debian) so it doesn't bail. + if is_ollama_download: + runner_lines.append('if command -v ollama >/dev/null 2>&1; then') + runner_lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote(ollama_cmd)}') + runner_lines.append('elif command -v docker >/dev/null 2>&1; then') + runner_lines.append(' ODYSSEUS_OLLAMA_CONTAINER="$(docker ps --format \'{{.Names}}\' 2>/dev/null | grep -E \'^(ollama-rocm|ollama-test)$\' | head -1)"') + runner_lines.append(' if [ -n "$ODYSSEUS_OLLAMA_CONTAINER" ]; then') + runner_lines.append(f' ODYSSEUS_OLLAMA_PULL_CMD={shlex.quote("docker exec ${ODYSSEUS_OLLAMA_CONTAINER} " + ollama_cmd)}') + runner_lines.append(' fi') + runner_lines.append('fi') + runner_lines.append('if [ -z "$ODYSSEUS_OLLAMA_PULL_CMD" ]; then echo "ERROR: Ollama not found on this server. Install Ollama or start an ollama-rocm/ollama-test container."; exit 127; fi') else: - runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}") - runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1") - runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8") - # Surface whether the HF token actually reached THIS server, so a gated - # download's "not authorized" failure can be told apart from a missing - # token (the token is masked — we only print applied / not-set). - runner_lines.append(_HF_TOKEN_STATUS_SNIPPET) - # Try hf CLI first, fall back to Python huggingface_hub, then auto-install - runner_lines.append('if command -v hf &>/dev/null; then') - # < /dev/null suppresses interactive "update available? [Y/n]" prompt - runner_lines.append(f' {hf_cmd} < /dev/null') - runner_lines.append('elif python3 -c "import huggingface_hub" 2>/dev/null; then') - runner_lines.append(' echo "hf CLI not found, using Python huggingface_hub..."') - runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"') - runner_lines.append('else') - runner_lines.append(' echo "Installing huggingface-hub and dependencies..."') - runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null') - if req.disable_hf_transfer: - runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null') - runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0') + runner_lines.append(f"command -v hf >/dev/null 2>&1 || {_pip_install_fallback_chain('huggingface_hub', python_cmd='pip', upgrade=True)}") + if req.disable_hf_transfer: + runner_lines.append("export HF_HUB_ENABLE_HF_TRANSFER=0") + runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=4") + else: + runner_lines.append(f"python3 -c 'import hf_transfer' 2>/dev/null || {_pip_install_fallback_chain('hf_transfer', python_cmd='pip')}") + runner_lines.append("python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1") + runner_lines.append("export HF_HUB_DOWNLOAD_MAX_WORKERS=8") + # Surface whether the HF token actually reached THIS server, so a gated + # download's "not authorized" failure can be told apart from a missing + # token (the token is masked — we only print applied / not-set). + runner_lines.append(_HF_TOKEN_STATUS_SNIPPET) + # Wrap the download in a retry loop. Large HF/Ollama transfers can + # hit transient network failures; both backends resume cached partials. + mw = 4 if req.disable_hf_transfer else 8 + runner_lines.append('_max_retries=10; _attempt=0; _ec=0') + runner_lines.append('while [ $_attempt -lt $_max_retries ]; do') + runner_lines.append(' _attempt=$((_attempt+1))') + if is_ollama_download: + runner_lines.append(' eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null') else: - runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null') - runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1") - runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={4 if req.disable_hf_transfer else 8})"') - runner_lines.append('fi') - runner_lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi') + runner_lines.append(' if command -v hf &>/dev/null; then') + runner_lines.append(f' {hf_cmd} < /dev/null') + runner_lines.append(' elif python3 -c "import huggingface_hub" 2>/dev/null; then') + runner_lines.append(' [ $_attempt -eq 1 ] && echo "hf CLI not found, using Python huggingface_hub..."') + runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"') + runner_lines.append(' else') + runner_lines.append(' echo "Installing huggingface-hub and dependencies..."') + runner_lines.append(' pip install --no-deps -q huggingface-hub 2>/dev/null') + if req.disable_hf_transfer: + runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests 2>/dev/null') + runner_lines.append(' export HF_HUB_ENABLE_HF_TRANSFER=0') + else: + runner_lines.append(' pip install -q filelock fsspec packaging pyyaml tqdm typer httpx requests hf_transfer 2>/dev/null') + runner_lines.append(" python3 -c 'import hf_transfer' 2>/dev/null && export HF_HUB_ENABLE_HF_TRANSFER=1") + runner_lines.append(f' python3 -c "import os; from huggingface_hub import snapshot_download; snapshot_download(\'{req.repo_id}\'{_dl_pyarg}, max_workers={mw})"') + runner_lines.append(' fi') + runner_lines.append(' _ec=$?') + runner_lines.append(' if [ $_ec -eq 0 ]; then break; fi') + runner_lines.append(' if [ $_attempt -lt $_max_retries ]; then') + runner_lines.append(' echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."') + runner_lines.append(' sleep 30') + runner_lines.append(' fi') + runner_lines.append('done') + runner_lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi') runner_lines.append(f"rm -f {remote_runner}") runner_lines.append('exec "${SHELL:-/bin/bash}"') runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh" @@ -493,23 +670,30 @@ def setup_cookbook_routes() -> APIRouter: lines.append("deactivate 2>/dev/null; hash -r") # Show whether the HF token reached this run (masked) — tells a gated # "not authorized" failure apart from a missing token. - lines.append(_HF_TOKEN_STATUS_SNIPPET) - if IS_WINDOWS: - # Detached path: no controlling TTY, so skip `< /dev/null` - # (handled by Popen stdin=DEVNULL) and don't keep a shell open. - lines.append(hf_cmd) - lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi') - else: - # < /dev/null suppresses interactive "update available? [Y/n]" prompt - lines.append(f"{hf_cmd} < /dev/null") - lines.append('_ec=$?; if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec)"; fi') + if not is_ollama_download: + lines.append(_HF_TOKEN_STATUS_SNIPPET) + # Retry loop — same rationale as the remote-bash path. Issue #2722. + _hf_invoke = 'eval "$ODYSSEUS_OLLAMA_PULL_CMD" < /dev/null' if is_ollama_download else (hf_cmd if IS_WINDOWS else f"{hf_cmd} < /dev/null") + lines.append('_max_retries=10; _attempt=0; _ec=0') + lines.append('while [ $_attempt -lt $_max_retries ]; do') + lines.append(' _attempt=$((_attempt+1))') + lines.append(f' {_hf_invoke}') + lines.append(' _ec=$?') + lines.append(' if [ $_ec -eq 0 ]; then break; fi') + lines.append(' if [ $_attempt -lt $_max_retries ]; then') + lines.append(' echo ""; echo "Download attempt $_attempt failed (exit $_ec) — retrying in 30s..."') + lines.append(' sleep 30') + lines.append(' fi') + lines.append('done') + lines.append('if [ $_ec -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; else echo ""; echo "DOWNLOAD_FAILED (exit $_ec after $_attempt attempts)"; fi') + if not IS_WINDOWS: lines.append(f"rm -f '{wrapper_script}'") lines.append('exec "${SHELL:-/bin/bash}"') wrapper_script.write_text("\n".join(lines) + "\n", encoding="utf-8") wrapper_script.chmod(0o755) setup_cmd = None if IS_WINDOWS else f"tmux new-session -d -s {session_id} {shlex.quote(str(wrapper_script))}" - logger.info(f"Model download: {req.repo_id} (include={req.include}, session={session_id}, remote={remote})") + logger.info(f"Model download: {req.repo_id} (backend={'ollama' if is_ollama_download else 'hf'}, include={req.include}, session={session_id}, remote={remote})") logger.info(f"Download setup_cmd: {setup_cmd}") if setup_cmd is None: @@ -564,35 +748,24 @@ def setup_cookbook_routes() -> APIRouter: for d in model_dir.split(','): d = d.strip() if d: - translated_d = translate_path(d) if not host else d - model_dirs.append(translated_d) - win_hf_hub = None - if not host: - win_profile = get_wsl_windows_user_profile() - win_hf_hub = os.path.join(win_profile, ".cache", "huggingface", "hub") if win_profile else None - - paths_code = _cached_model_scan_script(model_dirs, win_hf_hub) + model_dirs.append(d) + paths_code = _cached_model_scan_script(model_dirs) scan_py = TMUX_LOG_DIR / "scan_cache.py" scan_py.write_text(paths_code, encoding="utf-8") - scan_payload = scan_py.read_bytes() if host: + _pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else "" if platform == "windows": - remote_cmd = "python -" + # Windows: use 'python' and pipe via stdin with double-quote wrapping + cmd = f'ssh {_pf}{host} "python -" < \'{scan_py}\'' else: - # POSIX: use 'python3' if available, fall back to 'python'; throw if neither is found. - remote_cmd = ( - "if command -v python3 >/dev/null 2>&1; then python3 -; " - "elif command -v python >/dev/null 2>&1; then python -; " - "else echo \"python3/python not found\" >&2; exit 127; fi" - ) - rc, stdout_b, stderr_b = await run_ssh_command_async( - host, - ssh_port, - remote_cmd, - timeout=60, - stdin_data=scan_payload, + cmd = f"ssh {_pf}{host} 'python3 -' < '{scan_py}'" + proc = await asyncio.create_subprocess_shell( + cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + cwd=str(Path.home()), ) else: # LOCAL scan: use sys.executable (the venv Python Odysseus is already @@ -612,7 +785,7 @@ def setup_cookbook_routes() -> APIRouter: stderr=asyncio.subprocess.PIPE, cwd=str(Path.home()), ) - stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60) + stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=60) models = [] try: @@ -752,6 +925,100 @@ def setup_cookbook_routes() -> APIRouter: return p return None + async def _serve_crash_watchdog( + endpoint_id: str, + session_id: str, + remote: str | None, + ssh_port: str | None, + is_windows: bool, + ) -> None: + """Drop a freshly-registered endpoint when the cookbook serve dies early. + + The runner script always emits ``=== Process exited with code N ===`` + when the launched cmd terminates (success or failure). We poll the + tmux pane periodically; on a non-zero exit detected within the watch + window, the endpoint row is deleted so the picker doesn't keep a + dead model around. A zero exit (rare for a long-running serve, but + possible for fast-failing builds that the runner reports as code 0) + and "missing exit marker" both leave the endpoint alone — that's + the loading-but-not-yet-bound state, which the probe-marks-offline + logic already handles. + + Times are picked to outlast realistic vLLM load times (Qwen3.5-122B + takes ~3 min to load) without burning resources on a stuck-forever + wait. After the last check, the watchdog gives up — the picker's + per-endpoint probe takes over from there. + """ + # Cumulative wait points: 25 s, 60 s, 2 min, 5 min. + _waits = [25, 35, 60, 180] + # Tmux capture-pane equivalent of the polling path used elsewhere in + # this file. Build it once and reuse on each tick. Skip the watchdog + # entirely on native-Windows local runs (no tmux). The Windows + # detached-process path writes its log to a known file and has its + # own lifecycle tracking; punting here keeps the code simple. + local_win = is_windows and not remote + if local_win: + return + if remote: + ssh_args = ["ssh"] + if ssh_port and ssh_port != "22": + ssh_args.extend(["-p", str(ssh_port)]) + capture_cmd = ssh_args + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"] + else: + capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-200"] + + _exit_re = re.compile(r"=== Process exited with code (-?\d+) ===") + for wait_s in _waits: + await asyncio.sleep(wait_s) + try: + proc = await asyncio.create_subprocess_exec( + *capture_cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=8) + output = stdout.decode("utf-8", errors="replace") + except Exception as e: + logger.debug(f"crash-watchdog: capture-pane failed (will retry): {e!r}") + continue + # Last occurrence wins — a serve that exits/restarts under the + # runner's "exec bash -i" trail will emit multiple markers; the + # most-recent code is the one that matters. + matches = list(_exit_re.finditer(output)) + if not matches: + continue + try: + exit_code = int(matches[-1].group(1)) + except (ValueError, IndexError): + continue + if exit_code == 0: + # Exit 0 on a long-running serve is unusual (a normal "loaded + # then ready" path keeps the process alive) but it happens for + # commands like "ollama pull" the user might launch through + # the same form. Don't drop the endpoint on a clean exit; + # let the probe layer mark it offline if nothing's listening. + logger.info(f"crash-watchdog: serve {session_id} exited cleanly (0); leaving endpoint {endpoint_id}") + return + # Non-zero exit — drop the endpoint. + try: + from core.database import SessionLocal as _SL, ModelEndpoint as _ME + db = _SL() + try: + ep = db.query(_ME).filter(_ME.id == endpoint_id).first() + if ep: + logger.info( + f"crash-watchdog: dropping endpoint {endpoint_id} " + f"({ep.name} @ {ep.base_url}) — serve exited {exit_code}" + ) + db.delete(ep) + db.commit() + finally: + db.close() + except Exception as e: + logger.warning(f"crash-watchdog: endpoint cleanup failed: {e!r}") + return + logger.debug(f"crash-watchdog: no exit marker for {session_id} within window; leaving endpoint {endpoint_id}") + def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None: """Register a freshly-served LLM as a model endpoint so it appears in the model picker without a manual /setup step — the text-model sibling of @@ -763,6 +1030,10 @@ def setup_cookbook_routes() -> APIRouter: probing /v1/models and dims the endpoint until the server is reachable, so registering immediately (before the server finishes loading) is safe. """ + logger.info( + f"_auto_register_llm_endpoint: ENTRY repo_id={req.repo_id!r} " + f"remote={remote!r} cmd_prefix={req.cmd[:80]!r}" + ) import re from core.database import SessionLocal, ModelEndpoint @@ -787,16 +1058,20 @@ def setup_cookbook_routes() -> APIRouter: else: port = 8080 # llama.cpp's llama-server default — the Apple Silicon path - # Determine host (mirrors the image path: SSH alias for remote serves). - # For local serves while Odysseus runs inside Docker, "localhost" - # resolves to the container itself — useless. Use host.docker.internal - # which compose maps to the actual host, matching what /setup adds - # for Ollama by hand. + # Determine host. The cookbook tmux for `local=true` serves runs INSIDE + # the odysseus container — so the right URL for the in-container + # backend to reach it is `localhost`, NOT `host.docker.internal` + # (the latter points at the docker HOST, which doesn't have a server + # on that port). The previous host.docker.internal fallback only made + # sense for /setup-added external services like systemd Ollama on the + # host — and those go through manual setup, not this auto-register + # code path. For remote serves we still use the SSH host alias. if remote: host = remote.split("@")[-1] if "@" in remote else remote + elif re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\b", req.cmd or ""): + host = "host.docker.internal" else: - from routes.model_routes import _docker_host_gateway_reachable - host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost" + host = "localhost" base_url = f"http://{host}:{port}/v1" @@ -805,7 +1080,9 @@ def setup_cookbook_routes() -> APIRouter: # If the serve command opts models into OpenAI tool-calling, record it so # agent_loop trusts emitted tool_calls instead of the name heuristic. + is_ollama_endpoint = "ollama" in (req.cmd or "").lower() supports_tools = True if "--enable-auto-tool-choice" in req.cmd else None + pinned_models = [req.repo_id] if is_ollama_endpoint and req.repo_id else [] db = SessionLocal() try: @@ -815,14 +1092,43 @@ def setup_cookbook_routes() -> APIRouter: existing.is_enabled = True existing.model_type = "llm" existing.name = display_name + if is_ollama_endpoint: + existing.endpoint_kind = "ollama" + if pinned_models: + existing.cached_models = json.dumps(pinned_models) + existing.pinned_models = json.dumps(pinned_models) if supports_tools is not None: existing.supports_tools = supports_tools - # Wipe stale model lists so the picker re-probes and discovers - # the newly-served model instead of showing the old one. - existing.cached_models = None - existing.hidden_models = None db.commit() logger.info(f"Updated existing local model endpoint: {base_url}") + # Re-probe so cached_models matches what the server actually + # serves right now (the URL may have stayed the same but the + # model behind it changed across launches). + try: + from routes.model_routes import _probe_endpoint + import json as _json2 + probed = _probe_endpoint(base_url, existing.api_key, timeout=5) + if probed: + existing.cached_models = _json2.dumps(probed) + db.commit() + except Exception as _pe: + logger.warning(f"Re-probe failed for {base_url}: {_pe!r}") + # Sweep stale dupes: other endpoints with the same display name + # at DIFFERENT URLs (likely failed earlier-attempt ports) get + # deleted so the picker doesn't show an offline ghost next to + # the working one. Only sweeps endpoints whose id starts with + # `local-` so we never touch a user's hand-added DeepSeek/OpenAI/ + # etc. entry with a coincidentally matching name. + stale = (db.query(ModelEndpoint) + .filter(ModelEndpoint.name == display_name) + .filter(ModelEndpoint.base_url != base_url) + .filter(ModelEndpoint.id.like("local-%")) + .all()) + for s in stale: + logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})") + db.delete(s) + if stale: + db.commit() return existing.id ep_id = f"local-{uuid.uuid4().hex[:8]}" @@ -833,11 +1139,42 @@ def setup_cookbook_routes() -> APIRouter: api_key=None, is_enabled=True, model_type="llm", + endpoint_kind="ollama" if is_ollama_endpoint else "auto", + cached_models=json.dumps(pinned_models) if pinned_models else None, + pinned_models=json.dumps(pinned_models) if pinned_models else None, supports_tools=supports_tools, ) db.add(ep) db.commit() logger.info(f"Auto-registered local model endpoint: {display_name} @ {base_url}") + # Same sweep on first-register path: drop any pre-existing local-* + # endpoints with this display name pointed elsewhere. + stale = (db.query(ModelEndpoint) + .filter(ModelEndpoint.name == display_name) + .filter(ModelEndpoint.id != ep_id) + .filter(ModelEndpoint.id.like("local-%")) + .all()) + for s in stale: + logger.info(f"Sweeping stale local endpoint {s.id} ({s.base_url})") + db.delete(s) + if stale: + db.commit() + # Probe /v1/models NOW and write cached_models so the chat + # picker actually shows the model on the next /api/models + # call. Without this immediate probe, the endpoint has empty + # cached_models until the next background refresh fires (up + # to a minute later) and the picker shows nothing — even + # though the endpoint is in the DB and the server is up. + try: + from routes.model_routes import _probe_endpoint + import json as _json2 + probed = _probe_endpoint(base_url, None, timeout=5) + if probed: + ep.cached_models = _json2.dumps(probed) + db.commit() + logger.info(f"Auto-register: probed {len(probed)} models @ {base_url}") + except Exception as _pe: + logger.warning(f"Auto-register: probe-after-create failed for {base_url}: {_pe!r}") return ep_id except Exception as e: logger.error(f"Failed to auto-register local model endpoint: {e}") @@ -877,27 +1214,11 @@ def setup_cookbook_routes() -> APIRouter: in_venv=sys.prefix != sys.base_prefix, ) is_pip_install = bool(req.cmd and "pip install" in req.cmd) - remote = req.remote_host - is_windows = req.platform == "windows" - local_windows = IS_WINDOWS and not remote - if is_windows or local_windows: - if req.cmd.startswith("python3 "): - req.cmd = "python " + req.cmd[len("python3 "):] - if is_pip_install and ("llama-cpp-python" in req.cmd or "llama_cpp" in req.cmd) and (is_windows or local_windows): - if "--extra-index-url" not in req.cmd: - req.cmd += " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu" - if is_pip_install: # Keep big dependency wheel builds (vLLM, …) off the home filesystem's # pip cache so they don't fail mid-build with "No space left" (#1219) # and leave the dep installed-but-unusable (#1459). req.cmd = _pip_install_no_cache(req.cmd) - # Accept common aliases and enforce server extras for llama-cpp so - # `python -m llama_cpp.server` has all runtime dependencies. - req.cmd = re.sub(r"(?=!~,` for version specifiers. # v2 review HIGH-14: tightened from the previous regex which @@ -920,7 +1241,12 @@ def setup_cookbook_routes() -> APIRouter: # Otherwise the runner script picks one at runtime and `_auto_register` # below still registers the stale 11434 default — which on a host with # a systemd ollama lands on the wrong (unreachable-from-docker) service. - if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd: + # Match "ollama serve" as a phrase (with optional flags after), not + # any substring containing "ollama" — otherwise commands like + # `docker exec ollama-test ollama-import …` get wrapped as if they + # were native `ollama serve`, prepending OLLAMA_HOST=… and then + # running the ollama-not-found preflight which exits 127. + if re.search(r"\bollama\s+serve\b", req.cmd) and "OLLAMA_HOST=" not in req.cmd: _ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1" _ollama_chosen_port = _pick_free_port_for_ollama( remote, req.ssh_port, start_port=11434, max_offset=10, @@ -950,8 +1276,6 @@ def setup_cookbook_routes() -> APIRouter: ps_lines = [] ps_lines.append('$sessionDir = "$env:TEMP\\odysseus-sessions"') ps_lines.append('New-Item -ItemType Directory -Force -Path $sessionDir | Out-Null') - ps_lines.append('$env:PYTHONIOENCODING = "utf-8"') - ps_lines.append('$env:PYTHONUTF8 = "1"') if req.hf_token: ps_lines.append(f"$env:HF_TOKEN = '{_ps_squote(req.hf_token)}'") if req.gpus: @@ -970,7 +1294,7 @@ def setup_cookbook_routes() -> APIRouter: ps_lines.append('try { python -c "import llama_cpp" 2>$null } catch {}') ps_lines.append('if ($LASTEXITCODE -ne 0) {') ps_lines.append(' Write-Host "Installing llama-cpp-python..."') - ps_lines.append(' python -m pip install llama-cpp-python[server] --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu') + ps_lines.append(' python -m pip install llama-cpp-python[server]') ps_lines.append('}') elif "vllm" in req.cmd: ps_lines.append('Write-Host "ERROR: vLLM is not supported on Windows. Use Ollama or llama.cpp instead."') @@ -1045,58 +1369,46 @@ def setup_cookbook_routes() -> APIRouter: # ollama is found (otherwise macOS falls back to a slow source build). # /opt/homebrew = Apple Silicon, /usr/local = Intel; harmless on Linux. runner_lines.append('export PATH="$HOME/.local/bin:$HOME/bin:$HOME/llama.cpp/build/bin:/opt/homebrew/bin:/usr/local/bin:$PATH"') - if local_windows: - # LOCAL Windows: no native source compilation (no cmake/compiler on Git Bash). - # Just check python bindings (using native `python` binary) and fall back to pip install. - runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then') - runner_lines.append(' echo "llama-server not found — installing Python bindings..."') - runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='python')} || true") - runner_lines.append('fi') - runner_lines.append('if ! command -v llama-server &>/dev/null && ! python -c "import llama_cpp" 2>/dev/null; then') - runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install attempts."') - runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') - runner_lines.append('fi') - else: - runner_lines.append('if [ -d /data/data/com.termux ]; then') - runner_lines.append(' # Termux: no native build — use the Python bindings (CPU).') - runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then') - runner_lines.append(' pkg install -y cmake 2>/dev/null') - runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null') - runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true') - runner_lines.append(' fi') - runner_lines.append('elif ! command -v llama-server &>/dev/null; then') - runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."') - runner_lines.append(' mkdir -p ~/bin') - runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp') - # Build with the right accelerator: Metal on macOS (llama.cpp - # enables it automatically, no flag), CUDA on Linux when present, - # else a plain CPU build. nproc is Linux-only — fall back to - # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships - # a prebuilt llama-server and skips this whole source build.) - runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"') - runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then') - runner_lines.append(' command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."') - # Start from a clean cache: a prior failed configure (e.g. a CUDA - # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build` - # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is - # explicit so the binary is optimized (Metal auto-enables on macOS). - runner_lines.append(' cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\') - runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') - runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') - runner_lines.append(' else') - _append_llama_cpp_linux_accel_build_lines(runner_lines) - runner_lines.append(' fi') - # If the native build failed, fall back to the Python bindings. - runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') - runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."') - runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true") - runner_lines.append(' fi') - runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') - runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."') - runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') - runner_lines.append(' fi') - runner_lines.append('fi') - elif "ollama" in req.cmd: + runner_lines.append('if [ -d /data/data/com.termux ]; then') + runner_lines.append(' # Termux: no native build — use the Python bindings (CPU).') + runner_lines.append(' if ! python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' pkg install -y cmake 2>/dev/null') + runner_lines.append(' pip install numpy diskcache jinja2 2>/dev/null') + runner_lines.append(' CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_LLAMAFILE=OFF" pip install \'llama-cpp-python[server]\' --no-build-isolation --no-cache-dir 2>&1 || true') + runner_lines.append(' fi') + runner_lines.append('elif ! command -v llama-server &>/dev/null; then') + runner_lines.append(' echo "Native llama-server not found — building from source (one-time, may take a few minutes)..."') + runner_lines.append(' mkdir -p ~/bin') + runner_lines.append(' cd ~ && [ -d llama.cpp ] || git clone --depth 1 https://github.com/ggml-org/llama.cpp') + # Build with the right accelerator: Metal on macOS (llama.cpp + # enables it automatically, no flag), CUDA on Linux when present, + # else a plain CPU build. nproc is Linux-only — fall back to + # `sysctl hw.ncpu` on macOS. (Tip: `brew install llama.cpp` ships + # a prebuilt llama-server and skips this whole source build.) + runner_lines.append(' NPROC="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"') + runner_lines.append(' if [ "$(uname -s)" = "Darwin" ]; then') + runner_lines.append(' command -v cmake >/dev/null 2>&1 || echo "WARNING: cmake not found — install it with: brew install cmake (or: brew install llama.cpp for a prebuilt llama-server)."') + # Start from a clean cache: a prior failed configure (e.g. a CUDA + # attempt) poisons build/CMakeCache.txt, so a plain `cmake -B build` + # would reuse the bad settings and fail again. CMAKE_BUILD_TYPE is + # explicit so the binary is optimized (Metal auto-enables on macOS). + runner_lines.append(' cd ~/llama.cpp && rm -rf build && cmake -B build -DCMAKE_BUILD_TYPE=Release \\') + runner_lines.append(' && cmake --build build -j"$NPROC" --target llama-server \\') + runner_lines.append(' && ln -sf ~/llama.cpp/build/bin/llama-server ~/bin/llama-server') + runner_lines.append(' else') + _append_llama_cpp_linux_accel_build_lines(runner_lines) + runner_lines.append(' fi') + runner_lines.append(' # If the native build failed, fall back to the Python bindings.') + runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' echo "llama-server build failed — installing Python bindings as fallback..."') + runner_lines.append(f" {_pip_install_fallback_chain('llama-cpp-python[server]', python_cmd='pip')} || true") + runner_lines.append(' fi') + runner_lines.append(' if ! command -v llama-server &>/dev/null && ! python3 -c "import llama_cpp" 2>/dev/null; then') + runner_lines.append(' echo "ERROR: llama.cpp serving is not available after install/build attempts."') + runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') + runner_lines.append(' fi') + runner_lines.append('fi') + elif re.search(r"\bollama\s+serve\b", req.cmd): handled_ollama_serve = True _ollama_default_host = "0.0.0.0" if remote else "127.0.0.1" _ollama_host, _ollama_port = _ollama_bind_from_cmd( @@ -1117,23 +1429,13 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_try_port"') runner_lines.append(' break') runner_lines.append(' fi') - runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"') - runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."') - if local_windows: - # Windows detached process has no TTY; exec bash -i crashes. - # Keep the monitoring task alive with a sleep loop. - runner_lines.append(' while true; do sleep 60; done') - else: - runner_lines.append(' exec bash -i') - runner_lines.append('fi') + runner_lines.append(' exec 3<&-; exec 3>&-') + runner_lines.append('done') runner_lines.append('if ! command -v ollama &>/dev/null; then') runner_lines.append(' echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."') runner_lines.append(' echo') runner_lines.append(' echo "=== Process exited with code 127 ==="') - if local_windows: - runner_lines.append(' exit 127') - else: - runner_lines.append(' exec bash -i') + runner_lines.append(' exec bash -i') runner_lines.append('fi') runner_lines.append('ODYSSEUS_OLLAMA_URL="http://${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}"') if remote and _ollama_host in ("0.0.0.0", "::"): @@ -1141,20 +1443,24 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append('echo "[odysseus] Ollama has no built-in authentication; expose this only on a trusted LAN/VPN or provide an explicit OLLAMA_HOST with your own access controls."') runner_lines.append('echo "Starting ollama server on ${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}..."') runner_lines.append('OLLAMA_HOST="${ODYSSEUS_OLLAMA_HOST}:${ODYSSEUS_OLLAMA_PORT}" ollama serve') - if local_windows: - _append_serve_exit_code_lines(runner_lines, keep_shell_open=False) - else: - runner_lines.append('_ody_exit=$?') - runner_lines.append('echo') - runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="') - runner_lines.append('exec bash -i') + runner_lines.append('_ody_exit=$?') + runner_lines.append('echo') + runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="') + runner_lines.append('exec bash -i') elif "vllm serve" in req.cmd: # vLLM is CUDA/ROCm-only and does not run on macOS at all. runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then') runner_lines.append(' echo "ERROR: vLLM does not run on macOS. Use Ollama or llama.cpp (Metal) instead."') runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=1') runner_lines.append('fi') - _append_vllm_linux_preflight_lines(runner_lines) + # Put ~/.local/bin on PATH first — without a venv, vllm installs + # there via --user and the non-login serve shell otherwise can't + # find the `vllm` CLI ("command not found"). Mirrors llama.cpp above. + runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') + runner_lines.append('if ! command -v vllm &>/dev/null; then') + runner_lines.append(' echo "ERROR: vLLM is not installed."') + runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') + runner_lines.append('fi') elif "sglang.launch_server" in req.cmd: runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') runner_lines.append('if ! command -v sglang &>/dev/null; then') @@ -1173,15 +1479,30 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') runner_lines.append('fi') - if not handled_ollama_serve: + handled_ollama_sidecar_probe = False + if (not handled_ollama_serve + and re.search(r"\bdocker\s+exec\s+(?:ollama-rocm|ollama-test)\s+ollama\s+show\b", req.cmd or "")): + handled_ollama_sidecar_probe = True _append_serve_preflight_exit_lines( runner_lines, keep_shell_open=not local_windows, ) - if is_pip_install: - _append_pip_install_runner_lines(runner_lines, req.cmd) - else: - runner_lines.append(req.cmd) + runner_lines.append(req.cmd) + runner_lines.append('_ody_exit=$?') + runner_lines.append('echo') + runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="') + runner_lines.append('if [ "$_ody_exit" -eq 0 ]; then') + runner_lines.append(' echo "[odysseus] Ollama sidecar model is available; keeping Cookbook task attached to the persistent Ollama daemon."') + runner_lines.append(' while true; do sleep 3600; done') + runner_lines.append('fi') + runner_lines.append('exec bash -i') + + if not handled_ollama_serve and not handled_ollama_sidecar_probe: + _append_serve_preflight_exit_lines( + runner_lines, + keep_shell_open=not local_windows, + ) + runner_lines.append(req.cmd) if local_windows: # Detached background process — no interactive shell to keep open. # Print the exit marker the status poller looks for, then stop. @@ -1263,6 +1584,26 @@ def setup_cookbook_routes() -> APIRouter: elif not is_pip_install: endpoint_id = _auto_register_llm_endpoint(req, remote) + # Crash watchdog: the auto-register above writes the endpoint row + # IMMEDIATELY (before the server has even bound its port) so the + # picker shows the model as it warms up. When the serve process + # crashes right at startup (missing module, bad cmd, port collision, + # ModuleNotFoundError on llama_cpp, etc.), the endpoint is left + # dangling — every subsequent chat returns 503 or an empty response. + # Schedule a background task to read the tmux output for the + # "=== Process exited with code N ===" marker the runner emits; + # if N != 0 within the watch window, delete the endpoint we just + # created. Skipped for diffusion (different image-endpoint cleanup + # path) and pip-install tasks (no endpoint to drop). + if endpoint_id and not is_diffusion and not is_pip_install: + asyncio.create_task(_serve_crash_watchdog( + endpoint_id=endpoint_id, + session_id=session_id, + remote=remote, + ssh_port=req.ssh_port, + is_windows=is_windows, + )) + # Log to assistant try: from src.assistant_log import log_to_assistant @@ -1342,8 +1683,8 @@ def setup_cookbook_routes() -> APIRouter: cmd = f"ssh {pf}{host} '{setup_script}'" else: # Linux: auto-install tmux (via whichever package manager is available) - # and huggingface_hub + hf_transfer (falling back to --user, then - # guarded --break-system-packages on PEP-668 locked distros). + # and huggingface_hub + hf_transfer (falling back to --user/--break-system-packages + # on PEP-668 locked distros like Arch / newer Debian). setup_script = ( # Install tmux if missing — try common package managers; skip if no sudo "if ! command -v tmux >/dev/null 2>&1; then " @@ -1355,15 +1696,10 @@ def setup_cookbook_routes() -> APIRouter: " fi; " "fi; " "command -v tmux >/dev/null 2>&1 || echo 'WARNING: tmux missing and auto-install failed (need passwordless sudo). Install manually.'; " - # Install Python bits. Try system install first; fall back to --user, - # then use --break-system-packages only when pip supports it. + # Install Python bits. Try system install first; fall back to --user --break-system-packages on PEP 668 systems. "pip install -q huggingface_hub hf_transfer 2>/dev/null || " - "pip install --user -q huggingface_hub hf_transfer 2>/dev/null || " - "( pip install --help 2>/dev/null | grep -q -- --break-system-packages && " - "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ) || " - "pip3 install --user -q huggingface_hub hf_transfer 2>/dev/null || " - "( pip3 install --help 2>/dev/null | grep -q -- --break-system-packages && " - "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null ); " + "pip install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null || " + "pip3 install --user --break-system-packages -q huggingface_hub hf_transfer 2>/dev/null; " "python3 -c 'from huggingface_hub import snapshot_download; print(\"OK\")'" ) cmd = f"ssh {pf}{host} '{setup_script}'" @@ -1386,38 +1722,11 @@ def setup_cookbook_routes() -> APIRouter: async def _run_nvidia_smi(query: str, host: str | None, ssh_port: str | None, timeout: int = 8): """Run nvidia-smi locally or over SSH. Returns (stdout, error_or_None).""" if host: - candidates = [query] - stripped = query.strip() - if stripped.startswith("nvidia-smi "): - args = stripped[len("nvidia-smi "):] - candidates.append( - "bash -lc " - + shlex.quote( - f"{SSH_PATH_OVERRIDE}" - f"nvidia-smi {args}" - ) - ) - for nvidia_path in NVIDIA_PATH_CANDIDATES: - candidates.append(f"{nvidia_path} {args}") - - last_err = "nvidia-smi failed" - for candidate in candidates: - try: - rc, stdout, stderr = await run_ssh_command_async( - host, - ssh_port, - candidate, - connect_timeout=5, - timeout=timeout, - ) - except asyncio.TimeoutError: - return None, "nvidia-smi timed out" - if rc == 0: - return stdout.decode("utf-8", errors="replace"), None - err = (stderr.decode("utf-8", errors="replace") or "").strip()[:200] - if err: - last_err = err - return None, last_err + pf = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else "" + cmd = f"ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no {pf}{host} '{query}'" + proc = await asyncio.create_subprocess_shell( + cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) else: proc = await asyncio.create_subprocess_exec( *shlex.split(query), @@ -1996,30 +2305,58 @@ def setup_cookbook_routes() -> APIRouter: return {"models": out} - # Rate-limit for the orphan-tmux adoption sweep. The UI polls - # tasks/status every ~3s; we don't want to SSH every host on every - # poll. 20s is fast enough that a model the agent launched in the - # background shows up "almost immediately" in the UI without being - # wasteful. + # Rate-limit for the orphan-tmux adoption sweep. 60s interval so SSH + # work is genuinely sparse even on an actively-polled cookbook page. _last_orphan_sweep_ts = [0.0] - _ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0 + _ORPHAN_SWEEP_MIN_INTERVAL_S = 60.0 + # Concurrency guard so two requests racing don't both spawn a sweep. + _orphan_sweep_inflight = [False] def _maybe_sweep_orphans(tasks: list, state: dict) -> None: """Scan each configured cookbook server for `serve-*` tmux sessions the cookbook doesn't know about and adopt them into state.tasks. - Writes are conditional: if no orphans are found, nothing is touched. - Rate-limited so polling UIs don't trigger SSH on every refresh. + Heavy SSH work runs in a background thread via asyncio.to_thread so + it never blocks the request that triggered it. Was previously + disabled because the sync implementation pegged uvicorn CPU during + active cookbook polling — re-enabled now with the work pushed off + the event loop and a slower (60s) cadence. """ import time as _time - import subprocess - logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}") now = _time.monotonic() + if _orphan_sweep_inflight[0]: + return if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S: - logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last") return _last_orphan_sweep_ts[0] = now + _orphan_sweep_inflight[0] = True + # Snapshot inputs so the worker doesn't race with state mutations. + try: + tasks_snap = list(tasks or []) + except Exception: + tasks_snap = [] + state_snap = state if isinstance(state, dict) else {} + # Caller is _cookbook_tasks_status_sync (sync context, no event + # loop). Use a plain background thread — no asyncio needed. + import threading + def _run_sweep() -> None: + try: + _sync_sweep_orphans(tasks_snap, state_snap) + except Exception as _e: + logger.warning(f"orphan sweep thread failed: {_e!r}") + finally: + _orphan_sweep_inflight[0] = False + try: + threading.Thread(target=_run_sweep, daemon=True, name="orphan-sweep").start() + except Exception as _e: + logger.warning(f"orphan sweep thread spawn failed: {_e!r}") + _orphan_sweep_inflight[0] = False + return + + def _sync_sweep_orphans(tasks: list, state: dict) -> None: + """The actual sync sweep — never call this on the event loop.""" + import subprocess env = state.get("env") if isinstance(state, dict) else {} servers = env.get("servers") if isinstance(env, dict) else [] logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}") @@ -2143,6 +2480,121 @@ def setup_cookbook_routes() -> APIRouter: except Exception as e: logger.warning(f"orphan sweep: state write failed: {e}") + # In-memory cache for the Ollama library scrape. ollama.com is a public + # site, but it doesn't expose a stable JSON listing — we fetch the HTML + # search page and regex out the model cards. Cached for 1 h so a busy + # cookbook view doesn't hammer the site on every render. + _ollama_library_cache: dict = {"models": [], "fetched_at": 0.0, "error": None} + + _OLLAMA_FALLBACK_LIBRARY = [ + {"name": "qwen2.5", "description": "Qwen2.5 series — strong general/coding model from Alibaba.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b", "72b"]}, + {"name": "qwen2.5-coder", "description": "Code-specialized Qwen2.5 family.", "sizes": ["0.5b", "1.5b", "3b", "7b", "14b", "32b"]}, + {"name": "qwen3", "description": "Qwen3 — newer Alibaba family with hybrid reasoning.", "sizes": ["0.6b", "1.7b", "4b", "8b", "14b", "32b"]}, + {"name": "llama3.2", "description": "Meta Llama 3.2 instruct (and tiny / vision variants).", "sizes": ["1b", "3b", "11b", "90b"]}, + {"name": "llama3.1", "description": "Meta Llama 3.1 instruct.", "sizes": ["8b", "70b", "405b"]}, + {"name": "llama3.3", "description": "Meta Llama 3.3 70B instruct.", "sizes": ["70b"]}, + {"name": "gemma3", "description": "Google Gemma 3 — multimodal capable open-weights.", "sizes": ["1b", "4b", "12b", "27b"]}, + {"name": "gemma2", "description": "Google Gemma 2 instruct.", "sizes": ["2b", "9b", "27b"]}, + {"name": "mistral", "description": "Mistral 7B instruct — small, fast generalist.", "sizes": ["7b"]}, + {"name": "mistral-nemo", "description": "Mistral NeMo 12B instruct.", "sizes": ["12b"]}, + {"name": "mistral-small", "description": "Mistral Small 22B / 24B instruct.", "sizes": ["22b", "24b"]}, + {"name": "mixtral", "description": "Mistral MoE 8x7B / 8x22B.", "sizes": ["8x7b", "8x22b"]}, + {"name": "phi3", "description": "Microsoft Phi-3 small / medium.", "sizes": ["mini", "medium"]}, + {"name": "phi4", "description": "Microsoft Phi-4 14B.", "sizes": ["14b"]}, + {"name": "deepseek-r1", "description": "DeepSeek R1 reasoning model (distilled variants).", "sizes": ["1.5b", "7b", "8b", "14b", "32b", "70b"]}, + {"name": "deepseek-v3", "description": "DeepSeek V3 MoE 671B (huge — needs serious VRAM).", "sizes": ["671b"]}, + {"name": "codellama", "description": "Meta Code Llama instruct family.", "sizes": ["7b", "13b", "34b", "70b"]}, + {"name": "starcoder2", "description": "BigCode StarCoder2 — code completion.", "sizes": ["3b", "7b", "15b"]}, + {"name": "deepseek-coder-v2", "description": "DeepSeek Coder V2 — code MoE.", "sizes": ["16b", "236b"]}, + {"name": "nomic-embed-text", "description": "Embedding model — text vector encoder.", "sizes": ["latest"]}, + {"name": "mxbai-embed-large", "description": "Embedding model — Mixedbread large.", "sizes": ["latest"]}, + {"name": "llava", "description": "LLaVA multimodal vision-language model.", "sizes": ["7b", "13b", "34b"]}, + {"name": "minicpm-v", "description": "MiniCPM-V multimodal.", "sizes": ["8b"]}, + {"name": "command-r", "description": "Cohere Command R — RAG-oriented.", "sizes": ["35b"]}, + {"name": "command-r-plus", "description": "Cohere Command R+ — larger RAG model.", "sizes": ["104b"]}, + {"name": "qwq", "description": "Qwen QwQ reasoning preview.", "sizes": ["32b"]}, + {"name": "smollm2", "description": "HuggingFaceTB SmolLM2 — tiny capable models.", "sizes": ["135m", "360m", "1.7b"]}, + {"name": "granite3.1-dense", "description": "IBM Granite 3.1 dense instruct.", "sizes": ["2b", "8b"]}, + {"name": "nemotron", "description": "NVIDIA Nemotron 70B.", "sizes": ["70b"]}, + {"name": "olmo2", "description": "AI2 OLMo 2 open-weights.", "sizes": ["7b", "13b"]}, + ] + + @router.get("/api/cookbook/ollama/library") + async def ollama_library(refresh: int = 0, request: Request = None, owner: str = Depends(require_user)): + """List popular Ollama library models for the Browse picker. + + Tries a 1-hour-cached fetch of ollama.com/library, falls back to a + curated hard-coded list so the picker always renders something.""" + import time as _time + import httpx as _httpx + TTL = 3600.0 + now = _time.time() + if refresh or (now - _ollama_library_cache["fetched_at"]) > TTL or not _ollama_library_cache["models"]: + models: list[dict] = [] + err = None + try: + async with _httpx.AsyncClient(timeout=8, follow_redirects=True) as client: + resp = await client.get( + "https://ollama.com/search?sort=popular", + headers={"User-Agent": "odysseus-cookbook/1.0"}, + ) + if resp.status_code == 200: + html = resp.text + # ollama.com renders each model card as a single anchor: + # + # The description + sizes live inside that anchor. Pull + # the whole block then extract pieces individually. + block_re = re.compile( + r']*href="/library/([A-Za-z0-9._-]+)"[^>]*>(.*?)', + re.DOTALL, + ) + desc_re = re.compile(r']*>([^<]{4,400})

', re.DOTALL) + # Size tags on ollama.com cards look like "0.5b", "14b", + # "8x7b", "27b". Pulled from short -wrapped chips. + size_re = re.compile(r'>\s*(\d+(?:\.\d+)?(?:x\d+)?[bBmM])\s*<') + seen: set[str] = set() + for bm in block_re.finditer(html): + name = bm.group(1).strip() + if name in seen: + continue + seen.add(name) + body = bm.group(2) + dm = desc_re.search(body) + desc = (dm.group(1).strip() if dm else "").replace("\n", " ") + sizes_raw = size_re.findall(body) + # Dedup sizes preserving order + sizes: list[str] = [] + for s in sizes_raw: + s_low = s.lower() + if s_low not in sizes: + sizes.append(s_low) + models.append({"name": name, "description": desc, "sizes": sizes}) + if len(models) >= 80: + break + else: + err = f"HTTP {resp.status_code}" + except Exception as e: + err = str(e)[:160] + # Merge curated fallback so classics (qwen2.5, llama3, deepseek-r1, + # …) stay reachable even when ollama.com's front page is dominated + # by brand-new releases the user might not be looking for. + live_names = {m["name"] for m in models} + for fb in _OLLAMA_FALLBACK_LIBRARY: + if fb["name"] not in live_names: + models.append(fb) + if not models: + models = list(_OLLAMA_FALLBACK_LIBRARY) + if err is None: + err = "parsed 0 results — using fallback list" + _ollama_library_cache["models"] = models + _ollama_library_cache["fetched_at"] = now + _ollama_library_cache["error"] = err + return { + "models": _ollama_library_cache["models"], + "fetched_at": _ollama_library_cache["fetched_at"], + "error": _ollama_library_cache["error"], + } + @router.get("/api/cookbook/tasks/status") async def cookbook_tasks_status(request: Request): """Check status of all active cookbook tmux sessions. @@ -2180,13 +2632,39 @@ def setup_cookbook_routes() -> APIRouter: "inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));" "sys.exit(0 if ok and not inc else 1)" ) - if remote_host: - cmd = ["python3", "-c", py, repo_id] - else: - # Local Windows: python3 can hit the Microsoft Store stub. Use the - # real Python Odysseus is running under (guaranteed to exist). - import sys as _sys_local - cmd = [_sys_local.executable, "-c", py, repo_id] + cmd = ["python3", "-c", py, repo_id] + try: + if remote_host: + ssh_base = ["ssh"] + if ssh_port and ssh_port != "22": + ssh_base.extend(["-p", str(ssh_port)]) + shell_cmd = " ".join(shlex.quote(x) for x in cmd) + proc = subprocess.run(ssh_base + [remote_host, shell_cmd], timeout=12, capture_output=True) + else: + proc = subprocess.run(cmd, timeout=12, capture_output=True) + return proc.returncode == 0 + except Exception: + return False + + def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool: + """Best-effort check for resumable HF partial blobs. + + A lost SSH/tmux session can leave a real download still incomplete. + Treat any *.incomplete blob as stronger evidence than stale + "100%" lines in the captured pane output. + """ + if not repo_id or "/" not in repo_id: + return False + py = ( + "import os,sys;" + "repo=sys.argv[1];" + "base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');" + "d=os.path.join(base,'models--'+repo.replace('/','--'));" + "blobs=os.path.join(d,'blobs');" + "inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));" + "sys.exit(0 if inc else 1)" + ) + cmd = ["python3", "-c", py, repo_id] try: if remote_host: ssh_base = ["ssh"] @@ -2333,28 +2811,43 @@ def setup_cookbook_routes() -> APIRouter: except Exception: pass else: - try: - alive = subprocess.run(check_cmd, timeout=10, capture_output=True) - is_alive = alive.returncode == 0 - except Exception: + # Skip the live SSH check entirely for tasks already in a + # terminal state — they won't change, and 10s timeouts + # stacked per task were the dominant cost of this whole + # status endpoint (3+ minute stalls with ~8 accumulated + # stopped tasks). The agent's `list_served_models` call + # was blocking the chat stream every time. + _task_status = (task.get("status") or "").lower() + if _task_status in {"stopped", "done", "completed", + "crashed", "error", "failed", + "ended", "killed"}: is_alive = False - - # Capture last lines for progress. Prefer the "Downloading" line - # (real aggregate bytes) over "Fetching N files" (whole-file count that - # lags with hf_transfer). Falls back to the true last line otherwise. - if is_alive: + # Keep the persisted output_tail for the UI — it's + # what the agent uses to diagnose past failures. + full_snapshot = (task.get("output") or "")[-12000:] + else: try: - cap = subprocess.run(capture_cmd, timeout=10, capture_output=True, text=True) - if cap.returncode == 0: - full_snapshot = cap.stdout.strip() - lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()] - downloading_lines = [l for l in lines if l.startswith("Downloading")] - if downloading_lines: - progress_text = downloading_lines[-1] - elif lines: - progress_text = lines[-1] + alive = subprocess.run(check_cmd, timeout=4, capture_output=True) + is_alive = alive.returncode == 0 except Exception: - pass + is_alive = False + + # Capture last lines for progress. Prefer the "Downloading" line + # (real aggregate bytes) over "Fetching N files" (whole-file count that + # lags with hf_transfer). Falls back to the true last line otherwise. + if is_alive: + try: + cap = subprocess.run(capture_cmd, timeout=4, capture_output=True, text=True) + if cap.returncode == 0: + full_snapshot = cap.stdout.strip() + lines = [l.strip() for l in full_snapshot.split('\n') if l.strip()] + downloading_lines = [l for l in lines if l.startswith("Downloading")] + if downloading_lines: + progress_text = downloading_lines[-1] + elif lines: + progress_text = lines[-1] + except Exception: + pass # Determine status. For the local-Windows detached model the log file # persists after the process exits, so a finished download still has a @@ -2362,6 +2855,16 @@ def setup_cookbook_routes() -> APIRouter: # when the PID is gone instead of blindly reporting "stopped". download_zero_files = False status = "unknown" + download_has_ok = task_type == "download" and "DOWNLOAD_OK" in full_snapshot + download_has_failed = task_type == "download" and "DOWNLOAD_FAILED" in full_snapshot + download_has_incomplete_evidence = ( + task_type == "download" + and ( + ".incomplete" in full_snapshot + or bool(re.search(r'model-\d+-of-\d+\.[A-Za-z0-9_.-]+:\s+(?:[0-9]|[1-8][0-9])%', full_snapshot)) + or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or "")) + ) + ) if is_alive or (local_win_task and full_snapshot): lower = full_snapshot.lower() exit_match = re.search(r"=== process exited with code\s+(-?\d+)", full_snapshot, re.I) @@ -2374,20 +2877,24 @@ def setup_cookbook_routes() -> APIRouter: elif has_exit and task_type == "download": # Dependency installs are tracked as download tasks but only # emit the generic runner exit marker, not HF download markers. - status = "completed" if exit_code == 0 else "error" + if download_has_incomplete_evidence and not download_has_ok: + status = "running" if is_alive else "stopped" + else: + status = "completed" if exit_code == 0 else "error" elif has_exit and "unrecognized arguments" in lower: status = "error" elif has_error and not ("application startup complete" in lower): status = "error" - elif task_type == "download" and ("100%" in full_snapshot or "DOWNLOAD_OK" in full_snapshot): - # Only download tasks treat 100% as "completed". - # Serve tasks log 100%|██████| during inference progress - # (diffusion sampling, etc.) — that's "running", not done. + elif task_type == "download" and download_has_ok: if re.search(r"Fetching\s+0\s+files", full_snapshot, re.IGNORECASE): status = "error" download_zero_files = True else: status = "completed" + elif task_type == "download" and download_has_failed: + status = "error" + elif task_type == "download" and download_has_incomplete_evidence: + status = "running" if is_alive else "stopped" elif "application startup complete" in lower: status = "ready" elif not is_alive: @@ -2397,7 +2904,11 @@ def setup_cookbook_routes() -> APIRouter: status = "running" else: # Session is dead — check if it completed or crashed - if task_type == "download" and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")): + if ( + task_type == "download" + and not download_has_incomplete_evidence + and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or "")) + ): status = "completed" if not progress_text: progress_text = "Download complete" @@ -2407,12 +2918,12 @@ def setup_cookbook_routes() -> APIRouter: status = "stopped" # Parse structured phase info — single source of truth for the UI - phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and status == "running" and full_snapshot) else {} + phase_info = _parse_serve_phase(full_snapshot, task_type) if (task_type == "serve" and full_snapshot) else {} if phase_info.get("status") == "ready": status = "ready" serve_phase = phase_info.get("phase", "") diagnosis = _diagnose_serve_output(full_snapshot) if task_type == "serve" and full_snapshot else None - if diagnosis and status in {"running", "unknown", "stopped"}: + if diagnosis and status in {"running", "unknown", "stopped"} and phase_info.get("status") != "ready": status = "error" if download_zero_files: diagnosis = {"message": "No matching files were downloaded. The model repo or filename/quant pattern may be wrong (for example a ':Q4_K_M' tag that does not exist in the repo). Check the repo and the include/quant pattern."} diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py index a7af18b04..eb408ac9d 100644 --- a/routes/hwfit_routes.py +++ b/routes/hwfit_routes.py @@ -196,7 +196,24 @@ def setup_hwfit_routes(): if target_context is not None: target_context = max(1024, min(target_context, 1000000)) - results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None, target_context=target_context, fit_only=fit_only) + rank_kwargs = { + "use_case": use_case or None, + "limit": limit, + "search": search or None, + "sort": sort, + "quant": quant or None, + "fit_only": fit_only, + } + if target_context is not None: + rank_kwargs["target_context"] = target_context + try: + import inspect + supported = set(inspect.signature(rank_models).parameters) + rank_kwargs = {k: v for k, v in rank_kwargs.items() if k in supported} + except Exception: + rank_kwargs.pop("target_context", None) + rank_kwargs.pop("fit_only", None) + results = rank_models(system, **rank_kwargs) return {"system": system, "models": results} @router.get("/profiles") diff --git a/routes/model_routes.py b/routes/model_routes.py index 995705d75..6b76dc71f 100644 --- a/routes/model_routes.py +++ b/routes/model_routes.py @@ -5,7 +5,6 @@ import re import uuid import json import socket -import hashlib import time as _time import logging import httpx @@ -283,11 +282,8 @@ _HOST_TO_CURATED = ( ("fireworks.ai", "fireworks"), ("googleapis.com", "google"), ("x.ai", "xai"), - ("openrouter.ai", "openrouter"), ("ollama.com", "ollama"), - ("opencode.ai/zen/go", "opencode-go"), - ("opencode.ai/zen", "opencode-zen"), ) @@ -494,8 +490,6 @@ _NON_CHAT_EXACT_PREFIXES = ( def _is_chat_model(model_id: str) -> bool: """Return True if the model ID looks like a chat/completions-capable model.""" mid = model_id.lower() - if mid in {"gpt-5.1-codex"}: - return True for prefix in _NON_CHAT_PREFIXES: if mid.startswith(prefix): return False @@ -508,67 +502,9 @@ def _is_chat_model(model_id: str) -> bool: return True -def _delete_orphaned_provider_auth(db, auth_id: Optional[str], exclude_ep_id: Optional[str] = None) -> bool: - """Delete a ProviderAuthSession once no endpoint still references it. - - Subscription providers (e.g. ChatGPT Subscription) keep their refresh token - in ProviderAuthSession rather than ModelEndpoint.api_key. When the last - endpoint backed by that auth row is removed, the stored credentials should - be cleared instead of lingering. Returns True if a row was deleted. - ``exclude_ep_id`` drops the endpoint currently being deleted from the - reference count so it does not keep its own auth alive. - """ - if not auth_id: - return False - from core.database import ProviderAuthSession - still_referenced = db.query(ModelEndpoint.id).filter( - ModelEndpoint.provider_auth_id == auth_id, - ModelEndpoint.id != exclude_ep_id, - ).first() - if still_referenced is not None: - return False - auth_row = db.query(ProviderAuthSession).filter(ProviderAuthSession.id == auth_id).first() - if auth_row is None: - return False - db.delete(auth_row) - return True - - -def _is_discovery_only_provider(provider: str) -> bool: - """Provider that only supports model discovery, not live probing. - - ChatGPT Subscription speaks the Responses/Codex API and has no - chat-completions or general health endpoint, so completion probes and - reachability pings are skipped — status is derived from cached models. - """ - return provider == "chatgpt-subscription" - - -def _resolve_probe_key(ep) -> Optional[str]: - """API key/bearer to probe an endpoint with. - - Delegates to ``resolve_endpoint_runtime``, which already returns the static - ``ModelEndpoint.api_key`` for keyed endpoints and resolves (and refreshes) - the runtime bearer for session-backed providers (e.g. ChatGPT Subscription). - Returns None if resolution fails (e.g. re-auth required) so probing skips - rather than raising. Reads only already-loaded scalar attributes of ``ep``. - """ - try: - from src.endpoint_resolver import resolve_endpoint_runtime - _base, key = resolve_endpoint_runtime(ep, owner=getattr(ep, "owner", None)) - return key - except Exception as e: - logger.warning("Probe key resolution failed for %s: %s", getattr(ep, "id", "?"), e) - return None - - -def _probe_single_model(base: str, api_key: Optional[str], model_id: str, timeout: int = 10, with_tools: bool = False) -> dict: +def _probe_single_model(base: str, api_key: str, model_id: str, timeout: int = 10, with_tools: bool = False) -> dict: """Send a realistic completion request to a single model. Returns {status, latency_ms, error?}.""" provider = _detect_provider(base) - if _is_discovery_only_provider(provider): - # Responses/Codex API, not chat-completions: a completion probe would - # 400 and the re-probe flow would then hide every model. Discovery-only. - return {"status": "ok", "latency_ms": 0, "skipped": True} messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Say OK"}, @@ -682,11 +618,6 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis For Anthropic, queries their /v1/models API, falling back to hardcoded list.""" from src.endpoint_resolver import resolve_url base = resolve_url(_normalize_base(base_url)) - if _detect_provider(base) == "chatgpt-subscription": - from src.chatgpt_subscription import fetch_available_models - if api_key: - return fetch_available_models(api_key, timeout=timeout) - return [] if _detect_provider(base) == "anthropic": # Try Anthropic's /v1/models endpoint first url = build_models_url(base) @@ -713,10 +644,6 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis logger.warning(f"Anthropic /v1/models failed, using hardcoded list: {e}") return list(ANTHROPIC_MODELS) url = build_models_url(base) - if not url: - curated_key = _match_provider_curated(base, None) - fallback = _PROVIDER_CURATED.get(curated_key) if curated_key else None - return list(fallback or []) headers = build_headers(api_key, base) try: r = httpx.get(url, headers=headers, timeout=timeout, verify=llm_verify()) @@ -770,6 +697,7 @@ def _probe_endpoint(base_url: str, api_key: str = None, timeout: int = 5) -> Lis return list(fallback) return [] + def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> Dict[str, Any]: """Reachability probe that does not require installed/listed models.""" from src.endpoint_resolver import resolve_url @@ -785,10 +713,6 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> or "ollama" in (parsed_base.hostname or "").lower() ) - # APFEL-specific detection - host = (parsed_base.hostname or "").lower() - looks_like_apfel = "apfel" in host or parsed_base.port == 11435 - def _result_from_response(r) -> Dict[str, Any]: if 300 <= r.status_code < 400: loc = r.headers.get("location", "") @@ -810,23 +734,7 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> last_error: Optional[str] = None try: - # APFEL does not behave like Ollama; use its health endpoint. - if looks_like_apfel: - root = base - for suffix in ("/v1", "/api"): - if root.endswith(suffix): - root = root[: -len(suffix)].rstrip("/") - break - try: - r = httpx.get(root + "/health", timeout=timeout, verify=llm_verify()) - result = _result_from_response(r) - if result["reachable"]: - return result - last_error = result.get("error") - except Exception as e: - last_error = str(e)[:120] - - elif looks_like_ollama: + if looks_like_ollama: root = base for suffix in ("/v1", "/api"): if root.endswith(suffix): @@ -844,33 +752,44 @@ def _ping_endpoint(base_url: str, api_key: str = None, timeout: float = 1.5) -> except Exception: pass + # OpenAI-compatible servers (vLLM, llama.cpp, SGLang, lmdeploy, …) expose + # /v1/models but return 404 on the bare /v1 root. The probe used to GET + # the base URL only, so a fully-working vLLM endpoint (chats fine!) read + # as offline because /v1 → 404. Try /models first; fall back to the base + # URL only if /models couldn't be reached (TCP-level failure). + models_url = build_models_url(base) + try: + r = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify()) + result = _result_from_response(r) + if result["reachable"]: + return result + last_error = result.get("error") + except Exception as e: + last_error = str(e)[:120] + try: r = httpx.get(base, headers=headers, timeout=timeout, verify=llm_verify()) result = _result_from_response(r) - # If the bare base URL returns a non-auth 4xx (e.g. 404), try /models - # as a fallback. OpenAI-compatible servers like llama-swap return 404 - # on the base /v1 prefix but 200 on /v1/models. Auth failures (401/403) - # are definitive — probing /models would just repeat the same rejection. - if ( - not result["reachable"] - and result.get("status_code") is not None - and 400 <= result["status_code"] < 500 - and result["status_code"] not in (401, 403) - ): - models_url = build_models_url(base) - try: - r2 = httpx.get(models_url, headers=headers, timeout=timeout, verify=llm_verify()) - result2 = _result_from_response(r2) - if result2["reachable"]: - return result2 - except Exception: - pass - return result + if result["reachable"]: + return result + # 4xx from a reachable HTTP server (404 /v1, 401/403 missing key) is + # still proof the upstream is alive. Only treat connection-level + # failures, 5xx, and redirect-to-/login as truly offline. + sc = result.get("status_code") or 0 + if 400 <= sc < 500 and sc not in (407, 408, 421, 425, 429): + return { + "reachable": True, + "status_code": sc, + "error": None, + } + last_error = result.get("error") or last_error except Exception as e: last_error = str(e)[:120] return {"reachable": False, "status_code": None, "error": last_error} + + def _model_endpoint_error_message(base_url: str, ping: Dict[str, Any] = None) -> str: """Return a provider-aware error message for failed endpoint probes.""" ping = ping or {} @@ -959,14 +878,6 @@ def _visible_models(cached_models, hidden_models, pinned_models=None): return [m for m in merged if m not in hidden] -def _api_key_fingerprint(api_key: Optional[str]) -> str: - """Stable, non-secret label for distinguishing same-URL credentials.""" - key = (api_key or "").strip() - if not key: - return "" - return hashlib.sha256(key.encode("utf-8")).hexdigest()[:8] - - def setup_model_routes(model_discovery): router = APIRouter(prefix="/api") @@ -1068,17 +979,6 @@ def setup_model_routes(model_discovery): ok, info = _should_refresh_endpoint(ep, now, force=force) if not ok: continue - if getattr(ep, "provider_auth_id", None): - try: - from src.endpoint_resolver import resolve_endpoint_runtime - info["base"], info["api_key"] = resolve_endpoint_runtime( - ep, - owner=getattr(ep, "owner", None), - ) - info["key"] = _refresh_key(info["base"], info["api_key"]) - except Exception as e: - logger.warning("Skipping model refresh for %s: could not resolve provider auth: %s", getattr(ep, "name", ep.id), e) - continue groups.setdefault(info["key"], { "base": info["base"], "api_key": info["api_key"], @@ -1232,9 +1132,8 @@ def setup_model_routes(model_discovery): raise HTTPException(401, "Not authenticated") except HTTPException: raise - except Exception as e: - logger.error('Auth gate error in GET /api/models, failing closed: %s', e) - raise HTTPException(status_code=500, detail='Internal error') + except Exception: + pass # Admins see every endpoint (they manage the global pool); regular # users get the owner-scoped view. _is_admin = False @@ -1298,7 +1197,14 @@ def setup_model_routes(model_discovery): t0 = _time.time() try: import asyncio as _asyncio - ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 1.5) + # Bumped 1.5s → 3.5s. The previous 1.5s budget was clipping + # local vLLM endpoints on Tailscale links where the model + # server is still loading (Qwen3.5-122B takes 2–3 min to + # warm); /v1/models can take 500–2500 ms on a busy box, + # which pushed _ping_endpoint's full path-discovery sweep + # past the cap and marked the row offline despite the + # user actively chatting with it. + ping = await _asyncio.to_thread(_ping_endpoint, data["base"], data.get("api_key"), 3.5) lat = round((_time.time() - t0) * 1000) return { "alive": bool(ping.get("reachable")), @@ -1348,20 +1254,12 @@ def setup_model_routes(model_discovery): "endpoint_kind": kind, } try: - if _is_discovery_only_provider(provider): - # No general health endpoint — an unauthenticated GET just - # 401s. Report status from cached models instead of pinging. - entry["latency_ms"] = None - entry["status"] = "online" if cached_count else "offline" - entry["error"] = None - entry["model_count"] = cached_count - else: - t0 = _time.time() - ping = _ping_endpoint(base, ep.api_key, timeout=1.5) - entry["latency_ms"] = round((_time.time() - t0) * 1000) - entry["status"] = "online" if ping.get("reachable") or cached_count else "offline" - entry["error"] = ping.get("error") - entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0) + t0 = _time.time() + ping = _ping_endpoint(base, ep.api_key, timeout=1.5) + entry["latency_ms"] = round((_time.time() - t0) * 1000) + entry["status"] = "online" if ping.get("reachable") or cached_count else "offline" + entry["error"] = ping.get("error") + entry["model_count"] = cached_count or (len(ANTHROPIC_MODELS) if provider == "anthropic" else 0) except Exception as e: entry["latency_ms"] = None entry["status"] = "online" if cached_count else "offline" @@ -1394,7 +1292,7 @@ def setup_model_routes(model_discovery): if ep_id and ep_id not in endpoints_cache: ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first() if ep: - endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": _resolve_probe_key(ep)} + endpoints_cache[ep_id] = {"base_url": ep.base_url, "api_key": ep.api_key} ep_data = endpoints_cache.get(ep_id) if not ep_data: # Try to find by base_url from the model's endpoint field @@ -1433,7 +1331,7 @@ def setup_model_routes(model_discovery): "id": ep.id, "name": ep.name, "base_url": ep.base_url, - "api_key": _resolve_probe_key(ep), + "api_key": ep.api_key, }) finally: db.close() @@ -1522,21 +1420,43 @@ def setup_model_routes(model_discovery): # Endpoint counts as reachable if it has any model — including # admin-pinned IDs that a probe would never surface. status = "online" if (all_models or pinned) else "offline" - base = _normalize_base(r.base_url) ping = None - # Discovery-only providers have no health endpoint — an - # unauthenticated ping just 401s, so don't bother. - if not all_models and not pinned and r.is_enabled and not _is_discovery_only_provider(_detect_provider(base)): - ping = _ping_endpoint(r.base_url, r.api_key, timeout=1.0) + # When cached_models is empty, do a quick reachability probe. + # Bumped 1.0s → 3.5s because the user reported endpoints they + # were ACTIVELY chatting with showed "offline" — the previous + # 1s timeout was clipping live cloud endpoints (DeepSeek can + # take 1.5–2.5s on /v1/models when their region is under load, + # vLLM on a remote GPU box behind SSH can also push past 1s). + # 3.5s still keeps the picker render snappy in the common + # "everything's already cached" path because this branch only + # runs for endpoints with an empty cached_models. + if not all_models and not pinned and r.is_enabled: + ping = _ping_endpoint(r.base_url, r.api_key, timeout=3.5) if ping.get("reachable"): status = "empty" + # Best-effort: if the probe came back reachable, try + # to populate cached_models in the background so the + # NEXT picker load shows "online" instead of "empty". + # Failure here is silent — we already returned the + # "empty" status, and the existing background refresh + # path will eventually fill it in too. + try: + probed = _probe_endpoint(r.base_url, r.api_key, timeout=5) + if probed: + r.cached_models = json.dumps(probed) + db.commit() + all_models = probed + visible = _visible_models(all_models, r.hidden_models, pinned) + status = "online" + except Exception as _refill_err: + logger.debug(f"opportunistic cached_models refill failed for {r.id}: {_refill_err!r}") + base = _normalize_base(r.base_url) kind = _effective_endpoint_kind(r, base) results.append({ "id": r.id, "name": r.name, "base_url": r.base_url, "has_key": bool(r.api_key), - "api_key_fingerprint": _api_key_fingerprint(r.api_key), "is_enabled": r.is_enabled, "models": visible, "pinned_models": pinned, @@ -1603,34 +1523,21 @@ def setup_model_routes(model_discovery): ) explicit_timeout = _explicit_model_list_timeout(base_url, requested_kind, refresh_timeout) - # Dedupe: if an endpoint with the same base_url and compatible - # credentials already exists and is reachable by the caller (shared or - # owned by them), return it instead of creating a duplicate row. Keep - # same-url/different-key rows distinct so users can group the same - # provider URL under multiple credentials. + # Dedupe: if an endpoint with the same base_url already exists and + # is reachable by the caller (shared or owned by them), return it + # instead of creating a duplicate row. Fixes "Scan for Servers" + # re-adding manually-added endpoints under their host:port name. from src.auth_helpers import get_current_user as _gcu_dedup _caller = _gcu_dedup(request) or None - _incoming_api_key = api_key.strip() _db_dedup = SessionLocal() try: - _same_url_rows = ( + existing = ( _db_dedup.query(ModelEndpoint) .filter(ModelEndpoint.base_url == base_url) .filter((ModelEndpoint.owner.is_(None)) | (ModelEndpoint.owner == _caller)) .order_by(ModelEndpoint.owner.desc()) # prefer owned over shared - .all() + .first() ) - existing = None - _empty_key_existing = None - for _candidate in _same_url_rows: - _candidate_key = (getattr(_candidate, "api_key", None) or "").strip() - if _candidate_key == _incoming_api_key: - existing = _candidate - break - if _incoming_api_key and not _candidate_key and _empty_key_existing is None: - _empty_key_existing = _candidate - if existing is None and _incoming_api_key and _empty_key_existing is not None: - existing = _empty_key_existing if existing: changed = False # Persist any incoming pinned IDs onto the existing row. An @@ -1679,8 +1586,6 @@ def setup_model_routes(model_discovery): "id": existing.id, "name": existing.name, "base_url": existing.base_url, - "has_key": bool(existing.api_key), - "api_key_fingerprint": _api_key_fingerprint(existing.api_key), "models": _visible_models( existing_models, getattr(existing, "hidden_models", None), @@ -1754,8 +1659,6 @@ def setup_model_routes(model_discovery): "id": ep_id, "name": name.strip(), "base_url": base_url, - "has_key": bool(api_key.strip()), - "api_key_fingerprint": _api_key_fingerprint(api_key), "models": _merge_model_ids(model_ids, _pinned), "pinned_models": _pinned, "online": bool(model_ids) or bool(_pinned) or bool(ping.get("reachable")), @@ -1805,7 +1708,7 @@ def setup_model_routes(model_discovery): ep = db.query(ModelEndpoint).filter(ModelEndpoint.id == ep_id).first() if not ep: raise HTTPException(404, "Endpoint not found") - ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": _resolve_probe_key(ep)} + ep_data = {"id": ep.id, "name": ep.name, "base_url": ep.base_url, "api_key": ep.api_key} finally: db.close() @@ -1869,7 +1772,7 @@ def setup_model_routes(model_discovery): category = _classify_endpoint(base, kind) timeout = _manual_refresh_timeout(ep, category, refresh_timeout) try: - probed = _probe_endpoint(base, _resolve_probe_key(ep), timeout=timeout) + probed = _probe_endpoint(base, ep.api_key, timeout=timeout) except Exception as exc: logger.warning("Manual model refresh failed for endpoint %s at %s: %s", ep_id, base, exc) probed = [] @@ -2105,8 +2008,6 @@ def setup_model_routes(model_discovery): "name": ep.name, "model_type": ep.model_type, "base_url": ep.base_url, - "has_key": bool(ep.api_key), - "api_key_fingerprint": _api_key_fingerprint(ep.api_key), "pinned_models": _normalize_model_ids(getattr(ep, "pinned_models", None)), "endpoint_kind": getattr(ep, "endpoint_kind", None) or "auto", "model_refresh_mode": getattr(ep, "model_refresh_mode", None) or "auto", @@ -2208,9 +2109,7 @@ def setup_model_routes(model_discovery): cleared_user_preferences = _clear_user_prefs_for_endpoint(ep_id) cleared_sessions = _clear_sessions_for_endpoint(db, ep.base_url) cleared_loaded_sessions = _clear_loaded_sessions_for_endpoint(ep.base_url) - auth_id = getattr(ep, "provider_auth_id", None) db.delete(ep) - cleared_provider_auth = _delete_orphaned_provider_auth(db, auth_id, exclude_ep_id=ep_id) db.commit() _invalidate_models_cache() _local_probe_cache["data"] = None @@ -2220,7 +2119,6 @@ def setup_model_routes(model_discovery): "cleared_user_preferences": cleared_user_preferences, "cleared_sessions": cleared_sessions, "cleared_loaded_sessions": cleared_loaded_sessions, - "cleared_provider_auth": cleared_provider_auth, } finally: db.close() diff --git a/services/hwfit/data/hf_models.json b/services/hwfit/data/hf_models.json index e73cc26dc..35b55d9a9 100644 --- a/services/hwfit/data/hf_models.json +++ b/services/hwfit/data/hf_models.json @@ -14036,6 +14036,29 @@ "vision" ] }, + { + "name": "google/gemma-4-12B", + "provider": "Google", + "parameter_count": "12.0B", + "parameters_raw": 12000000000, + "min_ram_gb": 24.0, + "recommended_ram_gb": 32.0, + "min_vram_gb": 24.0, + "quantization": "BF16", + "context_length": 131072, + "use_case": "General purpose, multimodal", + "is_moe": false, + "num_experts": null, + "active_experts": null, + "active_parameters": null, + "architecture": "gemma4", + "pipeline_tag": "image-text-to-text", + "release_date": "2026-04-01", + "gguf_sources": [], + "capabilities": [ + "vision" + ] + }, { "name": "google/gemma-4-31B-it", "provider": "Google", @@ -19121,4 +19144,4 @@ ], "_discovered": true } -] \ No newline at end of file +] diff --git a/services/memory/skill_extractor.py b/services/memory/skill_extractor.py index e763bca4c..79e4c67c2 100644 --- a/services/memory/skill_extractor.py +++ b/services/memory/skill_extractor.py @@ -243,6 +243,20 @@ async def maybe_extract_skill( logger.debug("[skill-extract] '%s' already exists — dropped as duplicate", title) return None + # Auto-publish gate: if the user has `auto_approve_skills` on, the + # newly-extracted skill is created `published` immediately rather + # than waiting for the next audit batch. The audit still runs later + # and can demote it back to `draft` (or delete) on failure. Default + # ON matches the UI label "Auto-approve skills". + _initial_status = "draft" + try: + from routes.prefs_routes import _load_for_user as _load_prefs + _prefs = _load_prefs(owner) or {} + if _prefs.get("auto_approve_skills", True): + _initial_status = "published" + except Exception: + pass + entry = skills_manager.add_skill( title=title, problem=data.get("problem", ""), @@ -253,6 +267,7 @@ async def maybe_extract_skill( confidence=data.get("confidence", 0.7), session_id=getattr(session, "session_id", None), owner=owner, + status=_initial_status, ) try: from src.event_bus import fire_event diff --git a/src/tool_implementations.py b/src/tool_implementations.py index 548f6f0f5..5e62e686c 100644 --- a/src/tool_implementations.py +++ b/src/tool_implementations.py @@ -664,6 +664,17 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict: proc = args.get("steps") or [] if not proc and not args.get("body_extra") and not args.get("solution"): return {"error": "procedure (or solution body) is required", "exit_code": 1} + # Same auto-publish gate as the extractor path — when the user + # has auto_approve_skills on and the caller didn't pin an explicit + # status, publish immediately. Audit later demotes/removes on fail. + _status_arg = args.get("status") + if not _status_arg: + try: + from routes.prefs_routes import _load_for_user as _load_prefs + _prefs = _load_prefs(owner) or {} + _status_arg = "published" if _prefs.get("auto_approve_skills", True) else "draft" + except Exception: + _status_arg = "draft" entry = sm.add_skill( name=args.get("name"), description=(args.get("description") or args.get("title") or "").strip(), @@ -677,7 +688,7 @@ async def do_manage_skills(content: str, owner: Optional[str] = None) -> Dict: procedure=proc, pitfalls=args.get("pitfalls") or [], verification=args.get("verification") or [], - status=args.get("status") or "draft", + status=_status_arg, version=args.get("version") or "1.0.0", confidence=args.get("confidence", 0.8), source=args.get("source", "learned"), @@ -2621,8 +2632,90 @@ async def _cookbook_env_for_host(host: str) -> Dict[str, Any]: } -async def _cookbook_register_task(session_id: str, model: str, host: str, - cmd: str, task_type: str = "serve") -> bool: +def _infer_serve_port(cmd: str) -> int: + """Infer likely listen port from a serve command.""" + if not cmd: + return 8080 + m = re.search(r"--port\\s+(\\d+)", cmd) + if m: + try: + return int(m.group(1)) + except Exception: + pass + m = re.search(r"OLLAMA_HOST=[^\\s]*?:(\\d+)", cmd) + if m: + try: + return int(m.group(1)) + except Exception: + pass + if "ollama" in cmd: + return 11434 + return 8080 + + +def _infer_serve_host(host: str | None) -> tuple[str, bool]: + """Return (host, container_local) for registering a served endpoint.""" + if not (host or "").strip(): + return "localhost", True + base_host = host.split("@", 1)[-1] if "@" in host else host + return base_host, False + + +async def _ensure_served_endpoint( + *, + model: str, + cmd: str, + host: str | None, +) -> Dict[str, Any]: + """Register/fetch a model endpoint for a running serve session.""" + import httpx + endpoint_host, container_local = _infer_serve_host(host) + port = _infer_serve_port(cmd) + base_url = f"http://{endpoint_host}:{port}/v1" + short_name = model.split("/")[-1] if "/" in model else model + is_image = "diffusion_server.py" in (cmd or "") + payload = { + "name": short_name if not is_image else f"{short_name} (image)", + "base_url": base_url, + "skip_probe": "true", + "model_type": "image" if is_image else "llm", + "container_local": "true" if container_local else "false", + } + try: + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + f"{_COOKBOOK_BASE}/api/model-endpoints", + data=payload, + headers=_internal_headers(), + ) + data = resp.json() if resp.headers.get("content-type", "").startswith("application/json") else {} + if resp.status_code >= 400: + logger.debug( + f"ensure endpoint failed for {model!r}: status={resp.status_code} data={data}" + ) + return {"added": False, "endpoint_id": "", "base_url": base_url, "error": data} + ep_id = data.get("id") if isinstance(data, dict) else None + return { + "added": bool(ep_id), + "endpoint_id": ep_id or "", + "base_url": base_url, + "data": data, + } + except Exception as e: + logger.debug(f"ensure endpoint exception for {model!r}: {e}") + return {"added": False, "endpoint_id": "", "base_url": base_url, "error": str(e)} + + +async def _cookbook_register_task( + session_id: str, + model: str, + host: str, + cmd: str, + task_type: str = "serve", + *, + endpoint_added: bool = False, + endpoint_id: str = "", +) -> bool: """Append a task entry to cookbook_state.json after the agent launches via /api/model/serve or /api/model/download. The route spawns tmux but leaves state-writing to the UI; the agent needs to @@ -2672,7 +2765,8 @@ async def _cookbook_register_task(session_id: str, model: str, host: str, "sshPort": "", "platform": "linux", "_serveReady": False, - "_endpointAdded": False, + "_endpointAdded": bool(endpoint_added), + "_endpointId": endpoint_id or "", }) state["tasks"] = tasks try: @@ -3008,7 +3102,12 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict: if _servers.get("default_host"): host = _servers["default_host"] _host_defaulted = True + backend = (args.get("backend") or "").strip().lower() + if not backend and "/" not in repo_id and ":" in repo_id: + backend = "ollama" payload = {"repo_id": repo_id} + if backend: + payload["backend"] = backend if host: payload["remote_host"] = host if args.get("include"): @@ -3028,12 +3127,20 @@ async def do_download_model(content: str, owner: Optional[str] = None) -> Dict: sid = data.get("session_id", "?") registered = await _cookbook_register_task( session_id=sid, model=repo_id, host=host, - cmd=f"hf download {repo_id}", task_type="download", + cmd=(f"ollama pull {repo_id}" if backend == "ollama" else f"hf download {repo_id}"), + task_type="download", ) note = "" if registered else " (state-write failed — download may not show in UI)" where = host or "local" default_note = " (defaulted to the cookbook's selected server — pass host= or local=true to override)" if _host_defaulted else "" - return {"output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}", "session_id": sid, "host": host, "exit_code": 0} + return { + "output": f"Download started: {repo_id} on {where} (session: {sid}){note}{default_note}", + "session_id": sid, + "host": host, + "task_type": "download", + "phase": "running", + "exit_code": 0, + } return {"error": data.get("error", "Download failed"), "exit_code": 1} except Exception as e: return {"error": str(e), "exit_code": 1} @@ -3102,12 +3209,28 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict: data = resp.json() if data.get("ok"): sid = data.get("session_id", "?") + endpoint_id = data.get("endpoint_id") or "" + if endpoint_id: + endpoint_added = True + else: + endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host) + endpoint_added = bool(endpoint_meta.get("added")) + endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id registered = await _cookbook_register_task( session_id=sid, model=repo_id, host=host, cmd=cmd, task_type="serve", + endpoint_added=endpoint_added, endpoint_id=endpoint_id or "", ) note = "" if registered else " (state-write failed — task may not show in UI)" - return {"output": f"Serving {repo_id} (session: {sid}){note}", "session_id": sid, "exit_code": 0} + return { + "output": f"Serving {repo_id} (session: {sid}){note}", + "session_id": sid, + "task_type": "serve", + "phase": "running", + "host": host, + "endpoint_id": endpoint_id, + "exit_code": 0, + } # FastAPI HTTPException puts the message under `detail`, not `error`. # Surface BOTH so the agent sees "Invalid characters in cmd" (from # _validate_serve_cmd rejecting `&&`/`source`/`cd`) instead of @@ -3804,7 +3927,8 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict: if env_cfg.get("gpus"): payload["gpus"] = env_cfg["gpus"] if env_cfg.get("hf_token"): payload["hf_token"] = env_cfg["hf_token"] if env_cfg.get("platform"): payload["platform"] = env_cfg["platform"] - if env_cfg.get("ssh_port"): payload["ssh_port"] = env_cfg["ssh_port"] + if env_cfg.get("ssh_port"): + payload["ssh_port"] = env_cfg["ssh_port"] try: async with httpx.AsyncClient(timeout=30) as client: @@ -3813,12 +3937,20 @@ async def do_serve_preset(content: str, owner: Optional[str] = None) -> Dict: data = resp.json() if data.get("ok"): sid = data.get("session_id", "?") + endpoint_id = data.get("endpoint_id") or "" + if endpoint_id: + endpoint_added = True + else: + endpoint_meta = await _ensure_served_endpoint(model=repo_id, cmd=cmd, host=host) + endpoint_added = bool(endpoint_meta.get("added")) + endpoint_id = endpoint_meta.get("endpoint_id", "") or endpoint_id registered = await _cookbook_register_task( session_id=sid, model=repo_id, host=host, cmd=cmd, task_type="serve", + endpoint_added=endpoint_added, endpoint_id=endpoint_id or "", ) note = "" if registered else " (state-write failed — task may not show in UI)" - return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "exit_code": 0} + return {"output": f"Launched preset {chosen.get('name')!r}: {repo_id} on {host or 'local'} (session: {sid}){note}", "session_id": sid, "host": host, "endpoint_id": endpoint_id, "exit_code": 0} return {"error": data.get("error", "Serve failed"), "exit_code": 1} except Exception as e: return {"error": str(e), "exit_code": 1} diff --git a/static/index.html b/static/index.html index ec4af199f..ae3092659 100644 --- a/static/index.html +++ b/static/index.html @@ -1492,21 +1492,7 @@
-
-

Agent

-
Controls for the agent tool loop.
-
-
- - -
-
- - -
-
-
-
+ + +
+ + + - - -
-
@@ -2116,19 +2109,33 @@ -
- + + +
- + - - + + + + - +
@@ -2136,7 +2143,15 @@
-

Added Models (Endpoints)

+

Added Models (Endpoints) + + + +

Manage the endpoints you've added.
@@ -2167,10 +2182,45 @@
+
+

API Tokens

+
Bearer tokens for external integrations (scripts, Codex, headless agent runs). Token value shown ONCE on create — copy it then.
+
+
+ + + +
+
+ +