diff --git a/routes/chat_helpers.py b/routes/chat_helpers.py index 25f12d566..f030d6c91 100644 --- a/routes/chat_helpers.py +++ b/routes/chat_helpers.py @@ -22,6 +22,31 @@ from fastapi import HTTPException logger = logging.getLogger(__name__) +_CASUAL_OPENING_RE = re.compile( + r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|" + r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P.*)$", + re.IGNORECASE, +) +_CASUAL_BLOCKLIST_RE = re.compile( + r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|" + r"download|model|email|document|doc|note|calendar|task|search|web|research|" + r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b", + re.IGNORECASE, +) + + +def _is_casual_low_signal(text: str) -> bool: + """Short greetings/slang should not pull memory, skills, RAG, or docs.""" + s = str(text or "").strip() + m = _CASUAL_OPENING_RE.match(s) + if not m: + return False + tail = m.group("tail") or "" + if _CASUAL_BLOCKLIST_RE.search(tail): + return False + tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail) + return len(tail_words) <= 2 + # ── Data containers ────────────────────────────────────────────────────── # @@ -579,6 +604,7 @@ async def build_chat_context( # Resolve user prefs user = get_current_user(request) uprefs = load_prefs_for_user(user) + casual_low_signal = _is_casual_low_signal(message) # Memory enabled? mem_enabled = not incognito and not no_memory and uprefs.get("memory_enabled", True) @@ -588,6 +614,9 @@ async def build_chat_context( if not allow_tool_preprocessing: mem_enabled = False skills_enabled = False + if casual_low_signal: + mem_enabled = False + skills_enabled = False logger.debug( "Memory enabled=%s for user=%s (incognito=%s, no_memory=%s, pref=%s)", mem_enabled, user, incognito, no_memory, uprefs.get("memory_enabled", "NOT_SET"), @@ -603,11 +632,11 @@ async def build_chat_context( # Use RAG? use_rag_val = (str(use_rag).lower() != "false") if use_rag is not None else True - if incognito or not allow_tool_preprocessing or is_research_spinoff: + if incognito or not allow_tool_preprocessing or is_research_spinoff or casual_low_signal: use_rag_val = False # If pre-fetched search context was provided (compare mode), skip live web search - skip_web = bool(search_context) or not allow_tool_preprocessing + skip_web = bool(search_context) or not allow_tool_preprocessing or casual_low_signal # Build context preface # The stream path uses enhanced_message (with CoT/preprocessing applied), @@ -626,7 +655,7 @@ async def build_chat_context( incognito=incognito, use_skills=skills_enabled, ) - if use_rag is not None or is_research_spinoff: + if use_rag is not None or is_research_spinoff or casual_low_signal: _preface_kwargs["use_rag"] = use_rag_val preface, rag_sources, web_sources = chat_processor.build_context_preface(**_preface_kwargs) @@ -634,7 +663,7 @@ async def build_chat_context( used_memories = getattr(chat_processor, '_last_used_memories', []) # Inject pre-fetched search context (compare mode) - if search_context and allow_tool_preprocessing: + if search_context and allow_tool_preprocessing and not casual_low_signal: preface.append(untrusted_context_message("prefetched search context", search_context)) # YouTube transcripts diff --git a/routes/chat_routes.py b/routes/chat_routes.py index c33f7c2c7..0967667f1 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -826,7 +826,11 @@ def setup_chat_routes( from src.settings import get_setting _global_disabled = get_setting("disabled_tools", []) if _global_disabled and isinstance(_global_disabled, list): - disabled_tools.update(_global_disabled) + explicit_web_allowed = allow_web_search is not None and str(allow_web_search).lower() == "true" + if explicit_web_allowed: + disabled_tools.update(t for t in _global_disabled if t not in {"web_search", "web_fetch"}) + else: + disabled_tools.update(_global_disabled) # Light auto-escalation: the user is in chat mode and just expressed a # notes/calendar/email intent. Grant the relevant managers but withhold @@ -1256,6 +1260,10 @@ def setup_chat_routes( _max_rounds = _DEFAULT_ROUNDS _max_rounds = max(1, min(_max_rounds, 200)) + _forced_tools = None + if allow_web_search is not None and str(allow_web_search).lower() == "true": + _forced_tools = {"web_search", "web_fetch"} + async for chunk in stream_agent_loop( sess.endpoint_url, sess.model, @@ -1277,6 +1285,7 @@ def setup_chat_routes( plan_mode=plan_mode, approved_plan=approved_plan or None, workspace=workspace or None, + forced_tools=_forced_tools, ): if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): try: diff --git a/routes/cookbook_helpers.py b/routes/cookbook_helpers.py index 3600a9ad1..bf1124933 100644 --- a/routes/cookbook_helpers.py +++ b/routes/cookbook_helpers.py @@ -964,18 +964,31 @@ def _append_llama_cpp_linux_accel_build_lines(runner_lines: list[str]) -> None: runner_lines.append(' fi # end _odysseus_have_prebuilt guard') -def _llama_cpp_rebuild_cmd() -> str: +def _llama_cpp_rebuild_cmd(update_source: bool = False) -> str: """Shell command that clears the Cookbook-managed llama.cpp build. Removes the cached ``llama-server`` symlink and the ``~/llama.cpp/build*`` directory so the next llama.cpp serve recompiles from source, picking up a CUDA or HIP toolchain if one is now available. The serve bootstrap only builds when ``llama-server`` is missing from PATH, so without this an - existing CPU-only build is reused forever. It deliberately installs and - downloads nothing; the rebuild itself happens on the next serve. + existing CPU-only build is reused forever. When ``update_source`` is true, + the command also fast-forwards the Cookbook-managed ``~/llama.cpp`` checkout + if it exists. The rebuild itself happens on the next serve. """ + update_cmd = '' + if update_source: + update_cmd = ( + 'if [ -d "$HOME/llama.cpp/.git" ]; then ' + 'git -C "$HOME/llama.cpp" pull --ff-only --depth 1 || ' + 'echo "[odysseus] WARNING: llama.cpp source update failed; clearing cached build anyway."; ' + 'elif command -v git >/dev/null 2>&1; then ' + 'git clone --depth 1 https://github.com/ggml-org/llama.cpp "$HOME/llama.cpp" || ' + 'echo "[odysseus] WARNING: llama.cpp clone failed; clearing cached build anyway."; ' + 'fi && ' + ) return ( 'mkdir -p "$HOME/bin" && ' + f'{update_cmd}' 'rm -f "$HOME/bin/llama-server" && ' 'rm -rf "$HOME/llama.cpp/build" "$HOME/llama.cpp/build-vulkan" && ' 'echo "[odysseus] Cleared the cached llama.cpp build. ' diff --git a/routes/cookbook_routes.py b/routes/cookbook_routes.py index 0bd38d19f..0d8257574 100644 --- a/routes/cookbook_routes.py +++ b/routes/cookbook_routes.py @@ -273,6 +273,78 @@ def setup_cookbook_routes() -> APIRouter: def _load_stored_hf_token() -> str: return load_stored_hf_token(state_path=_cookbook_state_path) + def _normalize_minimax_m3_vllm_cmd(cmd: str) -> str: + """Patch MiniMax M3 vLLM launches into the known-good local form. + + The browser form can be stale or omit advanced-only fields. MiniMax M3 + is sensitive to several flags: using the HF repo id with block-size 128 + fails KV-cache setup, and FlashInfer sampler JIT fails on this host's + system nvcc. Normalize server-side before writing the tmux runner. + """ + if not cmd or "vllm serve" not in cmd or not re.search(r"minimax.*m3", cmd, re.I): + return cmd + try: + parts = shlex.split(cmd) + except ValueError: + return cmd + if "serve" not in parts: + return cmd + + env_re = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=") + env_parts = [p for p in parts if env_re.match(p)] + body = [p for p in parts if not env_re.match(p)] + try: + serve_i = body.index("serve") + except ValueError: + return cmd + if serve_i + 1 >= len(body): + return cmd + + repo_id = "cyankiwi/MiniMax-M3-AWQ-INT4" + snapshot = ( + "/home/pewds/.cache/huggingface/hub/" + "models--cyankiwi--MiniMax-M3-AWQ-INT4/" + "snapshots/4082acbbec1236d21828d55b6bb0fe02ade4ab5b" + ) + if body[serve_i + 1] == repo_id: + body[serve_i + 1] = snapshot + + def add_env(key: str, value: str) -> None: + if not any(p.startswith(f"{key}=") for p in env_parts): + env_parts.append(f"{key}={value}") + + def has_flag(flag: str) -> bool: + return any(p == flag or p.startswith(flag + "=") for p in body) + + def set_flag(flag: str, value: str) -> None: + for i, part in enumerate(body): + if part == flag: + if i + 1 < len(body): + body[i + 1] = value + else: + body.append(value) + return + if part.startswith(flag + "="): + body[i] = f"{flag}={value}" + return + body.extend([flag, value]) + + def add_bool(flag: str) -> None: + if not has_flag(flag): + body.append(flag) + + add_env("VLLM_TARGET_DEVICE", "cuda") + add_env("VLLM_USE_FLASHINFER_SAMPLER", "0") + set_flag("--served-model-name", repo_id) + set_flag("--tool-call-parser", "minimax_m3") + set_flag("--reasoning-parser", "minimax_m3") + set_flag("--attention-backend", "TRITON_ATTN") + set_flag("--block-size", "128") + add_bool("--language-model-only") + add_bool("--disable-custom-all-reduce") + add_bool("--enable-expert-parallel") + return shlex.join(env_parts + body) + def _cookbook_ssh_dir() -> Path: # The Docker image keeps cookbook keys under /app/.ssh; that path only # exists inside the container. On Windows (and any non-container host) @@ -1249,6 +1321,7 @@ def setup_cookbook_routes() -> APIRouter: # `TypeError: argument of type 'NoneType'` (a 500 instead of a clean 400). req.cmd = _validate_serve_cmd(req.cmd) or "" req.cmd = _normalize_llama_cpp_python_cache_types(req.cmd) or "" + req.cmd = _normalize_minimax_m3_vllm_cmd(req.cmd) req.cmd = _venv_safe_local_pip_install_cmd( req.cmd, local=not bool(req.remote_host), @@ -1579,6 +1652,96 @@ def setup_cookbook_routes() -> APIRouter: runner_lines.append(' echo "ERROR: vLLM is not installed."') runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127') runner_lines.append('fi') + runner_lines.append(f"ODYSSEUS_SERVE_CMD='{_bash_squote(req.cmd)}'") + runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ]; then') + runner_lines.append(' ODYSSEUS_VLLM_HELP_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'') + runner_lines.append('import shlex, sys') + runner_lines.append('parts = shlex.split(sys.argv[1])') + runner_lines.append('try:') + runner_lines.append(' serve_i = parts.index("serve")') + runner_lines.append('except ValueError:') + runner_lines.append(' print("vllm serve --help")') + runner_lines.append('else:') + runner_lines.append(' print(shlex.join(parts[:serve_i + 1] + ["--help"]))') + runner_lines.append('PY') + runner_lines.append(')"') + runner_lines.append(' ODYSSEUS_VLLM_SUPPORTS_SWAP=0') + runner_lines.append(' if eval "$ODYSSEUS_VLLM_HELP_CMD" 2>&1 | grep -q -- "--swap-space"; then ODYSSEUS_VLLM_SUPPORTS_SWAP=1; fi') + runner_lines.append('fi') + runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" = "1" ] && ! printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then') + runner_lines.append(' echo "[odysseus] Setting vLLM --swap-space 0 so the runtime does not reserve CPU swap per GPU."') + runner_lines.append(' ODYSSEUS_SERVE_CMD="${ODYSSEUS_SERVE_CMD} --swap-space 0"') + runner_lines.append('fi') + runner_lines.append('if [ -z "$ODYSSEUS_PREFLIGHT_EXIT" ] && [ "${ODYSSEUS_VLLM_SUPPORTS_SWAP:-0}" != "1" ]; then') + runner_lines.append(' if printf "%s" "$ODYSSEUS_SERVE_CMD" | grep -q -- "--swap-space"; then') + runner_lines.append(' echo "[odysseus] vLLM serve does not expose --swap-space; removing the flag and patching the runtime default to 0."') + runner_lines.append(' ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'') + runner_lines.append('import shlex, sys') + runner_lines.append('parts = shlex.split(sys.argv[1])') + runner_lines.append('out = []') + runner_lines.append('skip = False') + runner_lines.append('for part in parts:') + runner_lines.append(' if skip:') + runner_lines.append(' skip = False') + runner_lines.append(' continue') + runner_lines.append(' if part == "--swap-space":') + runner_lines.append(' skip = True') + runner_lines.append(' continue') + runner_lines.append(' if part.startswith("--swap-space="):') + runner_lines.append(' continue') + runner_lines.append(' out.append(part)') + runner_lines.append('print(shlex.join(out))') + runner_lines.append('PY') + runner_lines.append(')"') + runner_lines.append(' fi') + runner_lines.append(' ODYSSEUS_SERVE_CMD="$(python3 - "$ODYSSEUS_SERVE_CMD" <<\'PY\'') + runner_lines.append('import shlex, sys') + runner_lines.append('parts = shlex.split(sys.argv[1])') + runner_lines.append('patch = r"""import inspect, sys') + runner_lines.append('from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs') + runner_lines.append('def _odysseus_swap0(cls):') + runner_lines.append(' params = list(inspect.signature(cls).parameters)') + runner_lines.append(' if "swap_space" not in params:') + runner_lines.append(' return') + runner_lines.append(' idx = params.index("swap_space")') + runner_lines.append(' defaults = list(cls.__init__.__defaults__ or ())') + runner_lines.append(' if idx < len(defaults):') + runner_lines.append(' defaults[idx] = 0') + runner_lines.append(' cls.__init__.__defaults__ = tuple(defaults)') + runner_lines.append(' fields = getattr(cls, "__dataclass_fields__", {})') + runner_lines.append(' if "swap_space" in fields:') + runner_lines.append(' fields["swap_space"].default = 0') + runner_lines.append('_odysseus_swap0(EngineArgs)') + runner_lines.append('_odysseus_swap0(AsyncEngineArgs)') + runner_lines.append('try:') + runner_lines.append(' from vllm.config import CacheConfig') + runner_lines.append(' CacheConfig.swap_space = 0') + runner_lines.append('except Exception:') + runner_lines.append(' pass') + runner_lines.append('_orig_create_engine_config = EngineArgs.create_engine_config') + runner_lines.append('def _odysseus_create_engine_config(self, *args, **kwargs):') + runner_lines.append(' self.swap_space = 0') + runner_lines.append(' return _orig_create_engine_config(self, *args, **kwargs)') + runner_lines.append('EngineArgs.create_engine_config = _odysseus_create_engine_config') + runner_lines.append('AsyncEngineArgs.create_engine_config = _odysseus_create_engine_config') + runner_lines.append('from vllm.entrypoints.cli.main import main') + runner_lines.append('sys.exit(main())"""') + runner_lines.append('try:') + runner_lines.append(' serve_i = parts.index("serve")') + runner_lines.append('except ValueError:') + runner_lines.append(' print(shlex.join(parts))') + runner_lines.append('else:') + runner_lines.append(' exe_i = serve_i - 1') + runner_lines.append(' exe = parts[exe_i] if exe_i >= 0 else "vllm"') + runner_lines.append(' py = "python3"') + runner_lines.append(' if exe.endswith("/bin/vllm"):') + runner_lines.append(' py = exe[:-len("/bin/vllm")] + "/bin/python"') + runner_lines.append(' parts[exe_i:serve_i] = [py, "-c", patch]') + runner_lines.append(' print(shlex.join(parts))') + runner_lines.append('PY') + runner_lines.append(')"') + runner_lines.append(' echo "[odysseus] Patched vLLM internal swap_space default to 0 for this runtime."') + runner_lines.append('fi') elif "sglang.launch_server" in req.cmd: runner_lines.append('export PATH="$HOME/.local/bin:$PATH"') runner_lines.append('if ! command -v sglang &>/dev/null; then') @@ -1620,7 +1783,10 @@ def setup_cookbook_routes() -> APIRouter: runner_lines, keep_shell_open=not local_windows, ) - runner_lines.append(req.cmd) + if "vllm serve" in req.cmd: + runner_lines.append('eval "$ODYSSEUS_SERVE_CMD"') + else: + runner_lines.append(req.cmd) if local_windows: # Detached background process — no interactive shell to keep open. # Print the exit marker the status poller looks for, then stop. @@ -2418,16 +2584,14 @@ def setup_cookbook_routes() -> APIRouter: # Add 30% headroom for KV cache, activations, etc. needed_vram = (est_vram * 1.3) if est_vram else None - if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb: - continue - # Unknown-size models (e.g. MiniMax-M2.7, DeepSeek-V4-Flash) have no - # "NB" in the repo id, so the regex above can't extract their - # param count. Previously we dropped them entirely, which made - # brand-new flagship releases silently vanish from this list even - # on rigs with hundreds of GB of VRAM. Adapters/LoRAs are already - # filtered by _is_excluded(), so what falls through here is - # overwhelmingly full models — keep them, just without a size - # badge (the frontend handles needed_vram_gb=null gracefully). + if vram_gb > 0: + if needed_vram is None: + # The "trending models that fit" list must be conservative: + # if we cannot estimate size from the repo id/tags, do not + # present it as runnable on this hardware. + continue + if needed_vram > vram_gb: + continue out.append({ "repo_id": repo_id, @@ -2624,6 +2788,32 @@ def setup_cookbook_routes() -> APIRouter: except Exception as e: logger.warning(f"orphan sweep: state write failed: {e}") + @router.get("/api/cookbook/hf-gguf-files") + async def hf_gguf_files(repo_id: str, owner: str = Depends(require_user)): + """List GGUF files in a HuggingFace repo for the direct-download picker.""" + import httpx + + repo_id = _validate_repo_id(repo_id) + url = f"https://huggingface.co/api/models/{repo_id}" + try: + headers = {} + token = _load_stored_hf_token() + if token: + headers["Authorization"] = f"Bearer {token}" + async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client: + resp = await client.get(url, headers=headers) + if resp.status_code != 200: + return {"ok": False, "files": [], "error": f"HF API HTTP {resp.status_code}"} + data = resp.json() + except Exception as e: + return {"ok": False, "files": [], "error": str(e)} + files = [ + str(s.get("rfilename") or "") + for s in data.get("siblings", []) + if str(s.get("rfilename") or "").lower().endswith(".gguf") + ] + return {"ok": True, "repo_id": repo_id, "files": files} + # In-memory cache for the Ollama library scrape. ollama.com is a public # site, but it doesn't expose a stable JSON listing — we fetch the HTML # search page and regex out the model cards. Cached for 1 h so a busy diff --git a/routes/email_helpers.py b/routes/email_helpers.py index b3df6a560..a054f4df8 100644 --- a/routes/email_helpers.py +++ b/routes/email_helpers.py @@ -1109,22 +1109,30 @@ def _list_attachments_from_msg(msg): return attachments idx = 0 for part in msg.walk(): - if part.is_multipart(): - continue cd = str(part.get("Content-Disposition", "")) ct = part.get_content_type() + is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename()) + if part.is_multipart() and not is_attached_email: + continue # Skip text/html body parts (only consider real attachments) if ct in ("text/plain", "text/html") and "attachment" not in cd: continue filename = part.get_filename() if filename: filename = _decode_header(filename) + if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename): + filename = f"{filename}.eml" else: # Inline images, etc. - generate a name - ext = ct.split("/")[-1] if "/" in ct else "bin" + ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin") filename = f"attachment_{idx}.{ext}" payload = part.get_payload(decode=True) - size = len(payload) if payload else 0 + if payload is None and ct == "message/rfc822": + try: + payload = part.as_bytes() + except Exception: + payload = b"" + size = len(payload) if payload is not None else 0 attachments.append({ "index": idx, "filename": filename, @@ -1136,29 +1144,58 @@ def _list_attachments_from_msg(msg): return attachments +def _is_likely_signature_image_attachment(att: dict) -> bool: + """Match the reader's inline signature/logo image filter.""" + filename = str((att or {}).get("filename") or "").lower() + if not re.search(r"\.(png|jpe?g|gif|bmp|svg|webp)$", filename): + return False + size = int((att or {}).get("size") or 0) + if re.search(r"^image\d{3,}\.(png|jpe?g|gif)$", filename): + return True + if re.search(r"^(signature|logo|sig|footer|banner)[-_\d]*\.(png|jpe?g|gif|svg)$", filename): + return True + return 0 < size < 30 * 1024 + + +def _has_visible_attachments(msg) -> bool: + """Return True only for attachments the reader will render as chips.""" + return any( + not _is_likely_signature_image_attachment(att) + for att in _list_attachments_from_msg(msg) + ) + + def _extract_attachment_to_disk(msg, index, target_dir): """Extract a specific attachment to disk and return the file path.""" if not msg.is_multipart(): return None idx = 0 for part in msg.walk(): - if part.is_multipart(): - continue cd = str(part.get("Content-Disposition", "")) ct = part.get_content_type() + is_attached_email = ct == "message/rfc822" and ("attachment" in cd.lower() or part.get_filename()) + if part.is_multipart() and not is_attached_email: + continue if ct in ("text/plain", "text/html") and "attachment" not in cd: continue if idx == index: filename = part.get_filename() if filename: filename = _decode_header(filename) + if ct == "message/rfc822" and not re.search(r"\.[A-Za-z0-9]{1,8}$", filename): + filename = f"{filename}.eml" else: - ext = ct.split("/")[-1] if "/" in ct else "bin" + ext = "eml" if ct == "message/rfc822" else (ct.split("/")[-1] if "/" in ct else "bin") filename = f"attachment_{idx}.{ext}" # Sanitize safe_name = re.sub(r"[^\w\s\-.]", "_", filename).strip() payload = part.get_payload(decode=True) - if not payload: + if payload is None and ct == "message/rfc822": + try: + payload = part.as_bytes() + except Exception: + payload = b"" + if payload is None: return None target_dir.mkdir(parents=True, exist_ok=True) filepath = target_dir / safe_name diff --git a/routes/email_routes.py b/routes/email_routes.py index 416ad55b2..81d2b7330 100644 --- a/routes/email_routes.py +++ b/routes/email_routes.py @@ -44,7 +44,7 @@ from routes.email_helpers import ( _send_smtp_message, _smtp_security_mode, _IMAP_TIMEOUT_SECONDS, _open_imap_connection, _imap_connect, _imap, _decode_header, _detect_sent_folder, _detect_drafts_folder, - _extract_attachment_text, _list_attachments_from_msg, + _extract_attachment_text, _list_attachments_from_msg, _has_visible_attachments, _is_likely_signature_image_attachment, _extract_attachment_to_disk, _extract_html, _extract_text, _fetch_sender_thread_context, _pre_retrieve_context, _EMAIL_REPLY_SYS_PROMPT_BASE, _POOL_HOOKS, @@ -58,6 +58,7 @@ from routes.email_pollers import _start_poller logger = logging.getLogger(__name__) ODYSSEUS_MAIL_ORIGIN = "odysseus-ui" +EMAIL_READ_ATTACHMENT_VERSION = 2 def _email_tag_owner_aliases(account_id: str | None, owner: str = "") -> list[str]: @@ -244,6 +245,21 @@ def _imap_uid_fetch(conn, uid_set: str | bytes, query: str): return conn.uid("FETCH", _uid_bytes(uid_set), query) +def _imap_search_quote(value: str) -> str: + return '"' + str(value or "").replace("\\", "\\\\").replace('"', '\\"') + '"' + + +def _message_id_chain(*values: str) -> list[str]: + seen = set() + out = [] + for value in values: + for mid in re.findall(r"<[^>]+>", value or ""): + if mid not in seen: + seen.add(mid) + out.append(mid) + return out + + def _uid_from_fetch_meta(meta_b: bytes) -> str: m = re.search(rb"\bUID\s+(\d+)\b", meta_b) return m.group(1).decode() if m else "" @@ -1003,6 +1019,65 @@ def setup_email_routes(): except Exception: pass + def _related_thread_attachments_sync( + folder: str, + account_id: str | None, + owner: str, + current_uid: str, + current_message_id: str, + in_reply_to: str, + references: str, + limit: int = 12, + ) -> list[dict]: + """Return visible attachments from referenced messages in this folder.""" + wanted_ids = _message_id_chain(references, in_reply_to) + current_mid = (current_message_id or "").strip() + wanted_ids = [mid for mid in wanted_ids if mid and mid != current_mid] + if not wanted_ids: + return [] + + related: list[dict] = [] + try: + with _imap(account_id, owner=owner) as conn: + conn.select(_q(folder), readonly=True) + # Search newest referenced messages first; cap work so opening + # a long thread stays bounded. + for mid in reversed(wanted_ids[-10:]): + if len(related) >= limit: + break + status, data = _imap_uid_search(conn, f'(HEADER Message-ID {_imap_search_quote(mid)})') + if status != "OK" or not data or not data[0]: + continue + for uid_b in reversed(data[0].split()[-3:]): + source_uid = uid_b.decode(errors="ignore") + if not source_uid or source_uid == str(current_uid): + continue + st2, msg_data = _imap_uid_fetch(conn, source_uid, "(BODY.PEEK[])") + if st2 != "OK" or not msg_data or not isinstance(msg_data[0], tuple): + continue + msg = email_mod.message_from_bytes(msg_data[0][1]) + source_from = _decode_header(msg.get("From", "")) + source_subject = _decode_header(msg.get("Subject", "")) + source_date = msg.get("Date", "") + for att in _list_attachments_from_msg(msg): + if _is_likely_signature_image_attachment(att): + continue + enriched = dict(att) + enriched.update({ + "source_uid": source_uid, + "source_folder": folder, + "source_message_id": (msg.get("Message-ID") or "").strip(), + "source_from": source_from, + "source_subject": source_subject, + "source_date": source_date, + }) + related.append(enriched) + if len(related) >= limit: + break + except Exception as e: + logger.debug(f"related thread attachment lookup failed uid={current_uid}: {e}") + return related + @router.get("/list") async def list_emails( folder: str = Query("INBOX"), @@ -1273,6 +1348,17 @@ def setup_email_routes(): sender_name, sender_addr = email.utils.parseaddr(sender) parsed_date = email.utils.parsedate_to_datetime(date_str) if date_str else None attachments = _list_attachments_from_msg(msg) + related_attachments = [] + if not _has_visible_attachments(msg): + related_attachments = _related_thread_attachments_sync( + folder, + account_id, + owner, + uid, + message_id, + in_reply_to, + references, + ) if mark_seen: # Set \Seen in a separate readwrite session so concurrent reads @@ -1381,6 +1467,8 @@ def setup_email_routes(): "body": body, "body_html": body_html, "attachments": attachments, + "related_attachments": related_attachments, + "attachment_version": EMAIL_READ_ATTACHMENT_VERSION, "cached_summary": cached_summary, "cached_ai_reply": cached_ai_reply, "boundaries": cached_boundaries, @@ -1411,6 +1499,12 @@ def setup_email_routes(): """Read email body. Cached for 30m, sync IMAP work runs in a thread.""" ck = _read_cache_key(account_id, folder, uid, owner=owner) cached = _read_cache_get(ck) + if cached is not None: + # Older cached read responses lack the thread-attachment fallback. + # Fetch once so replies that reference prior attachments can show + # those files without waiting for cache expiry. + if cached.get("attachment_version") != EMAIL_READ_ATTACHMENT_VERSION: + cached = None if cached is not None: if mark_seen: try: @@ -1599,6 +1693,65 @@ def setup_email_routes(): return None doc_session_id = _resolve_doc_session() + def _create_markdown_doc(content: str, summary: str): + from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV + doc_id = str(uuid.uuid4()) + ver_id = str(uuid.uuid4()) + _db = _SL() + try: + _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False}) + _db.add(_Doc( + id=doc_id, session_id=doc_session_id, title=title, + language="markdown", current_content=content, + version_count=1, is_active=True, + )) + _db.add(_DV( + id=ver_id, document_id=doc_id, version_number=1, + content=content, summary=summary, source="upload", + )) + _db.commit() + finally: + _db.close() + _tag_doc_with_source(doc_id) + return doc_id + + def _attached_email_markdown(path): + raw_bytes = path.read_bytes() + if not raw_bytes: + return f"# Attached email: {base}\n\n_(empty email attachment)_" + try: + attached_msg = email_mod.message_from_bytes(raw_bytes) + except Exception as e: + return f"# Attached email: {base}\n\nCould not parse this email attachment: {e}" + + attached_subject = _decode_header(attached_msg.get("Subject", "")) or base + attached_from = _decode_header(attached_msg.get("From", "")) + attached_to = _decode_header(attached_msg.get("To", "")) + attached_cc = _decode_header(attached_msg.get("Cc", "")) + attached_date = attached_msg.get("Date", "") + attached_body = _extract_text(attached_msg).strip() + attached_atts = _list_attachments_from_msg(attached_msg) + + lines = [f"# Attached email: {attached_subject}", ""] + if attached_from: + lines.append(f"**From:** {attached_from}") + if attached_to: + lines.append(f"**To:** {attached_to}") + if attached_cc: + lines.append(f"**Cc:** {attached_cc}") + if attached_date: + lines.append(f"**Date:** {attached_date}") + lines.extend(["", "## Body", "", attached_body or "_(no readable body)_"]) + if attached_atts: + lines.extend(["", "## Attachments", ""]) + for att in attached_atts: + size = int(att.get("size") or 0) + size_label = f"{size} B" if size < 1024 else f"{round(size / 1024)} KB" + name = att.get("filename") or f"attachment_{att.get('index', '')}" + ctype = att.get("content_type") or "application/octet-stream" + lines.append(f"- {name} ({ctype}, {size_label})") + return "\n".join(lines).strip() + # ── PDF path (existing) ──────────────────────────────────── if ext == ".pdf": import shutil as _shutil @@ -1645,6 +1798,15 @@ def setup_email_routes(): _tag_doc_with_source(doc_id) return {"doc_id": doc_id, "filename": filepath.name} + # ── Attached email (.eml / message/rfc822) ──────────────── + if ext == ".eml": + try: + content = _attached_email_markdown(filepath) + except Exception as e: + return {"error": f"Failed to read email attachment: {e}", "filename": base} + doc_id = _create_markdown_doc(content, "Imported attached email") + return {"doc_id": doc_id, "filename": filepath.name} + # ── DOCX path: extract text → markdown document ─────────── if ext == ".docx": try: @@ -1682,25 +1844,7 @@ def setup_email_routes(): lines.append("") content = "\n".join(lines).strip() or f"_(empty {base})_" - from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV - doc_id = str(uuid.uuid4()) - ver_id = str(uuid.uuid4()) - _db = _SL() - try: - _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False}) - _db.add(_Doc( - id=doc_id, session_id=doc_session_id, title=title, - language="markdown", current_content=content, - version_count=1, is_active=True, - )) - _db.add(_DV( - id=ver_id, document_id=doc_id, version_number=1, - content=content, summary="Imported from DOCX", source="upload", - )) - _db.commit() - finally: - _db.close() - _tag_doc_with_source(doc_id) + doc_id = _create_markdown_doc(content, "Imported from DOCX") return {"doc_id": doc_id, "filename": filepath.name} # ── Plain text / markdown ──────────────────────────────── @@ -1709,25 +1853,7 @@ def setup_email_routes(): content = filepath.read_text(encoding="utf-8", errors="replace") except Exception as e: return {"error": f"Failed to read text file: {e}", "filename": base} - from src.database import SessionLocal as _SL, Document as _Doc, DocumentVersion as _DV - doc_id = str(uuid.uuid4()) - ver_id = str(uuid.uuid4()) - _db = _SL() - try: - _db.query(_Doc).filter(_Doc.is_active == True).update({"is_active": False}) - _db.add(_Doc( - id=doc_id, session_id=doc_session_id, title=title, - language="markdown", current_content=content, - version_count=1, is_active=True, - )) - _db.add(_DV( - id=ver_id, document_id=doc_id, version_number=1, - content=content, summary="Imported from email attachment", source="upload", - )) - _db.commit() - finally: - _db.close() - _tag_doc_with_source(doc_id) + doc_id = _create_markdown_doc(content, "Imported from email attachment") return {"doc_id": doc_id, "filename": filepath.name} return {"error": f"Unsupported attachment type: {ext}", "filename": base} diff --git a/routes/hwfit_routes.py b/routes/hwfit_routes.py index 5e38b9ca3..0dad0ddd7 100644 --- a/routes/hwfit_routes.py +++ b/routes/hwfit_routes.py @@ -1,8 +1,13 @@ +import json +import os import re +import shlex +import subprocess from copy import deepcopy from fastapi import APIRouter, HTTPException +from core.platform_compat import run_ssh_command from routes._validators import validate_remote_host, validate_ssh_port @@ -107,6 +112,73 @@ def _apply_manual_hardware(system, manual_mode="", manual_gpu_count="", manual_v return system +def _run_model_probe(host: str, ssh_port: str, cmd: str) -> str: + try: + if host: + r = run_ssh_command( + host, + ssh_port or None, + cmd, + timeout=15, + connect_timeout=5, + strict_host_key_checking=False, + text=True, + ) + else: + r = subprocess.run(["bash", "-lc", cmd], capture_output=True, text=True, timeout=15) + if r.returncode == 0: + return (r.stdout or "").strip() + except Exception: + return "" + return "" + + +def _inspect_model_path(model_path: str, host: str = "", ssh_port: str = "") -> dict: + """Read lightweight metadata from a local or SSH-visible HF model folder.""" + path = (model_path or "").strip() + if not path or path.startswith(("http://", "https://")): + return {} + if not (path.startswith("/") or path.startswith("~")): + return {} + + qpath = shlex.quote(path) + qconfig = shlex.quote(os.path.join(path, "config.json")) + out = {} + exists = _run_model_probe(host, ssh_port, f"test -d {qpath} && printf found || printf missing") + if exists != "found": + target = host or "local container" + out["model_probe_error"] = f"Model path is not visible on {target}: {path}" + return out + raw_config = _run_model_probe(host, ssh_port, f"test -f {qconfig} && sed -n '1,240p' {qconfig}") + if raw_config: + try: + cfg = json.loads(raw_config) + except Exception: + cfg = {} + for key in ("context_length", "max_position_embeddings", "n_ctx_train", "model_max_length", "max_seq_len"): + value = cfg.get(key) + if isinstance(value, (int, float)) and value > 0: + out["model_ctx_max"] = int(value) + break + else: + out["model_probe_error"] = f"config.json not found in model path: {path}" + + size_cmd = ( + f"find {qpath} -type f \\( -name '*.safetensors' -o -name '*.bin' -o -name '*.gguf' \\) " + "-printf '%s\\n' 2>/dev/null | awk '{s+=$1} END {if (s>0) printf \"%.6f\", s/1073741824}'" + ) + weights = _run_model_probe(host, ssh_port, size_cmd) + try: + weights_gb = float(weights) + except Exception: + weights_gb = 0.0 + if weights_gb > 0: + out["model_weights_gb"] = round(weights_gb, 3) + elif "model_probe_error" not in out: + out["model_probe_error"] = f"No model weight files found in: {path}" + return out + + def setup_hwfit_routes(): router = APIRouter(prefix="/api/hwfit", tags=["hwfit"]) @@ -235,7 +307,7 @@ def setup_hwfit_routes(): return {"system": system, "models": results} @router.get("/profiles") - def get_serve_profiles(model: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""): + def get_serve_profiles(model: str = "", model_path: str = "", host: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, serve_weights_gb: float = 0.0, serve_quant: str = ""): """Compute llama.cpp serve profiles (Quality/Balanced/Speed) for `model` against the detected hardware on `host` (or local). Returns concrete flags (n_gpu_layers, n_cpu_moe, cache_type, ctx) the serve UI can apply. @@ -272,8 +344,16 @@ def setup_hwfit_routes(): if nn and (nn == want or want.endswith(nn) or nn.endswith(want)): m = entry break + path_meta = _inspect_model_path(model_path or model, host=host, ssh_port=ssh_port) if m is None: - return {"system": system, "profiles": [], "error": "model not in catalog"} + return { + "system": system, + "profiles": [], + "error": "model not in catalog", + "model_ctx_max": int(path_meta.get("model_ctx_max") or 0), + "model_weights_gb": float(path_meta.get("model_weights_gb") or 0), + "model_probe_error": path_meta.get("model_probe_error") or "", + } # Surface the model's trained context limit so the serve UI can clamp a # user-typed context down to it (asking for ctx > n_ctx_train overflows # and, with a quantized KV cache, can crash the GPU). @@ -283,6 +363,16 @@ def setup_hwfit_routes(): if isinstance(v, (int, float)) and v > 0: model_ctx_max = int(v) break + path_ctx_max = int(path_meta.get("model_ctx_max") or 0) + if path_ctx_max > 0: + model_ctx_max = max(model_ctx_max, path_ctx_max) + model_weights_gb = float(path_meta.get("model_weights_gb") or 0) + if model_weights_gb <= 0: + for k in ("min_vram_gb", "required_gb", "size_gb", "recommended_ram_gb", "min_ram_gb"): + v = m.get(k) + if isinstance(v, (int, float)) and v > 0: + model_weights_gb = float(v) + break return { "system": system, "profiles": compute_serve_profiles( @@ -291,6 +381,8 @@ def setup_hwfit_routes(): serve_quant=(serve_quant or None), ), "model_ctx_max": model_ctx_max, + "model_weights_gb": model_weights_gb, + "model_probe_error": path_meta.get("model_probe_error") or "", } @router.get("/image-models") diff --git a/routes/model_routes.py b/routes/model_routes.py index 000cd9379..69528f6dc 100644 --- a/routes/model_routes.py +++ b/routes/model_routes.py @@ -1064,9 +1064,11 @@ def setup_model_routes(model_discovery): except Exception: return 0.0 - def _failure_delay(fails: int) -> float: + def _failure_delay(fails: int, *, empty_local: bool = False) -> float: if fails <= 0: return 0.0 + if empty_local: + return min(5.0 * (2 ** max(0, fails - 1)), 30.0) return min(_REFRESH_FAILURE_BASE * (2 ** max(0, fails - 1)), _REFRESH_FAILURE_MAX) def _should_refresh_endpoint(ep: Any, now: float, force: bool = False) -> tuple[bool, Dict[str, Any]]: @@ -1097,7 +1099,12 @@ def setup_model_routes(model_discovery): fails = int(state.get("fail_count") or 0) if fails and not force: last_failure = float(state.get("last_failure") or 0.0) - if now - last_failure < _failure_delay(fails): + empty_local = ( + not cached + and category == "local" + and str(getattr(ep, "id", "") or "").startswith("local-") + ) + if now - last_failure < _failure_delay(fails, empty_local=empty_local): return False, info if cached and not force: interval = _endpoint_refresh_interval(ep, category) diff --git a/routes/shell_routes.py b/routes/shell_routes.py index 406d80bb3..245181832 100644 --- a/routes/shell_routes.py +++ b/routes/shell_routes.py @@ -330,6 +330,9 @@ def add_user_install_bins_to_path(): candidates.append(os.path.join(site.USER_BASE, 'bin')) except Exception: pass + candidates.append(os.path.expanduser('~/bin')) + candidates.append(os.path.expanduser('~/llama.cpp/build/bin')) + candidates.append(os.path.expanduser('~/llama.cpp/build-vulkan/bin')) candidates.append(os.path.expanduser('~/.local/bin')) parts = os.environ.get('PATH', '').split(os.pathsep) if os.environ.get('PATH') else [] changed = False @@ -1188,6 +1191,7 @@ def setup_shell_routes() -> APIRouter: # venv over SSH so a remote `pip install` actually reflects here. remote_status: dict = {} remote_details: dict = {} + remote_probe_error = "" remote_names = [ p["name"] for p in packages @@ -1226,8 +1230,34 @@ def setup_shell_routes() -> APIRouter: break except ValueError as e: raise HTTPException(400, str(e)) - except Exception: + except Exception as e: remote_status = {} + remote_probe_error = f"SSH package probe failed: {str(e)[:160]}" + if "llama_cpp" in remote_names: + try: + inner = ( + 'export PATH="$HOME/.local/bin:$HOME/bin:' + '$HOME/llama.cpp/build/bin:$HOME/llama.cpp/build-vulkan/bin:$PATH"; ' + "command -v llama-server 2>/dev/null || true" + ) + argv = _ssh_base_argv(host, ssh_port) + [inner] + proc = await asyncio.create_subprocess_exec( + *argv, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + out, _err = await asyncio.wait_for(proc.communicate(), timeout=8) + llama_server_path = out.decode("utf-8", errors="replace").strip().splitlines() + llama_server_path = llama_server_path[-1].strip() if llama_server_path else "" + if llama_server_path: + remote_status["llama_cpp"] = True + probe = remote_details.setdefault("llama_cpp", {}) + if isinstance(probe, dict): + probe.setdefault("binaries", {})["llama-server"] = llama_server_path + except Exception as e: + if not remote_probe_error: + remote_probe_error = f"SSH llama-server probe failed: {str(e)[:160]}" + pass # Union of system_names + every package's system_prereqs. Probing # the prereqs alongside the main system deps in a single SSH call # avoids a second round-trip per Cookbook → Dependencies refresh. @@ -1272,7 +1302,9 @@ def setup_shell_routes() -> APIRouter: target_os_id = _os_id_from_release("\n".join(_osrel_lines)) except ValueError as e: raise HTTPException(400, str(e)) - except Exception: + except Exception as e: + if not remote_probe_error: + remote_probe_error = f"SSH system probe failed: {str(e)[:160]}" pass elif not host: # Local target — probe in-process so the inline install command @@ -1290,7 +1322,12 @@ def setup_shell_routes() -> APIRouter: on_remote = bool(host and pkg.get("target") == "remote") probe = None if on_remote: - pkg["installed"] = bool(remote_status.get(pkg["name"], False)) + if remote_probe_error and pkg["name"] not in remote_status: + pkg["installed"] = None + pkg["probe_error"] = remote_probe_error + pkg["status_note"] = remote_probe_error + else: + pkg["installed"] = bool(remote_status.get(pkg["name"], False)) probe = remote_details.get(pkg["name"]) if isinstance(probe, dict): pkg["details"] = probe @@ -1353,9 +1390,19 @@ def setup_shell_routes() -> APIRouter: # reads "ready" green while inference runs at 3 tok/s on GPU # silicon — actively misleading. if pkg["name"] == "llama_cpp" and pkg.get("installed"): + _native_llama_server = bool( + isinstance(probe, dict) + and isinstance(probe.get("binaries"), dict) + and probe["binaries"].get("llama-server") + ) _gpu_capable = False _has_nvidia_target = False - if on_remote and host: + if _native_llama_server: + # Native llama-server is the launcher path Cookbook now + # prefers. Do not mark this as a CPU-only Python wheel just + # because llama-cpp-python is absent from the selected venv. + _gpu_capable = True + elif on_remote and host: try: # Activate the configured venv FIRST so the probe # runs against the same python the launch script @@ -1609,7 +1656,8 @@ def setup_shell_routes() -> APIRouter: return {"ok": False, "error": f"Unsupported engine: {engine}"} host = str(body.get("remote_host") or "").strip() ssh_port = body.get("ssh_port") - cmd = _llama_cpp_rebuild_cmd() + update_source = bool(body.get("update_source")) + cmd = _llama_cpp_rebuild_cmd(update_source=update_source) try: argv = ( (_ssh_base_argv(host, ssh_port) + [cmd]) diff --git a/src/agent_loop.py b/src/agent_loop.py index e574a42b6..40e4232bb 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -751,6 +751,17 @@ def _extract_last_user_message(messages: List[Dict]) -> str: _LOW_SIGNAL_RE = re.compile(r"^[\W_]*$", re.UNICODE) +_CASUAL_OPENING_RE = re.compile( + r"^\s*(?:h+i+|hey+|hello+|yo+|sup+|what'?s up|wass?up|hiya|howdy|" + r"lol|lmao|haha+|hehe+|thanks?|thank you|ty|idk|dunno|meh|bruh|bro)\b(?P.*)$", + re.IGNORECASE, +) +_CASUAL_BLOCKLIST_RE = re.compile( + r"\b(?:cookbook|serve|serving|launch|start|vllm|sglang|llama\.?cpp|ollama|" + r"download|model|email|document|doc|note|calendar|task|search|web|research|" + r"file|folder|repo|git|settings?|endpoint|api|token|mcp)\b", + re.IGNORECASE, +) _EXPLICIT_CONTINUATION_RE = re.compile( r"^\s*(?:" r"yes|y|yeah|yep|ok|okay|sure|do it|go ahead|continue|carry on|" @@ -760,6 +771,17 @@ _EXPLICIT_CONTINUATION_RE = re.compile( r")\s*[.!?]*\s*$", re.IGNORECASE, ) +_RETRY_CONTINUATION_RE = re.compile( + r"\b(?:try again|retry|again|rerun|re-run|run it again|launch it again|" + r"start it again|failed|fails?|died|crashed|broke|insta|instantly)\b", + re.IGNORECASE, +) +_COOKBOOK_CONTEXT_RE = re.compile( + r"\b(?:cookbook|serve|serving|served|launch|start|preset|vllm|sglang|" + r"llama\.?cpp|ollama|download|cached models?|model servers?|running models?|" + r"gpu box|ajax|qwen|gemma|llama|mistral|minimax)\b", + re.IGNORECASE, +) def _is_explicit_continuation(text: str) -> bool: @@ -767,6 +789,37 @@ def _is_explicit_continuation(text: str) -> bool: return bool(_EXPLICIT_CONTINUATION_RE.match(str(text or "").strip())) +def _is_casual_low_signal(text: str) -> bool: + """True for short greetings/slang that should not inherit stale context.""" + s = str(text or "").strip() + m = _CASUAL_OPENING_RE.match(s) + if not m: + return False + tail = m.group("tail") or "" + if _CASUAL_BLOCKLIST_RE.search(tail): + return False + # Allow a short vocative/address after the opener without hardcoding the + # address term itself: "hey man", "yo dude", "sup ". Longer tails are + # more likely to be an actual request and should get normal context/tooling. + tail_words = re.findall(r"[A-Za-z0-9_'-]+", tail) + return len(tail_words) <= 2 + + +def _is_contextual_retry_continuation(messages: List[Dict], text: str) -> bool: + """Treat "try again / it failed" as a continuation only for active tool work. + + These follow-ups are common after Cookbook launches: the latest user turn + says only "try again it failed", while the actionable model/host/command + details live one or two turns back. Keep this intentionally narrow so + ordinary chat does not inherit stale Cookbook context. + """ + latest = str(text or "").strip() + if not latest or not _RETRY_CONTINUATION_RE.search(latest): + return False + recent = _recent_context_for_retrieval(messages, max_user=5, max_chars=1200) + return bool(_COOKBOOK_CONTEXT_RE.search(recent)) + + def _assistant_requested_followup(messages: List[Dict]) -> bool: """True when the previous assistant turn asked for missing task details. @@ -808,11 +861,12 @@ def _classify_agent_request(messages: List[Dict], last_user: str) -> Dict[str, o which domain rule packs get appended to the system prompt. """ text = str(last_user or "").strip() - continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages) + retry_continuation = _is_contextual_retry_continuation(messages, text) + continuation = _is_explicit_continuation(text) or _assistant_requested_followup(messages) or retry_continuation retrieval_query = _recent_context_for_retrieval(messages) if continuation else text q = retrieval_query.lower() - if not text or bool(_LOW_SIGNAL_RE.match(text)): + if not text or bool(_LOW_SIGNAL_RE.match(text)) or _is_casual_low_signal(text): return { "low_signal": True, "continuation": False, @@ -907,6 +961,7 @@ def _build_system_prompt( compact: bool = False, owner: Optional[str] = None, suppress_local_context: bool = False, + suppress_skills: bool = False, active_email: Optional[Dict[str, str]] = None, ) -> List[Dict]: """Build agent system prompt, inject MCP/document context, merge consecutive system msgs.""" @@ -924,7 +979,7 @@ def _build_system_prompt( _ov_sig = _hl.sha256(_json.dumps(get_builtin_overrides() or {}, sort_keys=True).encode()).hexdigest() except Exception: _ov_sig = "" - cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context) + cache_key = (frozenset(disabled_tools or []), bool(mcp_mgr), needs_admin, _rt_key, compact, _ov_sig, owner, suppress_local_context, suppress_skills) if _cached_base_prompt and _cached_base_prompt_key == cache_key and not active_document: agent_prompt = _cached_base_prompt # Skill index is user-editable (name + description), so it must never @@ -934,6 +989,7 @@ def _build_system_prompt( disabled_tools, mcp_mgr, needs_admin, relevant_tools, mcp_disabled_map=mcp_disabled_map, compact=compact, owner=owner, suppress_local_context=suppress_local_context, + suppress_skills=suppress_skills, ) else: agent_prompt, _skill_index_block = _build_base_prompt( @@ -945,6 +1001,7 @@ def _build_system_prompt( compact=compact, owner=owner, suppress_local_context=suppress_local_context, + suppress_skills=suppress_skills, ) if not active_document: _cached_base_prompt = agent_prompt @@ -1228,7 +1285,7 @@ def _build_system_prompt( # few. If the teacher wrote a procedure for "open my X chat" last # time the student failed, this is where the student finds it # before deciding which tool to call. - if not suppress_local_context: + if not suppress_local_context and not suppress_skills: try: last_user = _extract_last_user_message(messages) # Respect the user's skills-enabled toggle (mirrors memory_enabled). @@ -1395,6 +1452,7 @@ def _build_base_prompt( compact: bool = False, owner: Optional[str] = None, suppress_local_context: bool = False, + suppress_skills: bool = False, ): """Build the agent prompt with only relevant tools included. @@ -1447,7 +1505,7 @@ def _build_base_prompt( # The caller wraps it in untrusted_context_message and ships it as a # user-role message — same treatment as the matched-skills block. skill_index_block = "" - if not suppress_local_context: + if not suppress_local_context and not suppress_skills: try: from services.memory.skills import SkillsManager from src.constants import DATA_DIR @@ -1866,6 +1924,7 @@ async def stream_agent_loop( approved_plan: Optional[str] = None, tool_policy: Optional[ToolPolicy] = None, workspace: Optional[str] = None, + forced_tools: Optional[Set[str]] = None, _is_teacher_run: bool = False, ) -> AsyncGenerator[str, None]: """Streaming agent loop generator. @@ -1905,6 +1964,18 @@ async def stream_agent_loop( _needs_admin = _detect_admin_intent(messages) _last_user = _extract_last_user_message(messages) _intent = _classify_agent_request(messages, _last_user) + _low_signal_turn = bool(_intent.get("low_signal")) + _casual_low_signal_turn = _is_casual_low_signal(_last_user) + _direct_low_signal = ( + _low_signal_turn + and not bool(_intent.get("continuation")) + and not plan_mode + and not approved_plan + and (_casual_low_signal_turn or active_document is None) + and (_casual_low_signal_turn or not active_email) + and (_casual_low_signal_turn or not workspace) + and not forced_tools + ) # Tool retrieval uses the latest message by default. It may inherit recent # user turns only for explicit continuations ("yes", "do it", "1"). _retrieval_query = str(_intent.get("retrieval_query") or _last_user) @@ -1912,11 +1983,86 @@ async def stream_agent_loop( "[agent-intent] latest=%r continuation=%s low_signal=%s domains=%s retrieval_query=%r", _last_user[:120], bool(_intent.get("continuation")), - bool(_intent.get("low_signal")), + _low_signal_turn, sorted(_intent.get("domains") or []), _retrieval_query[:200], ) _mcp_disabled_map = _load_mcp_disabled_map() if mcp_mgr else {} + if _direct_low_signal: + logger.info("[agent] direct low-signal reply path for latest=%r", _last_user[:80]) + direct_messages = [{"role": "user", "content": _last_user}] + direct_response = "" + direct_start = time.time() + direct_actual_model = model + real_input_tokens = 0 + real_output_tokens = 0 + try: + async for chunk in stream_llm_with_fallback( + [(endpoint_url, model, headers)] + list(fallbacks or []), + direct_messages, + temperature=temperature, + max_tokens=min(max_tokens or 128, 128), + prompt_type=None, + tools=None, + timeout=int(get_setting("agent_stream_timeout_seconds", 300) or 300), + session_id=session_id, + ): + if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): + try: + data = json.loads(chunk[6:]) + except json.JSONDecodeError: + yield chunk + continue + if data.get("type") == "usage": + usage = data.get("data", {}) or {} + direct_actual_model = usage.get("model") or direct_actual_model + real_input_tokens += usage.get("input_tokens", 0) or 0 + real_output_tokens += usage.get("output_tokens", 0) or 0 + continue + if data.get("type") == "model_actual": + direct_actual_model = data.get("model") or direct_actual_model + data["requested_model"] = model + yield f"data: {json.dumps(data)}\n\n" + continue + if data.get("type") == "fallback": + direct_actual_model = data.get("answered_by") or direct_actual_model + yield chunk + continue + if "delta" in data: + if not data.get("thinking"): + direct_response += data.get("delta", "") + yield chunk + continue + yield chunk + elif chunk.startswith("event: "): + yield chunk + except Exception as _direct_err: + logger.warning("[agent] direct low-signal path failed: %s", _direct_err) + fallback = "Hey." + direct_response += fallback + yield f"data: {json.dumps({'delta': fallback})}\n\n" + + if not direct_response.strip(): + fallback = "Hey." + direct_response = fallback + yield f"data: {json.dumps({'delta': fallback})}\n\n" + + duration = time.time() - direct_start + metrics = { + "model": direct_actual_model, + "requested_model": model, + "input_tokens": real_input_tokens or estimate_tokens(direct_messages), + "output_tokens": real_output_tokens or max(len(direct_response) // 4, 1), + "total_time": round(duration, 2), + "response_time": round(duration, 2), + "agent_rounds": 0, + "tool_calls": 0, + "direct_low_signal": True, + } + yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n" + yield "data: [DONE]\n\n" + return + if plan_mode and mcp_mgr: # Allow read-only MCP tools to investigate, block write/unknown ones: # hide them from the schemas AND reject them at runtime by qualified name. @@ -1932,7 +2078,7 @@ async def stream_agent_loop( _t1 = time.time() if _relevant_tools: logger.info(f"[tool-rag] Using caller-provided relevant_tools ({len(_relevant_tools)} tools)") - if not guide_only and not _relevant_tools and bool(_intent.get("low_signal")): + if not guide_only and not _relevant_tools and _low_signal_turn: from src.tool_index import ALWAYS_AVAILABLE if workspace: # An active workspace IS the file-work signal: a vague "look at the @@ -2023,6 +2169,15 @@ async def stream_agent_loop( if _relevant_tools is not None and active_document is not None: _relevant_tools.update({"edit_document", "update_document", "suggest_document"}) + # Per-request UI toggles are stronger than retrieval. If the user turns on + # Search, the model must see the search tools even when the latest text is a + # typo or otherwise low-signal for tool RAG. + if not guide_only and forced_tools: + if _relevant_tools is None: + from src.tool_index import ALWAYS_AVAILABLE + _relevant_tools = set(ALWAYS_AVAILABLE) + _relevant_tools.update(t for t in forced_tools if t not in disabled_tools) + # The skill index injected by _build_system_prompt tells the model to # call `manage_skills action=view`, and Jaccard-matched skills are pasted # into the prompt as procedures to follow — but neither path goes through @@ -2030,7 +2185,7 @@ async def stream_agent_loop( # (grep, read_file, ...) that aren't in its schema list. Keep the schemas # in lockstep: manage_skills is callable whenever any skill is indexed, # and a matched skill's declared requires_toolsets ride along with it. - if not guide_only and _relevant_tools is not None: + if not guide_only and _relevant_tools is not None and not _low_signal_turn: try: from services.memory.skills import SkillsManager from src.constants import DATA_DIR @@ -2147,6 +2302,7 @@ async def stream_agent_loop( compact=_compact_agent_prompt, owner=owner, suppress_local_context=guide_only, + suppress_skills=_low_signal_turn, active_email=active_email, ) if plan_mode and not guide_only: @@ -2753,6 +2909,15 @@ async def stream_agent_loop( _intent_nudge_count += 1 _matched_phrase = _intent_match.group(0).strip() logger.info(f"[agent] intent-without-action nudge #{_intent_nudge_count} on round {round_num}: {_matched_phrase!r}") + _lower_phrase = _matched_phrase.lower() + _cookbook_log_hint = "" + if any(_word in _lower_phrase for _word in ("log", "logs", "output", "tail", "status")): + _cookbook_log_hint = ( + " If this is about a Cookbook/model serve, the concrete calls are: " + "`list_served_models` first, then `tail_serve_output` with the " + "session_id from the serve/list result. Never answer with " + "\"check logs\" when those tools are available." + ) messages.append({ "role": "system", "content": ( @@ -2761,6 +2926,7 @@ async def stream_agent_loop( "see you announced the action but didn't run it, which " "is the most frustrating thing you can do. " "DO IT NOW: emit the actual function call this turn. " + f"{_cookbook_log_hint}" "If you decided not to do it after all, say so plainly in " "one sentence instead of restating the plan." ), diff --git a/src/agent_tools/web_tools.py b/src/agent_tools/web_tools.py index 87a4b697f..f1410e18e 100644 --- a/src/agent_tools/web_tools.py +++ b/src/agent_tools/web_tools.py @@ -7,6 +7,7 @@ from src.constants import MAX_OUTPUT_CHARS class WebSearchTool: async def execute(self, content: str, ctx: dict) -> dict: from src.search import comprehensive_web_search + progress_cb = ctx.get("progress_cb") if isinstance(ctx, dict) else None raw = content.strip() query = raw time_filter = None @@ -37,18 +38,39 @@ class WebSearchTool: elif " news" in q_lc or q_lc.startswith("news ") or q_lc.endswith(" news"): time_filter = "week" loop = asyncio.get_running_loop() - text, sources = await asyncio.wait_for( - loop.run_in_executor( - None, - lambda: comprehensive_web_search( - query, - max_pages=max_pages, - time_filter=time_filter, - return_sources=True, + if progress_cb: + await progress_cb({ + "elapsed_s": 0, + "tail": f"Searching web for: {query[:160]}", + }) + try: + text, sources = await asyncio.wait_for( + loop.run_in_executor( + None, + lambda: comprehensive_web_search( + query, + max_pages=max_pages, + time_filter=time_filter, + return_sources=True, + ), ), - ), - timeout=30, - ) + timeout=30, + ) + except asyncio.TimeoutError: + return { + "error": f"web_search timed out after 30s: {query[:200]}", + "exit_code": 1, + } + except Exception as e: + return { + "error": f"web_search failed: {type(e).__name__}: {str(e) or 'no details'}", + "exit_code": 1, + } + if progress_cb: + await progress_cb({ + "elapsed_s": 30, + "tail": "Search completed; preparing sources.", + }) output = text[:MAX_OUTPUT_CHARS] if len(text) > MAX_OUTPUT_CHARS else text if sources: output += "\n\n" diff --git a/src/builtin_actions.py b/src/builtin_actions.py index a598cb652..bf4ddd950 100644 --- a/src/builtin_actions.py +++ b/src/builtin_actions.py @@ -76,8 +76,7 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]: import json import re from src.constants import DATA_DIR - from src.endpoint_resolver import resolve_endpoint - from src.llm_core import llm_call_async + from src.llm_core import llm_call_async_with_fallback from src.memory import MemoryManager manager = MemoryManager(DATA_DIR) @@ -116,10 +115,9 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]: if len(group_memories) < 2: return False - url, model, headers = resolve_endpoint("utility", owner=group_owner or None) - if not url or not model: - url, model, headers = resolve_endpoint("default", owner=group_owner or None) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=group_owner or None) + if not candidates: return False try: @@ -147,13 +145,11 @@ async def action_consolidate_memory(owner: str, **kwargs) -> Tuple[str, bool]: "\"drop\":[{\"id\":\"existing id\",\"reason\":\"short reason\"}]}\n\n" f"MEMORIES:\n{json.dumps(items, ensure_ascii=False)}" ) - raw = await llm_call_async( - url=url, - model=model, + raw = await llm_call_async_with_fallback( + candidates, messages=[{"role": "user", "content": prompt}], temperature=0.0, max_tokens=4096, - headers=headers, timeout=120, ) from src.text_helpers import strip_think @@ -604,8 +600,7 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]: try: from datetime import timedelta from core.database import SessionLocal, CalendarEvent - from src.endpoint_resolver import resolve_endpoint - from src.llm_core import llm_call_async + from src.llm_core import llm_call_async_with_fallback import re as _re, json as _json db = SessionLocal() @@ -620,10 +615,9 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]: if not events: return "No upcoming events to classify", True - llm_url, llm_model, llm_headers = resolve_endpoint("utility", owner=owner) - if not llm_url: - llm_url, llm_model, llm_headers = resolve_endpoint("default", owner=owner) - llm_available = bool(llm_url and llm_model) + from src.task_endpoint import resolve_task_candidates + llm_candidates = resolve_task_candidates(owner=owner) + llm_available = bool(llm_candidates) # Pull user memories so the LLM has personal context (relationships, # job, hobbies). Helps it know e.g. " is your spouse" so their @@ -699,11 +693,11 @@ async def action_classify_events(owner: str, **kwargs) -> Tuple[str, bool]: f"EVENTS: {_json.dumps(items)}" ) try: - raw = await llm_call_async( - url=llm_url, model=llm_model, + raw = await llm_call_async_with_fallback( + llm_candidates, messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=16384, - headers=llm_headers, timeout=180, + timeout=180, ) from src.text_helpers import strip_think as _st raw = _st(raw or "", prose=False, prompt_echo=False) @@ -810,8 +804,7 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo import asyncio as _aio from datetime import datetime as _dt, timedelta as _td from routes.email_helpers import _email_cache_owner_clause, _imap_connect, SCHEDULED_DB - from src.endpoint_resolver import resolve_endpoint - from src.llm_core import llm_call_async + from src.llm_core import llm_call_async_with_fallback # 1. Pull recent UIDs + From headers cheaply (header-only fetch). def _pull_headers(): @@ -891,11 +884,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo if not eligible: return "All sender sigs already cached (or no eligible senders)", True - url, model, headers = resolve_endpoint("utility", owner=owner) - if not url or not model: - url, model, headers = resolve_endpoint("default", owner=owner) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=owner) + if not candidates: return "No LLM endpoint available", False + model = candidates[0][1] analyzed = 0 no_sig = 0 @@ -949,11 +942,11 @@ async def action_learn_sender_signatures(owner: str, **kwargs) -> Tuple[str, boo ) try: - raw = await llm_call_async( - url=url, model=model, + raw = await llm_call_async_with_fallback( + candidates, messages=[{"role": "user", "content": prompt}], temperature=0.0, max_tokens=600, - headers=headers, timeout=60, + timeout=60, ) from src.text_helpers import strip_think as _st sig = _st(raw or "", prose=False, prompt_echo=False).strip() @@ -1137,7 +1130,6 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]: from services.memory.skills import SkillsManager from src.constants import DATA_DIR from routes.skills_routes import _run_skill_test_once, _skill_test_task - from src.endpoint_resolver import resolve_endpoint # #3 SCOPE GUARD: refuse to run on a None/empty owner — otherwise # `sm.load(owner=None)` returns every user's skills and we'd cross- @@ -1152,27 +1144,40 @@ async def action_test_skills(owner: str, **kwargs) -> Tuple[str, bool]: if not names: raise TaskNoop("no skills to test") - url, model, headers = resolve_endpoint("default", owner=owner) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=owner) + if not candidates: return "No Default/Utility model configured — set one in Settings.", False # #2 NO SILENT MODEL SWAP: if the configured model isn't served by the # endpoint, try a basename match — but fail loudly instead of grabbing # `avail[0]` which could be an embedding-only model and produce 36 # garbage transcripts → 36 'unknown' verdicts with no hint why. + url, model, headers = candidates[0] try: from src.llm_core import list_model_ids - avail = list_model_ids(url, headers=headers) - if avail and model not in avail: - import os as _os - base = _os.path.basename((model or "").rstrip("/")) - m = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None) - if m: - model = m - else: - return (f"Default model '{model}' not served by endpoint {url}. " - f"Available: {', '.join(avail[:8])}{'…' if len(avail) > 8 else ''}. " - "Set a valid Default model in Settings."), False + import os as _os + + selected = None + mismatch_notes = [] + for cand_url, cand_model, cand_headers in candidates: + avail = list_model_ids(cand_url, headers=cand_headers) + if not avail or cand_model in avail: + selected = (cand_url, cand_model, cand_headers) + break + base = _os.path.basename((cand_model or "").rstrip("/")) + matched = next((a for a in avail if _os.path.basename(a.rstrip("/")) == base), None) + if matched: + selected = (cand_url, matched, cand_headers) + break + mismatch_notes.append( + f"{cand_model} not served by {cand_url}; available: " + f"{', '.join(avail[:8])}{'...' if len(avail) > 8 else ''}" + ) + if selected: + url, model, headers = selected + elif mismatch_notes: + return "No configured task fallback model is served. " + " | ".join(mismatch_notes[:3]), False except Exception as _e: logger.warning(f"test_skills model resolve check failed (continuing): {_e}") @@ -1483,7 +1488,6 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]: from pathlib import Path as _P from core.database import SessionLocal as _SL, EmailAccount as _EA from routes.email_helpers import _imap_connect, _decode_header - from src.endpoint_resolver import resolve_endpoint, resolve_utility_fallback_candidates from src.llm_core import llm_call_async_with_fallback # Per-owner state file so multi-user runs don't clobber each other's @@ -1505,12 +1509,10 @@ async def action_check_email_urgency(owner: str, **kwargs) -> Tuple[str, bool]: # ── 1. Resolve LLM candidates (utility primary + utility fallbacks; fall # through to default chat as a last resort). - url, model, headers = resolve_endpoint("utility", owner=owner) - if not url or not model: - url, model, headers = resolve_endpoint("default", owner=owner) - if not url or not model: + from src.task_endpoint import resolve_task_candidates + candidates = resolve_task_candidates(owner=owner) + if not candidates: return "No LLM endpoint available", False - candidates = [(url, model, headers)] + resolve_utility_fallback_candidates(owner=owner) # ── 2. Enumerate enabled accounts. Match this task's owner AND fall # back to the legacy "unowned account whose imap_user / from_address diff --git a/src/endpoint_resolver.py b/src/endpoint_resolver.py index 34d451d9c..ac5a6b7ad 100644 --- a/src/endpoint_resolver.py +++ b/src/endpoint_resolver.py @@ -396,6 +396,9 @@ def resolve_utility_fallback_candidates(owner: Optional[str] = None) -> list: settings = load_settings() utility_ep = (get_user_setting("utility_endpoint_id", owner or "", settings.get("utility_endpoint_id", "")) or "").strip() if not utility_ep: + utility_chain = get_user_setting("utility_model_fallbacks", owner or "", settings.get("utility_model_fallbacks") or []) or [] + if utility_chain: + return _resolve_fallback_candidates("utility_model_fallbacks", owner=owner) return _resolve_fallback_candidates("default_model_fallbacks", owner=owner) except Exception: pass diff --git a/src/llm_core.py b/src/llm_core.py index 1338ef91a..ba567ed81 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -2130,6 +2130,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl yield _stream_delta_event(reasoning, thinking=True) content = delta.get("content") or "" if content: + content = re.sub(r"]*)?>", r"", content, flags=re.IGNORECASE) + content = re.sub(r"", "", content, flags=re.IGNORECASE) stripped = content.lstrip() # gpt-oss harmony format (<|channel|>analysis/final): route via the harmony # stream router. Sticky once the first marker appears — distinct from the diff --git a/src/task_endpoint.py b/src/task_endpoint.py index 6e477a3ec..6f9a27c09 100644 --- a/src/task_endpoint.py +++ b/src/task_endpoint.py @@ -1,6 +1,11 @@ -"""Shared resolver for background-task AI endpoint (auto-naming, memory, sorting).""" +"""Shared resolver for background-task AI endpoints.""" -from src.endpoint_resolver import resolve_endpoint +from src.endpoint_resolver import ( + resolve_chat_fallback_candidates, + resolve_endpoint, + resolve_utility_fallback_candidates, +) +from src.llm_core import llm_call_async_with_fallback def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_headers=None, owner=None): @@ -11,3 +16,60 @@ def resolve_task_endpoint(fallback_url=None, fallback_model=None, fallback_heade endpoint cannot be resolved. """ return resolve_endpoint("task", fallback_url, fallback_model, fallback_headers, owner=owner) + + +def resolve_task_candidates( + fallback_url=None, + fallback_model=None, + fallback_headers=None, + owner=None, +): + """Return ordered background-task LLM candidates. + + Order: + 1. configured Background Tasks endpoint/model, or caller fallback + 2. Utility endpoint/model + 3. Default endpoint/model + 4. Utility fallback chain + 5. Default fallback chain + """ + candidates = [] + + def _append(url, model, headers): + if not url or not model: + return + key = (url, model) + if any((u, m) == key for u, m, _ in candidates): + return + candidates.append((url, model, headers or {})) + + _append(*resolve_task_endpoint(fallback_url, fallback_model, fallback_headers, owner=owner)) + _append(*resolve_endpoint("utility", owner=owner)) + _append(*resolve_endpoint("default", owner=owner)) + for url, model, headers in resolve_utility_fallback_candidates(owner=owner): + _append(url, model, headers) + for url, model, headers in resolve_chat_fallback_candidates(owner=owner): + _append(url, model, headers) + + return candidates + + +async def task_llm_call_async( + messages, + *, + fallback_url=None, + fallback_model=None, + fallback_headers=None, + owner=None, + **kwargs, +): + """Call the shared background-task LLM candidate chain.""" + candidates = resolve_task_candidates( + fallback_url=fallback_url, + fallback_model=fallback_model, + fallback_headers=fallback_headers, + owner=owner, + ) + if not candidates: + raise RuntimeError("No LLM endpoint available for background task") + return await llm_call_async_with_fallback(candidates, messages=messages, **kwargs) diff --git a/src/task_scheduler.py b/src/task_scheduler.py index 6c8ab148a..b9ff51b6b 100644 --- a/src/task_scheduler.py +++ b/src/task_scheduler.py @@ -833,6 +833,14 @@ class TaskScheduler: owner=task.owner, body=run.result if output == "notification" else None, ) + elif run.status == "error": + self.add_notification( + task.name, + "error", + task_id, + owner=task.owner, + body=run.error or run.result, + ) # Log result to the assistant chat so all task activity is visible. # Skip skipped/error rows — user shouldn't see "skipped: …" noise @@ -1406,12 +1414,18 @@ class TaskScheduler: ) except Exception as e: logger.warning(f"Agent loop failed for task '{task.name}', falling back to simple call: {e}") - from src.llm_core import llm_call_async + from src.task_endpoint import task_llm_call_async messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": task.prompt}, ] - result = await llm_call_async(url=endpoint_url, model=model, messages=messages, timeout=120) + result = await task_llm_call_async( + messages, + fallback_url=endpoint_url, + fallback_model=model, + owner=task.owner, + timeout=120, + ) # Strip the model's chain-of-thought before saving/delivering. Task # output is LLM-only, so prose=True (which also removes untagged @@ -1636,13 +1650,17 @@ class TaskScheduler: # Honor per-task max_steps (defense against runaway agent loops). # Falls back to 20 if not set — the historical default. _task_max_rounds = task.max_steps if task.max_steps and task.max_steps > 0 else 20 - # Tasks are background workloads — they share the Utility model's - # fallback chain (Settings → Utility Model → Fallbacks). A downed - # primary endpoint won't silently yield `(no output)` — same recipe - # chat uses but with the utility list (`utility_model_fallbacks`). + # Tasks are background workloads: use the shared task fallback chain + # behind the primary endpoint so a downed primary won't silently yield + # `(no output)`. try: - from src.endpoint_resolver import resolve_utility_fallback_candidates - _task_fallbacks = resolve_utility_fallback_candidates(owner=task.owner or None) + from src.task_endpoint import resolve_task_candidates + _task_fallbacks = resolve_task_candidates( + fallback_url=endpoint_url, + fallback_model=model, + fallback_headers=headers, + owner=task.owner or None, + )[1:] except Exception: _task_fallbacks = [] async for event_str in stream_agent_loop( @@ -1679,21 +1697,22 @@ class TaskScheduler: # asking it to summarize what it did. Guarantees output. if not full_text.strip(): try: - from src.llm_core import llm_call_async_with_fallback - from src.endpoint_resolver import resolve_utility_fallback_candidates + from src.task_endpoint import task_llm_call_async grace_context = "You ran out of steps. " if tool_results: grace_context += "Here's what your tools returned:\n" + "\n".join(tool_results[-5:]) else: grace_context += "No tool results were captured." grace_context += "\n\nSummarize what you accomplished and what's still pending. Be concise." - _grace_candidates = [(endpoint_url, model, headers)] + resolve_utility_fallback_candidates(owner=task.owner or None) - full_text = await llm_call_async_with_fallback( - _grace_candidates, + full_text = await task_llm_call_async( messages=[ {"role": "system", "content": system_content}, {"role": "user", "content": grace_context}, ], + fallback_url=endpoint_url, + fallback_model=model, + fallback_headers=headers, + owner=task.owner or None, timeout=30, ) full_text = (full_text or "").strip() diff --git a/src/tool_implementations.py b/src/tool_implementations.py index ae7246ec6..f1ac33007 100644 --- a/src/tool_implementations.py +++ b/src/tool_implementations.py @@ -1119,8 +1119,8 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict: _ALIASES = { "shell": ["bash"], "terminal": ["bash"], - "search": ["web_search"], - "web": ["web_search"], + "search": ["web_search", "web_fetch"], + "web": ["web_search", "web_fetch"], "browser": ["builtin_browser"], "documents": ["create_document", "edit_document", "update_document", "suggest_document"], "doc": ["create_document", "edit_document", "update_document", "suggest_document"], @@ -1132,7 +1132,7 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict: "notes": ["manage_notes"], "calendar": ["manage_calendar"], "email": ["mcp__email__list_emails", "mcp__email__read_email", "mcp__email__send_email"], - "research": ["web_search"], # research is a per-request flag, not a tool — closest analog + "research": ["web_search", "web_fetch"], # research is a per-request flag, not a tool — closest analog } if action == "list_tools": @@ -2714,13 +2714,25 @@ async def do_serve_model(content: str, owner: Optional[str] = None) -> Dict: endpoint_added=endpoint_added, endpoint_id=endpoint_id or "", ) note = "" if registered else " (state-write failed — task may not show in UI)" + where = host or "local" + log_path = f"/tmp/odysseus-tmux/{sid}.log" return { - "output": f"Serving {repo_id} (session: {sid}){note}", + "output": ( + f"Serving {repo_id} on {where} (session: {sid}){note}\n" + f"Next required check: call list_served_models. If this task is not ready, " + f"call tail_serve_output with session_id={sid} and tail=400 before answering. " + f"Do not tell the user to check logs; you have the log tool." + ), "session_id": sid, "task_type": "serve", "phase": "running", "host": host, "endpoint_id": endpoint_id, + "log_path": log_path, + "next_tools": [ + {"name": "list_served_models", "arguments": {}}, + {"name": "tail_serve_output", "arguments": {"session_id": sid, "tail": 400}}, + ], "exit_code": 0, } # FastAPI HTTPException puts the message under `detail`, not `error`. @@ -3057,8 +3069,17 @@ async def do_tail_serve_output(content: str, owner: Optional[str] = None) -> Dic MAX_CHARS = 8000 if len(output_text) > MAX_CHARS: output_text = "…(earlier output truncated)…\n" + output_text[-MAX_CHARS:] + if not output_text: + output_text = ( + f"No log output captured yet for {session_id} on {host_label}. " + "This usually means the tmux wrapper has started but the model process " + "has not printed anything yet. Do not stop here: call list_served_models " + "again to check whether it is still loading, ready, or crashed; if it is " + "still not ready, call tail_serve_output again with a larger tail after " + "the next status check." + ) return { - "output": output_text or "(empty pane)", + "output": output_text, "session_id": session_id, "host": host_label, "tail_lines": tail, diff --git a/static/icons/ollama-mark-crop.png b/static/icons/ollama-mark-crop.png new file mode 100644 index 000000000..2554b38a1 Binary files /dev/null and b/static/icons/ollama-mark-crop.png differ diff --git a/static/icons/ollama-mark.png b/static/icons/ollama-mark.png new file mode 100644 index 000000000..8cd2cf1ed Binary files /dev/null and b/static/icons/ollama-mark.png differ diff --git a/static/icons/sglang-logo.png b/static/icons/sglang-logo.png new file mode 100644 index 000000000..c13afe571 Binary files /dev/null and b/static/icons/sglang-logo.png differ diff --git a/static/icons/sglang-mark.png b/static/icons/sglang-mark.png new file mode 100644 index 000000000..3e0fe3eda Binary files /dev/null and b/static/icons/sglang-mark.png differ diff --git a/static/index.html b/static/index.html index 89b0ebb34..e0e84bd0b 100644 --- a/static/index.html +++ b/static/index.html @@ -879,7 +879,7 @@ Library
@@ -1005,7 +1005,12 @@ `; // /#cookbook-dl-tab-fold-body (whole Download card body) @@ -2884,6 +3072,7 @@ const shared = { _getPort, _sshPrefix, _serverByVal, + _serverKey, _selectedServer, _getPlatform, _isWindows, diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js index a390e11ad..7672edfd2 100644 --- a/static/js/cookbookRunning.js +++ b/static/js/cookbookRunning.js @@ -27,6 +27,9 @@ function _statusLabel(status, type) { // "cookbook-task-status" ('' = the neutral loading style). function _taskBadge(task) { if (task._unreachable && task.status === 'running') return { text: 'unreachable', cls: 'cookbook-task-error' }; + if (task.type === 'download' && task.status === 'running') { + return { text: _statusLabel(task.status, task.type), cls: 'cookbook-task-downloading' }; + } if (task.type === 'serve' && task.status === 'running' && task.progress) { // Same green "running" pill — just with dynamic phase text, so it doesn't // read as a different status while the server is coming up. @@ -52,13 +55,13 @@ function _downloadOutputLooksActive(task) { function _canClearTask(task) { if (!task || task.status === 'running') return false; - if (task.type === 'serve' && (task.status === 'ready' || task._serveReady)) return false; + if (task.type === 'serve' && (task.status === 'ready' || (task._serveReady && !['stopped', 'error', 'crashed', 'failed', 'completed'].includes(task.status)))) return false; // If the tmux output still shows an in-flight download, the task isn't // actually finished — hide the clear/check pill so it doesn't show on a // task that's still doing work. (The next render will reflect this and // ideally the self-heal flips status back to running.) if (_downloadOutputLooksActive(task)) return false; - return ['done', 'stopped', 'error', 'crashed', 'failed'].includes(task.status); + return ['done', 'completed', 'stopped', 'error', 'crashed', 'failed'].includes(task.status); } function _clearPillLabel(task) { @@ -66,6 +69,13 @@ function _clearPillLabel(task) { return 'clear'; } +function _venvRootFromPath(path) { + let p = (path || '').toString().trim().replace(/\/+$/, ''); + if (!p) return ''; + p = p.replace(/\/bin\/(?:activate|python(?:3(?:\.\d+)?)?|vllm|pip(?:3)?)$/i, ''); + return p; +} + // A pip dependency/driver install (payload._dep) reports success with the // runner's "=== Process exited with code 0 ===" sentinel and pip's // "Successfully installed" line — never the HuggingFace download markers @@ -263,6 +273,7 @@ let _copyText; let _persistEnvState; let _refreshDependencies; let _serverByVal; +let _serverKey; let _selectedServer; let modelLogo; let esc; @@ -688,8 +699,10 @@ export function _saveTasks(tasks) { export function _addTask(sessionId, name, type, payload) { let tasks = _loadTasks(); const remoteHost = (payload && payload.remote_host) || _envState.remoteHost || ''; - const sshPort = (payload && payload.ssh_port) || _getPort(remoteHost) || ''; - const platform = (payload && payload.platform) || _getPlatform(remoteHost) || ''; + const remoteServerKey = (payload && payload.remote_server_key) || ''; + const remoteServerName = (payload && payload.remote_server_name) || ''; + const sshPort = (payload && payload.ssh_port) || _getPort(remoteServerKey || remoteHost) || ''; + const platform = (payload && payload.platform) || _getPlatform(remoteServerKey || remoteHost) || ''; // Serving a model supersedes its finished download — clear the matching // finished download card (covers serving directly from the Serve tab, not just // via the download card's "Serve →" button). @@ -704,7 +717,7 @@ export function _addTask(sessionId, name, type, payload) { return !(key && t.type === 'download' && t.status === 'queued' && _downloadDedupeKey(t) === key); }); } - const task = _stripTaskSecrets({ id: sessionId, sessionId, name, type, status: 'running', output: '', ts: Date.now(), payload: payload || null, remoteHost, sshPort, platform }); + const task = _stripTaskSecrets({ id: sessionId, sessionId, name, type, status: 'running', output: '', ts: Date.now(), payload: payload || null, remoteHost, remoteServerKey, remoteServerName, sshPort, platform }); tasks.push(task); _saveTasks(tasks); // New action → collapse all other cards, leave only this one open. @@ -1520,14 +1533,18 @@ function _parseServeCmdToFields(cmd) { return fields; } -export async function _launchServeTask(shortName, repo, cmd, fields, hostOverride) { +export async function _launchServeTask(shortName, repo, cmd, fields, hostOverride, targetMeta = null) { // Host resolution mirrors the download path: when the caller passes an explicit // host (resolved from the dropdown the user actually picked), use it and look // up that server's port/platform from the shared servers list. Only fall back // to _envState.remoteHost for legacy callers (diagnosis/pip-update). const _host = (hostOverride !== undefined) ? (hostOverride || '') : (_envState.remoteHost || ''); - const _hsrv = _serverByVal(_envState.remoteServerKey || _host) + const _targetKey = targetMeta?.serverKey || ''; + const _hsrv = (_targetKey && _targetKey !== 'local' ? _serverByVal(_targetKey) : null) + || (hostOverride === undefined ? _serverByVal(_envState.remoteServerKey || _host) : null) || _envState.servers.find(s => s.host === _host) || {}; + const _serverMetaKey = _targetKey || (_hsrv && _serverKey ? _serverKey(_hsrv) : '') || (_host || 'local'); + const _serverMetaName = targetMeta?.serverName || _hsrv.name || (_host ? _host : 'Local'); const _hplatform = _host ? (_hsrv.platform || '') : (_envState.platform || ''); // Replace any serve already targeting this same host:port — you can't run two @@ -1572,7 +1589,7 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid } } else { if (_envState.env === 'venv' && _envState.envPath) { - const p = _envState.envPath; + const p = _venvRootFromPath(_envState.envPath); envPrefix = 'source ' + (p.endsWith('/bin/activate') ? p : p + '/bin/activate'); } else if (_envState.env === 'conda' && _envState.envPath) { envPrefix = 'eval "$(conda shell.bash hook)" && conda activate ' + _envState.envPath; @@ -1583,7 +1600,7 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid repo_id: repo, cmd: cmd, remote_host: _host || undefined, - ssh_port: _getPort(_host) || undefined, + ssh_port: _getPort(_serverMetaKey || _host) || undefined, env_prefix: envPrefix || undefined, hf_token: _envState.hfToken || undefined, gpus: _envState.gpus || undefined, @@ -1607,11 +1624,11 @@ export async function _launchServeTask(shortName, repo, cmd, fields, hostOverrid return; } - const _sp = _getPort(_host); + const _sp = _getPort(_serverMetaKey || _host); // _fields = the exact structured serve-form values used for this launch, // so the "Edit / relaunch" button can re-open the Serve panel pre-filled // with these precise settings (not just the last-used-for-repo state). - const payload = { repo_id: repo, remote_host: _host || undefined, ssh_port: _sp || undefined, _cmd: cmd, _fields: fields || undefined, _env: _usedEnv, _envPath: _usedEnvPath, _gpus: _usedGpus }; + const payload = { repo_id: repo, remote_host: _host || undefined, remote_server_key: _serverMetaKey || undefined, remote_server_name: _serverMetaName || undefined, ssh_port: _sp || undefined, _cmd: cmd, _fields: fields || undefined, _env: _usedEnv, _envPath: _usedEnvPath, _gpus: _usedGpus }; _addTask(data.session_id, shortName, 'serve', payload); uiModule.showToast(`Serving ${shortName}...`); // Auto-register may have enabled an existing (offline) endpoint for this @@ -1760,16 +1777,25 @@ export function _renderRunningTab() { } // Group tasks by server - const _serverName = (host) => { - if (!host) return 'Local'; - const srv = _serverByVal(_envState.remoteServerKey || host) - || _envState.servers.find(s => s.host === host); - return srv?.name || host; + const _taskServerKey = (task) => task?.remoteServerKey || task?.remoteHost || ''; + const _serverName = (keyOrTask) => { + if (keyOrTask && typeof keyOrTask === 'object') { + const task = keyOrTask; + if (task.remoteServerName) return task.remoteServerName; + const srv = task.remoteServerKey ? _serverByVal(task.remoteServerKey) : null; + if (srv?.name) return srv.name; + if (!task.remoteHost) return 'Local'; + return (_envState.servers.find(s => s.host === task.remoteHost)?.name) || task.remoteHost; + } + const key = keyOrTask || ''; + if (!key || key === 'local') return 'Local'; + const srv = _serverByVal(key); + return srv?.name || key; }; const serverGroups = {}; for (const t of tasks) { - const key = t.remoteHost || ''; - if (!serverGroups[key]) serverGroups[key] = { name: _serverName(key), serve: [], download: [] }; + const key = _taskServerKey(t); + if (!serverGroups[key]) serverGroups[key] = { name: _serverName(t), serve: [], download: [] }; serverGroups[key][t.type === 'serve' ? 'serve' : 'download'].push(t); } @@ -1816,12 +1842,12 @@ export function _renderRunningTab() { e.stopPropagation(); // don't toggle the section collapse (was an inline onclick, blocked by CSP) const host = btn.dataset.clearServer; const allTasks = _loadTasks(); - const toRemove = allTasks.filter(t => (t.remoteHost || '') === host && _canClearTask(t)); + const toRemove = allTasks.filter(t => _taskServerKey(t) === host && _canClearTask(t)); // Bail with a clear message instead of silently doing nothing when // every task on this server is still running (nothing finished to // clear yet) — the previous behavior looked like the button was dead. if (!toRemove.length) { - const stillRunning = allTasks.filter(t => (t.remoteHost || '') === host && t.status === 'running').length; + const stillRunning = allTasks.filter(t => _taskServerKey(t) === host && t.status === 'running').length; const _msg = stillRunning ? `No finished tasks on ${_serverName(host)} — ${stillRunning} still running. Stop them first to clear.` : `No finished tasks on ${_serverName(host)}.`; @@ -1830,7 +1856,7 @@ export function _renderRunningTab() { return; } if (!await window.styledConfirm(`Clear ${toRemove.length} finished task${toRemove.length === 1 ? '' : 's'} on ${_serverName(host)}?`, { confirmText: 'Clear' })) return; - const remaining = allTasks.filter(t => (t.remoteHost || '') !== host || !_canClearTask(t)); + const remaining = allTasks.filter(t => _taskServerKey(t) !== host || !_canClearTask(t)); _saveTasks(remaining); // Fade/slide each finished card out (same exit as the per-card clear) // instead of yanking them instantly. @@ -1864,7 +1890,7 @@ export function _renderRunningTab() { btn.addEventListener('click', async (e) => { e.stopPropagation(); // don't toggle the section collapse const host = btn.dataset.stopServer; - const running = _loadTasks().filter(t => (t.remoteHost || '') === host && t.status === 'running'); + const running = _loadTasks().filter(t => _taskServerKey(t) === host && t.status === 'running'); if (!running.length) { uiModule.showToast(`Nothing running on ${_serverName(host)}`); return; } if (!await window.styledConfirm(`Stop ${running.length} running task${running.length > 1 ? 's' : ''} on ${_serverName(host)}?`, { confirmText: 'Stop all' })) return; // Mark every task as user-stopped BEFORE firing the kills so that the @@ -2177,9 +2203,6 @@ export function _renderRunningTab() { if (task.status !== 'running' && task.status !== 'queued') { items.push({ group: 'run', label: 'Reconnect tmux', action: 'reconnect' }); } - if (task.status === 'running') { - items.push({ group: 'run', label: 'Stop', action: 'stop', danger: true }); - } items.push({ group: 'run', label: 'Restart', action: 'retry' }); // ── Edit section ──────────────────────────────────────────── // Merged "Edit & relaunch" — opens the structured serve panel @@ -2539,7 +2562,7 @@ export function _renderRunningTab() { }); // Route to the right server section body - const serverBodyId = `server-body-${(task.remoteHost || 'local').replace(/[^a-zA-Z0-9-]/g, '_')}`; + const serverBodyId = `server-body-${(_taskServerKey(task) || 'local').replace(/[^a-zA-Z0-9-]/g, '_')}`; const targetBody = document.getElementById(serverBodyId); if (targetBody) targetBody.appendChild(el); else group.appendChild(el); @@ -3393,7 +3416,8 @@ function _refreshServerDots() { let tasks; try { tasks = _loadTasks(); } catch { return; } const byKey = {}; - for (const t of tasks) { (byKey[t.remoteHost || ''] = byKey[t.remoteHost || ''] || []).push(t); } + const _taskServerKeyForDot = (task) => task?.remoteServerKey || task?.remoteHost || ''; + for (const t of tasks) { (byKey[_taskServerKeyForDot(t)] = byKey[_taskServerKeyForDot(t)] || []).push(t); } document.querySelectorAll('.cookbook-section-header').forEach(header => { const dot = header.querySelector('.cookbook-srv-status'); if (!dot) return; @@ -3798,6 +3822,7 @@ export function initRunning(shared) { _persistEnvState = shared._persistEnvState; _refreshDependencies = shared._refreshDependencies; _serverByVal = shared._serverByVal; + _serverKey = shared._serverKey; _selectedServer = shared._selectedServer; modelLogo = shared.modelLogo; esc = shared.esc; diff --git a/static/js/cookbookServe.js b/static/js/cookbookServe.js index aba3f7926..da48507f7 100644 --- a/static/js/cookbookServe.js +++ b/static/js/cookbookServe.js @@ -18,6 +18,7 @@ let _sshCmd; let _getPort; let _sshPrefix; let _serverByVal; +let _serverKey; let _getPlatform; let _isWindows; let _isMetal; @@ -41,9 +42,40 @@ let _nextAvailablePort; // Storage keys const SERVE_STATE_KEY = 'cookbook-serve-state'; +const SERVE_FAVORITES_KEY = 'cookbook-serve-favorite-models'; let _cachedAllModels = []; +function _loadServeFavorites() { + try { + const raw = JSON.parse(localStorage.getItem(SERVE_FAVORITES_KEY) || '[]'); + return new Set(Array.isArray(raw) ? raw.filter(Boolean).map(String) : []); + } catch { + return new Set(); + } +} + +function _saveServeFavorites(favorites) { + try { + localStorage.setItem(SERVE_FAVORITES_KEY, JSON.stringify(Array.from(favorites || []))); + } catch {} +} + +function _isServeFavorite(repo) { + return _loadServeFavorites().has(String(repo || '')); +} + +function _toggleServeFavorite(repo) { + const key = String(repo || ''); + if (!key) return false; + const favorites = _loadServeFavorites(); + const next = !favorites.has(key); + if (next) favorites.add(key); + else favorites.delete(key); + _saveServeFavorites(favorites); + return next; +} + function _repoLooksAwqLike(model, repo) { const q = String(model?.quant || '').toUpperCase(); const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase(); @@ -53,7 +85,9 @@ function _repoLooksAwqLike(model, repo) { function _repoLooksGgufLike(model, repo) { const q = String(model?.quant || '').toUpperCase(); const n = `${repo || ''} ${model?.repo_id || ''} ${model?.name || ''} ${model?.path || ''}`.toLowerCase(); - return !!model?.is_gguf || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf'); + const hasGgufFile = Array.isArray(model?.gguf_files) + && model.gguf_files.some(f => f && typeof f.rel_path === 'string' && /\.gguf$/i.test(f.rel_path)); + return !!model?.is_gguf || hasGgufFile || /^Q[2-8]/.test(q) || /^IQ/.test(q) || q === 'GGUF' || n.includes('gguf'); } function _serveBackendWarning(model, repo, backend, fields = {}) { @@ -96,6 +130,352 @@ function _allGpuIds(count) { return Array.from({ length: Math.floor(n) }, (_, i) => String(i)).join(','); } +function _shellSplitForPreview(cmd) { + const s = String(cmd || ''); + const out = []; + let cur = ''; + let quote = ''; + let escNext = false; + for (const ch of s) { + if (escNext) { + cur += ch; + escNext = false; + continue; + } + if (ch === '\\') { + cur += ch; + escNext = true; + continue; + } + if (quote) { + cur += ch; + if (ch === quote) quote = ''; + continue; + } + if (ch === '"' || ch === "'") { + quote = ch; + cur += ch; + continue; + } + if (/\s/.test(ch)) { + if (cur) { + out.push(cur); + cur = ''; + } + continue; + } + cur += ch; + } + if (cur) out.push(cur); + return out; +} + +function _formatServeCmdPreview(cmd) { + const raw = String(cmd || ''); + if (raw.startsWith('MODEL_FILE=$({')) { + const marker = /&&\s+([A-Za-z_][A-Za-z0-9_]*=\S+\s+)*(?:[A-Za-z_][A-Za-z0-9_]*=\S+\s+)?(?:llama-server|python3?\s+-m\s+llama_cpp\.server)\b/; + const match = raw.match(marker); + if (match && match.index > 0) { + const prelude = raw.slice(0, match.index).replace(/\s+/g, ' ').trim(); + const rest = raw.slice(match.index).replace(/^\s*&&\s*/, ''); + return `${prelude}\n&&\n${_formatServeCmdPreview(rest)}`; + } + } + const tokens = _shellSplitForPreview(cmd); + if (tokens.length <= 4) return String(cmd || ''); + const lines = []; + let i = 0; + while (i < tokens.length && /^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) { + lines.push(tokens[i]); + i++; + } + if (tokens[i]) { + const head = [tokens[i++]]; + if (tokens[i] && !tokens[i].startsWith('--') && !/^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) head.push(tokens[i++]); + if (tokens[i] && !tokens[i].startsWith('--') && !/^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) head.push(tokens[i++]); + lines.push(head.join(' ')); + } + while (i < tokens.length) { + const t = tokens[i++]; + if (t.startsWith('--')) { + const vals = []; + while (i < tokens.length && !tokens[i].startsWith('--') && !/^[A-Za-z_][A-Za-z0-9_]*=/.test(tokens[i])) { + vals.push(tokens[i++]); + } + lines.push([t, ...vals].join(' ')); + } else { + lines.push(t); + } + } + return lines.join('\n'); +} + +function _normalizeServeCmdForLaunch(cmd) { + return String(cmd || '') + .replace(/MODEL_FILE=\$\(\{\s+/g, 'MODEL_FILE=$({ ') + .replace(/\s+\}\s+\|\s+head\s+-1\)/g, ' } | head -1)') + .replace(/\s*;\s*/g, '; ') + .replace(/\s*\|\|\s*/g, ' __ODY_OR__ ') + .replace(/\s*\|\s*/g, ' | ') + .replace(/\s+__ODY_OR__\s+/g, ' || ') + .replace(/\s+/g, ' ') + .trim(); +} + +function _modelSizeGb(model, explicitGb = 0) { + const explicit = Number(explicitGb || 0); + if (Number.isFinite(explicit) && explicit > 0) return explicit; + const bytes = Number(model?.size_bytes || 0); + if (Number.isFinite(bytes) && bytes > 0) return bytes / (1024 ** 3); + const gb = Number( + model?.size_gb + || model?.required_gb + || model?.vram_needed + || model?.min_vram_gb + || model?.recommended_ram_gb + || model?.min_ram_gb + || 0 + ); + if (Number.isFinite(gb) && gb > 0) return gb; + if (_isMiniMaxM3Model(model)) return 240; + return 0; +} + +function _parseParamsB(text) { + const s = String(text || ''); + const m = s.match(/(\d+(?:\.\d+)?)\s*([bBmMtT])\b/); + if (!m) return 0; + const n = parseFloat(m[1]); + if (!Number.isFinite(n) || n <= 0) return 0; + const unit = m[2].toLowerCase(); + if (unit === 't') return n * 1000; + if (unit === 'b') return n; + if (unit === 'm') return n / 1000; + return 0; +} + +function _knownModelContextMax(model) { + if (_isMiniMaxM3Model(model)) return 1048576; + return 0; +} + +function _modelIdentityText(model) { + return [ + model?.repo_id, + model?.quant_repo, + model?.name, + model?.id, + model?.path, + model?.model_path, + model?.served_model_name, + model?.quant, + model?.format, + ].filter(Boolean).join(' ').toLowerCase(); +} + +function _isMiniMaxM3Model(model) { + const name = _modelIdentityText(model); + return ( + (/minimax/.test(name) && /\bm3\b/.test(name)) + || /minimax-m3/.test(name) + || /models--cyankiwi--minimax-m3-awq-int4/.test(name) + || /cyankiwi\/minimax-m3-awq-int4/.test(name) + ); +} + +function _isMiniMaxM2Model(model) { + const name = _modelIdentityText(model); + return /minimax/.test(name) && /\bm2(?:\.\d+)?\b/.test(name); +} + +function _modelContextMaxForServe(model, explicitMax) { + const explicit = Number(explicitMax || 0); + if (Number.isFinite(explicit) && explicit > 0) return explicit; + const known = _knownModelContextMax(model); + if (known > 0) return known; + for (const key of ['context_length', 'max_position_embeddings', 'n_ctx_train', 'model_max_length', 'max_seq_len']) { + const value = Number(model?.[key] || 0); + if (Number.isFinite(value) && value > 0) return value; + } + const catalogCtx = Number(model?.context || 0); + if (Number.isFinite(catalogCtx) && catalogCtx > 0) return catalogCtx; + return 131072; +} + +function _estimateVllmContextFit(model, fields, modelCtxMax, modelWeightsGb = 0, fitSystem = null) { + const sys = fitSystem || _hwfitCache?.system || {}; + const isMiniMaxM3 = _isMiniMaxM3Model(model); + const gpuIds = String(fields.gpus || '').split(',').map(s => parseInt(s.trim(), 10)).filter(Number.isFinite); + const tp = Math.max(1, parseInt(fields.tp, 10) || gpuIds.length || 1); + const selectedCount = Math.max(1, gpuIds.length || tp); + const groups = Array.isArray(sys.gpu_groups) ? sys.gpu_groups : []; + const activeGroup = sys.active_group || groups[0] || null; + const perGpuGb = Number(activeGroup?.vram_each) + || (Number(sys.gpu_vram_gb) / Math.max(1, Number(sys.gpu_count) || selectedCount)) + || 0; + if (!perGpuGb) { + return { needsHardwareScan: true, reason: 'scan hardware first to estimate context from VRAM' }; + } + + const gpuUtil = Math.min(0.99, Math.max(0.1, parseFloat(fields.gpu_mem) || 0.90)); + const budgetGb = perGpuGb * selectedCount * gpuUtil; + const modelGb = _modelSizeGb(model, modelWeightsGb); + if (!modelGb) return { needsModelSize: true, reason: 'model weight size unknown; scan model files or enter context manually' }; + const modelMax = Math.max(1024, _modelContextMaxForServe(model, modelCtxMax)); + + if (isMiniMaxM3) { + const perGpuBudgetGb = perGpuGb * gpuUtil; + const modelShardGb = modelGb / Math.max(1, tp); + const fixedOverheadGb = Math.max(1.5, perGpuBudgetGb * 0.035); + const freeForKv = perGpuBudgetGb - modelShardGb - fixedOverheadGb; + const kvGbPerToken = (29.25 / 1048576) * (String(fields.vllm_kv_cache_dtype || '').toLowerCase() === 'fp8' ? 1 : 1.8); + if (freeForKv <= 0) { + return { + ctx: 1024, + budgetGb, + modelGb, + kvGbPerToken, + reason: `model shard ${modelShardGb.toFixed(1)}G exceeds per-GPU usable ${perGpuBudgetGb.toFixed(1)}G before KV`, + }; + } + const raw = Math.floor((freeForKv / kvGbPerToken) * 0.99); + const rounded = Math.max(1024, Math.floor(raw / 128) * 128); + const ctx = Math.min(modelMax, rounded); + return { + ctx, + budgetGb, + modelGb, + kvGbPerToken, + reason: `~${ctx.toLocaleString()} tokens fits per-GPU KV (${freeForKv.toFixed(1)}G free)`, + }; + } + + const name = `${model?.repo_id || ''} ${model?.name || ''} ${model?.quant || ''}`; + const lower = name.toLowerCase(); + const isMoE = /\bmoe\b|a\d+b|minimax|deepseek|mixtral|kimi-k2|glm-4\.5/.test(lower); + const totalParams = _parseParamsB(name) || Math.max(1, modelGb / 0.58); + const activeFromName = (() => { + const m = lower.match(/\ba(\d+(?:\.\d+)?)b\b/); + return m ? parseFloat(m[1]) : 0; + })(); + const activeParams = activeFromName || (isMoE ? Math.min(totalParams, 32) : totalParams); + const effectiveActiveParams = (/minimax/.test(lower) && /\bm3\b/.test(lower)) ? 23 : activeParams; + const kvDtype = String(fields.vllm_kv_cache_dtype || '').toLowerCase(); + const kvFactor = kvDtype === 'fp8' ? 0.55 : 1; + const kvGbPerTokenTotal = Math.max(0.00002, 0.000008 * effectiveActiveParams * kvFactor); + const kvGbPerToken = kvGbPerTokenTotal / Math.max(1, tp); + const perGpuBudgetGb = perGpuGb * gpuUtil; + const modelShardGb = modelGb / Math.max(1, tp); + const fixedOverheadGb = Math.max(1.5, perGpuBudgetGb * 0.035); + const freeForKv = perGpuBudgetGb - modelShardGb - fixedOverheadGb; + if (freeForKv <= 0) { + return { + ctx: 1024, + budgetGb, + modelGb, + kvGbPerToken, + reason: `model shard ${modelShardGb.toFixed(1)}G exceeds per-GPU usable ${perGpuBudgetGb.toFixed(1)}G before KV`, + }; + } + const raw = Math.floor(freeForKv / kvGbPerToken); + const rounded = Math.max(1024, Math.floor(raw / 1024) * 1024); + const ctx = Math.min(modelMax, rounded); + return { + ctx, + budgetGb, + modelGb, + kvGbPerToken, + reason: `~${ctx.toLocaleString()} tokens fits per-GPU KV (${freeForKv.toFixed(1)}G free)`, + }; +} + +function _estimateLlamaContextFit(model, fields, modelCtxMax, modelWeightsGb = 0, fitSystem = null, profileData = null) { + const profiles = Array.isArray(profileData?.profiles) ? profileData.profiles : []; + const preferred = profiles.find(p => String(p?.key || '').toLowerCase() === 'balanced') + || profiles.find(p => Number(p?.ctx) > 0) + || null; + const modelMax = Math.max(1024, _modelContextMaxForServe(model, modelCtxMax)); + if (preferred && Number(preferred.ctx) > 0) { + const ctx = Math.min(modelMax, Number(preferred.ctx)); + return { + ctx, + reason: `profile ${preferred.label || preferred.key || 'fit'} fits scanned hardware`, + }; + } + + const sys = fitSystem || _hwfitCache?.system || {}; + const modelGb = _modelSizeGb(model, modelWeightsGb); + const backend = String(fields.backend || '').toLowerCase(); + const llamaMode = String(fields.llama_mode || '').toLowerCase(); + const isCpuMode = backend === 'llamacpp' && llamaMode === 'cpu'; + const isUnifiedMode = backend === 'llamacpp' && (llamaMode === 'unified' || fields.unified_mem); + if (!modelGb) { + return { + ctx: Math.min(modelMax, 32768), + needsModelSize: true, + reason: 'model weight size unknown; using model limit fallback', + }; + } + + if (isCpuMode) { + return { + ctx: Math.min(modelMax, 131072), + modelGb, + reason: 'CPU mode uses system RAM; capped to trained limit', + }; + } + + const gpuIds = String(fields.gpus || '').split(',').map(s => parseInt(s.trim(), 10)).filter(Number.isFinite); + const selectedCount = Math.max(1, gpuIds.length || parseInt(fields.tp, 10) || 1); + const groups = Array.isArray(sys.gpu_groups) ? sys.gpu_groups : []; + const activeGroup = sys.active_group || groups[0] || null; + const totalVramGb = Number(activeGroup?.vram_each) + ? Number(activeGroup.vram_each) * selectedCount + : (Number(sys.gpu_vram_gb) || 0); + if (!totalVramGb) { + return { + ctx: Math.min(modelMax, 32768), + modelGb, + needsHardwareScan: true, + reason: 'scan hardware first; using model limit fallback', + }; + } + + const totalRamGb = Number(sys.total_ram_gb) || 0; + const availableRamGb = Number(sys.available_ram_gb) || 0; + const unifiedPoolGb = isUnifiedMode + ? Math.max( + totalVramGb, + availableRamGb, + totalRamGb > 0 ? totalRamGb * 0.85 : 0 + ) + : totalVramGb; + const usableGb = isUnifiedMode + ? Math.max(1, unifiedPoolGb - Math.max(2.0, unifiedPoolGb * 0.08)) + : Math.max(1, totalVramGb - Math.max(1.0, selectedCount * 0.6)); + const freeForKv = usableGb - modelGb; + const kv = String(fields.cache_type || '').toLowerCase(); + const kvFactor = kv === 'q4_0' ? 0.55 : (kv === 'q8_0' ? 1 : (kv === 'f16' ? 1.9 : 1)); + const kvGbPerToken = Math.max(0.00008, (modelGb / 7.5) * 0.0007 * kvFactor); + if (freeForKv <= 0) { + return { + ctx: Math.min(modelMax, 8192), + modelGb, + kvGbPerToken, + reason: `model ${modelGb.toFixed(1)}G exceeds usable ${isUnifiedMode ? 'unified memory' : 'VRAM'} ${usableGb.toFixed(1)}G before KV`, + }; + } + const raw = Math.floor(freeForKv / kvGbPerToken); + const rounded = Math.max(1024, Math.floor(raw / 1024) * 1024); + const ctx = Math.min(modelMax, rounded); + return { + ctx, + modelGb, + kvGbPerToken, + reason: `~${ctx.toLocaleString()} tokens fits llama.cpp KV (${freeForKv.toFixed(1)}G free ${isUnifiedMode ? 'unified' : 'VRAM'})`, + }; +} + function _selectedServeTarget(panel) { const select = document.getElementById('hwfit-server-select') || document.getElementById('hwfit-dl-server'); const servers = Array.isArray(_envState.servers) ? _envState.servers : []; @@ -117,6 +497,8 @@ function _selectedServeTarget(panel) { : (server?.name || 'local server'); return { host, + serverKey: server ? (_serverKey?.(server) || '') : (select?.value || ''), + serverName: server?.name || '', port: host ? (_getPort(host) || server?.port || '') : '', venv, platform: server?.platform || _envState.platform || '', @@ -152,6 +534,9 @@ function _runtimeNoteText(backend, pkg, target) { const label = labels[backend] || backend; if (!pkg) return `${label} readiness unavailable for ${target.label}.`; const note = pkg.status_note || pkg.update_note || ''; + if (pkg.installed === null || pkg.probe_error) { + return note ? `${label} readiness unavailable for ${target.label}: ${note}` : `${label} readiness unavailable for ${target.label}.`; + } if (pkg.installed) { return note ? `${label} ready on ${target.label}: ${note}` : `${label} ready on ${target.label}.`; } @@ -226,6 +611,13 @@ function _runnableGgufFiles(model) { return primary.length ? primary : files; } +function _selectedGgufSizeGb(model, relPath) { + const file = _runnableGgufFiles(model).find(f => f.rel_path === relPath); + const bytes = Number(file?.size_bytes || 0); + if (!Number.isFinite(bytes) || bytes <= 0) return 0; + return bytes / (1024 ** 3); +} + function _ggufFileLabel(file) { const base = (file.name || file.rel_path || '').split('/').pop(); const size = _formatGgufSize(file.size_bytes); @@ -281,6 +673,12 @@ function _rerenderCachedModels() { else if (sortVal === 'size-desc') allModels.sort((a, b) => _parseSize(b.size) - _parseSize(a.size)); else if (sortVal === 'size-asc') allModels.sort((a, b) => _parseSize(a.size) - _parseSize(b.size)); else if (sortVal === 'recent') allModels.sort((a, b) => (b.mtime || 0) - (a.mtime || 0)); + const favorites = _loadServeFavorites(); + allModels.sort((a, b) => { + const af = favorites.has(String(a.repo_id || '')) ? 1 : 0; + const bf = favorites.has(String(b.repo_id || '')) ? 1 : 0; + return bf - af; + }); let html = ''; let visibleCount = 0; @@ -303,8 +701,9 @@ function _rerenderCachedModels() { // living on the same line as the model name. const _isDownloading = m.status === 'downloading'; const _isDlActive = _isDownloading ? _isActivelyDownloading(m.repo_id) : false; + const _isFavorite = favorites.has(String(m.repo_id || '')); const isSelectMode = document.getElementById('hwfit-cache-select')?.classList.contains('active'); - html += `
`; + html += `
`; html += ``; html += `
`; const _mc = modelColor(m.repo_id) || ''; @@ -314,7 +713,8 @@ function _rerenderCachedModels() { const _downloadingPill = _isDownloading ? ` ${_isDlActive ? 'downloading' : 'stalled'}` : ''; - html += `
${modelLogo(m.repo_id)}${esc(shortName)}${hfLink ? ` HF ↗` : ''}${_runningPill}${_downloadingPill}
`; + const _favoritePill = _isFavorite ? ' pinned' : ''; + html += `
${modelLogo(m.repo_id)}${esc(shortName)}${_favoritePill}${hfLink ? ` HF ↗` : ''}${_runningPill}${_downloadingPill}
`; html += `
${metaParts.join(' \u00b7 ')}
`; html += `
`; const _bk = _detectBackend(m).backend; @@ -397,7 +797,12 @@ function _rerenderCachedModels() { const _deleteIco = ''; const _selectIco = ''; const _schedIco = ''; + const _favNow = _isServeFavorite(repo); + const _favIco = _favNow + ? '' + : ''; const items = []; + items.push({ label: _favNow ? 'Unfavorite' : 'Favorite', icon: _favIco, action: 'favorite' }); if (m && m.status === 'ready') items.push({ label: 'Serve', icon: _serveIco, action: 'serve' }); if (m && m.status === 'downloading') items.push({ label: 'Retry', icon: _retryIco, action: 'retry' }); if (m && m.status === 'ready') items.push({ label: 'Schedule…', icon: _schedIco, action: 'schedule' }); @@ -410,6 +815,11 @@ function _rerenderCachedModels() { div.addEventListener('click', () => { closeDropdown(); if (opt.action === 'serve') item.click(); + else if (opt.action === 'favorite') { + const favored = _toggleServeFavorite(repo); + uiModule.showToast(favored ? 'Favorited — pinned to top' : 'Unfavorited'); + _rerenderCachedModels(); + } else if (opt.action === 'delete') _deleteCachedModel(repo, item, false, m); else if (opt.action === 'retry') _retryCachedModel(repo, m); else if (opt.action === 'schedule') { @@ -532,16 +942,22 @@ function _rerenderCachedModels() { const ss = (_byRepo[repo] && typeof _byRepo[repo] === 'object') ? _byRepo[repo] : (_lastUsed || (_isLegacyFlat ? _allSs : {})); + const _modelSs = (_byRepo[repo] && typeof _byRepo[repo] === 'object') ? _byRepo[repo] : null; + const _repoForcedBackend = !!(_modelSs && _modelSs._forceBackend); + const _isMiniMaxM3 = _isMiniMaxM3Model({ ...m, repo_id: repo }); + const _isMiniMaxM2 = _isMiniMaxM2Model({ ...m, repo_id: repo }); + const _isMiniMaxMSeries = _isMiniMaxM3 || _isMiniMaxM2; + const svm = (k, def) => (_modelSs && _hasOwn(_modelSs, k)) ? _modelSs[k] : def; const detectedBackend = _detectBackend(m).backend; const _allowedBackends = new Set(_isWindows() ? ['llamacpp', 'diffusers'] : (_isMetal() ? ['llamacpp', 'ollama'] : ['vllm', 'sglang', 'llamacpp', 'ollama', 'diffusers'])); - const defaultBackend = (ss._forceBackend && ss.backend && _allowedBackends.has(ss.backend)) + const defaultBackend = (_repoForcedBackend && ss.backend && _allowedBackends.has(ss.backend)) ? ss.backend : detectedBackend; - const savedMatchesBackend = !!ss._forceBackend || (ss.backend || 'vllm') === detectedBackend; + const savedMatchesBackend = _repoForcedBackend || (ss.backend || 'vllm') === detectedBackend; const sv = (k, def) => (ss[k] !== undefined && savedMatchesBackend) ? ss[k] : def; - const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', '1'); + const defaultTp = defaultBackend === 'llamacpp' ? '1' : sv('tp', _isMiniMaxMSeries ? '8' : '1'); const detectedGpuIds = _allGpuIds(_getGpuToggleTotal?.()); const defaultGpus = defaultBackend === 'llamacpp' ? '0' @@ -555,7 +971,7 @@ function _rerenderCachedModels() { // OOMs. _detectModelOptimizations seeds opts.kvCacheDtype for // those families; honour it unless the user has a saved override. const _kvOptsCheck = _detectModelOptimizations(repo); - const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || 'auto'; + const _kvAutoDefault = (_kvOptsCheck && _kvOptsCheck.kvCacheDtype) || (_isMiniMaxMSeries ? 'fp8' : 'auto'); const _kvSelected = sv('vllm_kv_cache_dtype', _kvAutoDefault); const vllmKvCacheOpts = ['auto','fp8'].map(d => ``).join(''); const _l = (name, tip) => `${name}?`; @@ -567,6 +983,11 @@ function _rerenderCachedModels() { const _ggufOptions = _ggufChoices.map(f => `` ).join(''); + const _minimaxM3Snapshot = '/home/pewds/.cache/huggingface/hub/models--cyankiwi--MiniMax-M3-AWQ-INT4/snapshots/4082acbbec1236d21828d55b6bb0fe02ade4ab5b'; + const _defaultServeModel = _isMiniMaxM3 ? _minimaxM3Snapshot : (m.is_local_dir && m.path ? `${m.path}/${repo}` : repo); + const _savedModelPath = String(svm('model_path', _defaultServeModel) || '').trim(); + const _modelPathValue = _isMiniMaxM3 && (!_savedModelPath || _savedModelPath === repo) ? _minimaxM3Snapshot : _savedModelPath; + const _defaultServedModelName = _isMiniMaxM3 ? repo : ''; // Build save slots const _allPresets = _loadPresets(); const _repoShort = repo.split('/').pop(); @@ -583,15 +1004,10 @@ function _rerenderCachedModels() { const _arrowTitle = _modelPresets.length > 0 ? `${_modelPresets.length} saved launch config${_modelPresets.length === 1 ? '' : 's'} for ${_repoShort} — click ▾ to load or delete` : `No saved launch configs for ${_repoShort} yet — click Save to add one`; - // Wrap the Save split in a
+