Merge branch 'main' into dev

Bring main's maintainer-curated work (cookbook scheduler, calendar rendering/sync, settings polish, agent debug loop) into dev so dev is a superset of main (resolves the dev/main drift, #2543).
2026-06-16 01:35:36 -04:00 · 2026-06-05 10:50:51 +02:00
parent b19e5693af e0e250d023
commit 367858a587
33 changed files with 3291 additions and 245 deletions
@@ -19,6 +19,8 @@ from src.auth_helpers import require_user
 from src.tool_implementations import do_manage_notes


+COOKBOOK_READ_SCOPES = {"cookbook:read", "cookbook:launch"}
+COOKBOOK_LAUNCH_SCOPES = {"cookbook:launch"}
 TODO_READ_SCOPES = {"todos:read", "todos:write"}
 TODO_WRITE_SCOPES = {"todos:write"}
 EMAIL_READ_SCOPES = {"email:read", "email:draft", "email:send"}
@@ -130,6 +132,11 @@ def setup_codex_routes(
                    "actions": ["library", "read", "create", "delete"],
                    "available": documents_library_endpoint is not None,
                },
+                "cookbook": {
+                    "read": scoped(COOKBOOK_READ_SCOPES),
+                    "launch": scoped(COOKBOOK_LAUNCH_SCOPES),
+                    "actions": ["tasks", "servers", "output", "serve", "stop"],
+                },
            },
            "safety": {
                "email_send_requires_confirmation": True,
@@ -373,6 +380,374 @@ def setup_codex_routes(
            raise HTTPException(400, f"Invalid document payload: {exc}")
        return await _as_owner(request, owner, documents_create_endpoint, request, req)

+    # ── Cookbook surface ──
+    # Lets the agent run the same launch / monitor / kill loop the user
+    # would do by hand in the Cookbook UI: read the current task list +
+    # tmux output, launch a serve task, stop one.  Two scopes:
+    #   cookbook:read   — list tasks + tail output + list servers
+    #   cookbook:launch — also start/stop serves (host shell exec)
+    # `cookbook:launch` is genuinely powerful: /api/model/serve runs SSH'd
+    # commands on the user's hosts. The existing _validate_serve_cmd
+    # allowlist (vllm/python3/sglang/llama-server/etc., no shell metachars)
+    # keeps the agent inside the same sandbox the UI uses.
+
+    async def _run_shell(cmd: str, timeout: float = 15.0) -> dict:
+        """Run a shell command, return {exit_code, stdout, stderr}."""
+        import asyncio as _asyncio
+        try:
+            proc = await _asyncio.create_subprocess_shell(
+                cmd,
+                stdout=_asyncio.subprocess.PIPE,
+                stderr=_asyncio.subprocess.PIPE,
+            )
+            try:
+                stdout_b, stderr_b = await _asyncio.wait_for(proc.communicate(), timeout=timeout)
+            except _asyncio.TimeoutError:
+                proc.kill()
+                return {"exit_code": -1, "stdout": "", "stderr": "timed out"}
+            return {
+                "exit_code": proc.returncode,
+                "stdout": stdout_b.decode(errors="replace"),
+                "stderr": stderr_b.decode(errors="replace"),
+            }
+        except Exception as exc:
+            return {"exit_code": -1, "stdout": "", "stderr": str(exc)}
+
+    def _read_cookbook_state() -> dict:
+        from pathlib import Path as _Path
+        import os as _os, json as _json
+        p = _Path(_os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
+        if not p.exists():
+            return {}
+        try:
+            return _json.loads(p.read_text(encoding="utf-8"))
+        except Exception:
+            return {}
+
+    def _redact_task(t: dict) -> dict:
+        """Strip secrets before returning to the agent."""
+        clean = {k: v for k, v in t.items() if k not in ("hf_token", "_secrets")}
+        if isinstance(clean.get("payload"), dict):
+            pl = clean["payload"]
+            clean["payload"] = {k: v for k, v in pl.items()
+                                if k not in ("hf_token", "_secrets")}
+        return clean
+
+    @router.get("/cookbook/tasks")
+    async def codex_cookbook_tasks(request: Request):
+        _scope_owner(request, COOKBOOK_READ_SCOPES)
+        state = _read_cookbook_state()
+        tasks = state.get("tasks") or []
+        return {"tasks": [_redact_task(t) for t in tasks]}
+
+    @router.get("/cookbook/servers")
+    async def codex_cookbook_servers(request: Request):
+        _scope_owner(request, COOKBOOK_READ_SCOPES)
+        state = _read_cookbook_state()
+        servers = state.get("env", {}).get("servers") or []
+        # Strip ssh creds / passwords; keep only what's needed to pick a host.
+        cleaned = []
+        for s in servers:
+            cleaned.append({
+                "name": s.get("name"),
+                "host": s.get("host"),
+                "port": s.get("port"),
+                "env": s.get("env"),
+                "envPath": s.get("envPath"),
+                "platform": s.get("platform"),
+                "modelDirs": s.get("modelDirs"),
+            })
+        return {"servers": cleaned}
+
+    @router.get("/cookbook/output/{session_id}")
+    async def codex_cookbook_output(request: Request, session_id: str, tail: int = 400):
+        _scope_owner(request, COOKBOOK_READ_SCOPES)
+        # Defensive: session_id must be the tmux-style id we issue
+        # (`serve-XXXX` / `cookbook-XXXX` / `queue-XXXX`); anything else
+        # would let the agent run arbitrary `tmux capture-pane` targets.
+        import re as _re
+        if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id):
+            raise HTTPException(400, "Invalid session id")
+        tail = max(20, min(int(tail or 400), 4000))
+        # Resolve the task's host (if any) from cookbook state so we can
+        # ssh to the right box, exactly as the UI does in _reconnectTask.
+        state = _read_cookbook_state()
+        tasks = state.get("tasks") or []
+        task = next((t for t in tasks if t.get("sessionId") == session_id), None)
+        if task is None:
+            raise HTTPException(404, "task not found")
+        host = (task.get("remoteHost") or "").strip()
+        ssh_port = (task.get("sshPort") or "").strip()
+        # Prefer the persisted log file over the tmux pane. The pane gets
+        # overwritten by the post-crash neofetch banner + bash prompt the
+        # moment vllm exits; the log file is the raw stdout/stderr and
+        # survives unchanged. Falls back to pane for older tasks predating
+        # the tee-to-log runner change.
+        log_path = f"/tmp/odysseus-tmux/{session_id}.log"
+        inner = (
+            f"if [ -s {log_path} ]; then tail -n {tail} {log_path}; "
+            f"else tmux capture-pane -t {session_id} -p -S -{tail}; fi"
+        )
+        if host:
+            port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
+            import shlex
+            cmd = f"ssh {port_flag}{host} {shlex.quote(inner)}"
+        else:
+            cmd = inner
+        result = await _run_shell(cmd, timeout=15)
+        return {
+            "session_id": session_id,
+            "host": host or "local",
+            "exit_code": result.get("exit_code"),
+            "output": result.get("stdout", ""),
+            "task": _redact_task(task),
+        }
+
+    @router.post("/cookbook/serve")
+    async def codex_cookbook_serve(request: Request, body: dict[str, Any] = Body(default_factory=dict)):
+        _scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
+        # Wraps /api/model/serve with the SAME validation the UI uses.
+        # _validate_serve_cmd (called inside model_serve) rejects shell
+        # metachars and requires the leading binary to be in the
+        # cookbook allowlist (vllm / python3 / sglang / llama-server / ...).
+        from routes.cookbook_helpers import ServeRequest
+        # Accept friendly aliases agents naturally reach for. Without these,
+        # passing `host` silently maps to nothing and the serve runs LOCAL
+        # instead of on the intended remote — exactly the bug an agent
+        # would never debug on its own.
+        norm = dict(body or {})
+        if "host" in norm and "remote_host" not in norm:
+            norm["remote_host"] = norm.pop("host")
+        if "model" in norm and "repo_id" not in norm:
+            norm["repo_id"] = norm.pop("model")
+        if "ssh_port" not in norm and "port" in norm and (str(norm.get("port") or "").isdigit() and int(norm["port"]) >= 1000):
+            # Heuristic: if `port` looks like an SSH port (≥1000) and there's
+            # no explicit ssh_port, treat it as such. UI ports (8000, 8001,
+            # 30000) belong inside the cmd string, not here.
+            pass  # leave as-is — user's `port` here is ambiguous; skip remap.
+        try:
+            req = ServeRequest(**norm)
+        except Exception as exc:
+            raise HTTPException(400, f"Invalid serve payload: {exc}")
+        serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve")
+        # Fall back to importing from the cookbook router registered on app.
+        if serve_endpoint is None:
+            from fastapi import FastAPI
+            app: FastAPI = request.app
+            for route in app.routes:
+                if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()):
+                    serve_endpoint = route.endpoint
+                    break
+        if serve_endpoint is None:
+            raise HTTPException(503, "model serve endpoint unavailable")
+        return await serve_endpoint(request, req)
+
+    @router.post("/cookbook/stop/{session_id}")
+    async def codex_cookbook_stop(request: Request, session_id: str):
+        _scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
+        import re as _re
+        if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id):
+            raise HTTPException(400, "Invalid session id")
+        state = _read_cookbook_state()
+        tasks = state.get("tasks") or []
+        task = next((t for t in tasks if t.get("sessionId") == session_id), None)
+        host = ((task or {}).get("remoteHost") or "").strip()
+        ssh_port = ((task or {}).get("sshPort") or "").strip()
+        if host:
+            port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
+            cmd = f"ssh {port_flag}{host} \"tmux kill-session -t {session_id}\""
+        else:
+            cmd = f"tmux kill-session -t {session_id}"
+        result = await _run_shell(cmd, timeout=10)
+        return {"session_id": session_id, "exit_code": result.get("exit_code"), "host": host or "local"}
+
+    @router.get("/cookbook/cached")
+    async def codex_cookbook_cached(request: Request, host: str | None = None):
+        """List cached models on a configured server (or local if host is omitted).
+        Mirrors `list_cached_models` from the chat agent so external agents have
+        the same inventory view before deciding what to serve/download."""
+        _scope_owner(request, COOKBOOK_READ_SCOPES)
+        # Hit /api/model/cached internally, with the same modelDirs the chat
+        # agent's list_cached_models would resolve from cookbook state.
+        state = _read_cookbook_state()
+        env = state.get("env") if isinstance(state, dict) else {}
+        servers = (env.get("servers") if isinstance(env, dict) else None) or []
+        HF_DEFAULTS = {"~/.cache/huggingface/hub", "~/.cache/huggingface"}
+        def _dirs_for(srv: dict) -> str:
+            mds = srv.get("modelDirs") if isinstance(srv, dict) else None
+            if isinstance(mds, list):
+                extras = [d for d in mds if isinstance(d, str) and d.strip() and d.strip() not in HF_DEFAULTS]
+                return ",".join(extras)
+            if isinstance(mds, str) and mds.strip() not in HF_DEFAULTS:
+                return mds
+            return ""
+        # Resolve friendly host name → real host (matches list_cached_models flow).
+        resolved_host = host or ""
+        srv: dict[str, Any] = {}
+        if host:
+            srv = next(
+                (s for s in servers if isinstance(s, dict)
+                 and (s.get("name") == host or s.get("host") == host)),
+                {},
+            )
+            if srv and srv.get("host"):
+                resolved_host = srv["host"]
+        else:
+            srv = next((s for s in servers if isinstance(s, dict) and not (s.get("host") or "").strip()), {})
+        params: dict[str, str] = {}
+        if resolved_host:
+            params["host"] = resolved_host
+        md = _dirs_for(srv)
+        if md:
+            params["model_dir"] = md
+        if srv.get("port"):
+            params["ssh_port"] = str(srv["port"])
+        if srv.get("platform"):
+            params["platform"] = srv["platform"]
+        cached_endpoint = _find_endpoint(None, "GET", "/api/model/cached")
+        if cached_endpoint is None:
+            from fastapi import FastAPI
+            app: FastAPI = request.app
+            for route in app.routes:
+                if getattr(route, "path", None) == "/api/model/cached" and "GET" in getattr(route, "methods", set()):
+                    cached_endpoint = route.endpoint
+                    break
+        if cached_endpoint is None:
+            raise HTTPException(503, "model cached endpoint unavailable")
+        # The endpoint reads host/model_dir/ssh_port/platform as kwargs.
+        return await cached_endpoint(
+            request,
+            host=params.get("host") or None,
+            model_dir=params.get("model_dir") or None,
+            ssh_port=params.get("ssh_port") or None,
+            platform=params.get("platform") or None,
+        )
+
+    @router.get("/cookbook/presets")
+    async def codex_cookbook_presets(request: Request):
+        """List saved serve presets (model + host + port + launch cmd).
+        Counterpart to `list_serve_presets`. Use BEFORE composing a `serve`
+        body — the user's saved preset usually has the working cmd already."""
+        _scope_owner(request, COOKBOOK_READ_SCOPES)
+        state = _read_cookbook_state()
+        presets = state.get("presets") or []
+        out = []
+        for p in presets:
+            if not isinstance(p, dict):
+                continue
+            out.append({
+                "name": p.get("name"),
+                "model": p.get("model") or p.get("modelId"),
+                "host": p.get("host") or p.get("remoteHost"),
+                "port": p.get("port"),
+                "cmd": p.get("cmd"),
+            })
+        return {"presets": out, "default_host": (state.get("env") or {}).get("defaultServer", "")}
+
+    @router.post("/cookbook/preset/{name}")
+    async def codex_cookbook_serve_preset(request: Request, name: str):
+        """Launch a saved preset by name. Reuses the working cmd + host the
+        user already saved, avoiding the cmd-allowlist trial-and-error loop."""
+        _scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
+        import re as _re
+        if not _re.fullmatch(r"[A-Za-z0-9 _.:@\-]+", name):
+            raise HTTPException(400, "Invalid preset name")
+        state = _read_cookbook_state()
+        presets = state.get("presets") or []
+        lname = name.lower().strip()
+        chosen = next(
+            (p for p in presets if isinstance(p, dict) and (p.get("name") or "").lower() == lname),
+            None,
+        )
+        if chosen is None:
+            chosen = next(
+                (p for p in presets if isinstance(p, dict) and lname in (p.get("name") or "").lower()),
+                None,
+            )
+        if chosen is None:
+            raise HTTPException(404, f"No preset matching {name!r}")
+        repo_id = chosen.get("model") or chosen.get("modelId") or ""
+        cmd = (chosen.get("cmd") or "").strip()
+        host = chosen.get("host") or chosen.get("remoteHost") or ""
+        if not repo_id or not cmd or cmd.startswith("(adopted"):
+            raise HTTPException(400, f"Preset {chosen.get('name')!r} has no launchable cmd "
+                                     "(adopted from external launch). Use POST /cookbook/serve "
+                                     "with the actual cmd instead.")
+        # Reuse the serve handler we already validated.
+        from routes.cookbook_helpers import ServeRequest
+        body = {"repo_id": repo_id, "cmd": cmd}
+        if host:
+            body["remote_host"] = host
+        try:
+            req = ServeRequest(**body)
+        except Exception as exc:
+            raise HTTPException(400, f"Preset payload invalid: {exc}")
+        serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve")
+        if serve_endpoint is None:
+            from fastapi import FastAPI
+            app: FastAPI = request.app
+            for route in app.routes:
+                if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()):
+                    serve_endpoint = route.endpoint
+                    break
+        if serve_endpoint is None:
+            raise HTTPException(503, "model serve endpoint unavailable")
+        return await serve_endpoint(request, req)
+
+    @router.post("/cookbook/adopt")
+    async def codex_cookbook_adopt(request: Request, body: dict[str, Any] = Body(default_factory=dict)):
+        """Adopt an existing tmux session (one started via raw ssh+tmux) into
+        cookbook tracking. Needed when serve_model rejects a cmd and the
+        agent falls back to direct ssh — without adoption the session is
+        invisible to the UI. Body: {tmux_session, model, host?, port?}."""
+        _scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
+        norm = dict(body or {})
+        sess = (norm.get("tmux_session") or norm.get("session_id") or "").strip()
+        model = (norm.get("model") or norm.get("repo_id") or "").strip()
+        host = (norm.get("host") or norm.get("remote_host") or "").strip()
+        port = norm.get("port") or 8000
+        import re as _re
+        if not sess or not _re.fullmatch(r"[a-zA-Z0-9_-]+", sess):
+            raise HTTPException(400, "tmux_session required, [a-zA-Z0-9_-]+ only")
+        if not model:
+            raise HTTPException(400, "model required")
+        # Verify the tmux session exists on the target host before adopting.
+        import shlex
+        if host:
+            check = f"ssh {shlex.quote(host)} 'tmux has-session -t {shlex.quote(sess)}'"
+        else:
+            check = f"tmux has-session -t {shlex.quote(sess)}"
+        chk = await _run_shell(check, timeout=8)
+        if chk.get("exit_code") not in (0, None):
+            raise HTTPException(404, f"tmux session {sess!r} not found on {host or 'local'}")
+        # Write into cookbook_state.json.
+        import time as _t, json as _json
+        from core.atomic_io import atomic_write_json
+        from pathlib import Path as _Path
+        cookbook_state_path = _Path("/app/data/cookbook_state.json")
+        try:
+            state = _json.loads(cookbook_state_path.read_text(encoding="utf-8"))
+        except Exception:
+            state = {}
+        tasks = state.setdefault("tasks", [])
+        if any(isinstance(t, dict) and t.get("sessionId") == sess for t in tasks):
+            return {"ok": True, "already_tracked": True, "session_id": sess}
+        tasks.append({
+            "id": sess, "sessionId": sess,
+            "name": model.split("/")[-1] if "/" in model else model,
+            "type": "serve", "status": "running",
+            "output": f"Adopted externally-launched session {sess!r} on {host or 'local'}.",
+            "ts": int(_t.time() * 1000),
+            "payload": {"repo_id": model, "remote_host": host, "_cmd": "(adopted — launched outside cookbook)", "port": int(port)},
+            "remoteHost": host, "sshPort": "", "platform": "linux",
+            "_serveReady": False, "_endpointAdded": False, "_adoptedExternally": True,
+        })
+        try:
+            atomic_write_json(cookbook_state_path, state)
+        except Exception as exc:
+            raise HTTPException(500, f"state write failed: {exc}")
+        return {"ok": True, "session_id": sess, "host": host or "local"}
+
    return router


@@ -546,6 +546,13 @@ def _append_serve_preflight_exit_lines(runner_lines: list[str], *, keep_shell_op
    runner_lines.append('if [ -n "$ODYSSEUS_PREFLIGHT_EXIT" ]; then')
    runner_lines.append('  echo ""; echo "=== Process exited with code $ODYSSEUS_PREFLIGHT_EXIT ==="')
    if keep_shell_open:
+        # Decouple the post-crash interactive shell from the persistent log
+        # file. fds 3/4 were saved BEFORE the tee redirect at the top of
+        # the runner; restoring them here means the neofetch banner the
+        # user's .zshrc prints lands on the tmux pane only, not in the
+        # log file the agent's tail_serve_output reads.
+        runner_lines.append('  exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true')
+        runner_lines.append('  sleep 0.2  # let tee child flush + exit')
        runner_lines.append('  exec "${SHELL:-/bin/bash}"')
    else:
        runner_lines.append('  exit "$ODYSSEUS_PREFLIGHT_EXIT"')
@@ -563,7 +570,11 @@ def _append_serve_exit_code_lines(
    if is_pip_install:
        runner_lines.append('if [ $ODYSSEUS_CMD_EXIT -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; fi')
    if keep_shell_open:
-        runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="; exec "${SHELL:-/bin/bash}"')
+        runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
+        # See preflight branch above for the rationale on restoring fds 3/4.
+        runner_lines.append('exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true')
+        runner_lines.append('sleep 0.2  # let tee child flush + exit')
+        runner_lines.append('exec "${SHELL:-/bin/bash}"')
    else:
        runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
        runner_lines.append('exit "$ODYSSEUS_CMD_EXIT"')
@@ -801,6 +801,55 @@ def setup_cookbook_routes() -> APIRouter:
        finally:
            db.close()

+    def _pick_free_port_for_ollama(
+        remote: str | None, ssh_port: str | None, start_port: int, max_offset: int
+    ) -> int | None:
+        """Return the first free port in [start_port, start_port+max_offset] on
+        the target host. Used to pick a real bind for `ollama serve` so we
+        don't reattach to an external systemd ollama (or other listener) the
+        Cookbook Stop button can't kill."""
+        import socket
+        if remote:
+            # Probe over SSH. Bash's /dev/tcp gives a portable "is anything
+            # listening" check without requiring ss/netstat/nmap.
+            ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"]
+            if ssh_port and str(ssh_port) != "22":
+                if not _SSH_PORT_RE.match(str(ssh_port)):
+                    return None
+                ssh_base.extend(["-p", str(ssh_port)])
+            host_arg = remote
+            if not _REMOTE_HOST_RE.match(host_arg):
+                return None
+            probe_ports = " ".join(str(start_port + i) for i in range(max_offset + 1))
+            script = (
+                f"for p in {probe_ports}; do "
+                "if ! (exec 3<>/dev/tcp/127.0.0.1/$p) 2>/dev/null; then "
+                "echo $p; exit 0; fi; exec 3<&-; exec 3>&-; done; exit 1"
+            )
+            try:
+                import subprocess
+                r = subprocess.run(
+                    ssh_base + [host_arg, script],
+                    capture_output=True, text=True, timeout=8,
+                )
+                if r.returncode == 0:
+                    out = (r.stdout or "").strip().splitlines()
+                    if out and out[0].isdigit():
+                        return int(out[0])
+            except Exception:
+                return None
+            return None
+        # Local: just try to connect.
+        for off in range(max_offset + 1):
+            p = start_port + off
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.settimeout(0.25)
+                try:
+                    s.connect(("127.0.0.1", p))
+                except (ConnectionRefusedError, socket.timeout, OSError):
+                    return p
+        return None
+
    def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
        """Register a freshly-served LLM as a model endpoint so it appears in the
        model picker without a manual /setup step — the text-model sibling of
@@ -815,21 +864,37 @@ def setup_cookbook_routes() -> APIRouter:
        import re
        from core.database import SessionLocal, ModelEndpoint

-        # Port: an explicit --port wins. Otherwise fall back by backend — Ollama
-        # is the only server in our generated commands that omits --port.
+        # Port: ordered fallbacks so we match whatever the user actually
+        # asked for, not a hardcoded default:
+        #   1. explicit `--port N`  (vllm / sglang / llama-server)
+        #   2. `OLLAMA_HOST=host:port`  (the way Ollama specifies its bind)
+        #   3. fallback by backend (11434 ollama / 8080 llama.cpp)
+        # Previously the OLLAMA_HOST form was silently ignored and we
+        # registered every Ollama endpoint at 11434 — even if the user
+        # set OLLAMA_HOST=0.0.0.0:11435 to avoid colliding with an
+        # existing systemd Ollama, the registered endpoint pointed at
+        # the OLD port and showed as offline.
        port_match = re.search(r'--port\s+(\d+)', req.cmd)
+        ollama_host_match = re.search(r'OLLAMA_HOST=[^\s]*?:(\d+)', req.cmd)
        if port_match:
            port = int(port_match.group(1))
+        elif ollama_host_match:
+            port = int(ollama_host_match.group(1))
        elif "ollama" in req.cmd:
            port = 11434
        else:
            port = 8080  # llama.cpp's llama-server default — the Apple Silicon path

        # Determine host (mirrors the image path: SSH alias for remote serves).
+        # For local serves while Odysseus runs inside Docker, "localhost"
+        # resolves to the container itself — useless. Use host.docker.internal
+        # which compose maps to the actual host, matching what /setup adds
+        # for Ollama by hand.
        if remote:
            host = remote.split("@")[-1] if "@" in remote else remote
        else:
-            host = "localhost"
+            from routes.model_routes import _docker_host_gateway_reachable
+            host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost"

        base_url = f"http://{host}:{port}/v1"

@@ -927,6 +992,19 @@ def setup_cookbook_routes() -> APIRouter:
        session_id = f"serve-{uuid.uuid4().hex[:8]}"
        remote = req.remote_host
        is_windows = req.platform == "windows"
+
+        # Ollama: if the user didn't pin a port, resolve the actual port we'll
+        # bind to here (before runner construction) by probing the target host.
+        # Otherwise the runner script picks one at runtime and `_auto_register`
+        # below still registers the stale 11434 default — which on a host with
+        # a systemd ollama lands on the wrong (unreachable-from-docker) service.
+        if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd:
+            _ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1"
+            _ollama_chosen_port = _pick_free_port_for_ollama(
+                remote, req.ssh_port, start_port=11434, max_offset=10,
+            )
+            if _ollama_chosen_port:
+                req.cmd = f"OLLAMA_HOST={_ollama_bind_host}:{_ollama_chosen_port} {req.cmd}"
        # LOCAL execution on a native-Windows host never uses tmux (detached
        # process path below), regardless of the UI-supplied platform.
        local_windows = IS_WINDOWS and not remote
@@ -998,6 +1076,21 @@ def setup_cookbook_routes() -> APIRouter:
        else:
            # ── Linux/Termux: bash + tmux (existing flow) ──
            runner_lines = ["#!/bin/bash"]
+            # Mirror every line of stdout+stderr into a persistent log file
+            # on the host running the serve. This is the file tail_serve_output
+            # reads when the tmux pane has been overwritten by the post-crash
+            # bash prompt — without it, the agent's diagnostic tool sees the
+            # neofetch banner instead of the actual Python traceback.
+            # We save the original fds to 3/4 so we can RESTORE them before
+            # `exec ${SHELL}` at the end of the script. Without that restore,
+            # the post-crash interactive shell's neofetch banner ALSO gets
+            # teed into the log file and `tail -N` returns ONLY the banner —
+            # the actual traceback ends up earlier than the tail window.
+            runner_lines.append("mkdir -p /tmp/odysseus-tmux 2>/dev/null || true")
+            runner_lines.append("exec 3>&1 4>&2")
+            runner_lines.append(
+                f"exec > >(tee -a /tmp/odysseus-tmux/{session_id}.log) 2>&1"
+            )
            runner_lines.extend(_user_shell_path_bootstrap())
            runner_lines.append('ODYSSEUS_PREFLIGHT_EXIT=""')
            # Put Odysseus's own venv bin on PATH (local runs only) so the serve
@@ -1074,38 +1167,24 @@ def setup_cookbook_routes() -> APIRouter:
                    req.cmd,
                    default_host=_ollama_default_host,
                )
-                # Ollama can be a host binary, a system service, or a Docker
-                # container. If the HTTP API is already reachable, the model is
-                # already served and we should not require a host `ollama` CLI.
+                # Always launch a fresh ollama under tmux so Stop reliably
+                # kills it. If the requested port is busy (e.g. a systemd
+                # ollama on 11434), scan upward for a free one rather than
+                # silently reattaching to an external service that Stop
+                # can't reach.
                runner_lines.append(f'ODYSSEUS_OLLAMA_HOST={_bash_squote(_ollama_host)}')
                runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"')
-                runner_lines.append('ODYSSEUS_OLLAMA_URL=""')
-                runner_lines.append('for _ody_ollama_try in $(seq 1 20); do')
-                runner_lines.append('  for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
-                runner_lines.append('    [ -z "$_ody_ollama_port" ] && continue')
-                runner_lines.append('    for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
-                runner_lines.append('      _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
-                runner_lines.append('      if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
-                runner_lines.append('        ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
-                runner_lines.append('        ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
-                runner_lines.append('        break 3')
-                runner_lines.append('      fi')
-                runner_lines.append('    done')
-                runner_lines.append('  done')
-                runner_lines.append('  [ "$_ody_ollama_try" -eq 1 ] && echo "[odysseus] Waiting for an existing Ollama API on ports ${ODYSSEUS_OLLAMA_PORT}/11434..."')
-                runner_lines.append('  sleep 1')
-                runner_lines.append('done')
-                runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then')
-                runner_lines.append('  if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then')
-                runner_lines.append('    echo "[odysseus] Selected Ollama port ' + _ollama_port + ' was not reachable; using running Ollama on port ${ODYSSEUS_OLLAMA_PORT}."')
+                runner_lines.append('for _ody_off in 0 1 2 3 4 5 6 7 8 9; do')
+                runner_lines.append('  _ody_try_port=$((ODYSSEUS_OLLAMA_PORT + _ody_off))')
+                runner_lines.append('  if ! (exec 3<>/dev/tcp/127.0.0.1/$_ody_try_port) 2>/dev/null; then')
+                runner_lines.append('    exec 3<&-; exec 3>&-')
+                runner_lines.append('    ODYSSEUS_OLLAMA_PORT="$_ody_try_port"')
+                runner_lines.append('    break')
                runner_lines.append('  fi')
-                runner_lines.append('  echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"')
-                runner_lines.append('  echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."')
-                runner_lines.append('  exec bash -i')
-                runner_lines.append('fi')
+                runner_lines.append('  exec 3<&-; exec 3>&-')
+                runner_lines.append('done')
                runner_lines.append('if ! command -v ollama &>/dev/null; then')
-                runner_lines.append('  echo "ERROR: Ollama not found and no Ollama API is reachable on 127.0.0.1, localhost, or host.docker.internal (ports ${ODYSSEUS_OLLAMA_PORT}/11434)."')
-                runner_lines.append('  echo "Install Ollama, start an Ollama service/container on this server, or pick the port where it is already listening."')
+                runner_lines.append('  echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."')
                runner_lines.append('  echo')
                runner_lines.append('  echo "=== Process exited with code 127 ==="')
                runner_lines.append('  exec bash -i')
@@ -1940,6 +2019,153 @@ def setup_cookbook_routes() -> APIRouter:

        return {"models": out}

+    # Rate-limit for the orphan-tmux adoption sweep. The UI polls
+    # tasks/status every ~3s; we don't want to SSH every host on every
+    # poll. 20s is fast enough that a model the agent launched in the
+    # background shows up "almost immediately" in the UI without being
+    # wasteful.
+    _last_orphan_sweep_ts = [0.0]
+    _ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0
+
+    def _maybe_sweep_orphans(tasks: list, state: dict) -> None:
+        """Scan each configured cookbook server for `serve-*` tmux sessions
+        the cookbook doesn't know about and adopt them into state.tasks.
+
+        Writes are conditional: if no orphans are found, nothing is touched.
+        Rate-limited so polling UIs don't trigger SSH on every refresh.
+        """
+        import time as _time
+        import subprocess
+        logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}")
+        now = _time.monotonic()
+        if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S:
+            logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last")
+            return
+        _last_orphan_sweep_ts[0] = now
+
+        env = state.get("env") if isinstance(state, dict) else {}
+        servers = env.get("servers") if isinstance(env, dict) else []
+        logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}")
+        if not isinstance(servers, list):
+            return
+
+        known_sids = {
+            t.get("sessionId") for t in tasks
+            if isinstance(t, dict) and t.get("sessionId")
+        }
+
+        adopted_any = False
+        for srv in servers:
+            if not isinstance(srv, dict):
+                continue
+            host = (srv.get("host") or "").strip()
+            if not host:
+                continue  # local-only entry; the /proc scan handles it
+            if not _REMOTE_HOST_RE.match(host):
+                continue
+            sport = str(srv.get("port") or "").strip()
+            ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"]
+            if sport and sport != "22":
+                if not _SSH_PORT_RE.match(sport):
+                    continue
+                ssh_base.extend(["-p", sport])
+
+            try:
+                ls = subprocess.run(
+                    ssh_base + [host, "tmux ls 2>/dev/null"],
+                    timeout=6, capture_output=True, text=True,
+                )
+            except Exception:
+                continue
+            for line in (ls.stdout or "").splitlines():
+                sid = line.split(":", 1)[0].strip()
+                if not sid or not _SESSION_ID_RE.match(sid):
+                    continue
+                if sid in known_sids:
+                    continue
+                # Adopt any session whose pane is currently running a
+                # known model-server process (checked below). The earlier
+                # prefix gate (serve-/cookbook-) dropped legitimate
+                # serves whenever tmux fell back to numeric IDs, leaving
+                # them invisible in the Cookbook UI — so the user could
+                # neither see nor stop them.
+                # Skip zombie / idle-shell sessions. A tmux session left
+                # over from a crashed vllm just shows a bash prompt —
+                # adopting it would pollute the UI with "running" tasks
+                # that aren't actually serving anything. pane_current_command
+                # is the foreground process in the pane right now; only
+                # real model serves leave a python/vllm/etc. process there.
+                try:
+                    pc = subprocess.run(
+                        ssh_base + [host, "tmux", "list-panes", "-t", sid,
+                                    "-F", "#{pane_current_command}"],
+                        timeout=4, capture_output=True, text=True,
+                    )
+                    cur = (pc.stdout or "").strip().splitlines()
+                except Exception:
+                    cur = []
+                LIVE_PROCS = {"python", "python3", "vllm", "llama-server",
+                              "llama_cpp_main", "sglang", "lmdeploy",
+                              "ollama", "node", "uvicorn"}
+                if not any(c in LIVE_PROCS for c in cur):
+                    continue
+                # Try to recover a plausible repo_id + port from the
+                # pane buffer. Cheap heuristic — if we can't, register
+                # with placeholder fields; the UI still shows it.
+                try:
+                    cap = subprocess.run(
+                        ssh_base + [host, "tmux", "capture-pane", "-t", sid, "-p", "-S", "-300"],
+                        timeout=6, capture_output=True, text=True,
+                    )
+                    pane = cap.stdout or ""
+                except Exception:
+                    pane = ""
+                import re as _re_orphan
+                # vLLM banner: "model   /path/...". Falls back to the
+                # raw vllm-serve command if the banner already scrolled.
+                m_model = _re_orphan.search(r"model\s+(\S+)", pane)
+                model = m_model.group(1) if m_model else ""
+                if not model:
+                    m_serve = _re_orphan.search(r"vllm\s+serve\s+(\S+)", pane)
+                    model = m_serve.group(1) if m_serve else f"adopted:{sid}"
+                m_port = _re_orphan.search(r"--port\s+(\d+)", pane)
+                port = int(m_port.group(1)) if m_port else 0
+
+                import time as _t2
+                tasks.append({
+                    "id": sid,
+                    "sessionId": sid,
+                    "name": model.split("/")[-1] if "/" in model else model,
+                    "type": "serve",
+                    "status": "running",
+                    "output": f"Auto-adopted from orphan tmux session on {host}. "
+                              "Open the task to see live output.",
+                    "ts": int(_t2.time() * 1000),
+                    "payload": {
+                        "repo_id": model,
+                        "remote_host": host,
+                        "_cmd": "(orphan tmux session — original launch cmd unknown)",
+                        "port": port,
+                    },
+                    "remoteHost": host,
+                    "sshPort": sport,
+                    "platform": "linux",
+                    "_serveReady": False,
+                    "_endpointAdded": False,
+                    "_adoptedExternally": True,
+                })
+                known_sids.add(sid)
+                adopted_any = True
+                logger.info(f"auto-adopted orphan tmux session {sid!r} on {host}")
+
+        if adopted_any:
+            try:
+                from core.atomic_io import atomic_write_json
+                state["tasks"] = tasks
+                atomic_write_json(_cookbook_state_path, state)
+            except Exception as e:
+                logger.warning(f"orphan sweep: state write failed: {e}")
+
    @router.get("/api/cookbook/tasks/status")
    async def cookbook_tasks_status(request: Request):
        """Check status of all active cookbook tmux sessions.
@@ -1993,6 +2219,7 @@ def setup_cookbook_routes() -> APIRouter:

        # Load saved tasks from cookbook state
        tasks = []
+        state = {}
        if _cookbook_state_path.exists():
            try:
                state = json.loads(_cookbook_state_path.read_text(encoding="utf-8"))
@@ -2004,6 +2231,21 @@ def setup_cookbook_routes() -> APIRouter:
            except Exception:
                pass

+        # Orphan-tmux auto-adoption sweep. When the agent (or anyone)
+        # SSH-launches a `serve-*` tmux session — usually because
+        # serve_model rejected `source ... && vllm ...` or because of a
+        # manual relaunch via tmux send-keys — that session is invisible
+        # to the cookbook UI even though it's a live model server. The
+        # sweep finds those orphans on each configured remote host and
+        # writes them into state.tasks with _adoptedExternally=True, so
+        # they show up in the UI on the next poll without anyone having
+        # to remember to call adopt_served_model. Rate-limited via the
+        # module-level _last_orphan_sweep so we don't SSH every 3s.
+        try:
+            _maybe_sweep_orphans(tasks, state)
+        except Exception as _sweep_e:
+            logger.warning(f"orphan sweep failed (non-fatal): {_sweep_e!r}")
+
        results = []
        for task in tasks:
            session_id = task.get("sessionId", "")
@@ -2063,7 +2305,12 @@ def setup_cookbook_routes() -> APIRouter:
                if _tport and _tport != "22":
                    ssh_base.extend(["-p", str(_tport)])
                check_cmd = ssh_base + [remote, "tmux", "has-session", "-t", session_id]
-                capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
+                # Capture 500 lines (was 50) so a Python traceback survives
+                # the post-crash neofetch banner + bash prompt that otherwise
+                # fills the visible tail. Without this, output_tail ends up
+                # as just "Locale: C / Ubuntu_Odysseus ❯" and the agent
+                # can't diagnose the actual error.
+                capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"]
            elif IS_WINDOWS:
                # LOCAL Windows task: launched as a detached process (no tmux).
                # Liveness comes from the <session>.pid file, output from the
@@ -2072,7 +2319,7 @@ def setup_cookbook_routes() -> APIRouter:
                capture_cmd = None
            else:
                check_cmd = ["tmux", "has-session", "-t", session_id]
-                capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
+                capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"]

            local_win_task = (not remote) and IS_WINDOWS

@@ -18,6 +18,119 @@ from routes.prefs_routes import _load_for_user, _save_for_user
 logger = logging.getLogger(__name__)


+def _maybe_cascade_calendar_event(task) -> None:
+    """Delete the linked calendar event when a cookbook_serve task is
+    removed. Two lookup strategies:
+
+      1. PRIMARY — `cookbook_event_uid` marker stashed in task.prompt
+         by cookbookSchedule.js right after creating the event. Direct
+         UID match, no ambiguity.
+
+      2. FALLBACK — for tasks created before the marker was wired up
+         (or when the PATCH to add the marker failed silently), scan
+         the Cookbook calendar for events whose summary equals the
+         task name and delete the matches.
+
+    Best-effort throughout: errors are logged but never block the task
+    deletion itself."""
+    if not task or task.task_type != "action" or task.action != "cookbook_serve":
+        return
+
+    import httpx
+    from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN
+    headers = {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN}
+    if task.owner:
+        headers["X-Odysseus-Owner"] = task.owner
+
+    # Strategy 1: explicit UID marker in prompt.
+    event_uid = ""
+    if task.prompt:
+        try:
+            cfg = json.loads(task.prompt)
+            if isinstance(cfg, dict):
+                event_uid = (cfg.get("cookbook_event_uid") or "").strip()
+        except Exception:
+            pass
+
+    def _try_delete(uid: str) -> bool:
+        try:
+            with httpx.Client(timeout=10) as client:
+                r = client.delete(
+                    f"http://localhost:7000/api/calendar/events/{uid}",
+                    headers=headers,
+                )
+                if r.status_code >= 400:
+                    logger.info(
+                        f"task delete: cascade calendar event {uid} returned "
+                        f"HTTP {r.status_code}"
+                    )
+                    return False
+                return True
+        except Exception as e:
+            logger.warning(f"task delete: cascade calendar event {uid} failed: {e}")
+            return False
+
+    if event_uid:
+        _try_delete(event_uid)
+        return
+
+    # Strategy 2: scan the Cookbook calendar for matching summaries.
+    # Only runs for tasks missing the marker (old tasks or PATCH failures).
+    if not task.name:
+        return
+    try:
+        with httpx.Client(timeout=10) as client:
+            # Find the Cookbook calendar.
+            cal_r = client.get("http://localhost:7000/api/calendar/calendars", headers=headers)
+            if cal_r.status_code >= 400:
+                return
+            cals = (cal_r.json() or {}).get("calendars", [])
+            cookbook_cal = next(
+                (c for c in cals if (c.get("name") or "").lower() == "cookbook"),
+                None,
+            )
+            if not cookbook_cal:
+                return
+            cal_href = cookbook_cal.get("href") or cookbook_cal.get("id") or ""
+            # List events in a wide window to catch recurring + upcoming.
+            from datetime import datetime as _dt, timedelta as _td, timezone as _tz
+            now = _dt.now(_tz.utc)
+            start = (now - _td(days=30)).isoformat()
+            end = (now + _td(days=365)).isoformat()
+            ev_r = client.get(
+                "http://localhost:7000/api/calendar/events",
+                params={"start": start, "end": end, "calendar": cal_href},
+                headers=headers,
+            )
+            if ev_r.status_code >= 400:
+                return
+            events = (ev_r.json() or {}).get("events", [])
+            # Match by exact summary. Tasks named "Serve: <model>" are
+            # created from the schedule modal; the event's summary mirrors
+            # the task name 1:1 by design.
+            target = (task.name or "").strip()
+            uids_to_delete = set()
+            for ev in events:
+                if (ev.get("summary") or "").strip() != target:
+                    continue
+                uid = ev.get("uid") or ev.get("id") or ""
+                # Strip the "::occurrence" suffix on recurring expansions —
+                # we want to delete the MASTER once, not each instance.
+                if "::" in uid:
+                    uid = uid.split("::", 1)[0]
+                if uid:
+                    uids_to_delete.add(uid)
+            for uid in uids_to_delete:
+                _try_delete(uid)
+            if uids_to_delete:
+                logger.info(
+                    f"task delete: cascade matched {len(uids_to_delete)} calendar event(s) "
+                    f"by summary fallback for task {task.id} ({target!r})"
+                )
+    except Exception as e:
+        logger.warning(f"task delete: cascade fallback scan failed: {e}")
+
+
 class TaskCreate(BaseModel):
    name: Optional[str] = None
    prompt: Optional[str] = None
@@ -616,6 +729,12 @@ def setup_task_routes(task_scheduler) -> APIRouter:
                raise HTTPException(404, "Task not found")
            if user and task.owner != user:
                raise HTTPException(403, "Access denied")
+            # Cascade: cookbook_serve tasks may have a linked calendar
+            # event (created via the "Create event in calendar" toggle
+            # in the schedule modal). If so, delete the calendar event
+            # too so the calendar doesn't end up holding a phantom event
+            # for a task that no longer exists.
+            _maybe_cascade_calendar_event(task)
            db.delete(task)
            db.commit()
            return {"ok": True}