Merge branch 'main' into dev

Bring main's maintainer-curated work (cookbook scheduler, calendar rendering/sync, settings polish, agent debug loop) into dev so dev is a superset of main (resolves the dev/main drift, #2543).
This commit is contained in:
Kenny Van de Maele
2026-06-05 10:50:51 +02:00
33 changed files with 3291 additions and 245 deletions
+375
View File
@@ -19,6 +19,8 @@ from src.auth_helpers import require_user
from src.tool_implementations import do_manage_notes
COOKBOOK_READ_SCOPES = {"cookbook:read", "cookbook:launch"}
COOKBOOK_LAUNCH_SCOPES = {"cookbook:launch"}
TODO_READ_SCOPES = {"todos:read", "todos:write"}
TODO_WRITE_SCOPES = {"todos:write"}
EMAIL_READ_SCOPES = {"email:read", "email:draft", "email:send"}
@@ -130,6 +132,11 @@ def setup_codex_routes(
"actions": ["library", "read", "create", "delete"],
"available": documents_library_endpoint is not None,
},
"cookbook": {
"read": scoped(COOKBOOK_READ_SCOPES),
"launch": scoped(COOKBOOK_LAUNCH_SCOPES),
"actions": ["tasks", "servers", "output", "serve", "stop"],
},
},
"safety": {
"email_send_requires_confirmation": True,
@@ -373,6 +380,374 @@ def setup_codex_routes(
raise HTTPException(400, f"Invalid document payload: {exc}")
return await _as_owner(request, owner, documents_create_endpoint, request, req)
# ── Cookbook surface ──
# Lets the agent run the same launch / monitor / kill loop the user
# would do by hand in the Cookbook UI: read the current task list +
# tmux output, launch a serve task, stop one. Two scopes:
# cookbook:read — list tasks + tail output + list servers
# cookbook:launch — also start/stop serves (host shell exec)
# `cookbook:launch` is genuinely powerful: /api/model/serve runs SSH'd
# commands on the user's hosts. The existing _validate_serve_cmd
# allowlist (vllm/python3/sglang/llama-server/etc., no shell metachars)
# keeps the agent inside the same sandbox the UI uses.
async def _run_shell(cmd: str, timeout: float = 15.0) -> dict:
"""Run a shell command, return {exit_code, stdout, stderr}."""
import asyncio as _asyncio
try:
proc = await _asyncio.create_subprocess_shell(
cmd,
stdout=_asyncio.subprocess.PIPE,
stderr=_asyncio.subprocess.PIPE,
)
try:
stdout_b, stderr_b = await _asyncio.wait_for(proc.communicate(), timeout=timeout)
except _asyncio.TimeoutError:
proc.kill()
return {"exit_code": -1, "stdout": "", "stderr": "timed out"}
return {
"exit_code": proc.returncode,
"stdout": stdout_b.decode(errors="replace"),
"stderr": stderr_b.decode(errors="replace"),
}
except Exception as exc:
return {"exit_code": -1, "stdout": "", "stderr": str(exc)}
def _read_cookbook_state() -> dict:
from pathlib import Path as _Path
import os as _os, json as _json
p = _Path(_os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
if not p.exists():
return {}
try:
return _json.loads(p.read_text(encoding="utf-8"))
except Exception:
return {}
def _redact_task(t: dict) -> dict:
"""Strip secrets before returning to the agent."""
clean = {k: v for k, v in t.items() if k not in ("hf_token", "_secrets")}
if isinstance(clean.get("payload"), dict):
pl = clean["payload"]
clean["payload"] = {k: v for k, v in pl.items()
if k not in ("hf_token", "_secrets")}
return clean
@router.get("/cookbook/tasks")
async def codex_cookbook_tasks(request: Request):
_scope_owner(request, COOKBOOK_READ_SCOPES)
state = _read_cookbook_state()
tasks = state.get("tasks") or []
return {"tasks": [_redact_task(t) for t in tasks]}
@router.get("/cookbook/servers")
async def codex_cookbook_servers(request: Request):
_scope_owner(request, COOKBOOK_READ_SCOPES)
state = _read_cookbook_state()
servers = state.get("env", {}).get("servers") or []
# Strip ssh creds / passwords; keep only what's needed to pick a host.
cleaned = []
for s in servers:
cleaned.append({
"name": s.get("name"),
"host": s.get("host"),
"port": s.get("port"),
"env": s.get("env"),
"envPath": s.get("envPath"),
"platform": s.get("platform"),
"modelDirs": s.get("modelDirs"),
})
return {"servers": cleaned}
@router.get("/cookbook/output/{session_id}")
async def codex_cookbook_output(request: Request, session_id: str, tail: int = 400):
_scope_owner(request, COOKBOOK_READ_SCOPES)
# Defensive: session_id must be the tmux-style id we issue
# (`serve-XXXX` / `cookbook-XXXX` / `queue-XXXX`); anything else
# would let the agent run arbitrary `tmux capture-pane` targets.
import re as _re
if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id):
raise HTTPException(400, "Invalid session id")
tail = max(20, min(int(tail or 400), 4000))
# Resolve the task's host (if any) from cookbook state so we can
# ssh to the right box, exactly as the UI does in _reconnectTask.
state = _read_cookbook_state()
tasks = state.get("tasks") or []
task = next((t for t in tasks if t.get("sessionId") == session_id), None)
if task is None:
raise HTTPException(404, "task not found")
host = (task.get("remoteHost") or "").strip()
ssh_port = (task.get("sshPort") or "").strip()
# Prefer the persisted log file over the tmux pane. The pane gets
# overwritten by the post-crash neofetch banner + bash prompt the
# moment vllm exits; the log file is the raw stdout/stderr and
# survives unchanged. Falls back to pane for older tasks predating
# the tee-to-log runner change.
log_path = f"/tmp/odysseus-tmux/{session_id}.log"
inner = (
f"if [ -s {log_path} ]; then tail -n {tail} {log_path}; "
f"else tmux capture-pane -t {session_id} -p -S -{tail}; fi"
)
if host:
port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
import shlex
cmd = f"ssh {port_flag}{host} {shlex.quote(inner)}"
else:
cmd = inner
result = await _run_shell(cmd, timeout=15)
return {
"session_id": session_id,
"host": host or "local",
"exit_code": result.get("exit_code"),
"output": result.get("stdout", ""),
"task": _redact_task(task),
}
@router.post("/cookbook/serve")
async def codex_cookbook_serve(request: Request, body: dict[str, Any] = Body(default_factory=dict)):
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
# Wraps /api/model/serve with the SAME validation the UI uses.
# _validate_serve_cmd (called inside model_serve) rejects shell
# metachars and requires the leading binary to be in the
# cookbook allowlist (vllm / python3 / sglang / llama-server / ...).
from routes.cookbook_helpers import ServeRequest
# Accept friendly aliases agents naturally reach for. Without these,
# passing `host` silently maps to nothing and the serve runs LOCAL
# instead of on the intended remote — exactly the bug an agent
# would never debug on its own.
norm = dict(body or {})
if "host" in norm and "remote_host" not in norm:
norm["remote_host"] = norm.pop("host")
if "model" in norm and "repo_id" not in norm:
norm["repo_id"] = norm.pop("model")
if "ssh_port" not in norm and "port" in norm and (str(norm.get("port") or "").isdigit() and int(norm["port"]) >= 1000):
# Heuristic: if `port` looks like an SSH port (≥1000) and there's
# no explicit ssh_port, treat it as such. UI ports (8000, 8001,
# 30000) belong inside the cmd string, not here.
pass # leave as-is — user's `port` here is ambiguous; skip remap.
try:
req = ServeRequest(**norm)
except Exception as exc:
raise HTTPException(400, f"Invalid serve payload: {exc}")
serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve")
# Fall back to importing from the cookbook router registered on app.
if serve_endpoint is None:
from fastapi import FastAPI
app: FastAPI = request.app
for route in app.routes:
if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()):
serve_endpoint = route.endpoint
break
if serve_endpoint is None:
raise HTTPException(503, "model serve endpoint unavailable")
return await serve_endpoint(request, req)
@router.post("/cookbook/stop/{session_id}")
async def codex_cookbook_stop(request: Request, session_id: str):
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
import re as _re
if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id):
raise HTTPException(400, "Invalid session id")
state = _read_cookbook_state()
tasks = state.get("tasks") or []
task = next((t for t in tasks if t.get("sessionId") == session_id), None)
host = ((task or {}).get("remoteHost") or "").strip()
ssh_port = ((task or {}).get("sshPort") or "").strip()
if host:
port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
cmd = f"ssh {port_flag}{host} \"tmux kill-session -t {session_id}\""
else:
cmd = f"tmux kill-session -t {session_id}"
result = await _run_shell(cmd, timeout=10)
return {"session_id": session_id, "exit_code": result.get("exit_code"), "host": host or "local"}
@router.get("/cookbook/cached")
async def codex_cookbook_cached(request: Request, host: str | None = None):
"""List cached models on a configured server (or local if host is omitted).
Mirrors `list_cached_models` from the chat agent so external agents have
the same inventory view before deciding what to serve/download."""
_scope_owner(request, COOKBOOK_READ_SCOPES)
# Hit /api/model/cached internally, with the same modelDirs the chat
# agent's list_cached_models would resolve from cookbook state.
state = _read_cookbook_state()
env = state.get("env") if isinstance(state, dict) else {}
servers = (env.get("servers") if isinstance(env, dict) else None) or []
HF_DEFAULTS = {"~/.cache/huggingface/hub", "~/.cache/huggingface"}
def _dirs_for(srv: dict) -> str:
mds = srv.get("modelDirs") if isinstance(srv, dict) else None
if isinstance(mds, list):
extras = [d for d in mds if isinstance(d, str) and d.strip() and d.strip() not in HF_DEFAULTS]
return ",".join(extras)
if isinstance(mds, str) and mds.strip() not in HF_DEFAULTS:
return mds
return ""
# Resolve friendly host name → real host (matches list_cached_models flow).
resolved_host = host or ""
srv: dict[str, Any] = {}
if host:
srv = next(
(s for s in servers if isinstance(s, dict)
and (s.get("name") == host or s.get("host") == host)),
{},
)
if srv and srv.get("host"):
resolved_host = srv["host"]
else:
srv = next((s for s in servers if isinstance(s, dict) and not (s.get("host") or "").strip()), {})
params: dict[str, str] = {}
if resolved_host:
params["host"] = resolved_host
md = _dirs_for(srv)
if md:
params["model_dir"] = md
if srv.get("port"):
params["ssh_port"] = str(srv["port"])
if srv.get("platform"):
params["platform"] = srv["platform"]
cached_endpoint = _find_endpoint(None, "GET", "/api/model/cached")
if cached_endpoint is None:
from fastapi import FastAPI
app: FastAPI = request.app
for route in app.routes:
if getattr(route, "path", None) == "/api/model/cached" and "GET" in getattr(route, "methods", set()):
cached_endpoint = route.endpoint
break
if cached_endpoint is None:
raise HTTPException(503, "model cached endpoint unavailable")
# The endpoint reads host/model_dir/ssh_port/platform as kwargs.
return await cached_endpoint(
request,
host=params.get("host") or None,
model_dir=params.get("model_dir") or None,
ssh_port=params.get("ssh_port") or None,
platform=params.get("platform") or None,
)
@router.get("/cookbook/presets")
async def codex_cookbook_presets(request: Request):
"""List saved serve presets (model + host + port + launch cmd).
Counterpart to `list_serve_presets`. Use BEFORE composing a `serve`
body — the user's saved preset usually has the working cmd already."""
_scope_owner(request, COOKBOOK_READ_SCOPES)
state = _read_cookbook_state()
presets = state.get("presets") or []
out = []
for p in presets:
if not isinstance(p, dict):
continue
out.append({
"name": p.get("name"),
"model": p.get("model") or p.get("modelId"),
"host": p.get("host") or p.get("remoteHost"),
"port": p.get("port"),
"cmd": p.get("cmd"),
})
return {"presets": out, "default_host": (state.get("env") or {}).get("defaultServer", "")}
@router.post("/cookbook/preset/{name}")
async def codex_cookbook_serve_preset(request: Request, name: str):
"""Launch a saved preset by name. Reuses the working cmd + host the
user already saved, avoiding the cmd-allowlist trial-and-error loop."""
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
import re as _re
if not _re.fullmatch(r"[A-Za-z0-9 _.:@\-]+", name):
raise HTTPException(400, "Invalid preset name")
state = _read_cookbook_state()
presets = state.get("presets") or []
lname = name.lower().strip()
chosen = next(
(p for p in presets if isinstance(p, dict) and (p.get("name") or "").lower() == lname),
None,
)
if chosen is None:
chosen = next(
(p for p in presets if isinstance(p, dict) and lname in (p.get("name") or "").lower()),
None,
)
if chosen is None:
raise HTTPException(404, f"No preset matching {name!r}")
repo_id = chosen.get("model") or chosen.get("modelId") or ""
cmd = (chosen.get("cmd") or "").strip()
host = chosen.get("host") or chosen.get("remoteHost") or ""
if not repo_id or not cmd or cmd.startswith("(adopted"):
raise HTTPException(400, f"Preset {chosen.get('name')!r} has no launchable cmd "
"(adopted from external launch). Use POST /cookbook/serve "
"with the actual cmd instead.")
# Reuse the serve handler we already validated.
from routes.cookbook_helpers import ServeRequest
body = {"repo_id": repo_id, "cmd": cmd}
if host:
body["remote_host"] = host
try:
req = ServeRequest(**body)
except Exception as exc:
raise HTTPException(400, f"Preset payload invalid: {exc}")
serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve")
if serve_endpoint is None:
from fastapi import FastAPI
app: FastAPI = request.app
for route in app.routes:
if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()):
serve_endpoint = route.endpoint
break
if serve_endpoint is None:
raise HTTPException(503, "model serve endpoint unavailable")
return await serve_endpoint(request, req)
@router.post("/cookbook/adopt")
async def codex_cookbook_adopt(request: Request, body: dict[str, Any] = Body(default_factory=dict)):
"""Adopt an existing tmux session (one started via raw ssh+tmux) into
cookbook tracking. Needed when serve_model rejects a cmd and the
agent falls back to direct ssh — without adoption the session is
invisible to the UI. Body: {tmux_session, model, host?, port?}."""
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
norm = dict(body or {})
sess = (norm.get("tmux_session") or norm.get("session_id") or "").strip()
model = (norm.get("model") or norm.get("repo_id") or "").strip()
host = (norm.get("host") or norm.get("remote_host") or "").strip()
port = norm.get("port") or 8000
import re as _re
if not sess or not _re.fullmatch(r"[a-zA-Z0-9_-]+", sess):
raise HTTPException(400, "tmux_session required, [a-zA-Z0-9_-]+ only")
if not model:
raise HTTPException(400, "model required")
# Verify the tmux session exists on the target host before adopting.
import shlex
if host:
check = f"ssh {shlex.quote(host)} 'tmux has-session -t {shlex.quote(sess)}'"
else:
check = f"tmux has-session -t {shlex.quote(sess)}"
chk = await _run_shell(check, timeout=8)
if chk.get("exit_code") not in (0, None):
raise HTTPException(404, f"tmux session {sess!r} not found on {host or 'local'}")
# Write into cookbook_state.json.
import time as _t, json as _json
from core.atomic_io import atomic_write_json
from pathlib import Path as _Path
cookbook_state_path = _Path("/app/data/cookbook_state.json")
try:
state = _json.loads(cookbook_state_path.read_text(encoding="utf-8"))
except Exception:
state = {}
tasks = state.setdefault("tasks", [])
if any(isinstance(t, dict) and t.get("sessionId") == sess for t in tasks):
return {"ok": True, "already_tracked": True, "session_id": sess}
tasks.append({
"id": sess, "sessionId": sess,
"name": model.split("/")[-1] if "/" in model else model,
"type": "serve", "status": "running",
"output": f"Adopted externally-launched session {sess!r} on {host or 'local'}.",
"ts": int(_t.time() * 1000),
"payload": {"repo_id": model, "remote_host": host, "_cmd": "(adopted — launched outside cookbook)", "port": int(port)},
"remoteHost": host, "sshPort": "", "platform": "linux",
"_serveReady": False, "_endpointAdded": False, "_adoptedExternally": True,
})
try:
atomic_write_json(cookbook_state_path, state)
except Exception as exc:
raise HTTPException(500, f"state write failed: {exc}")
return {"ok": True, "session_id": sess, "host": host or "local"}
return router
+12 -1
View File
@@ -546,6 +546,13 @@ def _append_serve_preflight_exit_lines(runner_lines: list[str], *, keep_shell_op
runner_lines.append('if [ -n "$ODYSSEUS_PREFLIGHT_EXIT" ]; then')
runner_lines.append(' echo ""; echo "=== Process exited with code $ODYSSEUS_PREFLIGHT_EXIT ==="')
if keep_shell_open:
# Decouple the post-crash interactive shell from the persistent log
# file. fds 3/4 were saved BEFORE the tee redirect at the top of
# the runner; restoring them here means the neofetch banner the
# user's .zshrc prints lands on the tmux pane only, not in the
# log file the agent's tail_serve_output reads.
runner_lines.append(' exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true')
runner_lines.append(' sleep 0.2 # let tee child flush + exit')
runner_lines.append(' exec "${SHELL:-/bin/bash}"')
else:
runner_lines.append(' exit "$ODYSSEUS_PREFLIGHT_EXIT"')
@@ -563,7 +570,11 @@ def _append_serve_exit_code_lines(
if is_pip_install:
runner_lines.append('if [ $ODYSSEUS_CMD_EXIT -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; fi')
if keep_shell_open:
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="; exec "${SHELL:-/bin/bash}"')
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
# See preflight branch above for the rationale on restoring fds 3/4.
runner_lines.append('exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true')
runner_lines.append('sleep 0.2 # let tee child flush + exit')
runner_lines.append('exec "${SHELL:-/bin/bash}"')
else:
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
runner_lines.append('exit "$ODYSSEUS_CMD_EXIT"')
+280 -33
View File
@@ -801,6 +801,55 @@ def setup_cookbook_routes() -> APIRouter:
finally:
db.close()
def _pick_free_port_for_ollama(
remote: str | None, ssh_port: str | None, start_port: int, max_offset: int
) -> int | None:
"""Return the first free port in [start_port, start_port+max_offset] on
the target host. Used to pick a real bind for `ollama serve` so we
don't reattach to an external systemd ollama (or other listener) the
Cookbook Stop button can't kill."""
import socket
if remote:
# Probe over SSH. Bash's /dev/tcp gives a portable "is anything
# listening" check without requiring ss/netstat/nmap.
ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"]
if ssh_port and str(ssh_port) != "22":
if not _SSH_PORT_RE.match(str(ssh_port)):
return None
ssh_base.extend(["-p", str(ssh_port)])
host_arg = remote
if not _REMOTE_HOST_RE.match(host_arg):
return None
probe_ports = " ".join(str(start_port + i) for i in range(max_offset + 1))
script = (
f"for p in {probe_ports}; do "
"if ! (exec 3<>/dev/tcp/127.0.0.1/$p) 2>/dev/null; then "
"echo $p; exit 0; fi; exec 3<&-; exec 3>&-; done; exit 1"
)
try:
import subprocess
r = subprocess.run(
ssh_base + [host_arg, script],
capture_output=True, text=True, timeout=8,
)
if r.returncode == 0:
out = (r.stdout or "").strip().splitlines()
if out and out[0].isdigit():
return int(out[0])
except Exception:
return None
return None
# Local: just try to connect.
for off in range(max_offset + 1):
p = start_port + off
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.25)
try:
s.connect(("127.0.0.1", p))
except (ConnectionRefusedError, socket.timeout, OSError):
return p
return None
def _auto_register_llm_endpoint(req: ServeRequest, remote: str | None) -> str | None:
"""Register a freshly-served LLM as a model endpoint so it appears in the
model picker without a manual /setup step — the text-model sibling of
@@ -815,21 +864,37 @@ def setup_cookbook_routes() -> APIRouter:
import re
from core.database import SessionLocal, ModelEndpoint
# Port: an explicit --port wins. Otherwise fall back by backend — Ollama
# is the only server in our generated commands that omits --port.
# Port: ordered fallbacks so we match whatever the user actually
# asked for, not a hardcoded default:
# 1. explicit `--port N` (vllm / sglang / llama-server)
# 2. `OLLAMA_HOST=host:port` (the way Ollama specifies its bind)
# 3. fallback by backend (11434 ollama / 8080 llama.cpp)
# Previously the OLLAMA_HOST form was silently ignored and we
# registered every Ollama endpoint at 11434 — even if the user
# set OLLAMA_HOST=0.0.0.0:11435 to avoid colliding with an
# existing systemd Ollama, the registered endpoint pointed at
# the OLD port and showed as offline.
port_match = re.search(r'--port\s+(\d+)', req.cmd)
ollama_host_match = re.search(r'OLLAMA_HOST=[^\s]*?:(\d+)', req.cmd)
if port_match:
port = int(port_match.group(1))
elif ollama_host_match:
port = int(ollama_host_match.group(1))
elif "ollama" in req.cmd:
port = 11434
else:
port = 8080 # llama.cpp's llama-server default — the Apple Silicon path
# Determine host (mirrors the image path: SSH alias for remote serves).
# For local serves while Odysseus runs inside Docker, "localhost"
# resolves to the container itself — useless. Use host.docker.internal
# which compose maps to the actual host, matching what /setup adds
# for Ollama by hand.
if remote:
host = remote.split("@")[-1] if "@" in remote else remote
else:
host = "localhost"
from routes.model_routes import _docker_host_gateway_reachable
host = "host.docker.internal" if _docker_host_gateway_reachable() else "localhost"
base_url = f"http://{host}:{port}/v1"
@@ -927,6 +992,19 @@ def setup_cookbook_routes() -> APIRouter:
session_id = f"serve-{uuid.uuid4().hex[:8]}"
remote = req.remote_host
is_windows = req.platform == "windows"
# Ollama: if the user didn't pin a port, resolve the actual port we'll
# bind to here (before runner construction) by probing the target host.
# Otherwise the runner script picks one at runtime and `_auto_register`
# below still registers the stale 11434 default — which on a host with
# a systemd ollama lands on the wrong (unreachable-from-docker) service.
if "ollama" in req.cmd and "OLLAMA_HOST=" not in req.cmd:
_ollama_bind_host = "0.0.0.0" if remote else "127.0.0.1"
_ollama_chosen_port = _pick_free_port_for_ollama(
remote, req.ssh_port, start_port=11434, max_offset=10,
)
if _ollama_chosen_port:
req.cmd = f"OLLAMA_HOST={_ollama_bind_host}:{_ollama_chosen_port} {req.cmd}"
# LOCAL execution on a native-Windows host never uses tmux (detached
# process path below), regardless of the UI-supplied platform.
local_windows = IS_WINDOWS and not remote
@@ -998,6 +1076,21 @@ def setup_cookbook_routes() -> APIRouter:
else:
# ── Linux/Termux: bash + tmux (existing flow) ──
runner_lines = ["#!/bin/bash"]
# Mirror every line of stdout+stderr into a persistent log file
# on the host running the serve. This is the file tail_serve_output
# reads when the tmux pane has been overwritten by the post-crash
# bash prompt — without it, the agent's diagnostic tool sees the
# neofetch banner instead of the actual Python traceback.
# We save the original fds to 3/4 so we can RESTORE them before
# `exec ${SHELL}` at the end of the script. Without that restore,
# the post-crash interactive shell's neofetch banner ALSO gets
# teed into the log file and `tail -N` returns ONLY the banner —
# the actual traceback ends up earlier than the tail window.
runner_lines.append("mkdir -p /tmp/odysseus-tmux 2>/dev/null || true")
runner_lines.append("exec 3>&1 4>&2")
runner_lines.append(
f"exec > >(tee -a /tmp/odysseus-tmux/{session_id}.log) 2>&1"
)
runner_lines.extend(_user_shell_path_bootstrap())
runner_lines.append('ODYSSEUS_PREFLIGHT_EXIT=""')
# Put Odysseus's own venv bin on PATH (local runs only) so the serve
@@ -1074,38 +1167,24 @@ def setup_cookbook_routes() -> APIRouter:
req.cmd,
default_host=_ollama_default_host,
)
# Ollama can be a host binary, a system service, or a Docker
# container. If the HTTP API is already reachable, the model is
# already served and we should not require a host `ollama` CLI.
# Always launch a fresh ollama under tmux so Stop reliably
# kills it. If the requested port is busy (e.g. a systemd
# ollama on 11434), scan upward for a free one rather than
# silently reattaching to an external service that Stop
# can't reach.
runner_lines.append(f'ODYSSEUS_OLLAMA_HOST={_bash_squote(_ollama_host)}')
runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"')
runner_lines.append('ODYSSEUS_OLLAMA_URL=""')
runner_lines.append('for _ody_ollama_try in $(seq 1 20); do')
runner_lines.append(' for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue')
runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
runner_lines.append(' break 3')
runner_lines.append(' fi')
runner_lines.append(' done')
runner_lines.append(' done')
runner_lines.append(' [ "$_ody_ollama_try" -eq 1 ] && echo "[odysseus] Waiting for an existing Ollama API on ports ${ODYSSEUS_OLLAMA_PORT}/11434..."')
runner_lines.append(' sleep 1')
runner_lines.append('done')
runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then')
runner_lines.append(' if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then')
runner_lines.append(' echo "[odysseus] Selected Ollama port ' + _ollama_port + ' was not reachable; using running Ollama on port ${ODYSSEUS_OLLAMA_PORT}."')
runner_lines.append('for _ody_off in 0 1 2 3 4 5 6 7 8 9; do')
runner_lines.append(' _ody_try_port=$((ODYSSEUS_OLLAMA_PORT + _ody_off))')
runner_lines.append(' if ! (exec 3<>/dev/tcp/127.0.0.1/$_ody_try_port) 2>/dev/null; then')
runner_lines.append(' exec 3<&-; exec 3>&-')
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_try_port"')
runner_lines.append(' break')
runner_lines.append(' fi')
runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"')
runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."')
runner_lines.append(' exec bash -i')
runner_lines.append('fi')
runner_lines.append(' exec 3<&-; exec 3>&-')
runner_lines.append('done')
runner_lines.append('if ! command -v ollama &>/dev/null; then')
runner_lines.append(' echo "ERROR: Ollama not found and no Ollama API is reachable on 127.0.0.1, localhost, or host.docker.internal (ports ${ODYSSEUS_OLLAMA_PORT}/11434)."')
runner_lines.append(' echo "Install Ollama, start an Ollama service/container on this server, or pick the port where it is already listening."')
runner_lines.append(' echo "ERROR: Ollama not found on this server. Install it from https://ollama.com/download or `curl -fsSL https://ollama.com/install.sh | sh`."')
runner_lines.append(' echo')
runner_lines.append(' echo "=== Process exited with code 127 ==="')
runner_lines.append(' exec bash -i')
@@ -1940,6 +2019,153 @@ def setup_cookbook_routes() -> APIRouter:
return {"models": out}
# Rate-limit for the orphan-tmux adoption sweep. The UI polls
# tasks/status every ~3s; we don't want to SSH every host on every
# poll. 20s is fast enough that a model the agent launched in the
# background shows up "almost immediately" in the UI without being
# wasteful.
_last_orphan_sweep_ts = [0.0]
_ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0
def _maybe_sweep_orphans(tasks: list, state: dict) -> None:
"""Scan each configured cookbook server for `serve-*` tmux sessions
the cookbook doesn't know about and adopt them into state.tasks.
Writes are conditional: if no orphans are found, nothing is touched.
Rate-limited so polling UIs don't trigger SSH on every refresh.
"""
import time as _time
import subprocess
logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}")
now = _time.monotonic()
if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S:
logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last")
return
_last_orphan_sweep_ts[0] = now
env = state.get("env") if isinstance(state, dict) else {}
servers = env.get("servers") if isinstance(env, dict) else []
logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}")
if not isinstance(servers, list):
return
known_sids = {
t.get("sessionId") for t in tasks
if isinstance(t, dict) and t.get("sessionId")
}
adopted_any = False
for srv in servers:
if not isinstance(srv, dict):
continue
host = (srv.get("host") or "").strip()
if not host:
continue # local-only entry; the /proc scan handles it
if not _REMOTE_HOST_RE.match(host):
continue
sport = str(srv.get("port") or "").strip()
ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"]
if sport and sport != "22":
if not _SSH_PORT_RE.match(sport):
continue
ssh_base.extend(["-p", sport])
try:
ls = subprocess.run(
ssh_base + [host, "tmux ls 2>/dev/null"],
timeout=6, capture_output=True, text=True,
)
except Exception:
continue
for line in (ls.stdout or "").splitlines():
sid = line.split(":", 1)[0].strip()
if not sid or not _SESSION_ID_RE.match(sid):
continue
if sid in known_sids:
continue
# Adopt any session whose pane is currently running a
# known model-server process (checked below). The earlier
# prefix gate (serve-/cookbook-) dropped legitimate
# serves whenever tmux fell back to numeric IDs, leaving
# them invisible in the Cookbook UI — so the user could
# neither see nor stop them.
# Skip zombie / idle-shell sessions. A tmux session left
# over from a crashed vllm just shows a bash prompt —
# adopting it would pollute the UI with "running" tasks
# that aren't actually serving anything. pane_current_command
# is the foreground process in the pane right now; only
# real model serves leave a python/vllm/etc. process there.
try:
pc = subprocess.run(
ssh_base + [host, "tmux", "list-panes", "-t", sid,
"-F", "#{pane_current_command}"],
timeout=4, capture_output=True, text=True,
)
cur = (pc.stdout or "").strip().splitlines()
except Exception:
cur = []
LIVE_PROCS = {"python", "python3", "vllm", "llama-server",
"llama_cpp_main", "sglang", "lmdeploy",
"ollama", "node", "uvicorn"}
if not any(c in LIVE_PROCS for c in cur):
continue
# Try to recover a plausible repo_id + port from the
# pane buffer. Cheap heuristic — if we can't, register
# with placeholder fields; the UI still shows it.
try:
cap = subprocess.run(
ssh_base + [host, "tmux", "capture-pane", "-t", sid, "-p", "-S", "-300"],
timeout=6, capture_output=True, text=True,
)
pane = cap.stdout or ""
except Exception:
pane = ""
import re as _re_orphan
# vLLM banner: "model /path/...". Falls back to the
# raw vllm-serve command if the banner already scrolled.
m_model = _re_orphan.search(r"model\s+(\S+)", pane)
model = m_model.group(1) if m_model else ""
if not model:
m_serve = _re_orphan.search(r"vllm\s+serve\s+(\S+)", pane)
model = m_serve.group(1) if m_serve else f"adopted:{sid}"
m_port = _re_orphan.search(r"--port\s+(\d+)", pane)
port = int(m_port.group(1)) if m_port else 0
import time as _t2
tasks.append({
"id": sid,
"sessionId": sid,
"name": model.split("/")[-1] if "/" in model else model,
"type": "serve",
"status": "running",
"output": f"Auto-adopted from orphan tmux session on {host}. "
"Open the task to see live output.",
"ts": int(_t2.time() * 1000),
"payload": {
"repo_id": model,
"remote_host": host,
"_cmd": "(orphan tmux session — original launch cmd unknown)",
"port": port,
},
"remoteHost": host,
"sshPort": sport,
"platform": "linux",
"_serveReady": False,
"_endpointAdded": False,
"_adoptedExternally": True,
})
known_sids.add(sid)
adopted_any = True
logger.info(f"auto-adopted orphan tmux session {sid!r} on {host}")
if adopted_any:
try:
from core.atomic_io import atomic_write_json
state["tasks"] = tasks
atomic_write_json(_cookbook_state_path, state)
except Exception as e:
logger.warning(f"orphan sweep: state write failed: {e}")
@router.get("/api/cookbook/tasks/status")
async def cookbook_tasks_status(request: Request):
"""Check status of all active cookbook tmux sessions.
@@ -1993,6 +2219,7 @@ def setup_cookbook_routes() -> APIRouter:
# Load saved tasks from cookbook state
tasks = []
state = {}
if _cookbook_state_path.exists():
try:
state = json.loads(_cookbook_state_path.read_text(encoding="utf-8"))
@@ -2004,6 +2231,21 @@ def setup_cookbook_routes() -> APIRouter:
except Exception:
pass
# Orphan-tmux auto-adoption sweep. When the agent (or anyone)
# SSH-launches a `serve-*` tmux session — usually because
# serve_model rejected `source ... && vllm ...` or because of a
# manual relaunch via tmux send-keys — that session is invisible
# to the cookbook UI even though it's a live model server. The
# sweep finds those orphans on each configured remote host and
# writes them into state.tasks with _adoptedExternally=True, so
# they show up in the UI on the next poll without anyone having
# to remember to call adopt_served_model. Rate-limited via the
# module-level _last_orphan_sweep so we don't SSH every 3s.
try:
_maybe_sweep_orphans(tasks, state)
except Exception as _sweep_e:
logger.warning(f"orphan sweep failed (non-fatal): {_sweep_e!r}")
results = []
for task in tasks:
session_id = task.get("sessionId", "")
@@ -2063,7 +2305,12 @@ def setup_cookbook_routes() -> APIRouter:
if _tport and _tport != "22":
ssh_base.extend(["-p", str(_tport)])
check_cmd = ssh_base + [remote, "tmux", "has-session", "-t", session_id]
capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
# Capture 500 lines (was 50) so a Python traceback survives
# the post-crash neofetch banner + bash prompt that otherwise
# fills the visible tail. Without this, output_tail ends up
# as just "Locale: C / Ubuntu_Odysseus " and the agent
# can't diagnose the actual error.
capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"]
elif IS_WINDOWS:
# LOCAL Windows task: launched as a detached process (no tmux).
# Liveness comes from the <session>.pid file, output from the
@@ -2072,7 +2319,7 @@ def setup_cookbook_routes() -> APIRouter:
capture_cmd = None
else:
check_cmd = ["tmux", "has-session", "-t", session_id]
capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"]
local_win_task = (not remote) and IS_WINDOWS
+119
View File
@@ -18,6 +18,119 @@ from routes.prefs_routes import _load_for_user, _save_for_user
logger = logging.getLogger(__name__)
def _maybe_cascade_calendar_event(task) -> None:
"""Delete the linked calendar event when a cookbook_serve task is
removed. Two lookup strategies:
1. PRIMARY `cookbook_event_uid` marker stashed in task.prompt
by cookbookSchedule.js right after creating the event. Direct
UID match, no ambiguity.
2. FALLBACK for tasks created before the marker was wired up
(or when the PATCH to add the marker failed silently), scan
the Cookbook calendar for events whose summary equals the
task name and delete the matches.
Best-effort throughout: errors are logged but never block the task
deletion itself."""
if not task or task.task_type != "action" or task.action != "cookbook_serve":
return
import httpx
from core.middleware import INTERNAL_TOOL_HEADER, INTERNAL_TOOL_TOKEN
headers = {INTERNAL_TOOL_HEADER: INTERNAL_TOOL_TOKEN}
if task.owner:
headers["X-Odysseus-Owner"] = task.owner
# Strategy 1: explicit UID marker in prompt.
event_uid = ""
if task.prompt:
try:
cfg = json.loads(task.prompt)
if isinstance(cfg, dict):
event_uid = (cfg.get("cookbook_event_uid") or "").strip()
except Exception:
pass
def _try_delete(uid: str) -> bool:
try:
with httpx.Client(timeout=10) as client:
r = client.delete(
f"http://localhost:7000/api/calendar/events/{uid}",
headers=headers,
)
if r.status_code >= 400:
logger.info(
f"task delete: cascade calendar event {uid} returned "
f"HTTP {r.status_code}"
)
return False
return True
except Exception as e:
logger.warning(f"task delete: cascade calendar event {uid} failed: {e}")
return False
if event_uid:
_try_delete(event_uid)
return
# Strategy 2: scan the Cookbook calendar for matching summaries.
# Only runs for tasks missing the marker (old tasks or PATCH failures).
if not task.name:
return
try:
with httpx.Client(timeout=10) as client:
# Find the Cookbook calendar.
cal_r = client.get("http://localhost:7000/api/calendar/calendars", headers=headers)
if cal_r.status_code >= 400:
return
cals = (cal_r.json() or {}).get("calendars", [])
cookbook_cal = next(
(c for c in cals if (c.get("name") or "").lower() == "cookbook"),
None,
)
if not cookbook_cal:
return
cal_href = cookbook_cal.get("href") or cookbook_cal.get("id") or ""
# List events in a wide window to catch recurring + upcoming.
from datetime import datetime as _dt, timedelta as _td, timezone as _tz
now = _dt.now(_tz.utc)
start = (now - _td(days=30)).isoformat()
end = (now + _td(days=365)).isoformat()
ev_r = client.get(
"http://localhost:7000/api/calendar/events",
params={"start": start, "end": end, "calendar": cal_href},
headers=headers,
)
if ev_r.status_code >= 400:
return
events = (ev_r.json() or {}).get("events", [])
# Match by exact summary. Tasks named "Serve: <model>" are
# created from the schedule modal; the event's summary mirrors
# the task name 1:1 by design.
target = (task.name or "").strip()
uids_to_delete = set()
for ev in events:
if (ev.get("summary") or "").strip() != target:
continue
uid = ev.get("uid") or ev.get("id") or ""
# Strip the "::occurrence" suffix on recurring expansions —
# we want to delete the MASTER once, not each instance.
if "::" in uid:
uid = uid.split("::", 1)[0]
if uid:
uids_to_delete.add(uid)
for uid in uids_to_delete:
_try_delete(uid)
if uids_to_delete:
logger.info(
f"task delete: cascade matched {len(uids_to_delete)} calendar event(s) "
f"by summary fallback for task {task.id} ({target!r})"
)
except Exception as e:
logger.warning(f"task delete: cascade fallback scan failed: {e}")
class TaskCreate(BaseModel):
name: Optional[str] = None
prompt: Optional[str] = None
@@ -616,6 +729,12 @@ def setup_task_routes(task_scheduler) -> APIRouter:
raise HTTPException(404, "Task not found")
if user and task.owner != user:
raise HTTPException(403, "Access denied")
# Cascade: cookbook_serve tasks may have a linked calendar
# event (created via the "Create event in calendar" toggle
# in the schedule modal). If so, delete the calendar event
# too so the calendar doesn't end up holding a phantom event
# for a task that no longer exists.
_maybe_cascade_calendar_event(task)
db.delete(task)
db.commit()
return {"ok": True}