mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-30 00:22:10 -04:00
cookbook agent debug loop: persistent log files, auto-adopt orphan tmux, Codex/Claude skill parity
Three converging fixes so the chat agent + external Codex/Claude skills can actually debug a crashed serve instead of staring at a post-crash neofetch banner:
* Serves now `tee` to /tmp/odysseus-tmux/SESSION.log on the host running them. Runner saves fds 3/4 before the tee and restores them right before `exec ${SHELL}`, so the post-crash interactive zsh banner does NOT pollute the log file.
* `tail_serve_output` (chat agent) and `/api/codex/cookbook/output/{sid}` (Codex+Claude skills) both prefer the persistent log file over the tmux pane. Pane is fallback for sessions predating the tee runner. Default tail bumped 150 -> 400.
* `list_served_models` "recent log" snippet seeks to the Traceback line instead of showing the last 6 lines (which was always the bash prompt).
Cookbook auto-adoption sweep on `/api/cookbook/tasks/status`: every 20s (rate-limited) the cookbook SSHes each configured server, finds `serve-*` / `cookbook-*` tmux sessions running an actual model process (vllm/python/llama-server/etc., filtered via `pane_current_command`), and writes them into state.tasks. So when the agent falls back to raw ssh+tmux, the session appears in the Cookbook UI on the next poll.
`serve_model` error path now reads `data["detail"]` in addition to `data["error"]` so the FastAPI HTTPException message ("Invalid characters in cmd") actually reaches the agent instead of being swallowed as a generic "Serve failed". Tool description updated to warn against `cd …`/`source …`/`&&` prefixes.
Intent-without-action supervisor in agent_loop: when the model writes "Let me tail the output" / "I'll check the logs" / "Let me investigate" and ends the turn without emitting a tool call, the loop injects a sharp system nudge ("You said you would X — DO IT NOW") and continues. Capped at 2 nudges per chat so a model that genuinely cannot use the tool does not pin the loop.
Codex/Claude skill parity: adds `/cookbook/cached`, `/cookbook/presets`, `/cookbook/preset/{name}`, `/cookbook/adopt` so external agents have the same surface as the chat agent. SKILL.md docs + odysseus_api.py wrapper updated for both bundles.
`adopt_served_model` promoted to the always-on tool set so the agent has a documented fallback when serve_model rejects a cmd.
Also various cookbook UI tweaks accumulated alongside the above (cookbook.js, cookbookRunning.js, cookbookServe.js, cookbook-diagnosis.js, settings.js, style.css).
This commit is contained in:
@@ -19,6 +19,8 @@ from src.auth_helpers import require_user
|
||||
from src.tool_implementations import do_manage_notes
|
||||
|
||||
|
||||
COOKBOOK_READ_SCOPES = {"cookbook:read", "cookbook:launch"}
|
||||
COOKBOOK_LAUNCH_SCOPES = {"cookbook:launch"}
|
||||
TODO_READ_SCOPES = {"todos:read", "todos:write"}
|
||||
TODO_WRITE_SCOPES = {"todos:write"}
|
||||
EMAIL_READ_SCOPES = {"email:read", "email:draft", "email:send"}
|
||||
@@ -130,6 +132,11 @@ def setup_codex_routes(
|
||||
"actions": ["library", "read", "create", "delete"],
|
||||
"available": documents_library_endpoint is not None,
|
||||
},
|
||||
"cookbook": {
|
||||
"read": scoped(COOKBOOK_READ_SCOPES),
|
||||
"launch": scoped(COOKBOOK_LAUNCH_SCOPES),
|
||||
"actions": ["tasks", "servers", "output", "serve", "stop"],
|
||||
},
|
||||
},
|
||||
"safety": {
|
||||
"email_send_requires_confirmation": True,
|
||||
@@ -373,6 +380,374 @@ def setup_codex_routes(
|
||||
raise HTTPException(400, f"Invalid document payload: {exc}")
|
||||
return await _as_owner(request, owner, documents_create_endpoint, request, req)
|
||||
|
||||
# ── Cookbook surface ──
|
||||
# Lets the agent run the same launch / monitor / kill loop the user
|
||||
# would do by hand in the Cookbook UI: read the current task list +
|
||||
# tmux output, launch a serve task, stop one. Two scopes:
|
||||
# cookbook:read — list tasks + tail output + list servers
|
||||
# cookbook:launch — also start/stop serves (host shell exec)
|
||||
# `cookbook:launch` is genuinely powerful: /api/model/serve runs SSH'd
|
||||
# commands on the user's hosts. The existing _validate_serve_cmd
|
||||
# allowlist (vllm/python3/sglang/llama-server/etc., no shell metachars)
|
||||
# keeps the agent inside the same sandbox the UI uses.
|
||||
|
||||
async def _run_shell(cmd: str, timeout: float = 15.0) -> dict:
|
||||
"""Run a shell command, return {exit_code, stdout, stderr}."""
|
||||
import asyncio as _asyncio
|
||||
try:
|
||||
proc = await _asyncio.create_subprocess_shell(
|
||||
cmd,
|
||||
stdout=_asyncio.subprocess.PIPE,
|
||||
stderr=_asyncio.subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
stdout_b, stderr_b = await _asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
except _asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
return {"exit_code": -1, "stdout": "", "stderr": "timed out"}
|
||||
return {
|
||||
"exit_code": proc.returncode,
|
||||
"stdout": stdout_b.decode(errors="replace"),
|
||||
"stderr": stderr_b.decode(errors="replace"),
|
||||
}
|
||||
except Exception as exc:
|
||||
return {"exit_code": -1, "stdout": "", "stderr": str(exc)}
|
||||
|
||||
def _read_cookbook_state() -> dict:
|
||||
from pathlib import Path as _Path
|
||||
import os as _os, json as _json
|
||||
p = _Path(_os.environ.get("DATA_DIR", "data")) / "cookbook_state.json"
|
||||
if not p.exists():
|
||||
return {}
|
||||
try:
|
||||
return _json.loads(p.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _redact_task(t: dict) -> dict:
|
||||
"""Strip secrets before returning to the agent."""
|
||||
clean = {k: v for k, v in t.items() if k not in ("hf_token", "_secrets")}
|
||||
if isinstance(clean.get("payload"), dict):
|
||||
pl = clean["payload"]
|
||||
clean["payload"] = {k: v for k, v in pl.items()
|
||||
if k not in ("hf_token", "_secrets")}
|
||||
return clean
|
||||
|
||||
@router.get("/cookbook/tasks")
|
||||
async def codex_cookbook_tasks(request: Request):
|
||||
_scope_owner(request, COOKBOOK_READ_SCOPES)
|
||||
state = _read_cookbook_state()
|
||||
tasks = state.get("tasks") or []
|
||||
return {"tasks": [_redact_task(t) for t in tasks]}
|
||||
|
||||
@router.get("/cookbook/servers")
|
||||
async def codex_cookbook_servers(request: Request):
|
||||
_scope_owner(request, COOKBOOK_READ_SCOPES)
|
||||
state = _read_cookbook_state()
|
||||
servers = state.get("env", {}).get("servers") or []
|
||||
# Strip ssh creds / passwords; keep only what's needed to pick a host.
|
||||
cleaned = []
|
||||
for s in servers:
|
||||
cleaned.append({
|
||||
"name": s.get("name"),
|
||||
"host": s.get("host"),
|
||||
"port": s.get("port"),
|
||||
"env": s.get("env"),
|
||||
"envPath": s.get("envPath"),
|
||||
"platform": s.get("platform"),
|
||||
"modelDirs": s.get("modelDirs"),
|
||||
})
|
||||
return {"servers": cleaned}
|
||||
|
||||
@router.get("/cookbook/output/{session_id}")
|
||||
async def codex_cookbook_output(request: Request, session_id: str, tail: int = 400):
|
||||
_scope_owner(request, COOKBOOK_READ_SCOPES)
|
||||
# Defensive: session_id must be the tmux-style id we issue
|
||||
# (`serve-XXXX` / `cookbook-XXXX` / `queue-XXXX`); anything else
|
||||
# would let the agent run arbitrary `tmux capture-pane` targets.
|
||||
import re as _re
|
||||
if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id):
|
||||
raise HTTPException(400, "Invalid session id")
|
||||
tail = max(20, min(int(tail or 400), 4000))
|
||||
# Resolve the task's host (if any) from cookbook state so we can
|
||||
# ssh to the right box, exactly as the UI does in _reconnectTask.
|
||||
state = _read_cookbook_state()
|
||||
tasks = state.get("tasks") or []
|
||||
task = next((t for t in tasks if t.get("sessionId") == session_id), None)
|
||||
if task is None:
|
||||
raise HTTPException(404, "task not found")
|
||||
host = (task.get("remoteHost") or "").strip()
|
||||
ssh_port = (task.get("sshPort") or "").strip()
|
||||
# Prefer the persisted log file over the tmux pane. The pane gets
|
||||
# overwritten by the post-crash neofetch banner + bash prompt the
|
||||
# moment vllm exits; the log file is the raw stdout/stderr and
|
||||
# survives unchanged. Falls back to pane for older tasks predating
|
||||
# the tee-to-log runner change.
|
||||
log_path = f"/tmp/odysseus-tmux/{session_id}.log"
|
||||
inner = (
|
||||
f"if [ -s {log_path} ]; then tail -n {tail} {log_path}; "
|
||||
f"else tmux capture-pane -t {session_id} -p -S -{tail}; fi"
|
||||
)
|
||||
if host:
|
||||
port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
|
||||
import shlex
|
||||
cmd = f"ssh {port_flag}{host} {shlex.quote(inner)}"
|
||||
else:
|
||||
cmd = inner
|
||||
result = await _run_shell(cmd, timeout=15)
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"host": host or "local",
|
||||
"exit_code": result.get("exit_code"),
|
||||
"output": result.get("stdout", ""),
|
||||
"task": _redact_task(task),
|
||||
}
|
||||
|
||||
@router.post("/cookbook/serve")
|
||||
async def codex_cookbook_serve(request: Request, body: dict[str, Any] = Body(default_factory=dict)):
|
||||
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
|
||||
# Wraps /api/model/serve with the SAME validation the UI uses.
|
||||
# _validate_serve_cmd (called inside model_serve) rejects shell
|
||||
# metachars and requires the leading binary to be in the
|
||||
# cookbook allowlist (vllm / python3 / sglang / llama-server / ...).
|
||||
from routes.cookbook_helpers import ServeRequest
|
||||
# Accept friendly aliases agents naturally reach for. Without these,
|
||||
# passing `host` silently maps to nothing and the serve runs LOCAL
|
||||
# instead of on the intended remote — exactly the bug an agent
|
||||
# would never debug on its own.
|
||||
norm = dict(body or {})
|
||||
if "host" in norm and "remote_host" not in norm:
|
||||
norm["remote_host"] = norm.pop("host")
|
||||
if "model" in norm and "repo_id" not in norm:
|
||||
norm["repo_id"] = norm.pop("model")
|
||||
if "ssh_port" not in norm and "port" in norm and (str(norm.get("port") or "").isdigit() and int(norm["port"]) >= 1000):
|
||||
# Heuristic: if `port` looks like an SSH port (≥1000) and there's
|
||||
# no explicit ssh_port, treat it as such. UI ports (8000, 8001,
|
||||
# 30000) belong inside the cmd string, not here.
|
||||
pass # leave as-is — user's `port` here is ambiguous; skip remap.
|
||||
try:
|
||||
req = ServeRequest(**norm)
|
||||
except Exception as exc:
|
||||
raise HTTPException(400, f"Invalid serve payload: {exc}")
|
||||
serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve")
|
||||
# Fall back to importing from the cookbook router registered on app.
|
||||
if serve_endpoint is None:
|
||||
from fastapi import FastAPI
|
||||
app: FastAPI = request.app
|
||||
for route in app.routes:
|
||||
if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()):
|
||||
serve_endpoint = route.endpoint
|
||||
break
|
||||
if serve_endpoint is None:
|
||||
raise HTTPException(503, "model serve endpoint unavailable")
|
||||
return await serve_endpoint(request, req)
|
||||
|
||||
@router.post("/cookbook/stop/{session_id}")
|
||||
async def codex_cookbook_stop(request: Request, session_id: str):
|
||||
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
|
||||
import re as _re
|
||||
if not _re.fullmatch(r"[a-zA-Z0-9_-]+", session_id):
|
||||
raise HTTPException(400, "Invalid session id")
|
||||
state = _read_cookbook_state()
|
||||
tasks = state.get("tasks") or []
|
||||
task = next((t for t in tasks if t.get("sessionId") == session_id), None)
|
||||
host = ((task or {}).get("remoteHost") or "").strip()
|
||||
ssh_port = ((task or {}).get("sshPort") or "").strip()
|
||||
if host:
|
||||
port_flag = f"-p {ssh_port} " if ssh_port and ssh_port != "22" else ""
|
||||
cmd = f"ssh {port_flag}{host} \"tmux kill-session -t {session_id}\""
|
||||
else:
|
||||
cmd = f"tmux kill-session -t {session_id}"
|
||||
result = await _run_shell(cmd, timeout=10)
|
||||
return {"session_id": session_id, "exit_code": result.get("exit_code"), "host": host or "local"}
|
||||
|
||||
@router.get("/cookbook/cached")
|
||||
async def codex_cookbook_cached(request: Request, host: str | None = None):
|
||||
"""List cached models on a configured server (or local if host is omitted).
|
||||
Mirrors `list_cached_models` from the chat agent so external agents have
|
||||
the same inventory view before deciding what to serve/download."""
|
||||
_scope_owner(request, COOKBOOK_READ_SCOPES)
|
||||
# Hit /api/model/cached internally, with the same modelDirs the chat
|
||||
# agent's list_cached_models would resolve from cookbook state.
|
||||
state = _read_cookbook_state()
|
||||
env = state.get("env") if isinstance(state, dict) else {}
|
||||
servers = (env.get("servers") if isinstance(env, dict) else None) or []
|
||||
HF_DEFAULTS = {"~/.cache/huggingface/hub", "~/.cache/huggingface"}
|
||||
def _dirs_for(srv: dict) -> str:
|
||||
mds = srv.get("modelDirs") if isinstance(srv, dict) else None
|
||||
if isinstance(mds, list):
|
||||
extras = [d for d in mds if isinstance(d, str) and d.strip() and d.strip() not in HF_DEFAULTS]
|
||||
return ",".join(extras)
|
||||
if isinstance(mds, str) and mds.strip() not in HF_DEFAULTS:
|
||||
return mds
|
||||
return ""
|
||||
# Resolve friendly host name → real host (matches list_cached_models flow).
|
||||
resolved_host = host or ""
|
||||
srv: dict[str, Any] = {}
|
||||
if host:
|
||||
srv = next(
|
||||
(s for s in servers if isinstance(s, dict)
|
||||
and (s.get("name") == host or s.get("host") == host)),
|
||||
{},
|
||||
)
|
||||
if srv and srv.get("host"):
|
||||
resolved_host = srv["host"]
|
||||
else:
|
||||
srv = next((s for s in servers if isinstance(s, dict) and not (s.get("host") or "").strip()), {})
|
||||
params: dict[str, str] = {}
|
||||
if resolved_host:
|
||||
params["host"] = resolved_host
|
||||
md = _dirs_for(srv)
|
||||
if md:
|
||||
params["model_dir"] = md
|
||||
if srv.get("port"):
|
||||
params["ssh_port"] = str(srv["port"])
|
||||
if srv.get("platform"):
|
||||
params["platform"] = srv["platform"]
|
||||
cached_endpoint = _find_endpoint(None, "GET", "/api/model/cached")
|
||||
if cached_endpoint is None:
|
||||
from fastapi import FastAPI
|
||||
app: FastAPI = request.app
|
||||
for route in app.routes:
|
||||
if getattr(route, "path", None) == "/api/model/cached" and "GET" in getattr(route, "methods", set()):
|
||||
cached_endpoint = route.endpoint
|
||||
break
|
||||
if cached_endpoint is None:
|
||||
raise HTTPException(503, "model cached endpoint unavailable")
|
||||
# The endpoint reads host/model_dir/ssh_port/platform as kwargs.
|
||||
return await cached_endpoint(
|
||||
request,
|
||||
host=params.get("host") or None,
|
||||
model_dir=params.get("model_dir") or None,
|
||||
ssh_port=params.get("ssh_port") or None,
|
||||
platform=params.get("platform") or None,
|
||||
)
|
||||
|
||||
@router.get("/cookbook/presets")
|
||||
async def codex_cookbook_presets(request: Request):
|
||||
"""List saved serve presets (model + host + port + launch cmd).
|
||||
Counterpart to `list_serve_presets`. Use BEFORE composing a `serve`
|
||||
body — the user's saved preset usually has the working cmd already."""
|
||||
_scope_owner(request, COOKBOOK_READ_SCOPES)
|
||||
state = _read_cookbook_state()
|
||||
presets = state.get("presets") or []
|
||||
out = []
|
||||
for p in presets:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
out.append({
|
||||
"name": p.get("name"),
|
||||
"model": p.get("model") or p.get("modelId"),
|
||||
"host": p.get("host") or p.get("remoteHost"),
|
||||
"port": p.get("port"),
|
||||
"cmd": p.get("cmd"),
|
||||
})
|
||||
return {"presets": out, "default_host": (state.get("env") or {}).get("defaultServer", "")}
|
||||
|
||||
@router.post("/cookbook/preset/{name}")
|
||||
async def codex_cookbook_serve_preset(request: Request, name: str):
|
||||
"""Launch a saved preset by name. Reuses the working cmd + host the
|
||||
user already saved, avoiding the cmd-allowlist trial-and-error loop."""
|
||||
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
|
||||
import re as _re
|
||||
if not _re.fullmatch(r"[A-Za-z0-9 _.:@\-]+", name):
|
||||
raise HTTPException(400, "Invalid preset name")
|
||||
state = _read_cookbook_state()
|
||||
presets = state.get("presets") or []
|
||||
lname = name.lower().strip()
|
||||
chosen = next(
|
||||
(p for p in presets if isinstance(p, dict) and (p.get("name") or "").lower() == lname),
|
||||
None,
|
||||
)
|
||||
if chosen is None:
|
||||
chosen = next(
|
||||
(p for p in presets if isinstance(p, dict) and lname in (p.get("name") or "").lower()),
|
||||
None,
|
||||
)
|
||||
if chosen is None:
|
||||
raise HTTPException(404, f"No preset matching {name!r}")
|
||||
repo_id = chosen.get("model") or chosen.get("modelId") or ""
|
||||
cmd = (chosen.get("cmd") or "").strip()
|
||||
host = chosen.get("host") or chosen.get("remoteHost") or ""
|
||||
if not repo_id or not cmd or cmd.startswith("(adopted"):
|
||||
raise HTTPException(400, f"Preset {chosen.get('name')!r} has no launchable cmd "
|
||||
"(adopted from external launch). Use POST /cookbook/serve "
|
||||
"with the actual cmd instead.")
|
||||
# Reuse the serve handler we already validated.
|
||||
from routes.cookbook_helpers import ServeRequest
|
||||
body = {"repo_id": repo_id, "cmd": cmd}
|
||||
if host:
|
||||
body["remote_host"] = host
|
||||
try:
|
||||
req = ServeRequest(**body)
|
||||
except Exception as exc:
|
||||
raise HTTPException(400, f"Preset payload invalid: {exc}")
|
||||
serve_endpoint = _find_endpoint(None, "POST", "/api/model/serve")
|
||||
if serve_endpoint is None:
|
||||
from fastapi import FastAPI
|
||||
app: FastAPI = request.app
|
||||
for route in app.routes:
|
||||
if getattr(route, "path", None) == "/api/model/serve" and "POST" in getattr(route, "methods", set()):
|
||||
serve_endpoint = route.endpoint
|
||||
break
|
||||
if serve_endpoint is None:
|
||||
raise HTTPException(503, "model serve endpoint unavailable")
|
||||
return await serve_endpoint(request, req)
|
||||
|
||||
@router.post("/cookbook/adopt")
|
||||
async def codex_cookbook_adopt(request: Request, body: dict[str, Any] = Body(default_factory=dict)):
|
||||
"""Adopt an existing tmux session (one started via raw ssh+tmux) into
|
||||
cookbook tracking. Needed when serve_model rejects a cmd and the
|
||||
agent falls back to direct ssh — without adoption the session is
|
||||
invisible to the UI. Body: {tmux_session, model, host?, port?}."""
|
||||
_scope_owner(request, COOKBOOK_LAUNCH_SCOPES)
|
||||
norm = dict(body or {})
|
||||
sess = (norm.get("tmux_session") or norm.get("session_id") or "").strip()
|
||||
model = (norm.get("model") or norm.get("repo_id") or "").strip()
|
||||
host = (norm.get("host") or norm.get("remote_host") or "").strip()
|
||||
port = norm.get("port") or 8000
|
||||
import re as _re
|
||||
if not sess or not _re.fullmatch(r"[a-zA-Z0-9_-]+", sess):
|
||||
raise HTTPException(400, "tmux_session required, [a-zA-Z0-9_-]+ only")
|
||||
if not model:
|
||||
raise HTTPException(400, "model required")
|
||||
# Verify the tmux session exists on the target host before adopting.
|
||||
import shlex
|
||||
if host:
|
||||
check = f"ssh {shlex.quote(host)} 'tmux has-session -t {shlex.quote(sess)}'"
|
||||
else:
|
||||
check = f"tmux has-session -t {shlex.quote(sess)}"
|
||||
chk = await _run_shell(check, timeout=8)
|
||||
if chk.get("exit_code") not in (0, None):
|
||||
raise HTTPException(404, f"tmux session {sess!r} not found on {host or 'local'}")
|
||||
# Write into cookbook_state.json.
|
||||
import time as _t, json as _json
|
||||
from core.atomic_io import atomic_write_json
|
||||
from pathlib import Path as _Path
|
||||
cookbook_state_path = _Path("/app/data/cookbook_state.json")
|
||||
try:
|
||||
state = _json.loads(cookbook_state_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
state = {}
|
||||
tasks = state.setdefault("tasks", [])
|
||||
if any(isinstance(t, dict) and t.get("sessionId") == sess for t in tasks):
|
||||
return {"ok": True, "already_tracked": True, "session_id": sess}
|
||||
tasks.append({
|
||||
"id": sess, "sessionId": sess,
|
||||
"name": model.split("/")[-1] if "/" in model else model,
|
||||
"type": "serve", "status": "running",
|
||||
"output": f"Adopted externally-launched session {sess!r} on {host or 'local'}.",
|
||||
"ts": int(_t.time() * 1000),
|
||||
"payload": {"repo_id": model, "remote_host": host, "_cmd": "(adopted — launched outside cookbook)", "port": int(port)},
|
||||
"remoteHost": host, "sshPort": "", "platform": "linux",
|
||||
"_serveReady": False, "_endpointAdded": False, "_adoptedExternally": True,
|
||||
})
|
||||
try:
|
||||
atomic_write_json(cookbook_state_path, state)
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, f"state write failed: {exc}")
|
||||
return {"ok": True, "session_id": sess, "host": host or "local"}
|
||||
|
||||
return router
|
||||
|
||||
|
||||
|
||||
@@ -546,6 +546,13 @@ def _append_serve_preflight_exit_lines(runner_lines: list[str], *, keep_shell_op
|
||||
runner_lines.append('if [ -n "$ODYSSEUS_PREFLIGHT_EXIT" ]; then')
|
||||
runner_lines.append(' echo ""; echo "=== Process exited with code $ODYSSEUS_PREFLIGHT_EXIT ==="')
|
||||
if keep_shell_open:
|
||||
# Decouple the post-crash interactive shell from the persistent log
|
||||
# file. fds 3/4 were saved BEFORE the tee redirect at the top of
|
||||
# the runner; restoring them here means the neofetch banner the
|
||||
# user's .zshrc prints lands on the tmux pane only, not in the
|
||||
# log file the agent's tail_serve_output reads.
|
||||
runner_lines.append(' exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true')
|
||||
runner_lines.append(' sleep 0.2 # let tee child flush + exit')
|
||||
runner_lines.append(' exec "${SHELL:-/bin/bash}"')
|
||||
else:
|
||||
runner_lines.append(' exit "$ODYSSEUS_PREFLIGHT_EXIT"')
|
||||
@@ -563,7 +570,11 @@ def _append_serve_exit_code_lines(
|
||||
if is_pip_install:
|
||||
runner_lines.append('if [ $ODYSSEUS_CMD_EXIT -eq 0 ]; then echo ""; echo "DOWNLOAD_OK"; fi')
|
||||
if keep_shell_open:
|
||||
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="; exec "${SHELL:-/bin/bash}"')
|
||||
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
|
||||
# See preflight branch above for the rationale on restoring fds 3/4.
|
||||
runner_lines.append('exec 1>&3 2>&4 3>&- 4>&- 2>/dev/null || true')
|
||||
runner_lines.append('sleep 0.2 # let tee child flush + exit')
|
||||
runner_lines.append('exec "${SHELL:-/bin/bash}"')
|
||||
else:
|
||||
runner_lines.append('echo ""; echo "=== Process exited with code $ODYSSEUS_CMD_EXIT ==="')
|
||||
runner_lines.append('exit "$ODYSSEUS_CMD_EXIT"')
|
||||
|
||||
+183
-2
@@ -998,6 +998,21 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
else:
|
||||
# ── Linux/Termux: bash + tmux (existing flow) ──
|
||||
runner_lines = ["#!/bin/bash"]
|
||||
# Mirror every line of stdout+stderr into a persistent log file
|
||||
# on the host running the serve. This is the file tail_serve_output
|
||||
# reads when the tmux pane has been overwritten by the post-crash
|
||||
# bash prompt — without it, the agent's diagnostic tool sees the
|
||||
# neofetch banner instead of the actual Python traceback.
|
||||
# We save the original fds to 3/4 so we can RESTORE them before
|
||||
# `exec ${SHELL}` at the end of the script. Without that restore,
|
||||
# the post-crash interactive shell's neofetch banner ALSO gets
|
||||
# teed into the log file and `tail -N` returns ONLY the banner —
|
||||
# the actual traceback ends up earlier than the tail window.
|
||||
runner_lines.append("mkdir -p /tmp/odysseus-tmux 2>/dev/null || true")
|
||||
runner_lines.append("exec 3>&1 4>&2")
|
||||
runner_lines.append(
|
||||
f"exec > >(tee -a /tmp/odysseus-tmux/{session_id}.log) 2>&1"
|
||||
)
|
||||
runner_lines.extend(_user_shell_path_bootstrap())
|
||||
runner_lines.append('ODYSSEUS_PREFLIGHT_EXIT=""')
|
||||
# Put Odysseus's own venv bin on PATH (local runs only) so the serve
|
||||
@@ -1940,6 +1955,151 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
|
||||
return {"models": out}
|
||||
|
||||
# Rate-limit for the orphan-tmux adoption sweep. The UI polls
|
||||
# tasks/status every ~3s; we don't want to SSH every host on every
|
||||
# poll. 20s is fast enough that a model the agent launched in the
|
||||
# background shows up "almost immediately" in the UI without being
|
||||
# wasteful.
|
||||
_last_orphan_sweep_ts = [0.0]
|
||||
_ORPHAN_SWEEP_MIN_INTERVAL_S = 20.0
|
||||
|
||||
def _maybe_sweep_orphans(tasks: list, state: dict) -> None:
|
||||
"""Scan each configured cookbook server for `serve-*` tmux sessions
|
||||
the cookbook doesn't know about and adopt them into state.tasks.
|
||||
|
||||
Writes are conditional: if no orphans are found, nothing is touched.
|
||||
Rate-limited so polling UIs don't trigger SSH on every refresh.
|
||||
"""
|
||||
import time as _time
|
||||
import subprocess
|
||||
logger.info(f"_maybe_sweep_orphans: entered, last_ts={_last_orphan_sweep_ts[0]}")
|
||||
now = _time.monotonic()
|
||||
if now - _last_orphan_sweep_ts[0] < _ORPHAN_SWEEP_MIN_INTERVAL_S:
|
||||
logger.info(f"_maybe_sweep_orphans: rate-limited, {now - _last_orphan_sweep_ts[0]:.1f}s since last")
|
||||
return
|
||||
_last_orphan_sweep_ts[0] = now
|
||||
|
||||
env = state.get("env") if isinstance(state, dict) else {}
|
||||
servers = env.get("servers") if isinstance(env, dict) else []
|
||||
logger.info(f"orphan sweep starting: {len(servers) if isinstance(servers, list) else 0} server(s), known_sids={len([t for t in tasks if isinstance(t, dict) and t.get('sessionId')])}")
|
||||
if not isinstance(servers, list):
|
||||
return
|
||||
|
||||
known_sids = {
|
||||
t.get("sessionId") for t in tasks
|
||||
if isinstance(t, dict) and t.get("sessionId")
|
||||
}
|
||||
|
||||
adopted_any = False
|
||||
for srv in servers:
|
||||
if not isinstance(srv, dict):
|
||||
continue
|
||||
host = (srv.get("host") or "").strip()
|
||||
if not host:
|
||||
continue # local-only entry; the /proc scan handles it
|
||||
if not _REMOTE_HOST_RE.match(host):
|
||||
continue
|
||||
sport = str(srv.get("port") or "").strip()
|
||||
ssh_base = ["ssh", "-o", "ConnectTimeout=4", "-o", "StrictHostKeyChecking=no"]
|
||||
if sport and sport != "22":
|
||||
if not _SSH_PORT_RE.match(sport):
|
||||
continue
|
||||
ssh_base.extend(["-p", sport])
|
||||
|
||||
try:
|
||||
ls = subprocess.run(
|
||||
ssh_base + [host, "tmux ls 2>/dev/null"],
|
||||
timeout=6, capture_output=True, text=True,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
for line in (ls.stdout or "").splitlines():
|
||||
sid = line.split(":", 1)[0].strip()
|
||||
if not sid or not _SESSION_ID_RE.match(sid):
|
||||
continue
|
||||
# Only adopt sessions that LOOK like model serves; ignore
|
||||
# bare numeric tmux sessions and unrelated work.
|
||||
if not (sid.startswith("serve-") or sid.startswith("cookbook-")):
|
||||
continue
|
||||
if sid in known_sids:
|
||||
continue
|
||||
# Skip zombie / idle-shell sessions. A tmux session left
|
||||
# over from a crashed vllm just shows a bash prompt —
|
||||
# adopting it would pollute the UI with "running" tasks
|
||||
# that aren't actually serving anything. pane_current_command
|
||||
# is the foreground process in the pane right now; only
|
||||
# real model serves leave a python/vllm/etc. process there.
|
||||
try:
|
||||
pc = subprocess.run(
|
||||
ssh_base + [host, "tmux", "list-panes", "-t", sid,
|
||||
"-F", "#{pane_current_command}"],
|
||||
timeout=4, capture_output=True, text=True,
|
||||
)
|
||||
cur = (pc.stdout or "").strip().splitlines()
|
||||
except Exception:
|
||||
cur = []
|
||||
LIVE_PROCS = {"python", "python3", "vllm", "llama-server",
|
||||
"llama_cpp_main", "sglang", "lmdeploy",
|
||||
"ollama", "node", "uvicorn"}
|
||||
if not any(c in LIVE_PROCS for c in cur):
|
||||
continue
|
||||
# Try to recover a plausible repo_id + port from the
|
||||
# pane buffer. Cheap heuristic — if we can't, register
|
||||
# with placeholder fields; the UI still shows it.
|
||||
try:
|
||||
cap = subprocess.run(
|
||||
ssh_base + [host, "tmux", "capture-pane", "-t", sid, "-p", "-S", "-300"],
|
||||
timeout=6, capture_output=True, text=True,
|
||||
)
|
||||
pane = cap.stdout or ""
|
||||
except Exception:
|
||||
pane = ""
|
||||
import re as _re_orphan
|
||||
# vLLM banner: "model /path/...". Falls back to the
|
||||
# raw vllm-serve command if the banner already scrolled.
|
||||
m_model = _re_orphan.search(r"model\s+(\S+)", pane)
|
||||
model = m_model.group(1) if m_model else ""
|
||||
if not model:
|
||||
m_serve = _re_orphan.search(r"vllm\s+serve\s+(\S+)", pane)
|
||||
model = m_serve.group(1) if m_serve else f"adopted:{sid}"
|
||||
m_port = _re_orphan.search(r"--port\s+(\d+)", pane)
|
||||
port = int(m_port.group(1)) if m_port else 0
|
||||
|
||||
import time as _t2
|
||||
tasks.append({
|
||||
"id": sid,
|
||||
"sessionId": sid,
|
||||
"name": model.split("/")[-1] if "/" in model else model,
|
||||
"type": "serve",
|
||||
"status": "running",
|
||||
"output": f"Auto-adopted from orphan tmux session on {host}. "
|
||||
"Open the task to see live output.",
|
||||
"ts": int(_t2.time() * 1000),
|
||||
"payload": {
|
||||
"repo_id": model,
|
||||
"remote_host": host,
|
||||
"_cmd": "(orphan tmux session — original launch cmd unknown)",
|
||||
"port": port,
|
||||
},
|
||||
"remoteHost": host,
|
||||
"sshPort": sport,
|
||||
"platform": "linux",
|
||||
"_serveReady": False,
|
||||
"_endpointAdded": False,
|
||||
"_adoptedExternally": True,
|
||||
})
|
||||
known_sids.add(sid)
|
||||
adopted_any = True
|
||||
logger.info(f"auto-adopted orphan tmux session {sid!r} on {host}")
|
||||
|
||||
if adopted_any:
|
||||
try:
|
||||
from core.atomic_io import atomic_write_json
|
||||
state["tasks"] = tasks
|
||||
atomic_write_json(_cookbook_state_path, state)
|
||||
except Exception as e:
|
||||
logger.warning(f"orphan sweep: state write failed: {e}")
|
||||
|
||||
@router.get("/api/cookbook/tasks/status")
|
||||
async def cookbook_tasks_status(request: Request):
|
||||
"""Check status of all active cookbook tmux sessions.
|
||||
@@ -1993,6 +2153,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
|
||||
# Load saved tasks from cookbook state
|
||||
tasks = []
|
||||
state = {}
|
||||
if _cookbook_state_path.exists():
|
||||
try:
|
||||
state = json.loads(_cookbook_state_path.read_text(encoding="utf-8"))
|
||||
@@ -2004,6 +2165,21 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Orphan-tmux auto-adoption sweep. When the agent (or anyone)
|
||||
# SSH-launches a `serve-*` tmux session — usually because
|
||||
# serve_model rejected `source ... && vllm ...` or because of a
|
||||
# manual relaunch via tmux send-keys — that session is invisible
|
||||
# to the cookbook UI even though it's a live model server. The
|
||||
# sweep finds those orphans on each configured remote host and
|
||||
# writes them into state.tasks with _adoptedExternally=True, so
|
||||
# they show up in the UI on the next poll without anyone having
|
||||
# to remember to call adopt_served_model. Rate-limited via the
|
||||
# module-level _last_orphan_sweep so we don't SSH every 3s.
|
||||
try:
|
||||
_maybe_sweep_orphans(tasks, state)
|
||||
except Exception as _sweep_e:
|
||||
logger.warning(f"orphan sweep failed (non-fatal): {_sweep_e!r}")
|
||||
|
||||
results = []
|
||||
for task in tasks:
|
||||
session_id = task.get("sessionId", "")
|
||||
@@ -2063,7 +2239,12 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
if _tport and _tport != "22":
|
||||
ssh_base.extend(["-p", str(_tport)])
|
||||
check_cmd = ssh_base + [remote, "tmux", "has-session", "-t", session_id]
|
||||
capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
|
||||
# Capture 500 lines (was 50) so a Python traceback survives
|
||||
# the post-crash neofetch banner + bash prompt that otherwise
|
||||
# fills the visible tail. Without this, output_tail ends up
|
||||
# as just "Locale: C / Ubuntu_Odysseus ❯" and the agent
|
||||
# can't diagnose the actual error.
|
||||
capture_cmd = ssh_base + [remote, "tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"]
|
||||
elif IS_WINDOWS:
|
||||
# LOCAL Windows task: launched as a detached process (no tmux).
|
||||
# Liveness comes from the <session>.pid file, output from the
|
||||
@@ -2072,7 +2253,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
capture_cmd = None
|
||||
else:
|
||||
check_cmd = ["tmux", "has-session", "-t", session_id]
|
||||
capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-50"]
|
||||
capture_cmd = ["tmux", "capture-pane", "-t", session_id, "-p", "-S", "-500"]
|
||||
|
||||
local_win_task = (not remote) and IS_WINDOWS
|
||||
|
||||
|
||||
Reference in New Issue
Block a user