mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Odysseus v1.0
This commit is contained in:
+249
@@ -0,0 +1,249 @@
|
||||
"""Background job execution for the agent's `bash` tool.
|
||||
|
||||
Long commands (installs, ffmpeg, model downloads) should NOT block the chat
|
||||
stream — a multi-minute held SSE connection is fragile (model-stops-early,
|
||||
timeouts, tab suspend). Instead we launch them **detached** and let an
|
||||
always-on monitor re-invoke the agent when they finish ("auto-continue").
|
||||
|
||||
Design goals:
|
||||
* Restart-safe: status is derived from an on-disk exit-code file, not a live
|
||||
PID, so a uvicorn restart never loses a job or its result.
|
||||
* Idempotent follow-up: a job stays {done, followed_up: False} until the
|
||||
agent has actually been re-invoked, so completion can never silently
|
||||
"do nothing" — the monitor retries on the next tick.
|
||||
* Bounded: a hard max-runtime marks a runaway job failed and STILL triggers
|
||||
a follow-up ("timed out"), so you always hear back.
|
||||
|
||||
This module only owns launch + state. The monitor / agent re-invocation lives
|
||||
in the caller (so this stays import-light and unit-testable).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from core.atomic_io import atomic_write_json
|
||||
|
||||
_DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
|
||||
_JOBS_DIR = _DATA_DIR / "bg_jobs"
|
||||
_STORE = _DATA_DIR / "bg_jobs.json"
|
||||
|
||||
# A job that runs longer than this is presumed stuck and reaped (the agent
|
||||
# still gets a "timed out" follow-up so nothing hangs forever).
|
||||
DEFAULT_MAX_RUNTIME_S = 3600 # 1 hour
|
||||
# Cap how much captured output we keep / feed back to the model.
|
||||
_MAX_OUTPUT_CHARS = 16000
|
||||
# How long a finished-and-followed-up job (record + its .sh/.cmd.sh/.log/.exit
|
||||
# files) is kept before pruning, so neither the store nor data/bg_jobs/ grows
|
||||
# without bound. The agent has already consumed the result by then.
|
||||
_RETENTION_S = 3600 # 1 hour after follow-up
|
||||
|
||||
|
||||
def _load() -> Dict[str, Dict[str, Any]]:
|
||||
try:
|
||||
if _STORE.exists():
|
||||
return json.loads(_STORE.read_text()) or {}
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _save(jobs: Dict[str, Dict[str, Any]]) -> None:
|
||||
atomic_write_json(str(_STORE), jobs, indent=2)
|
||||
|
||||
|
||||
def _pid_alive(pid: Optional[int]) -> bool:
|
||||
if not pid:
|
||||
return False
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except (OSError, ProcessLookupError):
|
||||
return False
|
||||
|
||||
|
||||
def launch(command: str, session_id: str, cwd: Optional[str] = None,
|
||||
max_runtime_s: int = DEFAULT_MAX_RUNTIME_S) -> Dict[str, Any]:
|
||||
"""Launch `command` detached. Returns the job record (status='running').
|
||||
|
||||
Output + the final exit code are written to files so status survives a
|
||||
server restart. The process is put in its own session (setsid) so it
|
||||
outlives the request/stream that started it.
|
||||
"""
|
||||
_JOBS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
job_id = uuid.uuid4().hex[:12]
|
||||
log_path = _JOBS_DIR / f"{job_id}.log"
|
||||
exit_path = _JOBS_DIR / f"{job_id}.exit"
|
||||
|
||||
# The user command goes in its OWN script file, run as a child `bash`. This
|
||||
# is what isolates it: an `exit` inside it only ends that child (so the
|
||||
# wrapper still records the exit code), and — unlike textually wrapping the
|
||||
# command in `( … )` — the wrapper can't be broken by an unbalanced paren or
|
||||
# a trailing line-continuation in the command. `$?` is the child's real
|
||||
# exit status.
|
||||
cmd_path = _JOBS_DIR / f"{job_id}.cmd.sh"
|
||||
cmd_path.write_text(command + "\n")
|
||||
wrapper = (
|
||||
f"bash {cmd_path} > {log_path} 2>&1\n"
|
||||
f"echo $? > {exit_path}\n"
|
||||
)
|
||||
script_path = _JOBS_DIR / f"{job_id}.sh"
|
||||
script_path.write_text(wrapper)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
["bash", str(script_path)],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
stdin=subprocess.DEVNULL,
|
||||
cwd=cwd or None,
|
||||
start_new_session=True, # setsid — detach from the request lifecycle
|
||||
)
|
||||
|
||||
rec = {
|
||||
"id": job_id,
|
||||
"session_id": session_id,
|
||||
"command": command,
|
||||
"status": "running", # running | done | failed
|
||||
"pid": proc.pid,
|
||||
"started_at": time.time(),
|
||||
"ended_at": None,
|
||||
"exit_code": None,
|
||||
"max_runtime_s": max_runtime_s,
|
||||
"followed_up": False, # has the agent been re-invoked with the result?
|
||||
"log_path": str(log_path),
|
||||
"exit_path": str(exit_path),
|
||||
}
|
||||
jobs = _load()
|
||||
jobs[job_id] = rec
|
||||
_save(jobs)
|
||||
return rec
|
||||
|
||||
|
||||
def _read_output(rec: Dict[str, Any]) -> str:
|
||||
try:
|
||||
txt = Path(rec["log_path"]).read_text(errors="replace")
|
||||
except Exception:
|
||||
return ""
|
||||
if len(txt) > _MAX_OUTPUT_CHARS:
|
||||
# Keep head + tail — the interesting bits are usually at both ends.
|
||||
head = txt[: _MAX_OUTPUT_CHARS // 2]
|
||||
tail = txt[-_MAX_OUTPUT_CHARS // 2:]
|
||||
txt = head + "\n…[truncated]…\n" + tail
|
||||
return txt
|
||||
|
||||
|
||||
def _prune(jobs: Dict[str, Dict[str, Any]], now: float) -> bool:
|
||||
"""Drop records (and their on-disk files) for jobs that finished, were
|
||||
followed up, and are older than the retention window. Mutates `jobs`."""
|
||||
stale = [jid for jid, rec in jobs.items()
|
||||
if rec.get("followed_up") and rec.get("ended_at")
|
||||
and (now - rec["ended_at"]) > _RETENTION_S]
|
||||
for jid in stale:
|
||||
jobs.pop(jid, None)
|
||||
for p in _JOBS_DIR.glob(f"{jid}.*"): # .sh .cmd.sh .log .exit
|
||||
try:
|
||||
p.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
return bool(stale)
|
||||
|
||||
|
||||
def refresh() -> Dict[str, Dict[str, Any]]:
|
||||
"""Reconcile every running job against disk. Marks done/failed (incl.
|
||||
timeout). Idempotent — safe to call from a poll loop. Returns the store."""
|
||||
jobs = _load()
|
||||
changed = False
|
||||
now = time.time()
|
||||
for rec in jobs.values():
|
||||
if rec.get("status") != "running":
|
||||
continue
|
||||
exit_path = Path(rec.get("exit_path", ""))
|
||||
if exit_path.exists():
|
||||
try:
|
||||
code = int(exit_path.read_text().strip() or "1")
|
||||
except Exception:
|
||||
code = 1
|
||||
rec["exit_code"] = code
|
||||
rec["status"] = "done" if code == 0 else "failed"
|
||||
rec["ended_at"] = now
|
||||
changed = True
|
||||
elif (now - rec.get("started_at", now)) > rec.get("max_runtime_s", DEFAULT_MAX_RUNTIME_S):
|
||||
# Runaway / stuck — reap it but STILL surface a follow-up.
|
||||
_kill(rec.get("pid"))
|
||||
rec["status"] = "failed"
|
||||
rec["exit_code"] = -1
|
||||
rec["ended_at"] = now
|
||||
rec["timed_out"] = True
|
||||
changed = True
|
||||
elif not _pid_alive(rec.get("pid")) and not exit_path.exists():
|
||||
# Process vanished without writing an exit code (killed, OOM,
|
||||
# crash). Don't leave it "running" forever.
|
||||
rec["status"] = "failed"
|
||||
rec["exit_code"] = -1
|
||||
rec["ended_at"] = now
|
||||
rec["died"] = True
|
||||
changed = True
|
||||
if _prune(jobs, now):
|
||||
changed = True
|
||||
if changed:
|
||||
_save(jobs)
|
||||
return jobs
|
||||
|
||||
|
||||
def _kill(pid: Optional[int]) -> None:
|
||||
if not pid:
|
||||
return
|
||||
try:
|
||||
os.killpg(os.getpgid(pid), signal.SIGTERM)
|
||||
except Exception:
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def pending_followups() -> List[Dict[str, Any]]:
|
||||
"""Finished jobs the agent hasn't been re-invoked for yet. The monitor
|
||||
drains these; mark_followed_up() flips the flag only on success."""
|
||||
jobs = refresh()
|
||||
return [r for r in jobs.values()
|
||||
if r.get("status") in ("done", "failed") and not r.get("followed_up")]
|
||||
|
||||
|
||||
def mark_followed_up(job_id: str) -> None:
|
||||
jobs = _load()
|
||||
if job_id in jobs:
|
||||
jobs[job_id]["followed_up"] = True
|
||||
_save(jobs)
|
||||
|
||||
|
||||
def get(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
refresh() # reconcile against disk so status/exit_code are current
|
||||
rec = _load().get(job_id)
|
||||
if rec:
|
||||
rec = dict(rec)
|
||||
rec["output"] = _read_output(rec)
|
||||
return rec
|
||||
|
||||
|
||||
def list_for_session(session_id: str) -> List[Dict[str, Any]]:
|
||||
return [r for r in refresh().values() if r.get("session_id") == session_id]
|
||||
|
||||
|
||||
def result_text(rec: Dict[str, Any]) -> str:
|
||||
"""Human/agent-readable summary of a finished job, for the follow-up."""
|
||||
out = _read_output(rec)
|
||||
if rec.get("timed_out"):
|
||||
head = f"Background job timed out after {rec.get('max_runtime_s')}s."
|
||||
elif rec.get("died"):
|
||||
head = "Background job process died unexpectedly (no exit code)."
|
||||
else:
|
||||
head = f"Background job finished with exit code {rec.get('exit_code')}."
|
||||
return f"{head}\nCommand: {rec.get('command')}\n\nOutput:\n{out or '(no output)'}"
|
||||
Reference in New Issue
Block a user