cookbook agent debug loop: persistent log files, auto-adopt orphan tmux, Codex/Claude skill parity

Three converging fixes so the chat agent + external Codex/Claude skills can actually debug a crashed serve instead of staring at a post-crash neofetch banner: * Serves now `tee` to /tmp/odysseus-tmux/SESSION.log on the host running them. Runner saves fds 3/4 before the tee and restores them right before `exec ${SHELL}`, so the post-crash interactive zsh banner does NOT pollute the log file. * `tail_serve_output` (chat agent) and `/api/codex/cookbook/output/{sid}` (Codex+Claude skills) both prefer the persistent log file over the tmux pane. Pane is fallback for sessions predating the tee runner. Default tail bumped 150 -> 400. * `list_served_models` "recent log" snippet seeks to the Traceback line instead of showing the last 6 lines (which was always the bash prompt). Cookbook auto-adoption sweep on `/api/cookbook/tasks/status`: every 20s (rate-limited) the cookbook SSHes each configured server, finds `serve-*` / `cookbook-*` tmux sessions running an actual model process (vllm/python/llama-server/etc., filtered via `pane_current_command`), and writes them into state.tasks. So when the agent falls back to raw ssh+tmux, the session appears in the Cookbook UI on the next poll. `serve_model` error path now reads `data["detail"]` in addition to `data["error"]` so the FastAPI HTTPException message ("Invalid characters in cmd") actually reaches the agent instead of being swallowed as a generic "Serve failed". Tool description updated to warn against `cd …`/`source …`/`&&` prefixes. Intent-without-action supervisor in agent_loop: when the model writes "Let me tail the output" / "I'll check the logs" / "Let me investigate" and ends the turn without emitting a tool call, the loop injects a sharp system nudge ("You said you would X — DO IT NOW") and continues. Capped at 2 nudges per chat so a model that genuinely cannot use the tool does not pin the loop. Codex/Claude skill parity: adds `/cookbook/cached`, `/cookbook/presets`, `/cookbook/preset/{name}`, `/cookbook/adopt` so external agents have the same surface as the chat agent. SKILL.md docs + odysseus_api.py wrapper updated for both bundles. `adopt_served_model` promoted to the always-on tool set so the agent has a documented fallback when serve_model rejects a cmd. Also various cookbook UI tweaks accumulated alongside the above (cookbook.js, cookbookRunning.js, cookbookServe.js, cookbook-diagnosis.js, settings.js, style.css).
2026-06-17 02:05:22 -04:00 · 2026-06-04 23:27:18 +09:00
parent 041c03bf11
commit 9112861d8e
19 changed files with 1529 additions and 151 deletions
@@ -124,6 +124,14 @@ async function _openDownloadForGgufTask(task) {
 function _terminalServeDiagnosis(task, outputText) {
  const out = String(outputText || task?.output || '');
  if (!task || task.type !== 'serve' || !['stopped', 'error', 'crashed', 'failed'].includes(task.status) || !out.trim()) return null;
+  // Pip tasks (Reinstall vLLM, Upgrade torch, etc.) ride on the serve task
+  // type so they get a tmux session + show up in Running tab — but they are
+  // NOT serve invocations. Their output is pip's own; the generic
+  // "Serve stopped before the model became reachable" message + Edit-serve
+  // fix make no sense. Bail so the panel just shows pip's output.
+  const _isPipTask = ((task.payload?.repo_id || '').startsWith('pip-'))
+    || /python3? -m pip\b/.test(task.payload?._cmd || '');
+  if (_isPipTask) return null;
  if (_serveTaskLooksAwqOnLocalBackend(task, out)) {
    return {
      message: 'AWQ/GPTQ/FP8 cannot be served through llama.cpp/Ollama unified-memory mode.',
@@ -249,7 +257,7 @@ const SERVE_STATE_KEY = 'cookbook-serve-state';

 // Polling / timeout intervals
 const TASK_POLL_INTERVAL_MS = 3000;       // delay between reconnect-loop iterations
-const BG_MONITOR_INTERVAL_MS = 10000;     // background task status poll
+const BG_MONITOR_INTERVAL_MS = 5000;      // background task status poll
 const STALE_PROGRESS_MS = 5 * 60 * 1000;  // download with no progress this long = stale
 const STARTUP_STALE_PROGRESS_MS = 45 * 1000; // 0%-forever startup stall: retry much sooner

@@ -523,6 +531,26 @@ function _serveOutputLooksReady(task) {

 function _normalizeTaskForDisplay(task) {
  if (!task || typeof task !== 'object') return task;
+  // Pip tasks (Reinstall vLLM / Upgrade torch / etc.) ride on the serve task
+  // type so they get tmux + the Running tab. They are NOT serves — their
+  // "ready" markers are pip's `Successfully installed` / `Requirement already
+  // satisfied`, not "Application startup complete".
+  const _isPipTask = ((task.payload?.repo_id || '').startsWith('pip-'))
+    || /python3? -m pip\b/.test(task.payload?._cmd || '');
+  if (_isPipTask) {
+    // Override stale status: any pip task whose output carries pip's own
+    // success markers gets displayed as `done` regardless of what's in
+    // localStorage. Old pre-fix runs landed in error/stopped state and
+    // stuck there even after we taught the rest of the flow about pip
+    // tasks — this is the catch-all that flips them to Finished on render.
+    const out = String(task.output || '');
+    const ranOk = /Successfully installed|Requirement already (?:satisfied|up-to-date)/i.test(out)
+      && !/error:|ERROR:/.test(out.slice(-1024));
+    if (ranOk && task.status !== 'done' && task.status !== 'running') {
+      return { ...task, status: 'done' };
+    }
+    return task;
+  }
  if (task.type === 'serve' && task.status === 'done' && !_serveOutputLooksReady(task)) {
    return { ...task, status: 'error' };
  }
@@ -2409,7 +2437,7 @@ async function _reconnectTask(el, task) {
      if (data.exit_code !== 0) {
        failCount++;
        if (failCount < 5) {
-          await new Promise(r => setTimeout(r, 5000));
+          await new Promise(r => setTimeout(r, 3000));
          continue;
        }
        try {
@@ -2430,7 +2458,15 @@ async function _reconnectTask(el, task) {
        }

        const lastOutput = output.textContent || '';
-        const diag = _diagnose(lastOutput);
+        // Pip tasks (Reinstall vLLM / Upgrade torch / etc.) must skip the
+        // generic serve `_diagnose` step. Their output is pip's own and the
+        // error patterns there (torch ABI traceback, "No module named torch",
+        // etc.) are routinely matched against the previous tmux scrollback,
+        // tagging a clean pip success as a crashed serve. Detection is the
+        // same shape as the looksSuccessful branch below.
+        const _isPipTaskDiag = ((task.payload?.repo_id || '').startsWith('pip-'))
+          || /python3? -m pip\b/.test(task.payload?._cmd || '');
+        const diag = _isPipTaskDiag ? null : _diagnose(lastOutput);
        if (diag) {
          let diagEl = el.querySelector('.cookbook-diagnosis');
          if (!diagEl) {
@@ -2447,14 +2483,40 @@ async function _reconnectTask(el, task) {
        } else {
          const downloadLooksSuccessful = !lastOutput.includes('DOWNLOAD_FAILED')
            && (lastOutput.includes('DONE') || lastOutput.includes('100%') || lastOutput.includes('/snapshots/') || lastOutput.includes('Download complete') || lastOutput.includes('DOWNLOAD_OK'));
+          // Pip install / reinstall tasks are launched via _launchServeTask (so
+          // they show up in the Running tab + use tmux) but they aren't real
+          // serves — the cmd is `python3 -m pip ...` and the success markers
+          // are pip's own. Without this branch, a successful reinstall ends
+          // with no "Uvicorn running on" line and gets mis-flagged as a crashed
+          // serve.
+          const _isPipTask = ((task.payload?.repo_id || '').startsWith('pip-'))
+            || /python3? -m pip\b/.test(task.payload?._cmd || '');
+          const pipLooksSuccessful = _isPipTask
+            && /Successfully installed|Requirement already (?:satisfied|up-to-date)/i.test(lastOutput)
+            && !/error:|ERROR:/.test(lastOutput.slice(-1024));
          const serveLooksReady = task.type === 'serve' && _serveOutputLooksReady({ ...task, output: lastOutput });
-          const looksSuccessful = task.type === 'download' ? downloadLooksSuccessful : serveLooksReady;
+          const looksSuccessful = task.type === 'download'
+            ? downloadLooksSuccessful
+            : (_isPipTask ? pipLooksSuccessful : serveLooksReady);
          if (!lastOutput.trim() || !looksSuccessful) {
            _updateTask(task.sessionId, { status: 'crashed' });
            el.dataset.status = 'crashed';
            const badge = el.querySelector('.cookbook-task-status');
            if (badge) { badge.textContent = _statusLabel('crashed', task.type); badge.className = 'cookbook-task-status cookbook-task-crashed'; }
-            if (task.type === 'serve') {
+            if (_isPipTask) {
+              // Pip tasks: don't run the serve diagnosis (which would yell
+              // "Serve stopped before the model became reachable"). Show a
+              // pip-tailored message; the user can read pip's own error output
+              // directly above.
+              const _ranOk = /Successfully installed|Requirement already (?:satisfied|up-to-date)/i.test(lastOutput);
+              if (!_ranOk) {
+                _showDiagnosis(el, {
+                  message: 'Pip install did not finish with a success marker. Check the output for the underlying error.',
+                  suggestion: 'Suggested action: copy the troubleshooting bundle. Common causes: missing build deps, network blip, mismatched torch ABI.',
+                  fixes: [],
+                }, lastOutput);
+              }
+            } else if (task.type === 'serve') {
              const diag = _diagnose(lastOutput) || {
                message: _serveTaskLooksAwqOnLocalBackend(task, lastOutput)
                  ? 'AWQ/GPTQ/FP8 cannot be served through llama.cpp/Ollama unified-memory mode.'
@@ -2533,6 +2595,28 @@ async function _reconnectTask(el, task) {
            }
            _showCookbookNotif(true);
          } else {
+            // Strong completion markers — `DOWNLOAD_OK` is emitted by our
+            // downloader wrapper AFTER the model snapshot is on disk, and
+            // `/snapshots/` only appears once HF has resolved the cached
+            // tree. Either is conclusive. Finalize as done immediately, skip
+            // the 30s debounce — the debounce only exists to guard against
+            // ambiguous markers (bare "100%" / "Download complete") which can
+            // appear mid-stream during multi-file downloads.
+            const _strongDone = task.type === 'download'
+              && (lastOutput.includes('DOWNLOAD_OK') || lastOutput.includes('/snapshots/'));
+            if (_strongDone) {
+              _updateTask(task.sessionId, { status: 'done', _doneConfirmAt: null, _lastStatusFlipAt: Date.now() });
+              el.dataset.status = 'done';
+              const badge = el.querySelector('.cookbook-task-status');
+              if (badge) { badge.textContent = _statusLabel('done', task.type); badge.className = 'cookbook-task-status cookbook-task-done'; }
+              const _chk = el.querySelector('.cookbook-task-check'); if (_chk) _chk.style.display = '';
+              const _sb = el.querySelector('.cookbook-task-serve-btn'); if (_sb) _sb.style.display = '';
+              _showCookbookNotif();
+              _refreshDepsAfterInstall(task);
+              _renderRunningTab();
+              _processQueue();
+              break;
+            }
            // Debounce the done flip. Tmux capture-pane can fail transiently
            // (network blip, ssh reconnect), and the verify has-session right
            // above can briefly report dead even when the session is in the
@@ -2559,7 +2643,7 @@ async function _reconnectTask(el, task) {
                    stillAlive = pData.exit_code === 0;
                  } catch { /* network blip — treat as inconclusive, prefer running */ stillAlive = true; }
                  if (stillAlive) {
-                    _updateTask(task.sessionId, { status: 'running', _doneConfirmAt: null });
+                    _updateTask(task.sessionId, { status: 'running', _doneConfirmAt: null, _lastStatusFlipAt: Date.now() });
                    const _el = document.querySelector(`.cookbook-task[data-task-id="${task.sessionId}"]`);
                    if (_el) {
                      _el.dataset.status = 'running';
@@ -2571,7 +2655,7 @@ async function _reconnectTask(el, task) {
                    }
                    return;
                  }
-                  _updateTask(task.sessionId, { status: 'done', _doneConfirmAt: null });
+                  _updateTask(task.sessionId, { status: 'done', _doneConfirmAt: null, _lastStatusFlipAt: Date.now() });
                  const _el = document.querySelector(`.cookbook-task[data-task-id="${task.sessionId}"]`);
                  if (_el) {
                    _el.dataset.status = 'done';
@@ -2596,8 +2680,14 @@ async function _reconnectTask(el, task) {

      const snapshot = (data.stdout || '').trim();
      if (snapshot) {
+        // Only auto-scroll to bottom if the user was already there. When
+        // they've scrolled up to read earlier output, leave their position
+        // alone so a fresh snapshot doesn't yank them back to the tail.
+        // 40px tolerance covers sub-pixel rounding + the moment between
+        // releasing the scrollbar and the next poll arriving.
+        const _atBottom = (output.scrollHeight - output.scrollTop - output.clientHeight) < 40;
        output.textContent = snapshot;
-        output.scrollTop = output.scrollHeight;
+        if (_atBottom) output.scrollTop = output.scrollHeight;

        // Live status parsing for download tasks
        if (task.type === 'download') {
@@ -3153,16 +3243,27 @@ export async function _selfHealStaleTasks(opts = {}) {
    // itself fires every 10s, so this almost always fires too, but the
    // guard keeps a fast manual call from doubling up).
    const now = Date.now();
-    if (now - _selfHealLastTs < 8000) return;
+    if (now - _selfHealLastTs < 4000) return;
    _selfHealLastTs = now;
  }
  const tasks = _loadTasks();
-  const candidates = tasks.filter(t =>
-    t.type === 'download'
-    && ['done', 'error', 'crashed', 'stopped'].includes(t.status)
-    && t.sessionId
-    && !String(t.sessionId).startsWith('queue-')
-  );
+  const candidates = tasks.filter(t => {
+    if (t.type !== 'download') return false;
+    if (!['done', 'error', 'crashed', 'stopped'].includes(t.status)) return false;
+    if (!t.sessionId || String(t.sessionId).startsWith('queue-')) return false;
+    // Finished downloads with strong completion markers (DOWNLOAD_OK or HF
+    // /snapshots/ resolution) are demonstrably done — do not flip them back
+    // to running just because the tmux session is still alive (e.g., a
+    // long-lived shell that hosted the download or a flapping SSH that
+    // reports the session as up). This was the main source of finished↔
+    // downloading oscillation on a flaky connection.
+    if (t.status === 'done' && /DOWNLOAD_OK|\/snapshots\//.test(t.output || '')) return false;
+    // Cooldown: never flip the same task more than once every 45s. A flapping
+    // SSH connection used to drive the badge back-and-forth on every probe
+    // cycle; this enforces a stable view between flaps.
+    if (t._lastStatusFlipAt && (Date.now() - t._lastStatusFlipAt < 45000)) return false;
+    return true;
+  });
  if (!candidates.length) return;
  let flipped = 0;
  for (const t of candidates) {
@@ -3180,6 +3281,7 @@ export async function _selfHealStaleTasks(opts = {}) {
        if (ft && ft.status !== 'running') {
          ft.status = 'running';
          ft._selfHealed = true;
+          ft._lastStatusFlipAt = Date.now();
          _saveTasks(fresh);
          flipped++;
          const _el = document.querySelector(`.cookbook-task[data-task-id="${t.sessionId}"]`);