cookbook agent debug loop: persistent log files, auto-adopt orphan tmux, Codex/Claude skill parity

Three converging fixes so the chat agent + external Codex/Claude skills can actually debug a crashed serve instead of staring at a post-crash neofetch banner: * Serves now `tee` to /tmp/odysseus-tmux/SESSION.log on the host running them. Runner saves fds 3/4 before the tee and restores them right before `exec ${SHELL}`, so the post-crash interactive zsh banner does NOT pollute the log file. * `tail_serve_output` (chat agent) and `/api/codex/cookbook/output/{sid}` (Codex+Claude skills) both prefer the persistent log file over the tmux pane. Pane is fallback for sessions predating the tee runner. Default tail bumped 150 -> 400. * `list_served_models` "recent log" snippet seeks to the Traceback line instead of showing the last 6 lines (which was always the bash prompt). Cookbook auto-adoption sweep on `/api/cookbook/tasks/status`: every 20s (rate-limited) the cookbook SSHes each configured server, finds `serve-*` / `cookbook-*` tmux sessions running an actual model process (vllm/python/llama-server/etc., filtered via `pane_current_command`), and writes them into state.tasks. So when the agent falls back to raw ssh+tmux, the session appears in the Cookbook UI on the next poll. `serve_model` error path now reads `data["detail"]` in addition to `data["error"]` so the FastAPI HTTPException message ("Invalid characters in cmd") actually reaches the agent instead of being swallowed as a generic "Serve failed". Tool description updated to warn against `cd …`/`source …`/`&&` prefixes. Intent-without-action supervisor in agent_loop: when the model writes "Let me tail the output" / "I'll check the logs" / "Let me investigate" and ends the turn without emitting a tool call, the loop injects a sharp system nudge ("You said you would X — DO IT NOW") and continues. Capped at 2 nudges per chat so a model that genuinely cannot use the tool does not pin the loop. Codex/Claude skill parity: adds `/cookbook/cached`, `/cookbook/presets`, `/cookbook/preset/{name}`, `/cookbook/adopt` so external agents have the same surface as the chat agent. SKILL.md docs + odysseus_api.py wrapper updated for both bundles. `adopt_served_model` promoted to the always-on tool set so the agent has a documented fallback when serve_model rejects a cmd. Also various cookbook UI tweaks accumulated alongside the above (cookbook.js, cookbookRunning.js, cookbookServe.js, cookbook-diagnosis.js, settings.js, style.css).
2026-06-17 18:25:26 -04:00 · 2026-06-04 23:27:18 +09:00
parent 041c03bf11
commit 9112861d8e
19 changed files with 1529 additions and 151 deletions
@@ -378,16 +378,12 @@ export const ERROR_PATTERNS = [
    message: 'Model architecture too new for installed vLLM/transformers.',
    fixes: [
      { label: 'Try --trust-remote-code', action: (panel) => _serveAutoRetry(panel, '--trust-remote-code'), autofix: true },
-      { label: 'Update vLLM on server', action: (panel) => {
-        const taskEl = panel.closest('.cookbook-task');
-        const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
-        const host = task?.remoteHost || '';
-        const prefix = _buildEnvPrefix();
-        const pipCmd = prefix ? prefix + ' pip install -U vllm transformers' : 'pip install -U vllm transformers';
-        const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
-        // Run in tmux so it doesn't timeout
-        const name = 'update-vllm';
-        _launchServeTask(name, 'pip-update', cmd);
+      { label: 'Update vLLM on server', action: () => {
+        // Use the venv's python3 by absolute path when configured (SSH non-
+        // interactive sessions often pick user-site Python over the venv).
+        const _vp = (_envState.env === 'venv' && _envState.envPath)
+          ? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
+        _launchServeTask('update-vllm', 'pip-update', `${_vp} -m pip install -U vllm transformers`);
      }},
    ],
  },
@@ -395,16 +391,10 @@ export const ERROR_PATTERNS = [
    pattern: /Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels\/layer/i,
    message: 'Transformers/kernels package mismatch.',
    fixes: [
-      { label: 'Repair kernel package', action: (panel) => {
-        const taskEl = panel.closest('.cookbook-task');
-        const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
-        const host = task?.remoteHost || '';
-        const prefix = _buildEnvPrefix();
-        const pipCmd = prefix
-          ? prefix + ' python3 -m pip install --user --break-system-packages "kernels<0.15"'
-          : 'python3 -m pip install --user --break-system-packages "kernels<0.15"';
-        const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
-        _launchServeTask('repair-kernels', 'pip-update', cmd);
+      { label: 'Repair kernel package', action: () => {
+        const _vp = (_envState.env === 'venv' && _envState.envPath)
+          ? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
+        _launchServeTask('repair-kernels', 'pip-update', `${_vp} -m pip install --user --break-system-packages kernels<0.15`);
      }},
      { label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') },
    ],
@@ -445,14 +435,10 @@ export const ERROR_PATTERNS = [
    pattern: /Triton kernels.*Failed to import|cannot import name '\w+' from 'triton_kernels/i,
    message: 'Triton kernels version mismatch. Non-fatal warning — model will still run, just without optimized MoE kernels.',
    fixes: [
-      { label: 'Update triton on server', action: (panel) => {
-        const taskEl = panel.closest('.cookbook-task');
-        const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
-        const host = task?.remoteHost || '';
-        const prefix = _buildEnvPrefix();
-        const pipCmd = prefix ? prefix + ' pip install -U triton triton-kernels' : 'pip install -U triton triton-kernels';
-        const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
-        _launchServeTask('update-triton', 'pip-update', cmd);
+      { label: 'Update triton on server', action: () => {
+        const _vp = (_envState.env === 'venv' && _envState.envPath)
+          ? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
+        _launchServeTask('update-triton', 'pip-update', `${_vp} -m pip install -U triton triton-kernels`);
      }},
    ],
  },
@@ -474,14 +460,56 @@ export const ERROR_PATTERNS = [
    pattern: /attention_sink|sliding.window.*not supported|sliding_window.*incompatible/i,
    message: 'Model uses attention features unsupported in this vLLM version.',
    fixes: [
-      { label: 'Update vLLM on server', action: (panel) => {
-        const taskEl = panel.closest('.cookbook-task');
-        const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
-        const host = task?.remoteHost || '';
-        const prefix = _buildEnvPrefix();
-        const pipCmd = prefix ? prefix + ' pip install -U vllm' : 'pip install -U vllm';
-        const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
-        _launchServeTask('update-vllm', 'pip-update', cmd);
+      { label: 'Update vLLM on server', action: () => {
+        const _vp = (_envState.env === 'venv' && _envState.envPath)
+          ? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
+        _launchServeTask('update-vllm', 'pip-update', `${_vp} -m pip install -U vllm`);
+      }},
+    ],
+  },
+  {
+    // FlashInfer JIT-compiles attention kernels for the host GPU on first
+    // use. If the system /usr/bin/nvcc is older than CUDA 11.8 it can't
+    // target sm_89/sm_90 (Ada/Hopper), and the engine workers die before
+    // they can report a useful traceback. Two quick paths out: pick a
+    // non-flashinfer attention backend, or set CUDACXX to a newer nvcc
+    // (vLLM installs nvidia-cuda-nvcc into the venv — point at that).
+    pattern: /nvcc fatal\s+:\s+Unsupported gpu architecture 'compute_\d+'/i,
+    message: 'FlashInfer is JIT-compiling sampling kernels with an nvcc too old for this GPU (no sm_89 / sm_90 support — pre-CUDA 11.8). Changing the attention backend does not help — flashinfer JITs the SAMPLER too. The clean fix is to set VLLM_USE_FLASHINFER_SAMPLER=0 so vLLM uses its native sampler instead.',
+    suggestion: 'Suggested action: relaunch with VLLM_USE_FLASHINFER_SAMPLER=0 prepended. (Confirmed on the QuantTrio/Qwen3.5 model card as the canonical workaround.)',
+    fixes: [
+      { label: 'Retry with VLLM_USE_FLASHINFER_SAMPLER=0', action: (panel) => _serveAutoRetryReplace(panel, '', 'VLLM_USE_FLASHINFER_SAMPLER=0 ', { prepend: true }) },
+      { label: 'Uninstall flashinfer-python', action: () => {
+        // Hard fallback: vLLM 0.22 reaches into flashinfer for sampling kernels
+        // even with VLLM_USE_FLASHINFER_SAMPLER=0 in some configs. Removing
+        // the package forces it onto the native sampler.
+        const _vp = (_envState.env === 'venv' && _envState.envPath)
+          ? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
+        _launchServeTask('uninstall-flashinfer', 'pip-update', `${_vp} -m pip uninstall flashinfer-python -y`);
+      }},
+      { label: 'Edit serve', action: (panel) => _openServeEditFromDiagnosis(panel) },
+    ],
+  },
+  {
+    // vLLM <-> torch ABI mismatch: vLLM imports torch.library helpers
+    // (`infer_schema`, `register_fake`, etc.) that only exist on newer torch
+    // versions. When the installed torch is older, the import fails before
+    // any server code runs. Fix is to reinstall vllm (which pulls a matching
+    // torch) or upgrade torch directly.
+    pattern: /ImportError: cannot import name '[^']+' from 'torch(\.\w+)+'/i,
+    message: 'vLLM was built against a newer torch than what is installed. Reinstall vLLM so pip pulls a compatible torch (or upgrade torch directly).',
+    fixes: [
+      { label: 'Reinstall vLLM (pulls matching torch)', action: () => {
+        // Absolute path to the venv's python3 — bare `python3` lands in the
+        // wrong site-packages over SSH when ~/.local/bin precedes the venv.
+        const _vp = (_envState.env === 'venv' && _envState.envPath)
+          ? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
+        _launchServeTask('reinstall-vllm', 'pip-reinstall', `${_vp} -m pip install --force-reinstall vllm`);
+      }},
+      { label: 'Upgrade torch only', action: () => {
+        const _vp = (_envState.env === 'venv' && _envState.envPath)
+          ? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
+        _launchServeTask('upgrade-torch', 'pip-update', `${_vp} -m pip install -U torch`);
      }},
    ],
  },
@@ -607,59 +635,24 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
  };

  if (fixes.length) {
+    // Always render fixes as inline buttons. The old "Actions ▾" dropdown
+    // (for >3 fixes) was broken — the menu wouldn't open in some panels and
+    // hid useful actions behind a non-working affordance. Inline buttons wrap
+    // naturally in `.cookbook-diag-fixes` (flex-wrap) so a long list reflows
+    // onto multiple rows instead of getting collapsed.
    const row = document.createElement('div');
    row.className = 'cookbook-diag-fixes';
-
-    if (fixes.length <= 3) {
-      for (const fix of fixes) {
-        const btn = document.createElement('button');
-        btn.className = 'cookbook-btn cookbook-diag-btn';
-        btn.type = 'button';
-        btn.innerHTML = _diagFixIcon(fix.label) + '<span class="cookbook-diag-btn-label">' + _diagEsc(fix.label) + '</span>';
-        btn.addEventListener('click', (e) => {
-          e.stopPropagation();
-          runFix(fix, btn);
-        });
-        row.appendChild(btn);
-      }
-      body.appendChild(row);
-      return;
-    }
-
-    const wrap = document.createElement('div');
-    wrap.className = 'cookbook-diag-actions';
-
-    const trigger = document.createElement('button');
-    trigger.className = 'cookbook-btn cookbook-diag-action-trigger';
-    trigger.type = 'button';
-    trigger.textContent = 'Actions';
-    trigger.appendChild(document.createTextNode(' ▾'));
-    wrap.appendChild(trigger);
-
-    const menu = document.createElement('div');
-    menu.className = 'dropdown cookbook-diag-menu hidden';
    for (const fix of fixes) {
-      const item = document.createElement('button');
-      item.type = 'button';
-      item.innerHTML = _diagFixIcon(fix.label) + '<span class="cookbook-diag-btn-label">' + _diagEsc(fix.label) + '</span>';
-      item.addEventListener('click', async (e) => {
+      const btn = document.createElement('button');
+      btn.className = 'cookbook-btn cookbook-diag-btn';
+      btn.type = 'button';
+      btn.innerHTML = _diagFixIcon(fix.label) + '<span class="cookbook-diag-btn-label">' + _diagEsc(fix.label) + '</span>';
+      btn.addEventListener('click', (e) => {
        e.stopPropagation();
-        if (item.dataset.busy || trigger.dataset.busy) return;
-        item.dataset.busy = '1';
-        await runFix(fix, trigger, fix.label, () => menu.classList.add('hidden'), () => delete item.dataset.busy);
+        runFix(fix, btn);
      });
-      menu.appendChild(item);
+      row.appendChild(btn);
    }
-    wrap.appendChild(menu);
-    trigger.addEventListener('click', (e) => {
-      e.stopPropagation();
-      if (trigger.dataset.busy) return;
-      document.querySelectorAll('.cookbook-diag-menu').forEach(m => {
-        if (m !== menu) m.classList.add('hidden');
-      });
-      menu.classList.toggle('hidden');
-    });
-    row.appendChild(wrap);
    body.appendChild(row);
  }
 }