cookbook agent debug loop: persistent log files, auto-adopt orphan tmux, Codex/Claude skill parity

Three converging fixes so the chat agent + external Codex/Claude skills can actually debug a crashed serve instead of staring at a post-crash neofetch banner:

* Serves now `tee` to /tmp/odysseus-tmux/SESSION.log on the host running them. Runner saves fds 3/4 before the tee and restores them right before `exec ${SHELL}`, so the post-crash interactive zsh banner does NOT pollute the log file.
* `tail_serve_output` (chat agent) and `/api/codex/cookbook/output/{sid}` (Codex+Claude skills) both prefer the persistent log file over the tmux pane. Pane is fallback for sessions predating the tee runner. Default tail bumped 150 -> 400.
* `list_served_models` "recent log" snippet seeks to the Traceback line instead of showing the last 6 lines (which was always the bash prompt).

Cookbook auto-adoption sweep on `/api/cookbook/tasks/status`: every 20s (rate-limited) the cookbook SSHes each configured server, finds `serve-*` / `cookbook-*` tmux sessions running an actual model process (vllm/python/llama-server/etc., filtered via `pane_current_command`), and writes them into state.tasks. So when the agent falls back to raw ssh+tmux, the session appears in the Cookbook UI on the next poll.

`serve_model` error path now reads `data["detail"]` in addition to `data["error"]` so the FastAPI HTTPException message ("Invalid characters in cmd") actually reaches the agent instead of being swallowed as a generic "Serve failed". Tool description updated to warn against `cd …`/`source …`/`&&` prefixes.

Intent-without-action supervisor in agent_loop: when the model writes "Let me tail the output" / "I'll check the logs" / "Let me investigate" and ends the turn without emitting a tool call, the loop injects a sharp system nudge ("You said you would X — DO IT NOW") and continues. Capped at 2 nudges per chat so a model that genuinely cannot use the tool does not pin the loop.

Codex/Claude skill parity: adds `/cookbook/cached`, `/cookbook/presets`, `/cookbook/preset/{name}`, `/cookbook/adopt` so external agents have the same surface as the chat agent. SKILL.md docs + odysseus_api.py wrapper updated for both bundles.

`adopt_served_model` promoted to the always-on tool set so the agent has a documented fallback when serve_model rejects a cmd.

Also various cookbook UI tweaks accumulated alongside the above (cookbook.js, cookbookRunning.js, cookbookServe.js, cookbook-diagnosis.js, settings.js, style.css).
This commit is contained in:
pewdiepie-archdaemon
2026-06-04 23:27:18 +09:00
parent 041c03bf11
commit 9112861d8e
19 changed files with 1529 additions and 151 deletions
+76 -83
View File
@@ -378,16 +378,12 @@ export const ERROR_PATTERNS = [
message: 'Model architecture too new for installed vLLM/transformers.',
fixes: [
{ label: 'Try --trust-remote-code', action: (panel) => _serveAutoRetry(panel, '--trust-remote-code'), autofix: true },
{ label: 'Update vLLM on server', action: (panel) => {
const taskEl = panel.closest('.cookbook-task');
const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
const host = task?.remoteHost || '';
const prefix = _buildEnvPrefix();
const pipCmd = prefix ? prefix + ' pip install -U vllm transformers' : 'pip install -U vllm transformers';
const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
// Run in tmux so it doesn't timeout
const name = 'update-vllm';
_launchServeTask(name, 'pip-update', cmd);
{ label: 'Update vLLM on server', action: () => {
// Use the venv's python3 by absolute path when configured (SSH non-
// interactive sessions often pick user-site Python over the venv).
const _vp = (_envState.env === 'venv' && _envState.envPath)
? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
_launchServeTask('update-vllm', 'pip-update', `${_vp} -m pip install -U vllm transformers`);
}},
],
},
@@ -395,16 +391,10 @@ export const ERROR_PATTERNS = [
pattern: /Either a revision or a version must be specified|transformers\.integrations\.hub_kernels|kernels\/layer/i,
message: 'Transformers/kernels package mismatch.',
fixes: [
{ label: 'Repair kernel package', action: (panel) => {
const taskEl = panel.closest('.cookbook-task');
const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
const host = task?.remoteHost || '';
const prefix = _buildEnvPrefix();
const pipCmd = prefix
? prefix + ' python3 -m pip install --user --break-system-packages "kernels<0.15"'
: 'python3 -m pip install --user --break-system-packages "kernels<0.15"';
const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
_launchServeTask('repair-kernels', 'pip-update', cmd);
{ label: 'Repair kernel package', action: () => {
const _vp = (_envState.env === 'venv' && _envState.envPath)
? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
_launchServeTask('repair-kernels', 'pip-update', `${_vp} -m pip install --user --break-system-packages kernels<0.15`);
}},
{ label: 'Open Dependencies', action: () => _openCookbookDependencies('sglang') },
],
@@ -445,14 +435,10 @@ export const ERROR_PATTERNS = [
pattern: /Triton kernels.*Failed to import|cannot import name '\w+' from 'triton_kernels/i,
message: 'Triton kernels version mismatch. Non-fatal warning — model will still run, just without optimized MoE kernels.',
fixes: [
{ label: 'Update triton on server', action: (panel) => {
const taskEl = panel.closest('.cookbook-task');
const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
const host = task?.remoteHost || '';
const prefix = _buildEnvPrefix();
const pipCmd = prefix ? prefix + ' pip install -U triton triton-kernels' : 'pip install -U triton triton-kernels';
const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
_launchServeTask('update-triton', 'pip-update', cmd);
{ label: 'Update triton on server', action: () => {
const _vp = (_envState.env === 'venv' && _envState.envPath)
? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
_launchServeTask('update-triton', 'pip-update', `${_vp} -m pip install -U triton triton-kernels`);
}},
],
},
@@ -474,14 +460,56 @@ export const ERROR_PATTERNS = [
pattern: /attention_sink|sliding.window.*not supported|sliding_window.*incompatible/i,
message: 'Model uses attention features unsupported in this vLLM version.',
fixes: [
{ label: 'Update vLLM on server', action: (panel) => {
const taskEl = panel.closest('.cookbook-task');
const task = taskEl ? _loadTasks().find(t => t.sessionId === taskEl.dataset.taskId) : null;
const host = task?.remoteHost || '';
const prefix = _buildEnvPrefix();
const pipCmd = prefix ? prefix + ' pip install -U vllm' : 'pip install -U vllm';
const cmd = host ? _sshCmd(host, pipCmd) : pipCmd;
_launchServeTask('update-vllm', 'pip-update', cmd);
{ label: 'Update vLLM on server', action: () => {
const _vp = (_envState.env === 'venv' && _envState.envPath)
? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
_launchServeTask('update-vllm', 'pip-update', `${_vp} -m pip install -U vllm`);
}},
],
},
{
// FlashInfer JIT-compiles attention kernels for the host GPU on first
// use. If the system /usr/bin/nvcc is older than CUDA 11.8 it can't
// target sm_89/sm_90 (Ada/Hopper), and the engine workers die before
// they can report a useful traceback. Two quick paths out: pick a
// non-flashinfer attention backend, or set CUDACXX to a newer nvcc
// (vLLM installs nvidia-cuda-nvcc into the venv — point at that).
pattern: /nvcc fatal\s+:\s+Unsupported gpu architecture 'compute_\d+'/i,
message: 'FlashInfer is JIT-compiling sampling kernels with an nvcc too old for this GPU (no sm_89 / sm_90 support — pre-CUDA 11.8). Changing the attention backend does not help — flashinfer JITs the SAMPLER too. The clean fix is to set VLLM_USE_FLASHINFER_SAMPLER=0 so vLLM uses its native sampler instead.',
suggestion: 'Suggested action: relaunch with VLLM_USE_FLASHINFER_SAMPLER=0 prepended. (Confirmed on the QuantTrio/Qwen3.5 model card as the canonical workaround.)',
fixes: [
{ label: 'Retry with VLLM_USE_FLASHINFER_SAMPLER=0', action: (panel) => _serveAutoRetryReplace(panel, '', 'VLLM_USE_FLASHINFER_SAMPLER=0 ', { prepend: true }) },
{ label: 'Uninstall flashinfer-python', action: () => {
// Hard fallback: vLLM 0.22 reaches into flashinfer for sampling kernels
// even with VLLM_USE_FLASHINFER_SAMPLER=0 in some configs. Removing
// the package forces it onto the native sampler.
const _vp = (_envState.env === 'venv' && _envState.envPath)
? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
_launchServeTask('uninstall-flashinfer', 'pip-update', `${_vp} -m pip uninstall flashinfer-python -y`);
}},
{ label: 'Edit serve', action: (panel) => _openServeEditFromDiagnosis(panel) },
],
},
{
// vLLM <-> torch ABI mismatch: vLLM imports torch.library helpers
// (`infer_schema`, `register_fake`, etc.) that only exist on newer torch
// versions. When the installed torch is older, the import fails before
// any server code runs. Fix is to reinstall vllm (which pulls a matching
// torch) or upgrade torch directly.
pattern: /ImportError: cannot import name '[^']+' from 'torch(\.\w+)+'/i,
message: 'vLLM was built against a newer torch than what is installed. Reinstall vLLM so pip pulls a compatible torch (or upgrade torch directly).',
fixes: [
{ label: 'Reinstall vLLM (pulls matching torch)', action: () => {
// Absolute path to the venv's python3 — bare `python3` lands in the
// wrong site-packages over SSH when ~/.local/bin precedes the venv.
const _vp = (_envState.env === 'venv' && _envState.envPath)
? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
_launchServeTask('reinstall-vllm', 'pip-reinstall', `${_vp} -m pip install --force-reinstall vllm`);
}},
{ label: 'Upgrade torch only', action: () => {
const _vp = (_envState.env === 'venv' && _envState.envPath)
? `${_envState.envPath.replace(/\/+$/, '')}/bin/python3` : 'python3';
_launchServeTask('upgrade-torch', 'pip-update', `${_vp} -m pip install -U torch`);
}},
],
},
@@ -607,59 +635,24 @@ export function _showDiagnosis(panel, diagnosis, sourceText) {
};
if (fixes.length) {
// Always render fixes as inline buttons. The old "Actions ▾" dropdown
// (for >3 fixes) was broken — the menu wouldn't open in some panels and
// hid useful actions behind a non-working affordance. Inline buttons wrap
// naturally in `.cookbook-diag-fixes` (flex-wrap) so a long list reflows
// onto multiple rows instead of getting collapsed.
const row = document.createElement('div');
row.className = 'cookbook-diag-fixes';
if (fixes.length <= 3) {
for (const fix of fixes) {
const btn = document.createElement('button');
btn.className = 'cookbook-btn cookbook-diag-btn';
btn.type = 'button';
btn.innerHTML = _diagFixIcon(fix.label) + '<span class="cookbook-diag-btn-label">' + _diagEsc(fix.label) + '</span>';
btn.addEventListener('click', (e) => {
e.stopPropagation();
runFix(fix, btn);
});
row.appendChild(btn);
}
body.appendChild(row);
return;
}
const wrap = document.createElement('div');
wrap.className = 'cookbook-diag-actions';
const trigger = document.createElement('button');
trigger.className = 'cookbook-btn cookbook-diag-action-trigger';
trigger.type = 'button';
trigger.textContent = 'Actions';
trigger.appendChild(document.createTextNode(' ▾'));
wrap.appendChild(trigger);
const menu = document.createElement('div');
menu.className = 'dropdown cookbook-diag-menu hidden';
for (const fix of fixes) {
const item = document.createElement('button');
item.type = 'button';
item.innerHTML = _diagFixIcon(fix.label) + '<span class="cookbook-diag-btn-label">' + _diagEsc(fix.label) + '</span>';
item.addEventListener('click', async (e) => {
const btn = document.createElement('button');
btn.className = 'cookbook-btn cookbook-diag-btn';
btn.type = 'button';
btn.innerHTML = _diagFixIcon(fix.label) + '<span class="cookbook-diag-btn-label">' + _diagEsc(fix.label) + '</span>';
btn.addEventListener('click', (e) => {
e.stopPropagation();
if (item.dataset.busy || trigger.dataset.busy) return;
item.dataset.busy = '1';
await runFix(fix, trigger, fix.label, () => menu.classList.add('hidden'), () => delete item.dataset.busy);
runFix(fix, btn);
});
menu.appendChild(item);
row.appendChild(btn);
}
wrap.appendChild(menu);
trigger.addEventListener('click', (e) => {
e.stopPropagation();
if (trigger.dataset.busy) return;
document.querySelectorAll('.cookbook-diag-menu').forEach(m => {
if (m !== menu) m.classList.add('hidden');
});
menu.classList.toggle('hidden');
});
row.appendChild(wrap);
body.appendChild(row);
}
}