mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
Odysseus v1.0
This commit is contained in:
@@ -0,0 +1,644 @@
|
||||
"""Teacher-escalation loop for self-hosted models in agent mode.
|
||||
|
||||
When the student (self-hosted) model finishes a turn, evaluate whether
|
||||
it succeeded. If it didn't, escalate to a SOTA teacher endpoint, which
|
||||
both produces a corrective reply AND writes a SKILL.md procedure so
|
||||
the student can do it itself next time.
|
||||
|
||||
Trigger conditions (ALL must hold):
|
||||
1. Agent mode (not chat mode).
|
||||
2. The student's endpoint is self-hosted (not a known SOTA cloud API).
|
||||
3. `teacher_model` setting is configured.
|
||||
|
||||
Detection tiers:
|
||||
Tier 1: regex on tool outputs + agent reply. Catches the "Unknown
|
||||
action 'switch'" / "I don't have a tool" / "Could you tell
|
||||
me which one?" type failures. Free, instant.
|
||||
Tier 2 (TODO): LLM self-eval for ambiguous cases. Not in first cut.
|
||||
|
||||
If Tier 1 fires FAILURE, call the teacher with the full failed
|
||||
context. Skill is only saved if the teacher's response itself passes
|
||||
the same regex eval — no point persisting a procedure the teacher
|
||||
itself wasn't confident about.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Hosts considered SOTA / paid APIs — if the student's endpoint URL
|
||||
# hits one of these, the loop is OFF (the user is already paying for
|
||||
# a top-tier model; no need to escalate).
|
||||
_SOTA_HOSTS = frozenset({
|
||||
"api.openai.com", "api.anthropic.com",
|
||||
"api.deepseek.com", "deepseek.com",
|
||||
"api.mistral.ai", "api.cohere.com",
|
||||
"api.together.xyz", "api.fireworks.ai",
|
||||
"api.perplexity.ai", "api.x.ai",
|
||||
"generativelanguage.googleapis.com", "api.groq.com",
|
||||
})
|
||||
|
||||
|
||||
def is_self_hosted(endpoint_url: str) -> bool:
|
||||
"""True if the endpoint is NOT a known SOTA cloud API.
|
||||
|
||||
Conservative — anything we don't positively recognise as SOTA is
|
||||
treated as self-hosted. Better to over-escalate than to silently
|
||||
add latency to a paid-API user's chat.
|
||||
"""
|
||||
if not endpoint_url:
|
||||
return True
|
||||
try:
|
||||
host = (urlparse(endpoint_url).hostname or "").lower()
|
||||
except Exception:
|
||||
return True
|
||||
if not host:
|
||||
return True
|
||||
return host not in _SOTA_HOSTS
|
||||
|
||||
|
||||
# ── Tier 1: regex-based failure detection ──────────────────────────
|
||||
|
||||
# Patterns that show up in tool RESULTS when the call failed.
|
||||
_TOOL_ERROR_PATTERNS = [
|
||||
re.compile(r"^Unknown action\b", re.IGNORECASE),
|
||||
re.compile(r"^Failed to\b", re.IGNORECASE),
|
||||
re.compile(r"\bnot found\b", re.IGNORECASE),
|
||||
re.compile(r"^Invalid\b", re.IGNORECASE),
|
||||
re.compile(r"\berror:\s", re.IGNORECASE),
|
||||
]
|
||||
|
||||
# Patterns that show up in the AGENT'S REPLY when it gave up or
|
||||
# couldn't pick a path. Different list — these aren't tool errors,
|
||||
# they're the model verbally admitting it doesn't know.
|
||||
_REPLY_GIVE_UP_PATTERNS = [
|
||||
re.compile(r"\bI don't have (?:a )?tool\b", re.IGNORECASE),
|
||||
re.compile(r"\bI can(?:'t|not) (?:do|find|figure)\b", re.IGNORECASE),
|
||||
re.compile(r"\bI'?m not sure (?:which|how|what)\b", re.IGNORECASE),
|
||||
re.compile(r"\b[Cc]ould you (?:tell me|specify|clarify)\b"),
|
||||
re.compile(r"\bunable to (?:open|find|switch|complete)\b", re.IGNORECASE),
|
||||
re.compile(r"\bdoesn'?t (?:exist|appear to be|seem to)\b", re.IGNORECASE),
|
||||
]
|
||||
|
||||
|
||||
def evaluate_turn_regex(
|
||||
tool_results: List[Dict[str, Any]],
|
||||
agent_reply: str,
|
||||
) -> Tuple[str, Optional[str]]:
|
||||
"""Cheap regex check on a finished turn.
|
||||
|
||||
Returns ("failure", reason) on a detected problem, ("ok", None)
|
||||
otherwise. The caller decides whether to short-circuit or fall
|
||||
back to an LLM self-eval.
|
||||
"""
|
||||
# Any tool returned an explicit error field?
|
||||
for r in tool_results or []:
|
||||
if not isinstance(r, dict):
|
||||
continue
|
||||
if r.get("error"):
|
||||
return ("failure", f"tool returned error: {r.get('error')!r}")
|
||||
text = r.get("results") or r.get("output") or r.get("response") or ""
|
||||
if isinstance(text, str):
|
||||
for pat in _TOOL_ERROR_PATTERNS:
|
||||
if pat.search(text):
|
||||
snippet = text[:120].strip()
|
||||
return ("failure", f"tool result matched error pattern {pat.pattern!r}: {snippet!r}")
|
||||
|
||||
# Agent verbally gave up?
|
||||
if agent_reply:
|
||||
for pat in _REPLY_GIVE_UP_PATTERNS:
|
||||
m = pat.search(agent_reply)
|
||||
if m:
|
||||
return ("failure", f"agent reply matched give-up pattern {pat.pattern!r}")
|
||||
|
||||
return ("ok", None)
|
||||
|
||||
|
||||
# ── Teacher escalation ────────────────────────────────────────────
|
||||
|
||||
# Prompt template the teacher gets. The teacher is expected to (a)
|
||||
# describe how it would solve the task, and (b) emit a JSON skill
|
||||
# blob the caller can pass straight to manage_skills(add).
|
||||
_TEACHER_ESCALATION_PROMPT = """\
|
||||
You are the senior teacher model for an AI agent that runs on a smaller, \
|
||||
self-hosted student model. The student just failed at a task. Your job \
|
||||
is to write a permanent SKILL.md procedure so the student succeeds next \
|
||||
time.
|
||||
|
||||
The student's tools include (non-exhaustive): bash, python, web_search, \
|
||||
read_file, write_file, create_document, edit_document, manage_session \
|
||||
(list/switch/rename/archive/delete/important/truncate/fork), \
|
||||
list_sessions, manage_memory, manage_notes, manage_calendar, \
|
||||
send_email, list_emails, manage_settings, manage_skills, \
|
||||
manage_tasks, ui_control. The student also understands the markdown \
|
||||
anchor convention [Name](#session-<id>) / [Title](#document-<id>) for \
|
||||
clickable jump links.
|
||||
|
||||
THE TASK
|
||||
{user_request}
|
||||
|
||||
WHY THE STUDENT FAILED
|
||||
{failure_reason}
|
||||
|
||||
WHAT THE STUDENT TRIED (tool calls + replies in order)
|
||||
{trace}
|
||||
|
||||
YOUR JOB
|
||||
Respond with TWO sections, in this exact order:
|
||||
|
||||
1. A short paragraph explaining the correct procedure in plain English.
|
||||
|
||||
2. A fenced JSON code block matching this schema for manage_skills(add):
|
||||
|
||||
```json
|
||||
{{
|
||||
"action": "add",
|
||||
"name": "<short-kebab-case-slug>",
|
||||
"description": "<one-line summary of what this skill teaches>",
|
||||
"when_to_use": "<the trigger pattern: e.g. 'When the user says \\"open my X chat\\"'>",
|
||||
"procedure": [
|
||||
"Step 1: ...",
|
||||
"Step 2: ...",
|
||||
"Step 3: ..."
|
||||
],
|
||||
"pitfalls": ["..."],
|
||||
"verification": ["..."],
|
||||
"category": "<single category word>",
|
||||
"status": "draft",
|
||||
"confidence": 0.8,
|
||||
"source": "teacher-escalation"
|
||||
}}
|
||||
```
|
||||
|
||||
The procedure steps should reference SPECIFIC tool names and argument \
|
||||
shapes the student can copy. Be concrete — not "use the right tool", \
|
||||
but "call list_sessions, find the row whose name contains <X>, then \
|
||||
respond with `[Name](#session-<id>)`".
|
||||
|
||||
**PORTABILITY — CRITICAL.** Skills are shared across users. Do NOT \
|
||||
hardcode anything user-specific into the procedure:
|
||||
- NO hostnames or IPs (e.g. `gpu-box`, `user@192.0.2.10`) — \
|
||||
use placeholders like `<gpu_host>` or call `list_serve_presets` / \
|
||||
`list_cached_models` to discover them at runtime.
|
||||
- NO absolute filesystem paths tied to one machine (e.g. \
|
||||
`/home/<user>/vllm-env/bin/vllm`) — say "use the user's vLLM \
|
||||
install" or call the wrapped tool that picks the right binary.
|
||||
- NO model repo IDs the user happened to pick this time unless the \
|
||||
skill is specifically about THAT model — generalise to "the model \
|
||||
the user named, looked up via list_cached_models / search_hf_models".
|
||||
- NO tmux session names invented in the failed trace — these are \
|
||||
one-shot artefacts. The named tool (`serve_model`, `stop_served_model`) \
|
||||
owns session naming.
|
||||
- NO direct `ssh <host> 'tmux ...'` shell incantations even if that's \
|
||||
what the failed trace did — those bypass the cookbook's state \
|
||||
tracker. The skill must use `serve_model` / `stop_served_model` \
|
||||
/ `serve_preset`, not bash.
|
||||
|
||||
If you do NOT believe the task is solvable with the available tools, \
|
||||
output the explanation paragraph but OMIT the JSON block entirely. \
|
||||
A bad procedure is worse than no procedure — only emit the JSON if \
|
||||
you are confident the steps will actually work AND the steps are \
|
||||
portable across users / hosts.
|
||||
"""
|
||||
|
||||
|
||||
async def _call_teacher(teacher_model_spec: str, prompt: str) -> Optional[str]:
|
||||
"""Call the configured teacher endpoint with the escalation prompt."""
|
||||
from src.llm_core import llm_call_async
|
||||
from src.ai_interaction import _resolve_model, _TEACHER_SYSTEM_PROMPT
|
||||
try:
|
||||
url, model, headers = _resolve_model(teacher_model_spec)
|
||||
except Exception as e:
|
||||
logger.warning(f"teacher endpoint not resolvable ({teacher_model_spec!r}): {e}")
|
||||
return None
|
||||
try:
|
||||
return await llm_call_async(
|
||||
url, model,
|
||||
[
|
||||
{"role": "system", "content": _TEACHER_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
headers=headers,
|
||||
timeout=120,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"teacher call failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# Prompt used AFTER the teacher itself ran and succeeded — distill the
|
||||
# successful trace into a reusable SKILL.md. Different framing from the
|
||||
# original "you have to plan it" prompt because here the teacher has
|
||||
# already proven the steps work.
|
||||
_TEACHER_SKILL_FROM_TRACE_PROMPT = """\
|
||||
You are distilling a successful tool-use trace into a permanent \
|
||||
SKILL.md procedure so a smaller student model can reproduce it.
|
||||
|
||||
ORIGINAL USER REQUEST
|
||||
{user_request}
|
||||
|
||||
WHY THE STUDENT FAILED (you, the teacher, just succeeded where it didn't)
|
||||
{failure_reason}
|
||||
|
||||
YOUR SUCCESSFUL TRACE (tool calls + your final reply, in order)
|
||||
{trace}
|
||||
|
||||
Output ONE fenced JSON code block matching this schema and nothing else:
|
||||
|
||||
```json
|
||||
{{
|
||||
"action": "add",
|
||||
"name": "<short-kebab-case-slug>",
|
||||
"description": "<one-line summary of what this skill teaches>",
|
||||
"when_to_use": "<the trigger pattern: 'When the user says X'>",
|
||||
"procedure": [
|
||||
"Step 1: <specific tool name and arg shape>",
|
||||
"Step 2: ...",
|
||||
"Step 3: ..."
|
||||
],
|
||||
"pitfalls": ["..."],
|
||||
"verification": ["..."],
|
||||
"category": "<single category word>",
|
||||
"status": "draft",
|
||||
"confidence": 0.8,
|
||||
"source": "teacher-escalation"
|
||||
}}
|
||||
```
|
||||
|
||||
The procedure must be the steps that ACTUALLY worked in the trace, \
|
||||
generalised away from this specific request. Each step references a \
|
||||
SPECIFIC tool name and argument shape the student can copy.
|
||||
|
||||
**PORTABILITY — CRITICAL.** Skills are shared across users. Strip every \
|
||||
user-specific token from your trace before writing the procedure:
|
||||
- Replace hostnames/IPs with placeholders (`<gpu_host>` etc.) or \
|
||||
instruct the student to discover them via `list_serve_presets` / \
|
||||
`list_cached_models` at runtime.
|
||||
- Replace user-specific paths (`/home/<user>/...`) with the wrapped \
|
||||
tool that picks the right binary on whatever machine runs the skill.
|
||||
- Don't bake in the specific model repo_id you happened to use unless \
|
||||
the skill is about that exact model.
|
||||
- Reference the high-level tools (`serve_model`, `stop_served_model`, \
|
||||
`serve_preset`, `list_cached_models`, `search_hf_models`, etc.) \
|
||||
rather than `ssh <host> 'tmux new-session ... vllm serve ...'` \
|
||||
shell incantations — even if THAT'S what worked in the trace. Raw \
|
||||
shell launches bypass the cookbook tracker and don't reproduce on \
|
||||
another user's box.
|
||||
|
||||
If the trace did NOT genuinely solve the user's problem (e.g. you also \
|
||||
gave up, or the underlying issue was external infrastructure that no \
|
||||
procedure can fix), output the single token NO_SKILL and nothing else.
|
||||
"""
|
||||
|
||||
|
||||
def _extract_skill_json(teacher_response: str) -> Optional[Dict[str, Any]]:
|
||||
"""Find the first ```json {...}``` block and parse it.
|
||||
|
||||
Returns None if no block found or JSON is malformed — both
|
||||
treated as "teacher declined to write a skill", per the prompt
|
||||
contract.
|
||||
"""
|
||||
if not teacher_response:
|
||||
return None
|
||||
import json
|
||||
m = re.search(r"```(?:json)?\s*\n(\{[\s\S]*?\})\s*\n```", teacher_response)
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
return data
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _format_trace(tool_results: List[Dict[str, Any]], agent_reply: str) -> str:
|
||||
"""Render the turn's tool calls + final reply for the teacher prompt."""
|
||||
lines = []
|
||||
for i, r in enumerate(tool_results or []):
|
||||
if not isinstance(r, dict):
|
||||
continue
|
||||
tool = r.get("tool") or r.get("action") or "(unknown tool)"
|
||||
if r.get("error"):
|
||||
lines.append(f"- {tool}: ERROR {r['error']!r}")
|
||||
continue
|
||||
out = r.get("results") or r.get("output") or r.get("response") or ""
|
||||
if isinstance(out, str) and len(out) > 400:
|
||||
out = out[:400] + "..."
|
||||
lines.append(f"- {tool}: {out!r}")
|
||||
trace = "\n".join(lines) if lines else "(no tools called)"
|
||||
if agent_reply:
|
||||
snippet = agent_reply if len(agent_reply) < 800 else agent_reply[:800] + "..."
|
||||
trace += f"\n\nFinal reply: {snippet!r}"
|
||||
return trace
|
||||
|
||||
|
||||
async def escalate_and_learn(
|
||||
user_request: str,
|
||||
tool_results: List[Dict[str, Any]],
|
||||
agent_reply: str,
|
||||
failure_reason: str,
|
||||
owner: Optional[str] = None,
|
||||
) -> Optional[str]:
|
||||
"""Call the teacher, evaluate ITS attempt, save a skill on success.
|
||||
|
||||
Returns the saved skill name (or None if the teacher couldn't
|
||||
write one). Logs but doesn't raise — escalation is best-effort.
|
||||
"""
|
||||
from src.settings import get_setting
|
||||
teacher_spec = (get_setting("teacher_model", "") or "").strip()
|
||||
if not teacher_spec:
|
||||
return None
|
||||
|
||||
prompt = _TEACHER_ESCALATION_PROMPT.format(
|
||||
user_request=user_request or "(no user request captured)",
|
||||
failure_reason=failure_reason or "(failure reason not captured)",
|
||||
trace=_format_trace(tool_results, agent_reply),
|
||||
)
|
||||
response = await _call_teacher(teacher_spec, prompt)
|
||||
if not response:
|
||||
return None
|
||||
|
||||
skill = _extract_skill_json(response)
|
||||
if not skill:
|
||||
# Teacher chose not to write a skill — see prompt contract.
|
||||
logger.info("teacher declined to write a skill for this failure")
|
||||
return None
|
||||
|
||||
# Same regex eval applied to the teacher's response — if the
|
||||
# teacher itself sounded uncertain ("I don't have a tool"), drop
|
||||
# the skill rather than persist a sketchy one.
|
||||
status, reason = evaluate_turn_regex([], response)
|
||||
if status == "failure":
|
||||
logger.info(f"teacher response failed eval, skipping skill save: {reason}")
|
||||
return None
|
||||
|
||||
# Tag the skill with the escalation source for auditability.
|
||||
skill.setdefault("source", "teacher-escalation")
|
||||
skill.setdefault("teacher_model", teacher_spec)
|
||||
# Force action=add regardless of what the teacher wrote.
|
||||
skill["action"] = "add"
|
||||
|
||||
import json
|
||||
from src.tool_implementations import do_manage_skills
|
||||
try:
|
||||
result = await do_manage_skills(json.dumps(skill), owner=owner)
|
||||
if isinstance(result, dict) and not result.get("error"):
|
||||
logger.info(f"teacher wrote skill: {skill.get('name')}")
|
||||
return skill.get("name")
|
||||
logger.warning(f"skill save failed: {result}")
|
||||
except Exception as e:
|
||||
logger.warning(f"skill save raised: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def maybe_escalate(
|
||||
*,
|
||||
student_endpoint_url: str,
|
||||
mode: str,
|
||||
user_request: str,
|
||||
tool_results: List[Dict[str, Any]],
|
||||
agent_reply: str,
|
||||
owner: Optional[str] = None,
|
||||
) -> Optional[asyncio.Task]:
|
||||
"""Fire-and-forget entrypoint called by the agent loop end-of-turn.
|
||||
|
||||
Returns the created asyncio.Task (so tests can await it) or None
|
||||
if escalation didn't fire. Safe to call unconditionally — does
|
||||
its own gating.
|
||||
"""
|
||||
# Gate 1: only in agent mode.
|
||||
if mode != "agent":
|
||||
return None
|
||||
|
||||
# Gate 2: feature is enabled AND a teacher endpoint is configured.
|
||||
# (No self-hosted-only gate — users run cheap cloud students like
|
||||
# deepseek-v4-flash with a SOTA teacher; the toggle is the control.)
|
||||
try:
|
||||
from src.settings import get_setting
|
||||
if not get_setting("teacher_enabled", False):
|
||||
return None
|
||||
if not (get_setting("teacher_model", "") or "").strip():
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# Gate 3: regex eval — only escalate on detected failure.
|
||||
status, reason = evaluate_turn_regex(tool_results, agent_reply)
|
||||
if status != "failure":
|
||||
return None
|
||||
|
||||
# Fire async — don't block the user's chat.
|
||||
return asyncio.create_task(
|
||||
escalate_and_learn(user_request, tool_results, agent_reply, reason or "", owner),
|
||||
name="teacher_escalation",
|
||||
)
|
||||
|
||||
|
||||
# ── Inline teacher takeover (visible in chat stream) ───────────────
|
||||
|
||||
async def run_teacher_inline(
|
||||
*,
|
||||
student_endpoint_url: str,
|
||||
student_messages: List[Dict[str, Any]],
|
||||
student_tool_events: List[Dict[str, Any]],
|
||||
student_reply: str,
|
||||
owner: Optional[str] = None,
|
||||
):
|
||||
"""Async generator. Yields SSE event strings.
|
||||
|
||||
If escalation gates pass, runs the teacher inside the same chat
|
||||
stream — the user sees the teacher's tool calls and reply live.
|
||||
Saves a skill only if the teacher actually succeeded.
|
||||
|
||||
Gates (all must hold): agent mode (caller guarantees), teacher
|
||||
toggle on, teacher_model configured, Tier 1 regex flags failure.
|
||||
"""
|
||||
import json
|
||||
from src.settings import get_setting
|
||||
|
||||
# Gates
|
||||
try:
|
||||
if not get_setting("teacher_enabled", False):
|
||||
return
|
||||
teacher_spec = (get_setting("teacher_model", "") or "").strip()
|
||||
if not teacher_spec:
|
||||
return
|
||||
except Exception:
|
||||
return
|
||||
|
||||
status, reason = evaluate_turn_regex(student_tool_events, student_reply)
|
||||
if status != "failure":
|
||||
return
|
||||
|
||||
# Extract original user request — last user-role message
|
||||
user_request = ""
|
||||
for m in reversed(student_messages):
|
||||
if m.get("role") != "user":
|
||||
continue
|
||||
c = m.get("content")
|
||||
if isinstance(c, str):
|
||||
user_request = c
|
||||
elif isinstance(c, list):
|
||||
user_request = next(
|
||||
(p.get("text", "") for p in c
|
||||
if isinstance(p, dict) and p.get("type") == "text"),
|
||||
"",
|
||||
)
|
||||
break
|
||||
|
||||
# Resolve teacher endpoint
|
||||
try:
|
||||
from src.ai_interaction import _resolve_model
|
||||
teacher_url, teacher_model, teacher_headers = _resolve_model(teacher_spec)
|
||||
except Exception as e:
|
||||
logger.warning(f"teacher endpoint not resolvable ({teacher_spec!r}): {e}")
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "escalation_failed",
|
||||
"reason": f"teacher endpoint not resolvable: {e}",
|
||||
}) + '\n\n'
|
||||
)
|
||||
return
|
||||
|
||||
# Announce takeover so the frontend can render a banner
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "teacher_takeover",
|
||||
"teacher_model": teacher_spec,
|
||||
"student_failure": reason,
|
||||
}) + '\n\n'
|
||||
)
|
||||
|
||||
# Build teacher messages. Strip the student's leading system
|
||||
# prompts (the teacher's run will build its own fresh) but keep the
|
||||
# user/assistant/tool history so the teacher sees what the student
|
||||
# tried. The appended note leads with the user request text so RAG
|
||||
# tool selection picks the right tools for the teacher's turn.
|
||||
history = [m for m in student_messages if m.get("role") != "system"]
|
||||
note_content = (
|
||||
f"{user_request or '(no user request captured)'}\n\n"
|
||||
"[teacher-takeover] The previous attempt by the student model "
|
||||
f"failed.\nFailure signal: {reason}\n"
|
||||
"Please solve the request above using your own tools. The user "
|
||||
"is watching your tool calls live."
|
||||
)
|
||||
teacher_messages = history + [{"role": "user", "content": note_content}]
|
||||
|
||||
# Recursively invoke the agent loop with the teacher's params.
|
||||
# The _is_teacher_run flag prevents infinite recursion (the teacher
|
||||
# run will skip its own escalation hook).
|
||||
from src.agent_loop import stream_agent_loop
|
||||
captured_tool_events: List[Dict[str, Any]] = []
|
||||
captured_text_parts: List[str] = []
|
||||
|
||||
async for evt_str in stream_agent_loop(
|
||||
endpoint_url=teacher_url,
|
||||
model=teacher_model,
|
||||
messages=teacher_messages,
|
||||
headers=teacher_headers,
|
||||
owner=owner,
|
||||
_is_teacher_run=True,
|
||||
):
|
||||
# Swallow teacher's own [DONE] — outer loop emits the real one
|
||||
if "[DONE]" in evt_str:
|
||||
continue
|
||||
if evt_str.startswith("data: "):
|
||||
try:
|
||||
payload = json.loads(evt_str[6:].strip())
|
||||
except Exception:
|
||||
yield evt_str
|
||||
continue
|
||||
if isinstance(payload, dict):
|
||||
payload["teacher"] = True
|
||||
typ = payload.get("type")
|
||||
if typ == "tool_output":
|
||||
captured_tool_events.append({
|
||||
"tool": payload.get("tool"),
|
||||
"command": payload.get("command"),
|
||||
"output": payload.get("output"),
|
||||
"exit_code": payload.get("exit_code"),
|
||||
})
|
||||
if "delta" in payload and isinstance(payload["delta"], str):
|
||||
captured_text_parts.append(payload["delta"])
|
||||
yield 'data: ' + json.dumps(payload) + '\n\n'
|
||||
continue
|
||||
yield evt_str
|
||||
|
||||
teacher_text = "".join(captured_text_parts).strip()
|
||||
t_status, t_reason = evaluate_turn_regex(captured_tool_events, teacher_text)
|
||||
if t_status == "failure":
|
||||
logger.info(f"teacher also failed: {t_reason}")
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "escalation_failed",
|
||||
"reason": t_reason,
|
||||
}) + '\n\n'
|
||||
)
|
||||
return
|
||||
|
||||
# Teacher succeeded — distill its successful trace into a skill
|
||||
prompt = _TEACHER_SKILL_FROM_TRACE_PROMPT.format(
|
||||
user_request=user_request or "(no user request captured)",
|
||||
failure_reason=reason or "",
|
||||
trace=_format_trace(captured_tool_events, teacher_text),
|
||||
)
|
||||
skill_response = await _call_teacher(teacher_spec, prompt)
|
||||
if skill_response and "NO_SKILL" in skill_response and not _extract_skill_json(skill_response):
|
||||
logger.info("teacher declined to write a skill (NO_SKILL)")
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "skill_save_failed",
|
||||
"reason": "teacher said NO_SKILL (problem not reproducible)",
|
||||
}) + '\n\n'
|
||||
)
|
||||
return
|
||||
skill = _extract_skill_json(skill_response) if skill_response else None
|
||||
if not skill:
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "skill_save_failed",
|
||||
"reason": "teacher did not emit valid skill JSON",
|
||||
}) + '\n\n'
|
||||
)
|
||||
return
|
||||
|
||||
skill["action"] = "add"
|
||||
skill.setdefault("source", "teacher-escalation")
|
||||
skill.setdefault("teacher_model", teacher_spec)
|
||||
|
||||
import json as _json
|
||||
from src.tool_implementations import do_manage_skills
|
||||
try:
|
||||
result = await do_manage_skills(_json.dumps(skill), owner=owner)
|
||||
if isinstance(result, dict) and not result.get("error"):
|
||||
logger.info(f"teacher succeeded; saved skill: {skill.get('name')}")
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "skill_saved",
|
||||
"name": skill.get("name"),
|
||||
"category": skill.get("category", "general"),
|
||||
}) + '\n\n'
|
||||
)
|
||||
else:
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "skill_save_failed",
|
||||
"reason": str(result),
|
||||
}) + '\n\n'
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"skill save raised: {e}")
|
||||
yield (
|
||||
'data: ' + json.dumps({
|
||||
"type": "skill_save_failed",
|
||||
"reason": str(e),
|
||||
}) + '\n\n'
|
||||
)
|
||||
Reference in New Issue
Block a user