mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
73315e6ddc
* fix(skill-extractor): walk all brace candidates so stray braces in prose do not swallow valid JSON The extractor sliced from the FIRST brace to the LAST brace to recover JSON embedded in surrounding commentary. When the model emits stray braces before the JSON object, the slice produces invalid JSON, json.loads raises, and the exception is swallowed -- the skill is silently lost. Fix: walk each brace candidate left-to-right and attempt json.loads on each slice. The first candidate that parses successfully wins. If none parse, json.loads on the original text raises and the existing JSONDecodeError handler logs and returns None as before. Tested locally -- 8/8 tests passed: tests/test_extract_skill_json_nonstring.py (2 passed) tests/test_skill_extractor_rows.py (1 passed) tests/test_search_content_extraction_parity.py (2 passed) tests/test_deep_research_search_error.py (3 passed) Closes #2199 * test(skill-extractor): add focused repro for stray-brace JSON recovery * test(skill-extractor): add regression test for leading invalid-brace fragment Addresses the remaining edge case from review: a response that *starts* with a brace but the leading fragment isn't valid JSON (e.g. '{not json}\n{"title": "Valid later", ...}') still needs to recover the valid skill object that follows. _extract_json_object (already on dev) handles this correctly — it tries the whole de-fenced string first, then walks each '{' candidate left-to- right regardless of whether the response begins with '{', so the leading invalid fragment no longer short-circuits recovery of the real object. Updates the comment at the call site to call this out explicitly and adds a regression test covering exactly the scenario described in review.
275 lines
10 KiB
Python
275 lines
10 KiB
Python
"""
|
|
skill_extractor.py
|
|
|
|
Background auto-extraction of skills from complex agent runs.
|
|
When the agent takes >= 2 rounds or >= 2 tool calls to complete a task,
|
|
we ask the LLM to distill the approach into a reusable skill.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
SKILL_EXTRACT_PROMPT = (
|
|
"You are analyzing an AI agent's work session. The agent took {rounds} rounds "
|
|
"and {tool_count} tool calls to complete the task.\n\n"
|
|
"Extract a reusable 'skill' ONLY IF the session contains a concrete, "
|
|
"repeatable procedure the agent could follow to solve a similar problem "
|
|
"ON THE COMPUTER next time (e.g. a sequence of shell commands, code, file "
|
|
"edits, API calls, or tool usage).\n\n"
|
|
"Return null (the bare word, no JSON) when the session is NOT a reusable "
|
|
"computer procedure, including:\n"
|
|
"- The real work happened OUTSIDE the computer (the user did something "
|
|
"physically, in person, on another device, or by hand) and the agent only "
|
|
"discussed or advised it.\n"
|
|
"- A one-off, personal, or context-specific task that won't recur "
|
|
"(personal errands, a specific person/place/date, casual conversation).\n"
|
|
"- A pure question/answer or explanation with no transferable method.\n"
|
|
"- The agent failed, gave up, or the approach is not worth repeating.\n\n"
|
|
"When (and only when) a genuine reusable procedure exists, return a JSON "
|
|
"object with:\n"
|
|
'- "title": short name (under 10 words)\n'
|
|
'- "problem": what was the challenge (1-2 sentences)\n'
|
|
'- "solution": what worked (1-2 sentences)\n'
|
|
'- "steps": array of step-by-step instructions (3-7 short steps)\n'
|
|
'- "tags": array of relevant keywords (3-5 tags)\n'
|
|
'- "confidence": 0.0-1.0 how reliable AND reusable this procedure is\n\n'
|
|
"Be conservative: if in doubt, return null.\n"
|
|
"Return ONLY valid JSON (or the bare word null), no markdown fences."
|
|
)
|
|
|
|
# Skills the model is unsure about (or that read as one-offs) add clutter —
|
|
# drop anything below this confidence.
|
|
MIN_CONFIDENCE = 0.6
|
|
|
|
# How many recent messages to include
|
|
CONTEXT_WINDOW = 12
|
|
|
|
|
|
def _skill_dicts(skills):
|
|
for skill in skills or []:
|
|
if isinstance(skill, dict):
|
|
yield skill
|
|
|
|
|
|
def _has_duplicate_title(skills, title: str) -> bool:
|
|
wanted = title.lower()
|
|
for skill in _skill_dicts(skills):
|
|
existing = skill.get("title", "")
|
|
if isinstance(existing, str) and existing.lower() == wanted:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _extract_json_object(text: str) -> Optional[dict]:
|
|
"""Best-effort extraction of a JSON object from an LLM response.
|
|
|
|
The response may be wrapped in code fences or surrounded by prose, and some
|
|
models emit a stray brace in the prose before the real object
|
|
(e.g. "uses {placeholder} then {...}"). Slicing first-'{' .. last-'}' then
|
|
grabs an unparseable span and the skill is silently lost. Try the whole
|
|
string first, then each '{' start position in turn, returning the first
|
|
candidate that parses to a JSON object (dict). Returns None if none do.
|
|
"""
|
|
if not text:
|
|
return None
|
|
s = text.strip()
|
|
if s.startswith("```"):
|
|
s = s.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
|
|
end = s.rfind("}")
|
|
if end == -1:
|
|
return None
|
|
|
|
def _as_dict(candidate):
|
|
try:
|
|
obj = json.loads(candidate)
|
|
except (json.JSONDecodeError, ValueError):
|
|
return None
|
|
return obj if isinstance(obj, dict) else None
|
|
|
|
# The clean, common case: the whole (de-fenced) string is the object.
|
|
obj = _as_dict(s)
|
|
if obj is not None:
|
|
return obj
|
|
# Otherwise scan each '{' candidate up to the last '}'.
|
|
start = s.find("{")
|
|
while 0 <= start < end:
|
|
obj = _as_dict(s[start : end + 1])
|
|
if obj is not None:
|
|
return obj
|
|
start = s.find("{", start + 1)
|
|
return None
|
|
|
|
|
|
async def maybe_extract_skill(
|
|
session,
|
|
skills_manager,
|
|
endpoint_url: str,
|
|
model: str,
|
|
headers: dict,
|
|
round_count: int,
|
|
tool_count: int,
|
|
owner: Optional[str] = None,
|
|
):
|
|
"""Extract a skill if the agent run was complex enough."""
|
|
if not model:
|
|
logger.debug("[skill-extract] No model provided, skipping")
|
|
return None
|
|
|
|
# Quiet by default; flip to DEBUG when chasing extractor issues.
|
|
logger.debug(
|
|
"[skill-extract] start: rounds=%d tools=%d model=%s owner=%s",
|
|
round_count, tool_count, model, owner,
|
|
)
|
|
if round_count < 2 and tool_count < 2:
|
|
logger.debug("[skill-extract] BELOW threshold (need rounds>=2 or tools>=2)")
|
|
return None
|
|
|
|
try:
|
|
from src.llm_core import llm_call_async
|
|
|
|
# Get recent messages
|
|
history = session.get_context_messages()
|
|
recent = history[-CONTEXT_WINDOW:] if len(history) > CONTEXT_WINDOW else history
|
|
if not recent:
|
|
logger.debug("[skill-extract] no recent messages, skipping")
|
|
return None
|
|
|
|
# Strip media (images/audio) from messages
|
|
stripped_recent = []
|
|
for msg in recent:
|
|
content = msg.get("content", "")
|
|
if isinstance(content, list):
|
|
text_only = [b for b in content if isinstance(b, dict) and b.get("type") == "text"]
|
|
if not text_only and content:
|
|
continue
|
|
content = text_only
|
|
stripped_recent.append({"role": msg.get("role"), "content": content})
|
|
|
|
if not stripped_recent:
|
|
return None
|
|
|
|
# Build conversation summary for extraction
|
|
conv_lines = []
|
|
for msg in stripped_recent:
|
|
role = msg.get("role", "?")
|
|
content = msg.get("content", "")
|
|
if isinstance(content, list):
|
|
content = " ".join(
|
|
b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text"
|
|
)
|
|
# Truncate long messages
|
|
if len(content) > 500:
|
|
content = content[:500] + "..."
|
|
conv_lines.append(f"[{role}] {content}")
|
|
|
|
conversation = "\n".join(conv_lines)
|
|
|
|
prompt = SKILL_EXTRACT_PROMPT.format(rounds=round_count, tool_count=tool_count)
|
|
|
|
import time as _time
|
|
_t0 = _time.monotonic()
|
|
logger.debug(
|
|
"[skill-extract] calling LLM (endpoint=%s, ctx=%d msgs, timeout=30s)",
|
|
endpoint_url, len(recent),
|
|
)
|
|
response = await llm_call_async(
|
|
endpoint_url,
|
|
model,
|
|
[
|
|
{"role": "system", "content": prompt},
|
|
{"role": "user", "content": f"Conversation:\n{conversation}"},
|
|
],
|
|
headers=headers,
|
|
timeout=30,
|
|
)
|
|
logger.debug(
|
|
"[skill-extract] LLM returned in %.1fs (len=%d, head=%r)",
|
|
_time.monotonic() - _t0, len(response or ""), (response or "")[:80],
|
|
)
|
|
|
|
if not response or response.strip().lower() == "null":
|
|
logger.debug(
|
|
"[skill-extract] LLM declined (returned null/empty) — "
|
|
"session deemed not a reusable procedure"
|
|
)
|
|
return None
|
|
|
|
# Some models (MiniMax, Qwen-Thinker, DeepSeek-R1) emit their
|
|
# chain-of-thought BEFORE the JSON output even when asked for
|
|
# raw JSON. `strip_think(prose=True, prompt_echo=True)` removes
|
|
# <think>…</think> tags AND prose-style "Let me analyze this…"
|
|
# preambles. Without it, json.loads bombed on character 0 every
|
|
# time and the silent-bail looked like "extractor doesn't work".
|
|
try:
|
|
from src.text_helpers import strip_think as _strip_think
|
|
response = _strip_think(response, prose=True, prompt_echo=True)
|
|
except Exception:
|
|
pass
|
|
|
|
# Parse JSON. The object may be wrapped in code fences or surrounded by
|
|
# commentary (and may contain a stray/invalid brace fragment before
|
|
# the real object — including one that makes the response itself look
|
|
# like it starts with '{'), so use a tolerant extractor that tries the
|
|
# whole string first and then each '{' candidate left-to-right.
|
|
data = _extract_json_object(response)
|
|
if not data:
|
|
logger.debug("[skill-extract] no JSON object found in response, dropping")
|
|
return None
|
|
|
|
title = data.get("title", "").strip()
|
|
if not title:
|
|
logger.debug("[skill-extract] LLM returned object with no title, dropping")
|
|
return None
|
|
|
|
# Honour the model's own reliability/reusability estimate — low-
|
|
# confidence extractions are usually one-offs or shaky procedures.
|
|
try:
|
|
_conf = float(data.get("confidence", 0.7))
|
|
except (TypeError, ValueError):
|
|
_conf = 0.7
|
|
if _conf < MIN_CONFIDENCE:
|
|
logger.debug(
|
|
"[skill-extract] '%s' below confidence floor (%.2f < %.2f) — dropped",
|
|
title, _conf, MIN_CONFIDENCE,
|
|
)
|
|
return None
|
|
|
|
# Check for duplicate skills
|
|
existing = skills_manager.load(owner=owner)
|
|
if _has_duplicate_title(existing, title):
|
|
logger.debug("[skill-extract] '%s' already exists — dropped as duplicate", title)
|
|
return None
|
|
|
|
entry = skills_manager.add_skill(
|
|
title=title,
|
|
problem=data.get("problem", ""),
|
|
solution=data.get("solution", ""),
|
|
steps=data.get("steps", []),
|
|
tags=data.get("tags", []),
|
|
source="learned",
|
|
confidence=data.get("confidence", 0.7),
|
|
session_id=getattr(session, "session_id", None),
|
|
owner=owner,
|
|
)
|
|
try:
|
|
from src.event_bus import fire_event
|
|
fire_event("skill_added", owner)
|
|
except Exception:
|
|
logger.debug("skill_added event dispatch failed", exc_info=True)
|
|
logger.info("Auto-extracted skill: %s (id=%s)", title, entry["id"])
|
|
return entry
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.debug("[skill-extract] non-JSON LLM response, dropping: %s", e)
|
|
return None
|
|
except Exception as e:
|
|
# Real exceptions stay INFO+warning so they don't get lost when
|
|
# users only have default log level. `exc_info=True` ships the
|
|
# full traceback so timeouts vs auth vs import errors are
|
|
# distinguishable from outside.
|
|
logger.warning("[skill-extract] FAILED: %s", e, exc_info=True)
|
|
return None
|