mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
fix(skills): tolerate a stray brace before the JSON in skill extraction (#2200)
maybe_extract_skill() sliced the LLM response from the first '{' to the
last '}'. When a model emits a stray brace in prose before the real
object (e.g. "uses {placeholder} then {...}"), the slice starts at the
prose brace, json.loads fails, and a valid skill is silently dropped.
Factor parsing into _extract_json_object(), which tries the whole
(de-fenced) string first and then each '{' start position, returning the
first candidate that parses to a JSON object.
Adds tests/test_skill_extractor_json.py.
This commit is contained in:
committed by
GitHub
parent
c5ac89f01f
commit
92ef01d4fa
@@ -63,6 +63,46 @@ def _has_duplicate_title(skills, title: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_json_object(text: str) -> Optional[dict]:
|
||||||
|
"""Best-effort extraction of a JSON object from an LLM response.
|
||||||
|
|
||||||
|
The response may be wrapped in code fences or surrounded by prose, and some
|
||||||
|
models emit a stray brace in the prose before the real object
|
||||||
|
(e.g. "uses {placeholder} then {...}"). Slicing first-'{' .. last-'}' then
|
||||||
|
grabs an unparseable span and the skill is silently lost. Try the whole
|
||||||
|
string first, then each '{' start position in turn, returning the first
|
||||||
|
candidate that parses to a JSON object (dict). Returns None if none do.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
s = text.strip()
|
||||||
|
if s.startswith("```"):
|
||||||
|
s = s.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
|
||||||
|
end = s.rfind("}")
|
||||||
|
if end == -1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _as_dict(candidate):
|
||||||
|
try:
|
||||||
|
obj = json.loads(candidate)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
return None
|
||||||
|
return obj if isinstance(obj, dict) else None
|
||||||
|
|
||||||
|
# The clean, common case: the whole (de-fenced) string is the object.
|
||||||
|
obj = _as_dict(s)
|
||||||
|
if obj is not None:
|
||||||
|
return obj
|
||||||
|
# Otherwise scan each '{' candidate up to the last '}'.
|
||||||
|
start = s.find("{")
|
||||||
|
while 0 <= start < end:
|
||||||
|
obj = _as_dict(s[start : end + 1])
|
||||||
|
if obj is not None:
|
||||||
|
return obj
|
||||||
|
start = s.find("{", start + 1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def maybe_extract_skill(
|
async def maybe_extract_skill(
|
||||||
session,
|
session,
|
||||||
skills_manager,
|
skills_manager,
|
||||||
@@ -169,21 +209,12 @@ async def maybe_extract_skill(
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Parse JSON
|
# Parse JSON. The object may be wrapped in code fences or surrounded by
|
||||||
text = response.strip()
|
# commentary (and may contain a stray brace before the real object), so
|
||||||
if text.startswith("```"):
|
# use a tolerant extractor that tries each '{' candidate.
|
||||||
text = text.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
|
data = _extract_json_object(response)
|
||||||
# After strip_think, the JSON may still be embedded inside surrounding
|
if not data:
|
||||||
# commentary — slice from the first '{' to the matching last '}'.
|
logger.debug("[skill-extract] no JSON object found in response, dropping")
|
||||||
if text and text[0] != "{":
|
|
||||||
_start = text.find("{")
|
|
||||||
_end = text.rfind("}")
|
|
||||||
if 0 <= _start < _end:
|
|
||||||
text = text[_start : _end + 1]
|
|
||||||
|
|
||||||
data = json.loads(text)
|
|
||||||
if not data or not isinstance(data, dict):
|
|
||||||
logger.debug("[skill-extract] parsed JSON not a dict, dropping")
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
title = data.get("title", "").strip()
|
title = data.get("title", "").strip()
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
"""Regression: skill-extraction JSON parsing must tolerate a stray brace in prose.
|
||||||
|
|
||||||
|
maybe_extract_skill() sliced the LLM response from the first '{' to the last
|
||||||
|
'}'. When a model emits a stray brace in prose before the real object
|
||||||
|
(e.g. "uses {placeholder} then {...}"), that slice starts at the prose brace and
|
||||||
|
json.loads fails, so a perfectly good skill is silently dropped. Extraction now
|
||||||
|
tries each '{' start position and returns the first candidate that parses to a
|
||||||
|
JSON object.
|
||||||
|
"""
|
||||||
|
from services.memory import skill_extractor
|
||||||
|
|
||||||
|
|
||||||
|
def test_stray_brace_before_real_json_is_recovered():
|
||||||
|
resp = (
|
||||||
|
'The user mentioned {placeholder} before the actual JSON '
|
||||||
|
'{"title": "Restart the service", "steps": ["a", "b"]}'
|
||||||
|
)
|
||||||
|
data = skill_extractor._extract_json_object(resp)
|
||||||
|
assert isinstance(data, dict)
|
||||||
|
assert data["title"] == "Restart the service"
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_json_object():
|
||||||
|
data = skill_extractor._extract_json_object('{"title": "Y", "steps": []}')
|
||||||
|
assert data["title"] == "Y"
|
||||||
|
|
||||||
|
|
||||||
|
def test_code_fenced_json():
|
||||||
|
data = skill_extractor._extract_json_object('```json\n{"title": "Z"}\n```')
|
||||||
|
assert data["title"] == "Z"
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_json_object_returns_none():
|
||||||
|
assert skill_extractor._extract_json_object("just prose, no object here") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_non_object_json_returns_none():
|
||||||
|
# A bare array is valid JSON but not a skill object.
|
||||||
|
assert skill_extractor._extract_json_object("[1, 2, 3]") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_input_returns_none():
|
||||||
|
assert skill_extractor._extract_json_object("") is None
|
||||||
Reference in New Issue
Block a user