diff --git a/services/memory/skill_extractor.py b/services/memory/skill_extractor.py index c11133921..520ea5748 100644 --- a/services/memory/skill_extractor.py +++ b/services/memory/skill_extractor.py @@ -63,6 +63,46 @@ def _has_duplicate_title(skills, title: str) -> bool: return False +def _extract_json_object(text: str) -> Optional[dict]: + """Best-effort extraction of a JSON object from an LLM response. + + The response may be wrapped in code fences or surrounded by prose, and some + models emit a stray brace in the prose before the real object + (e.g. "uses {placeholder} then {...}"). Slicing first-'{' .. last-'}' then + grabs an unparseable span and the skill is silently lost. Try the whole + string first, then each '{' start position in turn, returning the first + candidate that parses to a JSON object (dict). Returns None if none do. + """ + if not text: + return None + s = text.strip() + if s.startswith("```"): + s = s.split("\n", 1)[-1].rsplit("```", 1)[0].strip() + end = s.rfind("}") + if end == -1: + return None + + def _as_dict(candidate): + try: + obj = json.loads(candidate) + except (json.JSONDecodeError, ValueError): + return None + return obj if isinstance(obj, dict) else None + + # The clean, common case: the whole (de-fenced) string is the object. + obj = _as_dict(s) + if obj is not None: + return obj + # Otherwise scan each '{' candidate up to the last '}'. + start = s.find("{") + while 0 <= start < end: + obj = _as_dict(s[start : end + 1]) + if obj is not None: + return obj + start = s.find("{", start + 1) + return None + + async def maybe_extract_skill( session, skills_manager, @@ -169,21 +209,12 @@ async def maybe_extract_skill( except Exception: pass - # Parse JSON - text = response.strip() - if text.startswith("```"): - text = text.split("\n", 1)[-1].rsplit("```", 1)[0].strip() - # After strip_think, the JSON may still be embedded inside surrounding - # commentary — slice from the first '{' to the matching last '}'. - if text and text[0] != "{": - _start = text.find("{") - _end = text.rfind("}") - if 0 <= _start < _end: - text = text[_start : _end + 1] - - data = json.loads(text) - if not data or not isinstance(data, dict): - logger.debug("[skill-extract] parsed JSON not a dict, dropping") + # Parse JSON. The object may be wrapped in code fences or surrounded by + # commentary (and may contain a stray brace before the real object), so + # use a tolerant extractor that tries each '{' candidate. + data = _extract_json_object(response) + if not data: + logger.debug("[skill-extract] no JSON object found in response, dropping") return None title = data.get("title", "").strip() diff --git a/tests/test_skill_extractor_json.py b/tests/test_skill_extractor_json.py new file mode 100644 index 000000000..54460103e --- /dev/null +++ b/tests/test_skill_extractor_json.py @@ -0,0 +1,43 @@ +"""Regression: skill-extraction JSON parsing must tolerate a stray brace in prose. + +maybe_extract_skill() sliced the LLM response from the first '{' to the last +'}'. When a model emits a stray brace in prose before the real object +(e.g. "uses {placeholder} then {...}"), that slice starts at the prose brace and +json.loads fails, so a perfectly good skill is silently dropped. Extraction now +tries each '{' start position and returns the first candidate that parses to a +JSON object. +""" +from services.memory import skill_extractor + + +def test_stray_brace_before_real_json_is_recovered(): + resp = ( + 'The user mentioned {placeholder} before the actual JSON ' + '{"title": "Restart the service", "steps": ["a", "b"]}' + ) + data = skill_extractor._extract_json_object(resp) + assert isinstance(data, dict) + assert data["title"] == "Restart the service" + + +def test_clean_json_object(): + data = skill_extractor._extract_json_object('{"title": "Y", "steps": []}') + assert data["title"] == "Y" + + +def test_code_fenced_json(): + data = skill_extractor._extract_json_object('```json\n{"title": "Z"}\n```') + assert data["title"] == "Z" + + +def test_no_json_object_returns_none(): + assert skill_extractor._extract_json_object("just prose, no object here") is None + + +def test_non_object_json_returns_none(): + # A bare array is valid JSON but not a skill object. + assert skill_extractor._extract_json_object("[1, 2, 3]") is None + + +def test_empty_input_returns_none(): + assert skill_extractor._extract_json_object("") is None