fix(skills): tolerate a stray brace before the JSON in skill extraction (#2200)

maybe_extract_skill() sliced the LLM response from the first '{' to the last '}'. When a model emits a stray brace in prose before the real object (e.g. "uses {placeholder} then {...}"), the slice starts at the prose brace, json.loads fails, and a valid skill is silently dropped. Factor parsing into _extract_json_object(), which tries the whole (de-fenced) string first and then each '{' start position, returning the first candidate that parses to a JSON object. Adds tests/test_skill_extractor_json.py.
2026-06-15 17:25:26 -04:00 · 2026-06-07 17:54:36 +03:00
parent c5ac89f01f
commit 92ef01d4fa
2 changed files with 89 additions and 15 deletions
@@ -63,6 +63,46 @@ def _has_duplicate_title(skills, title: str) -> bool:
    return False


+def _extract_json_object(text: str) -> Optional[dict]:
+    """Best-effort extraction of a JSON object from an LLM response.
+
+    The response may be wrapped in code fences or surrounded by prose, and some
+    models emit a stray brace in the prose before the real object
+    (e.g. "uses {placeholder} then {...}"). Slicing first-'{' .. last-'}' then
+    grabs an unparseable span and the skill is silently lost. Try the whole
+    string first, then each '{' start position in turn, returning the first
+    candidate that parses to a JSON object (dict). Returns None if none do.
+    """
+    if not text:
+        return None
+    s = text.strip()
+    if s.startswith("```"):
+        s = s.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
+    end = s.rfind("}")
+    if end == -1:
+        return None
+
+    def _as_dict(candidate):
+        try:
+            obj = json.loads(candidate)
+        except (json.JSONDecodeError, ValueError):
+            return None
+        return obj if isinstance(obj, dict) else None
+
+    # The clean, common case: the whole (de-fenced) string is the object.
+    obj = _as_dict(s)
+    if obj is not None:
+        return obj
+    # Otherwise scan each '{' candidate up to the last '}'.
+    start = s.find("{")
+    while 0 <= start < end:
+        obj = _as_dict(s[start : end + 1])
+        if obj is not None:
+            return obj
+        start = s.find("{", start + 1)
+    return None
+
+
 async def maybe_extract_skill(
    session,
    skills_manager,
@@ -169,21 +209,12 @@ async def maybe_extract_skill(
        except Exception:
            pass

-        # Parse JSON
-        text = response.strip()
-        if text.startswith("```"):
-            text = text.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
-        # After strip_think, the JSON may still be embedded inside surrounding
-        # commentary — slice from the first '{' to the matching last '}'.
-        if text and text[0] != "{":
-            _start = text.find("{")
-            _end = text.rfind("}")
-            if 0 <= _start < _end:
-                text = text[_start : _end + 1]
-
-        data = json.loads(text)
-        if not data or not isinstance(data, dict):
-            logger.debug("[skill-extract] parsed JSON not a dict, dropping")
+        # Parse JSON. The object may be wrapped in code fences or surrounded by
+        # commentary (and may contain a stray brace before the real object), so
+        # use a tolerant extractor that tries each '{' candidate.
+        data = _extract_json_object(response)
+        if not data:
+            logger.debug("[skill-extract] no JSON object found in response, dropping")
            return None

        title = data.get("title", "").strip()
@@ -0,0 +1,43 @@
+"""Regression: skill-extraction JSON parsing must tolerate a stray brace in prose.
+
+maybe_extract_skill() sliced the LLM response from the first '{' to the last
+'}'. When a model emits a stray brace in prose before the real object
+(e.g. "uses {placeholder} then {...}"), that slice starts at the prose brace and
+json.loads fails, so a perfectly good skill is silently dropped. Extraction now
+tries each '{' start position and returns the first candidate that parses to a
+JSON object.
+"""
+from services.memory import skill_extractor
+
+
+def test_stray_brace_before_real_json_is_recovered():
+    resp = (
+        'The user mentioned {placeholder} before the actual JSON '
+        '{"title": "Restart the service", "steps": ["a", "b"]}'
+    )
+    data = skill_extractor._extract_json_object(resp)
+    assert isinstance(data, dict)
+    assert data["title"] == "Restart the service"
+
+
+def test_clean_json_object():
+    data = skill_extractor._extract_json_object('{"title": "Y", "steps": []}')
+    assert data["title"] == "Y"
+
+
+def test_code_fenced_json():
+    data = skill_extractor._extract_json_object('```json\n{"title": "Z"}\n```')
+    assert data["title"] == "Z"
+
+
+def test_no_json_object_returns_none():
+    assert skill_extractor._extract_json_object("just prose, no object here") is None
+
+
+def test_non_object_json_returns_none():
+    # A bare array is valid JSON but not a skill object.
+    assert skill_extractor._extract_json_object("[1, 2, 3]") is None
+
+
+def test_empty_input_returns_none():
+    assert skill_extractor._extract_json_object("") is None