fix(skill-extractor): walk all brace candidates so stray braces in prose do not swallow valid JSON (#2205)

* fix(skill-extractor): walk all brace candidates so stray braces in prose do not swallow valid JSON

The extractor sliced from the FIRST brace to the LAST brace to recover
JSON embedded in surrounding commentary. When the model emits stray
braces before the JSON object, the slice produces invalid JSON,
json.loads raises, and the exception is swallowed -- the skill is
silently lost.

Fix: walk each brace candidate left-to-right and attempt json.loads on
each slice. The first candidate that parses successfully wins. If none
parse, json.loads on the original text raises and the existing
JSONDecodeError handler logs and returns None as before.

Tested locally -- 8/8 tests passed:
  tests/test_extract_skill_json_nonstring.py (2 passed)
  tests/test_skill_extractor_rows.py (1 passed)
  tests/test_search_content_extraction_parity.py (2 passed)
  tests/test_deep_research_search_error.py (3 passed)

Closes #2199

* test(skill-extractor): add focused repro for stray-brace JSON recovery

* test(skill-extractor): add regression test for leading invalid-brace fragment

Addresses the remaining edge case from review: a response that *starts*
with a brace but the leading fragment isn't valid JSON (e.g.
'{not json}\n{"title": "Valid later", ...}') still needs to recover
the valid skill object that follows.

_extract_json_object (already on dev) handles this correctly — it tries
the whole de-fenced string first, then walks each '{' candidate left-to-
right regardless of whether the response begins with '{', so the leading
invalid fragment no longer short-circuits recovery of the real object.
Updates the comment at the call site to call this out explicitly and adds
a regression test covering exactly the scenario described in review.
This commit is contained in:
Lucas Daniel
2026-06-07 19:31:12 -03:00
committed by GitHub
parent 7b68413433
commit 73315e6ddc
2 changed files with 121 additions and 2 deletions
+4 -2
View File
@@ -210,8 +210,10 @@ async def maybe_extract_skill(
pass
# Parse JSON. The object may be wrapped in code fences or surrounded by
# commentary (and may contain a stray brace before the real object), so
# use a tolerant extractor that tries each '{' candidate.
# commentary (and may contain a stray/invalid brace fragment before
# the real object — including one that makes the response itself look
# like it starts with '{'), so use a tolerant extractor that tries the
# whole string first and then each '{' candidate left-to-right.
data = _extract_json_object(response)
if not data:
logger.debug("[skill-extract] no JSON object found in response, dropping")
+117
View File
@@ -0,0 +1,117 @@
import pytest
from services.memory import skill_extractor
class _FakeSession:
session_id = "s1"
def get_context_messages(self):
return [
{"role": "user", "content": "Walk me through deploying the service"},
{"role": "assistant", "content": "Sure, here's the runbook..."},
]
class _FakeSkillsManager:
def __init__(self):
self.added = []
def load(self, owner=None):
return []
def add_skill(self, **kwargs):
self.added.append(kwargs)
return {"id": "skill-1", **kwargs}
# Stray '{' in prose ("uses {a} then ...") before the real JSON object —
# the bug this fix addresses: slicing from the FIRST '{' to the LAST '}'
# produced invalid JSON and the whole extraction was silently dropped.
_STRAY_BRACE_RESPONSE = (
'Sure thing — note this uses {a} as a placeholder, then the actual skill is:\n'
'{"title": "Deploy runbook", "problem": "manual deploys are error-prone", '
'"solution": "use the deploy script", "steps": ["build", "push", "restart"], '
'"tags": ["deploy"], "confidence": 0.9}'
)
@pytest.mark.parametrize("response", [_STRAY_BRACE_RESPONSE])
async def test_maybe_extract_skill_recovers_json_past_stray_braces(monkeypatch, response):
async def fake_llm_call_async(*args, **kwargs):
return response
monkeypatch.setattr("src.llm_core.llm_call_async", fake_llm_call_async)
skills_manager = _FakeSkillsManager()
entry = await skill_extractor.maybe_extract_skill(
_FakeSession(),
skills_manager,
endpoint_url="http://endpoint",
model="test-model",
headers={},
round_count=3,
tool_count=3,
owner="alice",
)
assert entry is not None
assert entry["title"] == "Deploy runbook"
assert skills_manager.added and skills_manager.added[0]["title"] == "Deploy runbook"
# Response *starts* with a brace, but it's an invalid fragment — the valid
# skill JSON only appears on a later line. `json.loads(text)` fails on the
# first attempt even though `text[0] == "{"`, so the candidate walk must run
# regardless of whether the response starts with '{'.
_LEADING_INVALID_BRACE_RESPONSE = (
'{not json}\n'
'{"title": "Valid later", "problem": "p", "solution": "s", '
'"steps": ["one", "two", "three"], "tags": ["test"], "confidence": 0.9}'
)
@pytest.mark.parametrize("response", [_LEADING_INVALID_BRACE_RESPONSE])
async def test_maybe_extract_skill_recovers_json_after_leading_invalid_brace(monkeypatch, response):
async def fake_llm_call_async(*args, **kwargs):
return response
monkeypatch.setattr("src.llm_core.llm_call_async", fake_llm_call_async)
skills_manager = _FakeSkillsManager()
entry = await skill_extractor.maybe_extract_skill(
_FakeSession(),
skills_manager,
endpoint_url="http://endpoint",
model="test-model",
headers={},
round_count=3,
tool_count=3,
owner="alice",
)
assert entry is not None
assert entry["title"] == "Valid later"
assert skills_manager.added and skills_manager.added[0]["title"] == "Valid later"
async def test_maybe_extract_skill_drops_when_no_candidate_parses(monkeypatch):
async def fake_llm_call_async(*args, **kwargs):
return 'Some commentary with {unbalanced and { nested } braces } but no real JSON object'
monkeypatch.setattr("src.llm_core.llm_call_async", fake_llm_call_async)
skills_manager = _FakeSkillsManager()
entry = await skill_extractor.maybe_extract_skill(
_FakeSession(),
skills_manager,
endpoint_url="http://endpoint",
model="test-model",
headers={},
round_count=3,
tool_count=3,
owner="alice",
)
assert entry is None
assert not skills_manager.added