diff --git a/services/memory/memory_extractor.py b/services/memory/memory_extractor.py index 4ea0d6489..e5f609250 100644 --- a/services/memory/memory_extractor.py +++ b/services/memory/memory_extractor.py @@ -236,6 +236,43 @@ def _is_text_duplicate(new_text: str, existing: list, threshold: float = 0.6) -> return False +def _parse_extraction_json(raw: str) -> list: + """Parse the extraction LLM's reply into a list of facts, tolerating + reasoning-model noise. + + The model emits (and sometimes a prose preamble or a + ```json fence) AROUND the JSON array; without stripping it, json.loads + bombs and the run silently yields "0 candidates". Pure str -> list (no + LLM/network); returns [] on any parse failure instead of raising. + """ + text = (raw or "").strip() + try: + from src.text_helpers import strip_think as _strip_think + text = _strip_think(text, prose=True, prompt_echo=True).strip() + except Exception: + pass + if text.startswith("```"): + text = text.split("\n", 1)[-1].rsplit("```", 1)[0].strip() + # JSON may still be embedded in surrounding commentary (leading prose or + # trailing remarks like "[...] Done!") — slice from the first '[' to the + # last ']' whenever both exist. Slice unconditionally: a reply that starts + # with '[' can still carry trailing commentary that breaks json.loads. + _start = text.find("[") + _end = text.rfind("]") + if 0 <= _start < _end: + text = text[_start : _end + 1] + + try: + facts = json.loads(text) + except json.JSONDecodeError: + logger.debug("Memory extraction returned non-JSON: %r", (raw or "")[:120]) + return [] + except Exception: + logger.debug("Memory extraction returned non-JSON: %r", (raw or "")[:120]) + return [] + return facts if isinstance(facts, list) else [] + + async def extract_and_store( session, memory_manager, @@ -284,9 +321,34 @@ async def extract_and_store( fallback_facts = _fallback_memory_candidates(stripped_recent) + # Flatten the window into a SINGLE user message instead of appending the + # raw alternating role messages. Passed as raw chat messages, the model + # treats the window as a conversation to CONTINUE rather than a transcript + # to ANALYZE, so it reliably extracts nothing — typically returning `[]` + # (and, depending on the input, sometimes an empty or -only + # completion when the window ends on an assistant turn). This was the real + # cause of auto-memory logging "0 candidates" on every run. Reframing it as + # one "analyze this transcript, return the JSON array" user message makes + # the model actually extract. Controlled repro on this model: 0/6 trials + # with the old structure vs 6/6 with this one. The skill extractor flattens + # for the same reason. + def _flatten_msg(m): + c = m.get("content", "") + if isinstance(c, list): + c = " ".join( + b.get("text", "") for b in c + if isinstance(b, dict) and b.get("type") == "text" + ) + return f"{m.get('role', '?')}: {c}" + + transcript = "\n\n".join(_flatten_msg(m) for m in stripped_recent) extraction_messages = [ {"role": "system", "content": EXTRACT_SYSTEM_PROMPT}, - ] + stripped_recent + {"role": "user", "content": ( + "Conversation to analyze:\n\n" + transcript + + "\n\nReturn the JSON array of durable facts now (or [] if none)." + )}, + ] facts = [] try: @@ -295,19 +357,20 @@ async def extract_and_store( model, extraction_messages, temperature=0.1, - max_tokens=500, + # A reasoning model spends most of its budget on tokens + # BEFORE emitting the JSON, so the old 500 truncated the response + # before any JSON appeared → every run logged "0 candidates". The + # audit path hit the same wall and raised to 16384; extraction's + # output (a short facts list) is small, so an ample ceiling is + # enough once thinking has room. + max_tokens=4096, headers=headers, ) - # Parse JSON from response (handle markdown fences if model wraps them) - text = raw.strip() - if text.startswith("```"): - text = text.split("\n", 1)[-1].rsplit("```", 1)[0].strip() - - try: - facts = json.loads(text) - except json.JSONDecodeError: - logger.debug("Memory extraction returned non-JSON") + # Parse JSON, tolerating reasoning-model noise ( blocks, a + # ```json fence, and leading/trailing commentary). See + # _parse_extraction_json — returns [] rather than raising. + facts = _parse_extraction_json(raw) except Exception as e: logger.warning(f"LLM memory extraction failed; using fallback candidates if available: {e}") diff --git a/tests/test_memory_extraction_parse.py b/tests/test_memory_extraction_parse.py new file mode 100644 index 000000000..20d383cc6 --- /dev/null +++ b/tests/test_memory_extraction_parse.py @@ -0,0 +1,36 @@ +"""_parse_extraction_json must survive reasoning-model noise. + +The extraction model wraps its JSON array in blocks, ```json fences, +or leading/trailing prose. The helper strips that noise and slices the array +unconditionally — a reply that starts with '[' can still carry trailing +commentary like "[...] Done!" that would otherwise break json.loads. +""" + +from services.memory.memory_extractor import _parse_extraction_json + + +def test_think_prefixed_array_parses_to_one_fact(): + raw = 'reasoning...\n[{"text": "x", "category": "fact"}]' + assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}] + + +def test_fenced_json_block_parses(): + raw = '```json\n[{"text": "x", "category": "fact"}]\n```' + assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}] + + +def test_leading_prose_before_array_parses(): + raw = 'Here are the durable facts:\n[{"text": "x", "category": "fact"}]' + assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}] + + +def test_trailing_commentary_after_array_parses(): + # Exercises the unconditional slice: text starts with '[' but has trailing + # commentary that the old `text[0] != "["` guard skipped, breaking json.loads. + raw = '[{"text": "x", "category": "fact"}] Done!' + assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}] + + +def test_malformed_no_array_returns_empty(): + assert _parse_extraction_json("no array here, sorry") == [] + assert _parse_extraction_json("") == []