diff --git a/services/memory/memory_extractor.py b/services/memory/memory_extractor.py
index 4ea0d6489..e5f609250 100644
--- a/services/memory/memory_extractor.py
+++ b/services/memory/memory_extractor.py
@@ -236,6 +236,43 @@ def _is_text_duplicate(new_text: str, existing: list, threshold: float = 0.6) ->
return False
+def _parse_extraction_json(raw: str) -> list:
+ """Parse the extraction LLM's reply into a list of facts, tolerating
+ reasoning-model noise.
+
+ The model emits … (and sometimes a prose preamble or a
+ ```json fence) AROUND the JSON array; without stripping it, json.loads
+ bombs and the run silently yields "0 candidates". Pure str -> list (no
+ LLM/network); returns [] on any parse failure instead of raising.
+ """
+ text = (raw or "").strip()
+ try:
+ from src.text_helpers import strip_think as _strip_think
+ text = _strip_think(text, prose=True, prompt_echo=True).strip()
+ except Exception:
+ pass
+ if text.startswith("```"):
+ text = text.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
+ # JSON may still be embedded in surrounding commentary (leading prose or
+ # trailing remarks like "[...] Done!") — slice from the first '[' to the
+ # last ']' whenever both exist. Slice unconditionally: a reply that starts
+ # with '[' can still carry trailing commentary that breaks json.loads.
+ _start = text.find("[")
+ _end = text.rfind("]")
+ if 0 <= _start < _end:
+ text = text[_start : _end + 1]
+
+ try:
+ facts = json.loads(text)
+ except json.JSONDecodeError:
+ logger.debug("Memory extraction returned non-JSON: %r", (raw or "")[:120])
+ return []
+ except Exception:
+ logger.debug("Memory extraction returned non-JSON: %r", (raw or "")[:120])
+ return []
+ return facts if isinstance(facts, list) else []
+
+
async def extract_and_store(
session,
memory_manager,
@@ -284,9 +321,34 @@ async def extract_and_store(
fallback_facts = _fallback_memory_candidates(stripped_recent)
+ # Flatten the window into a SINGLE user message instead of appending the
+ # raw alternating role messages. Passed as raw chat messages, the model
+ # treats the window as a conversation to CONTINUE rather than a transcript
+ # to ANALYZE, so it reliably extracts nothing — typically returning `[]`
+ # (and, depending on the input, sometimes an empty or -only
+ # completion when the window ends on an assistant turn). This was the real
+ # cause of auto-memory logging "0 candidates" on every run. Reframing it as
+ # one "analyze this transcript, return the JSON array" user message makes
+ # the model actually extract. Controlled repro on this model: 0/6 trials
+ # with the old structure vs 6/6 with this one. The skill extractor flattens
+ # for the same reason.
+ def _flatten_msg(m):
+ c = m.get("content", "")
+ if isinstance(c, list):
+ c = " ".join(
+ b.get("text", "") for b in c
+ if isinstance(b, dict) and b.get("type") == "text"
+ )
+ return f"{m.get('role', '?')}: {c}"
+
+ transcript = "\n\n".join(_flatten_msg(m) for m in stripped_recent)
extraction_messages = [
{"role": "system", "content": EXTRACT_SYSTEM_PROMPT},
- ] + stripped_recent
+ {"role": "user", "content": (
+ "Conversation to analyze:\n\n" + transcript
+ + "\n\nReturn the JSON array of durable facts now (or [] if none)."
+ )},
+ ]
facts = []
try:
@@ -295,19 +357,20 @@ async def extract_and_store(
model,
extraction_messages,
temperature=0.1,
- max_tokens=500,
+ # A reasoning model spends most of its budget on tokens
+ # BEFORE emitting the JSON, so the old 500 truncated the response
+ # before any JSON appeared → every run logged "0 candidates". The
+ # audit path hit the same wall and raised to 16384; extraction's
+ # output (a short facts list) is small, so an ample ceiling is
+ # enough once thinking has room.
+ max_tokens=4096,
headers=headers,
)
- # Parse JSON from response (handle markdown fences if model wraps them)
- text = raw.strip()
- if text.startswith("```"):
- text = text.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
-
- try:
- facts = json.loads(text)
- except json.JSONDecodeError:
- logger.debug("Memory extraction returned non-JSON")
+ # Parse JSON, tolerating reasoning-model noise ( blocks, a
+ # ```json fence, and leading/trailing commentary). See
+ # _parse_extraction_json — returns [] rather than raising.
+ facts = _parse_extraction_json(raw)
except Exception as e:
logger.warning(f"LLM memory extraction failed; using fallback candidates if available: {e}")
diff --git a/tests/test_memory_extraction_parse.py b/tests/test_memory_extraction_parse.py
new file mode 100644
index 000000000..20d383cc6
--- /dev/null
+++ b/tests/test_memory_extraction_parse.py
@@ -0,0 +1,36 @@
+"""_parse_extraction_json must survive reasoning-model noise.
+
+The extraction model wraps its JSON array in blocks, ```json fences,
+or leading/trailing prose. The helper strips that noise and slices the array
+unconditionally — a reply that starts with '[' can still carry trailing
+commentary like "[...] Done!" that would otherwise break json.loads.
+"""
+
+from services.memory.memory_extractor import _parse_extraction_json
+
+
+def test_think_prefixed_array_parses_to_one_fact():
+ raw = 'reasoning...\n[{"text": "x", "category": "fact"}]'
+ assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}]
+
+
+def test_fenced_json_block_parses():
+ raw = '```json\n[{"text": "x", "category": "fact"}]\n```'
+ assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}]
+
+
+def test_leading_prose_before_array_parses():
+ raw = 'Here are the durable facts:\n[{"text": "x", "category": "fact"}]'
+ assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}]
+
+
+def test_trailing_commentary_after_array_parses():
+ # Exercises the unconditional slice: text starts with '[' but has trailing
+ # commentary that the old `text[0] != "["` guard skipped, breaking json.loads.
+ raw = '[{"text": "x", "category": "fact"}] Done!'
+ assert _parse_extraction_json(raw) == [{"text": "x", "category": "fact"}]
+
+
+def test_malformed_no_array_returns_empty():
+ assert _parse_extraction_json("no array here, sorry") == []
+ assert _parse_extraction_json("") == []