fix(memory): record dislikes as dislikes, not preferences (#2435)

_fallback_memory_candidates matched both positive (prefer/like/love) and negative (hate / do not like / don't like) sentiment verbs in one regex alternation, then formatted every hit as "User prefers {X}.". So "I hate cilantro" was stored as "User prefers cilantro." -- the inverse of what the user said. These fallback facts are persisted to memory and later re-injected into the model's context, so the inverted preference actively misleads the assistant. Capture the matched verb and branch on it: negatives become "User dislikes {X}.", positives stay "User prefers {X}." (still filed under the existing "preference" category). Supported by Claude Opus 4.8 Co-authored-by: SurprisedDuck <288741682+SurprisedDuck@users.noreply.github.com>
2026-06-17 18:25:26 -04:00 · 2026-06-07 16:36:07 +02:00
parent 3c924b8dee
commit c75d3e1975
2 changed files with 42 additions and 3 deletions
@@ -192,11 +192,19 @@ def _fallback_memory_candidates(messages) -> list[dict]:
            if place:
                add(f"User lives in {place}.", "identity")

-        m = re.search(r"\bi (?:prefer|like|love|hate|do not like|don't like)\s+([^.!?\n]{4,100})", text, re.I)
+        m = re.search(r"\bi (prefer|like|love|hate|do not like|don't like)\s+([^.!?\n]{4,100})", text, re.I)
        if m:
-            preference = _clean_memory_value(m.group(1), 100)
+            preference = _clean_memory_value(m.group(2), 100)
            if preference:
-                add(f"User prefers {preference}.", "preference")
+                # The same pattern catches likes and dislikes; keep the stored
+                # sentiment faithful instead of recording every match as a
+                # preference ("I hate cilantro" must not become "User prefers
+                # cilantro").
+                verb = m.group(1).lower()
+                if verb in ("hate", "do not like", "don't like"):
+                    add(f"User dislikes {preference}.", "preference")
+                else:
+                    add(f"User prefers {preference}.", "preference")

        m = re.search(
            r"\bi (?:(?:want|would like|plan|hope) to|wanna) "
@@ -0,0 +1,31 @@
+"""The fallback memory extractor must not invert dislikes into preferences.
+
+_fallback_memory_candidates matched both positive (prefer/like/love) and
+negative (hate/do not like/don't like) sentiment verbs in one alternation but
+formatted every hit as "User prefers X.", so "I hate cilantro" was stored as
+"User prefers cilantro" -- the opposite of what the user said, then persisted
+to memory and re-injected into context. These pin the sentiment.
+"""
+from services.memory.memory_extractor import _fallback_memory_candidates
+
+
+def _texts(content):
+    cands = _fallback_memory_candidates([{"role": "user", "content": content}])
+    return [c["text"].lower() for c in cands]
+
+
+def test_dislike_is_not_stored_as_preference():
+    texts = _texts("I hate cilantro in my food")
+    assert not any("prefers cilantro" in t for t in texts)
+    assert any("dislikes cilantro" in t for t in texts)
+
+
+def test_negated_like_is_not_stored_as_preference():
+    texts = _texts("I don't like crowded trains")
+    assert not any("prefers crowded" in t for t in texts)
+    assert any("dislikes crowded" in t for t in texts)
+
+
+def test_genuine_preference_still_stored():
+    texts = _texts("I love spicy ramen noodles")
+    assert any("prefers spicy ramen" in t for t in texts)