fix(security): prevent exponential ReDoS in email→calendar extract regex (#4708)

The fallback regex in email_pollers.py that recovers a [{"action": ...}, ...] JSON array from raw model output used lazy [^[\]]*? runs inside a (?:,\s*\{...\}\s*)* repetition, which backtracks exponentially (CodeQL py/redos) on inputs like [{"action"},{ + }},{{ * N. It runs on the LLM reply to an email→calendar prompt embedding the untrusted email body, so a crafted email can stall the background poller. Extract the pattern to a module-level _CAL_ACTION_ARRAY_RE and rewrite the object-content class from the lazy [^[\]]*? to a greedy brace-delimited [^{}], which removes the quantifier ambiguity. The match is linear (a 500KB adversarial input now resolves in <1ms) and equivalent on well-formed arrays; it is also strictly more robust for values containing '[' or ']' (the old class bailed on those and extracted nothing). Resolves CodeQL py/redos #198.
2026-06-22 20:55:29 -04:00 · 2026-06-22 08:18:34 -07:00
parent 91b4171b3f
commit ca4973c41f
2 changed files with 61 additions and 1 deletions
@@ -44,6 +44,17 @@ from routes.email_helpers import (

 logger = logging.getLogger(__name__)

+# Recovers a `[{"action": ...}, ...]` JSON array from raw LLM output when the
+# fenced-block strip leaves nothing usable. Runs on model output influenced by
+# untrusted email bodies, so it must not backtrack: the object content class is
+# `[^{}]` (brace-delimited, greedy) rather than the old `[^[\]]*?` lazy runs,
+# which exploded exponentially on inputs like `[{"action"},{` + `}},{{` * N
+# (CodeQL py/redos #198).
+_CAL_ACTION_ARRAY_RE = re.compile(
+    r'\[\s*\{[^{}]*"action"[^{}]*\}\s*(?:,\s*\{[^{}]*\}\s*)*\]',
+    re.DOTALL,
+)
+

 def _owner_for_email_account(account_id: str | None) -> str:
    if not account_id:
@@ -558,7 +569,7 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
                        cal_extract = _strip_think(_raw_original)
                        cal_extract = re.sub(r"^```(?:json)?\s*|\s*```$", "", cal_extract, flags=re.MULTILINE).strip()
                        if not cal_extract and _raw_original:
-                            matches = list(re.finditer(r'\[\s*\{[^[\]]*?"action"[^[\]]*?\}\s*(?:,\s*\{[^[\]]*?\}\s*)*\]', _raw_original, re.DOTALL))
+                            matches = list(_CAL_ACTION_ARRAY_RE.finditer(_raw_original))
                            if matches:
                                cal_extract = matches[-1].group()
                        logger.info(f"[cal-extract] uid={uid.decode() if isinstance(uid, bytes) else uid} folder={_folder} subj={subject[:50]!r} raw_len={len(cal_extract)} orig_len={len(_raw_original)} raw={cal_extract[:800]!r}")
@@ -0,0 +1,49 @@
+r"""Regression test for ReDoS in the calendar-extract fallback regex.
+
+CodeQL `py/redos` (#198) flagged the inline array-matcher in
+`email_pollers.py` that recovers a `[{"action": ...}, ...]` JSON array from
+raw LLM output (influenced by attacker-supplied email bodies). The original
+pattern used `[^[\]]*?` lazy runs inside a `(...)*` repetition, which
+backtracks *exponentially* on inputs like `[{"action"},{` + `}},{{` * N.
+
+The regex is now a module-level constant so it can be pinned here. These tests
+assert it (a) still extracts well-formed action arrays and (b) returns
+promptly on the adversarial input that hung the old pattern.
+"""
+
+import time
+
+from routes.email_pollers import _CAL_ACTION_ARRAY_RE
+
+
+def _matches(s):
+    return [m.group() for m in _CAL_ACTION_ARRAY_RE.finditer(s)]
+
+
+def test_extracts_action_array_from_prose():
+    s = 'Here you go:\n[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]\nThanks!'
+    assert _matches(s) == ['[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]']
+
+
+def test_extracts_multi_object_array():
+    s = 'prose [{"action":"add","title":"A"},{"action":"cancel","uid":"x"}] tail'
+    assert _matches(s) == ['[{"action":"add","title":"A"},{"action":"cancel","uid":"x"}]']
+
+
+def test_no_array_returns_no_match():
+    assert _matches("no array here at all") == []
+
+
+def test_bracket_in_string_value_still_extracts():
+    # The old `[^[\]]` class bailed on a '[' inside a value and matched nothing;
+    # the linear `[^{}]` form correctly recovers the array.
+    s = '[{"action":"add","title":"Meeting [urgent]","start":"x"}]'
+    assert _matches(s) == [s]
+
+
+def test_adversarial_input_is_fast():
+    evil = '[{"action"},{' + '}},{{' * 100_000  # exploded the old exponential pattern
+    start = time.perf_counter()
+    _CAL_ACTION_ARRAY_RE.search(evil)
+    dt = time.perf_counter() - start
+    assert dt < 1.0, f"_CAL_ACTION_ARRAY_RE took {dt:.2f}s on adversarial input"