diff --git a/routes/email_pollers.py b/routes/email_pollers.py index 146db0ed7..7fca02c72 100644 --- a/routes/email_pollers.py +++ b/routes/email_pollers.py @@ -44,6 +44,17 @@ from routes.email_helpers import ( logger = logging.getLogger(__name__) +# Recovers a `[{"action": ...}, ...]` JSON array from raw LLM output when the +# fenced-block strip leaves nothing usable. Runs on model output influenced by +# untrusted email bodies, so it must not backtrack: the object content class is +# `[^{}]` (brace-delimited, greedy) rather than the old `[^[\]]*?` lazy runs, +# which exploded exponentially on inputs like `[{"action"},{` + `}},{{` * N +# (CodeQL py/redos #198). +_CAL_ACTION_ARRAY_RE = re.compile( + r'\[\s*\{[^{}]*"action"[^{}]*\}\s*(?:,\s*\{[^{}]*\}\s*)*\]', + re.DOTALL, +) + def _owner_for_email_account(account_id: str | None) -> str: if not account_id: @@ -558,7 +569,7 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None cal_extract = _strip_think(_raw_original) cal_extract = re.sub(r"^```(?:json)?\s*|\s*```$", "", cal_extract, flags=re.MULTILINE).strip() if not cal_extract and _raw_original: - matches = list(re.finditer(r'\[\s*\{[^[\]]*?"action"[^[\]]*?\}\s*(?:,\s*\{[^[\]]*?\}\s*)*\]', _raw_original, re.DOTALL)) + matches = list(_CAL_ACTION_ARRAY_RE.finditer(_raw_original)) if matches: cal_extract = matches[-1].group() logger.info(f"[cal-extract] uid={uid.decode() if isinstance(uid, bytes) else uid} folder={_folder} subj={subject[:50]!r} raw_len={len(cal_extract)} orig_len={len(_raw_original)} raw={cal_extract[:800]!r}") diff --git a/tests/test_redos_cal_extract.py b/tests/test_redos_cal_extract.py new file mode 100644 index 000000000..d3ea8b988 --- /dev/null +++ b/tests/test_redos_cal_extract.py @@ -0,0 +1,49 @@ +r"""Regression test for ReDoS in the calendar-extract fallback regex. + +CodeQL `py/redos` (#198) flagged the inline array-matcher in +`email_pollers.py` that recovers a `[{"action": ...}, ...]` JSON array from +raw LLM output (influenced by attacker-supplied email bodies). The original +pattern used `[^[\]]*?` lazy runs inside a `(...)*` repetition, which +backtracks *exponentially* on inputs like `[{"action"},{` + `}},{{` * N. + +The regex is now a module-level constant so it can be pinned here. These tests +assert it (a) still extracts well-formed action arrays and (b) returns +promptly on the adversarial input that hung the old pattern. +""" + +import time + +from routes.email_pollers import _CAL_ACTION_ARRAY_RE + + +def _matches(s): + return [m.group() for m in _CAL_ACTION_ARRAY_RE.finditer(s)] + + +def test_extracts_action_array_from_prose(): + s = 'Here you go:\n[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]\nThanks!' + assert _matches(s) == ['[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]'] + + +def test_extracts_multi_object_array(): + s = 'prose [{"action":"add","title":"A"},{"action":"cancel","uid":"x"}] tail' + assert _matches(s) == ['[{"action":"add","title":"A"},{"action":"cancel","uid":"x"}]'] + + +def test_no_array_returns_no_match(): + assert _matches("no array here at all") == [] + + +def test_bracket_in_string_value_still_extracts(): + # The old `[^[\]]` class bailed on a '[' inside a value and matched nothing; + # the linear `[^{}]` form correctly recovers the array. + s = '[{"action":"add","title":"Meeting [urgent]","start":"x"}]' + assert _matches(s) == [s] + + +def test_adversarial_input_is_fast(): + evil = '[{"action"},{' + '}},{{' * 100_000 # exploded the old exponential pattern + start = time.perf_counter() + _CAL_ACTION_ARRAY_RE.search(evil) + dt = time.perf_counter() - start + assert dt < 1.0, f"_CAL_ACTION_ARRAY_RE took {dt:.2f}s on adversarial input"