fix(security): prevent exponential ReDoS in email→calendar extract regex (#4708)

The fallback regex in email_pollers.py that recovers a
[{"action": ...}, ...] JSON array from raw model output used lazy
[^[\]]*? runs inside a (?:,\s*\{...\}\s*)* repetition, which backtracks
exponentially (CodeQL py/redos) on inputs like [{"action"},{ + }},{{ * N.
It runs on the LLM reply to an email→calendar prompt embedding the
untrusted email body, so a crafted email can stall the background poller.

Extract the pattern to a module-level _CAL_ACTION_ARRAY_RE and rewrite the
object-content class from the lazy [^[\]]*? to a greedy brace-delimited
[^{}], which removes the quantifier ambiguity. The match is linear (a 500KB
adversarial input now resolves in <1ms) and equivalent on well-formed
arrays; it is also strictly more robust for values containing '[' or ']'
(the old class bailed on those and extracted nothing).

Resolves CodeQL py/redos #198.
This commit is contained in:
nopoz
2026-06-22 08:18:34 -07:00
committed by GitHub
parent 91b4171b3f
commit ca4973c41f
2 changed files with 61 additions and 1 deletions
+12 -1
View File
@@ -44,6 +44,17 @@ from routes.email_helpers import (
logger = logging.getLogger(__name__)
# Recovers a `[{"action": ...}, ...]` JSON array from raw LLM output when the
# fenced-block strip leaves nothing usable. Runs on model output influenced by
# untrusted email bodies, so it must not backtrack: the object content class is
# `[^{}]` (brace-delimited, greedy) rather than the old `[^[\]]*?` lazy runs,
# which exploded exponentially on inputs like `[{"action"},{` + `}},{{` * N
# (CodeQL py/redos #198).
_CAL_ACTION_ARRAY_RE = re.compile(
r'\[\s*\{[^{}]*"action"[^{}]*\}\s*(?:,\s*\{[^{}]*\}\s*)*\]',
re.DOTALL,
)
def _owner_for_email_account(account_id: str | None) -> str:
if not account_id:
@@ -558,7 +569,7 @@ async def _auto_summarize_pass_single(days_back: int = 1, account_id: str | None
cal_extract = _strip_think(_raw_original)
cal_extract = re.sub(r"^```(?:json)?\s*|\s*```$", "", cal_extract, flags=re.MULTILINE).strip()
if not cal_extract and _raw_original:
matches = list(re.finditer(r'\[\s*\{[^[\]]*?"action"[^[\]]*?\}\s*(?:,\s*\{[^[\]]*?\}\s*)*\]', _raw_original, re.DOTALL))
matches = list(_CAL_ACTION_ARRAY_RE.finditer(_raw_original))
if matches:
cal_extract = matches[-1].group()
logger.info(f"[cal-extract] uid={uid.decode() if isinstance(uid, bytes) else uid} folder={_folder} subj={subject[:50]!r} raw_len={len(cal_extract)} orig_len={len(_raw_original)} raw={cal_extract[:800]!r}")
+49
View File
@@ -0,0 +1,49 @@
r"""Regression test for ReDoS in the calendar-extract fallback regex.
CodeQL `py/redos` (#198) flagged the inline array-matcher in
`email_pollers.py` that recovers a `[{"action": ...}, ...]` JSON array from
raw LLM output (influenced by attacker-supplied email bodies). The original
pattern used `[^[\]]*?` lazy runs inside a `(...)*` repetition, which
backtracks *exponentially* on inputs like `[{"action"},{` + `}},{{` * N.
The regex is now a module-level constant so it can be pinned here. These tests
assert it (a) still extracts well-formed action arrays and (b) returns
promptly on the adversarial input that hung the old pattern.
"""
import time
from routes.email_pollers import _CAL_ACTION_ARRAY_RE
def _matches(s):
return [m.group() for m in _CAL_ACTION_ARRAY_RE.finditer(s)]
def test_extracts_action_array_from_prose():
s = 'Here you go:\n[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]\nThanks!'
assert _matches(s) == ['[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]']
def test_extracts_multi_object_array():
s = 'prose [{"action":"add","title":"A"},{"action":"cancel","uid":"x"}] tail'
assert _matches(s) == ['[{"action":"add","title":"A"},{"action":"cancel","uid":"x"}]']
def test_no_array_returns_no_match():
assert _matches("no array here at all") == []
def test_bracket_in_string_value_still_extracts():
# The old `[^[\]]` class bailed on a '[' inside a value and matched nothing;
# the linear `[^{}]` form correctly recovers the array.
s = '[{"action":"add","title":"Meeting [urgent]","start":"x"}]'
assert _matches(s) == [s]
def test_adversarial_input_is_fast():
evil = '[{"action"},{' + '}},{{' * 100_000 # exploded the old exponential pattern
start = time.perf_counter()
_CAL_ACTION_ARRAY_RE.search(evil)
dt = time.perf_counter() - start
assert dt < 1.0, f"_CAL_ACTION_ARRAY_RE took {dt:.2f}s on adversarial input"