odysseus/tests/test_redos_cal_extract.py

r"""Regression test for ReDoS in the calendar-extract fallback regex.

CodeQL `py/redos` (#198) flagged the inline array-matcher in
`email_pollers.py` that recovers a `[{"action": ...}, ...]` JSON array from
raw LLM output (influenced by attacker-supplied email bodies). The original
pattern used `[^[\]]*?` lazy runs inside a `(...)*` repetition, which
backtracks *exponentially* on inputs like `[{"action"},{` + `}},{{` * N.

The regex is now a module-level constant so it can be pinned here. These tests
assert it (a) still extracts well-formed action arrays and (b) returns
promptly on the adversarial input that hung the old pattern.
"""

import time

from routes.email_pollers import _CAL_ACTION_ARRAY_RE


def _matches(s):
    return [m.group() for m in _CAL_ACTION_ARRAY_RE.finditer(s)]


def test_extracts_action_array_from_prose():
    s = 'Here you go:\n[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]\nThanks!'
    assert _matches(s) == ['[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]']


def test_extracts_multi_object_array():
    s = 'prose [{"action":"add","title":"A"},{"action":"cancel","uid":"x"}] tail'
    assert _matches(s) == ['[{"action":"add","title":"A"},{"action":"cancel","uid":"x"}]']


def test_no_array_returns_no_match():
    assert _matches("no array here at all") == []


def test_bracket_in_string_value_still_extracts():
    # The old `[^[\]]` class bailed on a '[' inside a value and matched nothing;
    # the linear `[^{}]` form correctly recovers the array.
    s = '[{"action":"add","title":"Meeting [urgent]","start":"x"}]'
    assert _matches(s) == [s]


def test_adversarial_input_is_fast():
    evil = '[{"action"},{' + '}},{{' * 100_000  # exploded the old exponential pattern
    start = time.perf_counter()
    _CAL_ACTION_ARRAY_RE.search(evil)
    dt = time.perf_counter() - start
    assert dt < 1.0, f"_CAL_ACTION_ARRAY_RE took {dt:.2f}s on adversarial input"