mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-22 20:55:29 -04:00
ca4973c41f
The fallback regex in email_pollers.py that recovers a
[{"action": ...}, ...] JSON array from raw model output used lazy
[^[\]]*? runs inside a (?:,\s*\{...\}\s*)* repetition, which backtracks
exponentially (CodeQL py/redos) on inputs like [{"action"},{ + }},{{ * N.
It runs on the LLM reply to an email→calendar prompt embedding the
untrusted email body, so a crafted email can stall the background poller.
Extract the pattern to a module-level _CAL_ACTION_ARRAY_RE and rewrite the
object-content class from the lazy [^[\]]*? to a greedy brace-delimited
[^{}], which removes the quantifier ambiguity. The match is linear (a 500KB
adversarial input now resolves in <1ms) and equivalent on well-formed
arrays; it is also strictly more robust for values containing '[' or ']'
(the old class bailed on those and extracted nothing).
Resolves CodeQL py/redos #198.
50 lines
1.9 KiB
Python
50 lines
1.9 KiB
Python
r"""Regression test for ReDoS in the calendar-extract fallback regex.
|
|
|
|
CodeQL `py/redos` (#198) flagged the inline array-matcher in
|
|
`email_pollers.py` that recovers a `[{"action": ...}, ...]` JSON array from
|
|
raw LLM output (influenced by attacker-supplied email bodies). The original
|
|
pattern used `[^[\]]*?` lazy runs inside a `(...)*` repetition, which
|
|
backtracks *exponentially* on inputs like `[{"action"},{` + `}},{{` * N.
|
|
|
|
The regex is now a module-level constant so it can be pinned here. These tests
|
|
assert it (a) still extracts well-formed action arrays and (b) returns
|
|
promptly on the adversarial input that hung the old pattern.
|
|
"""
|
|
|
|
import time
|
|
|
|
from routes.email_pollers import _CAL_ACTION_ARRAY_RE
|
|
|
|
|
|
def _matches(s):
|
|
return [m.group() for m in _CAL_ACTION_ARRAY_RE.finditer(s)]
|
|
|
|
|
|
def test_extracts_action_array_from_prose():
|
|
s = 'Here you go:\n[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]\nThanks!'
|
|
assert _matches(s) == ['[{"action":"add","title":"Standup","start":"2026-07-01T09:00"}]']
|
|
|
|
|
|
def test_extracts_multi_object_array():
|
|
s = 'prose [{"action":"add","title":"A"},{"action":"cancel","uid":"x"}] tail'
|
|
assert _matches(s) == ['[{"action":"add","title":"A"},{"action":"cancel","uid":"x"}]']
|
|
|
|
|
|
def test_no_array_returns_no_match():
|
|
assert _matches("no array here at all") == []
|
|
|
|
|
|
def test_bracket_in_string_value_still_extracts():
|
|
# The old `[^[\]]` class bailed on a '[' inside a value and matched nothing;
|
|
# the linear `[^{}]` form correctly recovers the array.
|
|
s = '[{"action":"add","title":"Meeting [urgent]","start":"x"}]'
|
|
assert _matches(s) == [s]
|
|
|
|
|
|
def test_adversarial_input_is_fast():
|
|
evil = '[{"action"},{' + '}},{{' * 100_000 # exploded the old exponential pattern
|
|
start = time.perf_counter()
|
|
_CAL_ACTION_ARRAY_RE.search(evil)
|
|
dt = time.perf_counter() - start
|
|
assert dt < 1.0, f"_CAL_ACTION_ARRAY_RE took {dt:.2f}s on adversarial input"
|