mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 09:45:24 -04:00
fix(webhooks): redact IPv6 addresses in sanitized error messages (#3038)
* fix(webhooks): redact IPv6 addresses in sanitized error messages
sanitize_error() only stripped IPv4 literals, so a failed webhook
delivery to an internal IPv6 host (::1, fe80::/fc00:: ...) leaked the
address into Webhook.last_error, which is surfaced in the UI. The module
already treats internal IPv6 as sensitive (see _PRIVATE_NETWORKS and
src/url_safety.py); the scrubber just didn't keep up.
Add an IPv6 redaction pass covering bracketed, full 8-group, and
::-compressed forms. The pattern is scoped to leave clock times
("12:34:56"), MAC addresses, and C++ "::" tokens untouched, and the
::-branch uses a lookahead over a flat character class so there is no
nested quantifier to backtrack on (no ReDoS on long colon/hex runs).
Adds tests/test_webhook_sanitize_error_ipv6.py.
* webhook: validate IPv6 candidates with ipaddress, not a regex grammar
Per review on #3038: instead of hand-rolling the IPv6 grammar in a regex
(brittle, and easy to over-match colon-heavy text), use a loose regex to
find candidate tokens and let ipaddress.ip_address() decide. Only tokens
it parses as IPv6 are redacted, so the false-positive guards (clock times,
MACs, "std::vector") now come from the stdlib instead of a custom pattern.
This also covers cases the old pattern missed -- zone ids (fe80::1%eth0)
and IPv4-mapped addresses -- and no longer partially mangles invalid
colon strings (a 9-group token is preserved whole rather than losing its
first 8 groups). The bracketed branch is a single greedy class with no
X*:X* backtracking; verified ~1ms on 40k-char adversarial input.
Extends the test file with zone-id, IPv4-mapped, and invalid-token cases.
* webhook: redact bracketed/scoped/IPv4-mapped IPv6 as one unit
Review on #3038 found a few IP forms left partially redacted or malformed
by sanitize_error():
[fe80::1%eth0]:8080 -> [[redacted]]:8080
[::ffff:192.168.0.1]:8080 -> [[redacted][redacted]]:8080
::ffff:192.168.0.1 -> [redacted][redacted]
Two causes: the bracketed branch's character class dropped zone ids, so
scoped addresses fell through to the bare branch and left the brackets and
port behind; and the IPv4 pass ran first, stripping the embedded v4 of an
IPv4-mapped address so the v6 pass then redacted the "::ffff:" remnant
separately.
Fix:
- run the IP-candidate pass before the IPv4 pass, so IPv4-mapped forms are
matched and redacted whole
- match the full bracketed authority ([...] + optional %zone + :port) as a
single token, and redact a v4-or-v6 literal inside [ ] as one [redacted]
- extend the bare branch with a bounded (exactly-3) dotted-quad tail for
IPv4-mapped forms; exactly-3 so it can't swallow a partial suffix and
accidentally preserve an otherwise-valid address
Each form now collapses to a single [redacted]; the candidate finder stays
linear (~1.3ms on 40k-char adversarial input). Adds regression tests for
the three reported forms and keeps the timestamp/MAC/std::vector coverage.
This commit is contained in:
committed by
GitHub
parent
a3cb15d0a1
commit
3940297655
+54
-3
@@ -136,11 +136,62 @@ def validate_events(events_str: str) -> str:
|
||||
return ",".join(events)
|
||||
|
||||
|
||||
# Broad candidate matcher for the IP-redaction pass. Deliberately loose: a
|
||||
# bracketed host authority ([fe80::1%eth0]:8080 and friends) with an optional
|
||||
# :port, or a bare IPv6 run — hex groups joined by colons, an optional trailing
|
||||
# dotted-quad for IPv4-mapped forms (::ffff:192.168.0.1), and an optional %zone.
|
||||
# It does NOT encode the IPv6 grammar; ipaddress.ip_address() is the real
|
||||
# validator (see _redact_ip_candidate), so any colon-bearing string it rejects
|
||||
# (clock times, MACs, "std::vector") is left alone. Every branch is a single
|
||||
# greedy class or a repetition over a mandatory ':'/'.' delimiter, so there is no
|
||||
# nested-quantifier backtracking (ReDoS-safe).
|
||||
_IP_CANDIDATE = re.compile(
|
||||
r'\[[^\[\]\s]*\](?::\d+)?'
|
||||
r'|(?<![\w.:%])[0-9A-Fa-f]{0,4}(?::[0-9A-Fa-f]{0,4}){2,}'
|
||||
r'(?:(?:\.[0-9]{1,3}){3})?(?:%[0-9A-Za-z._-]+)?'
|
||||
)
|
||||
|
||||
|
||||
def _redact_ip_candidate(match: re.Match) -> str:
|
||||
"""Redact a candidate token that the stdlib confirms is an IP address.
|
||||
|
||||
A bare token is redacted only when it parses as IPv6 — bare IPv4 is left to
|
||||
the dedicated IPv4 pass. A bracketed token is a host authority, so a v4 or v6
|
||||
literal inside [ ] is redacted as a whole. This keeps output consistent (one
|
||||
[redacted], never nested or partial) for scoped/mapped/ported forms.
|
||||
"""
|
||||
token = match.group(0)
|
||||
bracketed = token.startswith('[')
|
||||
candidate = token
|
||||
if bracketed:
|
||||
# Keep only what's inside [...]; the trailing :port is dropped.
|
||||
candidate = candidate[1:candidate.index(']')]
|
||||
# A zone id (fe80::1%eth0) is not part of the address ipaddress parses.
|
||||
candidate = candidate.split('%', 1)[0]
|
||||
# The loose bare pattern can trail one stray ':' (e.g. "::1:" in "host ::1:
|
||||
# down"); drop it unless it's the "::" compression marker.
|
||||
if candidate.endswith(':') and not candidate.endswith('::'):
|
||||
candidate = candidate[:-1]
|
||||
try:
|
||||
addr = ipaddress.ip_address(candidate)
|
||||
except ValueError:
|
||||
return token
|
||||
if bracketed or isinstance(addr, ipaddress.IPv6Address):
|
||||
return '[redacted]'
|
||||
return token
|
||||
|
||||
|
||||
def sanitize_error(error: str, max_len: int = 200) -> str:
|
||||
"""Strip potentially sensitive details from error messages."""
|
||||
# Remove IP addresses and ports
|
||||
cleaned = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?', '[redacted]', error)
|
||||
# Remove hostnames in URLs
|
||||
# Redact IPv6 (and bracketed-authority) addresses first, so an IPv4-mapped
|
||||
# form like ::ffff:192.168.0.1 is scrubbed as one unit instead of having its
|
||||
# embedded IPv4 removed first and leaving a stray "::ffff:" behind. Broad
|
||||
# candidates are validated by ipaddress.ip_address(), so the false-positive
|
||||
# guards (clock times, MACs, C++ "::") come from the stdlib, not a regex.
|
||||
cleaned = _IP_CANDIDATE.sub(_redact_ip_candidate, error)
|
||||
# Remove remaining bare IPv4 addresses and ports.
|
||||
cleaned = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?', '[redacted]', cleaned)
|
||||
# Remove hostnames in URLs.
|
||||
cleaned = re.sub(r'https?://[^\s/]+', '[redacted-url]', cleaned)
|
||||
return cleaned[:max_len]
|
||||
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
"""sanitize_error must scrub IPv6 addresses, not just IPv4.
|
||||
|
||||
Webhook delivery errors are stored in Webhook.last_error and surfaced in the
|
||||
UI. The scrubber removed IPv4 literals but let IPv6 addresses through, so a
|
||||
failed delivery to an internal v6 host (::1, fe80::/fc00:: ...) leaked the
|
||||
address. This pins the v6 redaction while keeping the false-positive guards
|
||||
(clock times, MACs, C++ "::") that make the pattern safe on arbitrary text.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from unittest.mock import patch
|
||||
|
||||
from tests.helpers.import_state import clear_module, preserve_import_state
|
||||
|
||||
# Same import dance as test_webhook_ssrf_resilience.py: webhook_manager pulls in
|
||||
# core.database (init_db -> create_all), which needs a DB path at import time.
|
||||
# Pin DATABASE_URL to in-memory SQLite and restore module state afterwards.
|
||||
# sanitize_error itself is pure (stdlib re only).
|
||||
with patch.dict(os.environ, {"DATABASE_URL": "sqlite:///:memory:"}), \
|
||||
preserve_import_state("src.database", "core.database"):
|
||||
clear_module("src.database")
|
||||
_core_database = sys.modules.get("core.database")
|
||||
if _core_database is not None and not getattr(_core_database, "__file__", None):
|
||||
del sys.modules["core.database"]
|
||||
from src.webhook_manager import sanitize_error
|
||||
|
||||
|
||||
def test_ipv6_addresses_are_redacted():
|
||||
leaky = [
|
||||
"connect to [fd00::1234:5678]:8080 failed", # bracketed + port
|
||||
"ConnectError to fe80::1 refused", # link-local
|
||||
"no route to ::1", # loopback
|
||||
"host fc00::abcd unreachable", # unique-local
|
||||
"connect to [::1]:443 refused", # bracketed + port
|
||||
"POST https://[2001:db8::1]:443/hook failed", # inside a URL
|
||||
"addr 2001:0db8:0000:0000:0000:ff00:0042:8329", # full 8-group
|
||||
]
|
||||
for msg in leaky:
|
||||
out = sanitize_error(msg)
|
||||
# Scrubbed via the v6 rule ([redacted]) or, inside a URL, the URL rule
|
||||
# ([redacted-url]) — either way the address must not survive.
|
||||
assert "[redacted" in out, out
|
||||
assert "::" not in out and "[fd00" not in out, out
|
||||
|
||||
|
||||
def test_non_addresses_are_preserved():
|
||||
# Colon-bearing strings that are NOT IPv6 must pass through untouched, so
|
||||
# error messages stay readable.
|
||||
safe = [
|
||||
"failed at 12:34:56 today", # clock time
|
||||
"2026-06-05T22:36:55 connection reset", # ISO timestamp
|
||||
"std::vector<int> overflow", # C++ scope resolution
|
||||
"device ab:cd:ef:01:23:45 offline", # MAC address
|
||||
"unsupported ratio 16:9",
|
||||
"HTTP 500 from upstream",
|
||||
"request [deadbeef] failed", # bracketed hex id, no colon
|
||||
]
|
||||
for msg in safe:
|
||||
assert sanitize_error(msg) == msg, msg
|
||||
|
||||
|
||||
def test_ipv4_still_redacted_and_length_capped():
|
||||
assert sanitize_error("dial 192.168.1.5:9000 refused") == "dial [redacted] refused"
|
||||
assert len(sanitize_error("x" * 500)) == 200
|
||||
|
||||
|
||||
def test_ipv6_zone_id_is_redacted():
|
||||
# Link-local addresses often carry a %zone (fe80::1%eth0). The whole token,
|
||||
# zone included, must go — ipaddress validates the address part.
|
||||
out = sanitize_error("bind fe80::1%eth0 unreachable")
|
||||
assert "[redacted]" in out
|
||||
assert "::" not in out and "%eth0" not in out and "fe80" not in out
|
||||
|
||||
|
||||
def test_ipv4_mapped_ipv6_is_scrubbed():
|
||||
# ::ffff:192.168.0.1 must be redacted as a single unit (one [redacted]), not
|
||||
# split into "[redacted][redacted]" by the v6 and v4 passes.
|
||||
assert sanitize_error("to ::ffff:192.168.0.1 closed") == "to [redacted] closed"
|
||||
|
||||
|
||||
def test_bracketed_scoped_ipv6_with_port_is_one_redaction():
|
||||
# [fe80::1%eth0]:8080 — the whole bracketed authority (zone + port) goes,
|
||||
# with no leftover brackets/port and no nested [redacted].
|
||||
assert sanitize_error("dial [fe80::1%eth0]:8080 timeout") == "dial [redacted] timeout"
|
||||
|
||||
|
||||
def test_bracketed_ipv4_mapped_with_port_is_one_redaction():
|
||||
# [::ffff:192.168.0.1]:8080 — same, for an IPv4-mapped literal in brackets.
|
||||
assert sanitize_error("dial [::ffff:192.168.0.1]:8080 timeout") == "dial [redacted] timeout"
|
||||
|
||||
|
||||
def test_invalid_ipv6_is_not_partially_mangled():
|
||||
# Nine groups is not a valid address. Backing the scrub with ipaddress means
|
||||
# the whole token is preserved, instead of a hand-rolled 8-group regex
|
||||
# chewing off "1:2:3:4:5:6:7:8" and leaving a dangling ":9".
|
||||
msg = "weird id 1:2:3:4:5:6:7:8:9 here"
|
||||
assert sanitize_error(msg) == msg
|
||||
Reference in New Issue
Block a user