From b8463e3ac2eaffc7e980b02099e3f673236ffa46 Mon Sep 17 00:00:00 2001 From: SurprisedDuck Date: Sun, 7 Jun 2026 16:56:20 +0200 Subject: [PATCH] fix(email): decode headers without injected spaces (#2433) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit routes.email_helpers._decode_header joined the runs from email.header.decode_header() with " ". Those runs carry their own surrounding whitespace (e.g. (b"Re: ", None)), and RFC 2047 §6.2 requires the whitespace between two adjacent encoded-words to be dropped, so the join produced a double space after an ASCII prefix ("Re: Jóse"), a spurious space in "Name " senders, and a stray space between two adjacent encoded-words ("Café 日本"). _decode_header backs the inbox list, message read, search, and the background pollers, so the corruption hit essentially every non-ASCII subject/sender. Use email.header.make_header(...) for RFC-correct concatenation, keeping the existing lossy per-part fallback for malformed/unknown MIME charsets (make_header raises LookupError there) so the unknown-charset contract in tests/test_email_decode_header.py still holds. The sibling mcp_servers.email_server._decode_header was already fixed the same way (commit 46999de); this brings the routes.email_helpers copy in line, with regression coverage. Supported by Claude Opus 4.8 Co-authored-by: SurprisedDuck <288741682+SurprisedDuck@users.noreply.github.com> --- routes/email_helpers.py | 36 +++++++++------- ...test_email_helpers_decode_header_spaces.py | 42 +++++++++++++++++++ 2 files changed, 64 insertions(+), 14 deletions(-) create mode 100644 tests/test_email_helpers_decode_header_spaces.py diff --git a/routes/email_helpers.py b/routes/email_helpers.py index 454fc9dc0..e973a6b73 100644 --- a/routes/email_helpers.py +++ b/routes/email_helpers.py @@ -802,20 +802,28 @@ def _imap(account_id: str | None = None, owner: str = ""): def _decode_header(raw): if not raw: return "" - parts = email.header.decode_header(raw) - decoded = [] - for data, charset in parts: - if isinstance(data, bytes): - try: - decoded.append(data.decode(charset or "utf-8", errors="replace")) - except (LookupError, ValueError): - # Unknown/invalid MIME charset (e.g. a malformed or spam header - # like =?x-unknown-charset?B?...?=). errors="replace" only covers - # byte-decode errors, not codec lookup, so fall back to utf-8. - decoded.append(data.decode("utf-8", errors="replace")) - else: - decoded.append(data) - return " ".join(decoded) + try: + # make_header concatenates per RFC 2047: no spurious space between an + # encoded-word and adjacent plain text (plain runs keep their own + # whitespace), and the whitespace between two adjacent encoded-words is + # dropped. The old " ".join produced "Re: Jose"-style double spaces on + # every non-ASCII subject or sender. + return str(email.header.make_header(email.header.decode_header(raw))) + except Exception: + # Malformed header or unknown/invalid MIME charset (e.g. a spam header + # like =?x-unknown-charset?B?...?=) makes make_header raise LookupError; + # fall back to a lossy per-part decode. errors="replace" only covers + # byte-decode errors, not codec lookup, hence the explicit utf-8 retry. + decoded = [] + for data, charset in email.header.decode_header(raw): + if isinstance(data, bytes): + try: + decoded.append(data.decode(charset or "utf-8", errors="replace")) + except (LookupError, ValueError): + decoded.append(data.decode("utf-8", errors="replace")) + else: + decoded.append(data) + return "".join(decoded) def _detect_sent_folder(conn): diff --git a/tests/test_email_helpers_decode_header_spaces.py b/tests/test_email_helpers_decode_header_spaces.py new file mode 100644 index 000000000..c6e626589 --- /dev/null +++ b/tests/test_email_helpers_decode_header_spaces.py @@ -0,0 +1,42 @@ +"""routes.email_helpers._decode_header must not inject spaces between parts. + +email.header.decode_header returns plain-text runs WITH their surrounding +whitespace (e.g. (b"Re: ", None)), so joining the parts with " " produced a +double space after "Re:" on every non-ASCII subject, a spurious space in +"Name " senders, and violated RFC 2047 6.2, which requires the +whitespace between two adjacent encoded-words to be dropped. The corruption +surfaced on the inbox list, message read, search, and the background pollers. + +The sibling mcp_servers.email_server._decode_header was already fixed for this +(see tests/test_mcp_email_decode_header_spaces.py); these pin the same contract +for the routes.email_helpers copy. +""" +import os +import tempfile +from pathlib import Path + +_tmp_data = Path(tempfile.mkdtemp(prefix="odysseus_decode_hdr_spaces_")) +os.environ.setdefault("DATA_DIR", str(_tmp_data)) +os.environ.setdefault("DATABASE_URL", f"sqlite:///{_tmp_data / 'app.db'}") + +from routes.email_helpers import _decode_header + + +def test_prefix_then_encoded_word_single_space(): + # "Re: " (plain text, trailing space) followed by an encoded word must + # keep exactly one space -- the old " ".join produced "Re: Jose". + assert _decode_header("Re: =?utf-8?b?SsOzc2U=?=") == "Re: Jóse" + + +def test_encoded_word_then_plain_text_single_space(): + assert _decode_header("=?utf-8?b?SsOzc2U=?= Smith") == "Jóse Smith" + + +def test_adjacent_encoded_words_join_without_space(): + # RFC 2047 6.2: whitespace between two adjacent encoded-words is dropped. + out = _decode_header("=?iso-8859-1?q?Caf=E9?= =?utf-8?b?5pel5pys?=") + assert out == "Café日本" + + +def test_plain_ascii_header_unchanged(): + assert _decode_header("Weekly report") == "Weekly report"