mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-16 17:55:26 -04:00
fix(email): decode headers without injected spaces (#2433)
routes.email_helpers._decode_header joined the runs from
email.header.decode_header() with " ". Those runs carry their own
surrounding whitespace (e.g. (b"Re: ", None)), and RFC 2047 §6.2 requires
the whitespace between two adjacent encoded-words to be dropped, so the
join produced a double space after an ASCII prefix ("Re: Jóse"), a
spurious space in "Name <addr>" senders, and a stray space between two
adjacent encoded-words ("Café 日本"). _decode_header backs the inbox list,
message read, search, and the background pollers, so the corruption hit
essentially every non-ASCII subject/sender.
Use email.header.make_header(...) for RFC-correct concatenation, keeping
the existing lossy per-part fallback for malformed/unknown MIME charsets
(make_header raises LookupError there) so the unknown-charset contract in
tests/test_email_decode_header.py still holds.
The sibling mcp_servers.email_server._decode_header was already fixed the
same way (commit 46999de); this brings the routes.email_helpers copy in
line, with regression coverage.
Supported by Claude Opus 4.8
Co-authored-by: SurprisedDuck <288741682+SurprisedDuck@users.noreply.github.com>
This commit is contained in:
+22
-14
@@ -802,20 +802,28 @@ def _imap(account_id: str | None = None, owner: str = ""):
|
|||||||
def _decode_header(raw):
|
def _decode_header(raw):
|
||||||
if not raw:
|
if not raw:
|
||||||
return ""
|
return ""
|
||||||
parts = email.header.decode_header(raw)
|
try:
|
||||||
decoded = []
|
# make_header concatenates per RFC 2047: no spurious space between an
|
||||||
for data, charset in parts:
|
# encoded-word and adjacent plain text (plain runs keep their own
|
||||||
if isinstance(data, bytes):
|
# whitespace), and the whitespace between two adjacent encoded-words is
|
||||||
try:
|
# dropped. The old " ".join produced "Re: Jose"-style double spaces on
|
||||||
decoded.append(data.decode(charset or "utf-8", errors="replace"))
|
# every non-ASCII subject or sender.
|
||||||
except (LookupError, ValueError):
|
return str(email.header.make_header(email.header.decode_header(raw)))
|
||||||
# Unknown/invalid MIME charset (e.g. a malformed or spam header
|
except Exception:
|
||||||
# like =?x-unknown-charset?B?...?=). errors="replace" only covers
|
# Malformed header or unknown/invalid MIME charset (e.g. a spam header
|
||||||
# byte-decode errors, not codec lookup, so fall back to utf-8.
|
# like =?x-unknown-charset?B?...?=) makes make_header raise LookupError;
|
||||||
decoded.append(data.decode("utf-8", errors="replace"))
|
# fall back to a lossy per-part decode. errors="replace" only covers
|
||||||
else:
|
# byte-decode errors, not codec lookup, hence the explicit utf-8 retry.
|
||||||
decoded.append(data)
|
decoded = []
|
||||||
return " ".join(decoded)
|
for data, charset in email.header.decode_header(raw):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
try:
|
||||||
|
decoded.append(data.decode(charset or "utf-8", errors="replace"))
|
||||||
|
except (LookupError, ValueError):
|
||||||
|
decoded.append(data.decode("utf-8", errors="replace"))
|
||||||
|
else:
|
||||||
|
decoded.append(data)
|
||||||
|
return "".join(decoded)
|
||||||
|
|
||||||
|
|
||||||
def _detect_sent_folder(conn):
|
def _detect_sent_folder(conn):
|
||||||
|
|||||||
@@ -0,0 +1,42 @@
|
|||||||
|
"""routes.email_helpers._decode_header must not inject spaces between parts.
|
||||||
|
|
||||||
|
email.header.decode_header returns plain-text runs WITH their surrounding
|
||||||
|
whitespace (e.g. (b"Re: ", None)), so joining the parts with " " produced a
|
||||||
|
double space after "Re:" on every non-ASCII subject, a spurious space in
|
||||||
|
"Name <addr>" senders, and violated RFC 2047 6.2, which requires the
|
||||||
|
whitespace between two adjacent encoded-words to be dropped. The corruption
|
||||||
|
surfaced on the inbox list, message read, search, and the background pollers.
|
||||||
|
|
||||||
|
The sibling mcp_servers.email_server._decode_header was already fixed for this
|
||||||
|
(see tests/test_mcp_email_decode_header_spaces.py); these pin the same contract
|
||||||
|
for the routes.email_helpers copy.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
_tmp_data = Path(tempfile.mkdtemp(prefix="odysseus_decode_hdr_spaces_"))
|
||||||
|
os.environ.setdefault("DATA_DIR", str(_tmp_data))
|
||||||
|
os.environ.setdefault("DATABASE_URL", f"sqlite:///{_tmp_data / 'app.db'}")
|
||||||
|
|
||||||
|
from routes.email_helpers import _decode_header
|
||||||
|
|
||||||
|
|
||||||
|
def test_prefix_then_encoded_word_single_space():
|
||||||
|
# "Re: " (plain text, trailing space) followed by an encoded word must
|
||||||
|
# keep exactly one space -- the old " ".join produced "Re: Jose".
|
||||||
|
assert _decode_header("Re: =?utf-8?b?SsOzc2U=?=") == "Re: Jóse"
|
||||||
|
|
||||||
|
|
||||||
|
def test_encoded_word_then_plain_text_single_space():
|
||||||
|
assert _decode_header("=?utf-8?b?SsOzc2U=?= Smith") == "Jóse Smith"
|
||||||
|
|
||||||
|
|
||||||
|
def test_adjacent_encoded_words_join_without_space():
|
||||||
|
# RFC 2047 6.2: whitespace between two adjacent encoded-words is dropped.
|
||||||
|
out = _decode_header("=?iso-8859-1?q?Caf=E9?= =?utf-8?b?5pel5pys?=")
|
||||||
|
assert out == "Café日本"
|
||||||
|
|
||||||
|
|
||||||
|
def test_plain_ascii_header_unchanged():
|
||||||
|
assert _decode_header("Weekly report") == "Weekly report"
|
||||||
Reference in New Issue
Block a user