From b8463e3ac2eaffc7e980b02099e3f673236ffa46 Mon Sep 17 00:00:00 2001
From: SurprisedDuck <jannik.theiss@googlemail.com>
Date: Sun, 7 Jun 2026 16:56:20 +0200
Subject: [PATCH] fix(email): decode headers without injected spaces (#2433)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

routes.email_helpers._decode_header joined the runs from
email.header.decode_header() with " ". Those runs carry their own
surrounding whitespace (e.g. (b"Re: ", None)), and RFC 2047 §6.2 requires
the whitespace between two adjacent encoded-words to be dropped, so the
join produced a double space after an ASCII prefix ("Re:  Jóse"), a
spurious space in "Name <addr>" senders, and a stray space between two
adjacent encoded-words ("Café 日本"). _decode_header backs the inbox list,
message read, search, and the background pollers, so the corruption hit
essentially every non-ASCII subject/sender.

Use email.header.make_header(...) for RFC-correct concatenation, keeping
the existing lossy per-part fallback for malformed/unknown MIME charsets
(make_header raises LookupError there) so the unknown-charset contract in
tests/test_email_decode_header.py still holds.

The sibling mcp_servers.email_server._decode_header was already fixed the
same way (commit 46999de); this brings the routes.email_helpers copy in
line, with regression coverage.

Supported by Claude Opus 4.8

Co-authored-by: SurprisedDuck <288741682+SurprisedDuck@users.noreply.github.com>
---
 routes/email_helpers.py                       | 36 +++++++++-------
 ...test_email_helpers_decode_header_spaces.py | 42 +++++++++++++++++++
 2 files changed, 64 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_email_helpers_decode_header_spaces.py
diff --git a/routes/email_helpers.py b/routes/email_helpers.py
index 454fc9dc0..e973a6b73 100644
--- a/routes/email_helpers.py
+++ b/routes/email_helpers.py
@@ -802,20 +802,28 @@ def _imap(account_id: str | None = None, owner: str = ""):
 def _decode_header(raw):
     if not raw:
         return ""
-    parts = email.header.decode_header(raw)
-    decoded = []
-    for data, charset in parts:
-        if isinstance(data, bytes):
-            try:
-                decoded.append(data.decode(charset or "utf-8", errors="replace"))
-            except (LookupError, ValueError):
-                # Unknown/invalid MIME charset (e.g. a malformed or spam header
-                # like =?x-unknown-charset?B?...?=). errors="replace" only covers
-                # byte-decode errors, not codec lookup, so fall back to utf-8.
-                decoded.append(data.decode("utf-8", errors="replace"))
-        else:
-            decoded.append(data)
-    return " ".join(decoded)
+    try:
+        # make_header concatenates per RFC 2047: no spurious space between an
+        # encoded-word and adjacent plain text (plain runs keep their own
+        # whitespace), and the whitespace between two adjacent encoded-words is
+        # dropped. The old " ".join produced "Re:  Jose"-style double spaces on
+        # every non-ASCII subject or sender.
+        return str(email.header.make_header(email.header.decode_header(raw)))
+    except Exception:
+        # Malformed header or unknown/invalid MIME charset (e.g. a spam header
+        # like =?x-unknown-charset?B?...?=) makes make_header raise LookupError;
+        # fall back to a lossy per-part decode. errors="replace" only covers
+        # byte-decode errors, not codec lookup, hence the explicit utf-8 retry.
+        decoded = []
+        for data, charset in email.header.decode_header(raw):
+            if isinstance(data, bytes):
+                try:
+                    decoded.append(data.decode(charset or "utf-8", errors="replace"))
+                except (LookupError, ValueError):
+                    decoded.append(data.decode("utf-8", errors="replace"))
+            else:
+                decoded.append(data)
+        return "".join(decoded)
 
 
 def _detect_sent_folder(conn):
diff --git a/tests/test_email_helpers_decode_header_spaces.py b/tests/test_email_helpers_decode_header_spaces.py
new file mode 100644
index 000000000..c6e626589
--- /dev/null
+++ b/tests/test_email_helpers_decode_header_spaces.py
@@ -0,0 +1,42 @@
+"""routes.email_helpers._decode_header must not inject spaces between parts.
+
+email.header.decode_header returns plain-text runs WITH their surrounding
+whitespace (e.g. (b"Re: ", None)), so joining the parts with " " produced a
+double space after "Re:" on every non-ASCII subject, a spurious space in
+"Name <addr>" senders, and violated RFC 2047 6.2, which requires the
+whitespace between two adjacent encoded-words to be dropped. The corruption
+surfaced on the inbox list, message read, search, and the background pollers.
+
+The sibling mcp_servers.email_server._decode_header was already fixed for this
+(see tests/test_mcp_email_decode_header_spaces.py); these pin the same contract
+for the routes.email_helpers copy.
+"""
+import os
+import tempfile
+from pathlib import Path
+
+_tmp_data = Path(tempfile.mkdtemp(prefix="odysseus_decode_hdr_spaces_"))
+os.environ.setdefault("DATA_DIR", str(_tmp_data))
+os.environ.setdefault("DATABASE_URL", f"sqlite:///{_tmp_data / 'app.db'}")
+
+from routes.email_helpers import _decode_header
+
+
+def test_prefix_then_encoded_word_single_space():
+    # "Re: " (plain text, trailing space) followed by an encoded word must
+    # keep exactly one space -- the old " ".join produced "Re:  Jose".
+    assert _decode_header("Re: =?utf-8?b?SsOzc2U=?=") == "Re: Jóse"
+
+
+def test_encoded_word_then_plain_text_single_space():
+    assert _decode_header("=?utf-8?b?SsOzc2U=?= Smith") == "Jóse Smith"
+
+
+def test_adjacent_encoded_words_join_without_space():
+    # RFC 2047 6.2: whitespace between two adjacent encoded-words is dropped.
+    out = _decode_header("=?iso-8859-1?q?Caf=E9?= =?utf-8?b?5pel5pys?=")
+    assert out == "Café日本"
+
+
+def test_plain_ascii_header_unchanged():
+    assert _decode_header("Weekly report") == "Weekly report"