From ee40a0c8d946d3c3fd553734004f580fb315feed Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 10 Dec 2025 09:44:30 -0500 Subject: [PATCH 1/2] Correctly fold unknown-8bit originating from encoded words. The unknown-8bit trick was designed to deal with unknown bytes in an ASCII message, and it works fine for that. However, I also tried to extend it to handle bytes that can't be decoded using the charset specified in an encoded word, and there it fails because there can be other non-ASCII characters that were *successfully* decoded. The fix is simple: do the unknown-8bit encoding using the utf-8 codec. This is especially appropriate since anyone trying to do recovery on an unknown byte string will probably attempt utf-8 first. --- Lib/email/_encoded_words.py | 2 +- Lib/test/test_email/test__header_value_parser.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Lib/email/_encoded_words.py b/Lib/email/_encoded_words.py index 6795a606de037e..05a34a4c105233 100644 --- a/Lib/email/_encoded_words.py +++ b/Lib/email/_encoded_words.py @@ -219,7 +219,7 @@ def encode(string, charset='utf-8', encoding=None, lang=''): """ if charset == 'unknown-8bit': - bstring = string.encode('ascii', 'surrogateescape') + bstring = string.encode('utf-8', 'surrogateescape') else: bstring = string.encode(charset) if encoding is None: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f7f9f9c4e2fbb5..863f7bd6f66e9a 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3265,5 +3265,13 @@ def test_fold_unfoldable_element_stealing_whitespace(self): token = parser.get_address_list(text)[0] self._test(token, expected, policy=policy) + def test_encoded_word_with_undecodable_bytes(self): + self._test(parser.get_address_list( + ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?=' + )[0], + ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n', + ) + + if __name__ == '__main__': unittest.main() From 1bba1343395bfc2b205c68a35e7fddd146999281 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 10 Dec 2025 10:00:42 -0500 Subject: [PATCH 2/2] News entry. --- .../Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst diff --git a/Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst b/Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst new file mode 100644 index 00000000000000..388fff0e2acb96 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-12-10-10-00-06.gh-issue-142517.fG4hbe.rst @@ -0,0 +1,4 @@ +The non-``compat32`` :mod:`email` policies now correctly handle refolding +encoded words that contain bytes that can not be decoded in their specified +character set. Previously this resulting in an encoding exception during +folding.