diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index eb315583346..32fc06e35db 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -69,6 +69,7 @@ XXX: provide complete list of token types. import re import urllib # For urllib.parse.unquote +from string import hexdigits from collections import namedtuple, OrderedDict from email import _encoded_words as _ew from email import errors @@ -391,10 +392,6 @@ class UnstructuredTokenList(TokenList): token_type = 'unstructured' def _fold(self, folded): - if any(x.token_type=='encoded-word' for x in self): - return self._fold_encoded(folded) - # Here we can have either a pure ASCII string that may or may not - # have surrogateescape encoded bytes, or a unicode string. last_ew = None for part in self.parts: tstr = str(part) @@ -1386,35 +1383,6 @@ def _get_ptext_to_endchars(value, endchars): pos = pos + 1 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp -def _decode_ew_run(value): - """ Decode a run of RFC2047 encoded words. - - _decode_ew_run(value) -> (text, value, defects) - - Scans the supplied value for a run of tokens that look like they are RFC - 2047 encoded words, decodes those words into text according to RFC 2047 - rules (whitespace between encoded words is discarded), and returns the text - and the remaining value (including any leading whitespace on the remaining - value), as well as a list of any defects encountered while decoding. The - input value may not have any leading whitespace. - - """ - res = [] - defects = [] - last_ws = '' - while value: - try: - tok, ws, value = _wsp_splitter(value, 1) - except ValueError: - tok, ws, value = value, '', '' - if not (tok.startswith('=?') and tok.endswith('?=')): - return ''.join(res), last_ws + tok + ws + value, defects - text, charset, lang, new_defects = _ew.decode(tok) - res.append(text) - defects.extend(new_defects) - last_ws = ws - return ''.join(res), last_ws, defects - def get_fws(value): """FWS = 1*WSP @@ -1440,7 +1408,8 @@ def get_encoded_word(value): raise errors.HeaderParseError( "expected encoded word but found {}".format(value)) remstr = ''.join(remainder) - if remstr[:2].isdigit(): + if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits: + # The ? after the CTE was followed by an encoded word escape (=XX). rest, *remainder = remstr.split('?=', 1) tok = tok + '?=' + rest if len(tok.split()) > 1: @@ -1488,8 +1457,8 @@ def get_unstructured(value): """ # XXX: but what about bare CR and LF? They might signal the start or - # end of an encoded word. YAGNI for now, since out current parsers - # will never send us strings with bard CR or LF. + # end of an encoded word. YAGNI for now, since our current parsers + # will never send us strings with bare CR or LF. unstructured = UnstructuredTokenList() while value: @@ -1501,6 +1470,8 @@ def get_unstructured(value): try: token, value = get_encoded_word(value) except errors.HeaderParseError: + # XXX: Need to figure out how to register defects when + # appropriate here. pass else: have_ws = True diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py index 14395fed40d..f8e380dc554 100644 --- a/Lib/test/test_email/test__encoded_words.py +++ b/Lib/test/test_email/test__encoded_words.py @@ -122,6 +122,11 @@ class TestDecode(TestEmailBase): # XXX Should this be a new Defect instead? defects = [errors.CharsetError]) + def test_q_nonascii(self): + self._test('=?utf-8?q?=C3=89ric?=', + 'Éric', + charset='utf-8') + class TestEncodeQ(TestEmailBase): diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 6101e191c01..8917447217c 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -170,6 +170,15 @@ class TestParser(TestParserMixin, TestEmailBase): [], '') + def test_get_encoded_word_quopri_utf_escape_follows_cte(self): + # Issue 18044 + self._test_get_x(parser.get_encoded_word, + '=?utf-8?q?=C3=89ric?=', + 'Éric', + 'Éric', + [], + '') + # get_unstructured def _get_unst(self, value): diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index c0c81c1caa0..80f1c0238e4 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -123,12 +123,45 @@ class TestBaseHeaderFeatures(TestHeaderBase): # self.assertEqual(h, value) # self.assertDefectsEqual(h.defects, [errors.ObsoleteHeaderDefect]) - def test_RFC2047_value_decoded(self): - value = '=?utf-8?q?this_is_a_test?=' - h = self.make_header('subject', value) - self.assertEqual(h, 'this is a test') + +@parameterize +class TestUnstructuredHeader(TestHeaderBase): + + def string_as_value(self, + source, + decoded, + *args): + l = len(args) + defects = args[0] if l>0 else [] + header = 'Subject:' + (' ' if source else '') + folded = header + (args[1] if l>1 else source) + '\n' + h = self.make_header('Subject', source) + self.assertEqual(h, decoded) + self.assertDefectsEqual(h.defects, defects) + self.assertEqual(h.fold(policy=policy.default), folded) + + string_params = { + + 'rfc2047_simple_quopri': ( + '=?utf-8?q?this_is_a_test?=', + 'this is a test', + [], + 'this is a test'), + + 'rfc2047_gb2312_base64': ( + '=?gb2312?b?1eLKx9bQzsSy4srUo6E=?=', + '\u8fd9\u662f\u4e2d\u6587\u6d4b\u8bd5\uff01', + [], + '=?utf-8?b?6L+Z5piv5Lit5paH5rWL6K+V77yB?='), + + 'rfc2047_simple_nonascii_quopri': ( + '=?utf-8?q?=C3=89ric?=', + 'Éric'), + + } +@parameterize class TestDateHeader(TestHeaderBase): datestring = 'Sun, 23 Sep 2001 20:10:55 -0700' diff --git a/Misc/NEWS b/Misc/NEWS index 4cfe4f8fb18..751433fd398 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -151,6 +151,10 @@ Core and Builtins Library ------- +- Issue #18044: The new email header parser was mis-parsing encoded words where + an encoded character immediately followed the '?' that follows the CTE + character, resulting in a decoding failure. They are now decoded correctly. + - Issue #18101: Tcl.split() now process strings nested in a tuple as it do with byte strings.