diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 32fc06e35db..039237936c6 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1624,6 +1624,7 @@ def get_quoted_string(value): def get_atom(value): """atom = [CFWS] 1*atext [CFWS] + An atom could be an rfc2047 encoded word. """ atom = Atom() if value and value[0] in CFWS_LEADER: @@ -1632,7 +1633,15 @@ def get_atom(value): if value and value[0] in ATOM_ENDS: raise errors.HeaderParseError( "expected atom but found '{}'".format(value)) - token, value = get_atext(value) + if value.startswith('=?'): + try: + token, value = get_encoded_word(value) + except errors.HeaderParseError: + # XXX: need to figure out how to register defects when + # appropriate here. + token, value = get_atext(value) + else: + token, value = get_atext(value) atom.append(token) if value and value[0] in CFWS_LEADER: token, value = get_cfws(value) @@ -1661,12 +1670,22 @@ def get_dot_atom_text(value): def get_dot_atom(value): """ dot-atom = [CFWS] dot-atom-text [CFWS] + Any place we can have a dot atom, we could instead have an rfc2047 encoded + word. """ dot_atom = DotAtom() if value[0] in CFWS_LEADER: token, value = get_cfws(value) dot_atom.append(token) - token, value = get_dot_atom_text(value) + if value.startswith('=?'): + try: + token, value = get_encoded_word(value) + except errors.HeaderParseError: + # XXX: need to figure out how to register defects when + # appropriate here. + token, value = get_dot_atom_text(value) + else: + token, value = get_dot_atom_text(value) dot_atom.append(token) if value and value[0] in CFWS_LEADER: token, value = get_cfws(value) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 8917447217c..646082b4a40 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -808,9 +808,13 @@ class TestParser(TestParserMixin, TestEmailBase): self.assertEqual(atom[2].comments, ['bar']) def test_get_atom_atom_ends_at_noncfws(self): - atom = self._test_get_x(parser.get_atom, + self._test_get_x(parser.get_atom, 'bob fred', 'bob ', 'bob ', [], 'fred') + def test_get_atom_rfc2047_atom(self): + self._test_get_x(parser.get_atom, + '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') + # get_dot_atom_text def test_get_dot_atom_text(self): @@ -885,6 +889,10 @@ class TestParser(TestParserMixin, TestEmailBase): with self.assertRaises(errors.HeaderParseError): parser.get_dot_atom(' (foo) bar.bang. foo') + def test_get_dot_atom_rfc2047_atom(self): + self._test_get_x(parser.get_dot_atom, + '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '') + # get_word (if this were black box we'd repeat all the qs/atom tests) def test_get_word_atom_yields_atom(self): @@ -2156,6 +2164,22 @@ class TestParser(TestParserMixin, TestEmailBase): self.assertEqual(address[0].token_type, 'mailbox') + def test_get_address_rfc2047_display_name(self): + address = self._test_get_x(parser.get_address, + '=?utf-8?q?=C3=89ric?= ', + 'Éric ', + 'Éric ', + [], + '') + self.assertEqual(address.token_type, 'address') + self.assertEqual(len(address.mailboxes), 1) + self.assertEqual(address.mailboxes, + address.all_mailboxes) + self.assertEqual(address.mailboxes[0].display_name, + 'Éric') + self.assertEqual(address[0].token_type, + 'mailbox') + def test_get_address_empty_group(self): address = self._test_get_x(parser.get_address, 'Monty Python:;', diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 80f1c0238e4..f754a324316 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -158,6 +158,10 @@ class TestUnstructuredHeader(TestHeaderBase): '=?utf-8?q?=C3=89ric?=', 'Éric'), + 'rfc2047_quopri_with_regular_text': ( + 'The =?utf-8?q?=C3=89ric=2C?= Himself', + 'The Éric, Himself'), + } @@ -1119,6 +1123,26 @@ class TestAddressHeader(TestHeaderBase): 'example.com', None), + 'rfc2047_atom_is_decoded': + ('=?utf-8?q?=C3=89ric?= ', + [], + 'Éric ', + 'Éric', + 'foo@example.com', + 'foo', + 'example.com', + None), + + 'rfc2047_atom_in_phrase_is_decoded': + ('The =?utf-8?q?=C3=89ric=2C?= Himself ', + [], + '"The Éric, Himself" ', + 'The Éric, Himself', + 'foo@example.com', + 'foo', + 'example.com', + None), + } # XXX: Need many more examples, and in particular some with names in diff --git a/Misc/NEWS b/Misc/NEWS index bcdfbf3b6d5..da97dbc079b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -154,6 +154,9 @@ Core and Builtins Library ------- +- Issue #18431: The new email header parser now decodes RFC2047 encoded words + in structured headers. + - Issue #18044: The new email header parser was mis-parsing encoded words where an encoded character immediately followed the '?' that follows the CTE character, resulting in a decoding failure. They are now decoded correctly.