mirror of https://github.com/python/cpython
bpo-43323: Fix UnicodeEncodeError in the email module (GH-32137)
It was raised if the charset itself contains characters not encodable in UTF-8 (in particular \udcxx characters representing non-decodable bytes in the source).
This commit is contained in:
parent
3483299a24
commit
e91dee87ed
|
@ -179,15 +179,15 @@ def decode(ew):
|
||||||
# Turn the CTE decoded bytes into unicode.
|
# Turn the CTE decoded bytes into unicode.
|
||||||
try:
|
try:
|
||||||
string = bstring.decode(charset)
|
string = bstring.decode(charset)
|
||||||
except UnicodeError:
|
except UnicodeDecodeError:
|
||||||
defects.append(errors.UndecodableBytesDefect("Encoded word "
|
defects.append(errors.UndecodableBytesDefect("Encoded word "
|
||||||
"contains bytes not decodable using {} charset".format(charset)))
|
f"contains bytes not decodable using {charset!r} charset"))
|
||||||
string = bstring.decode(charset, 'surrogateescape')
|
string = bstring.decode(charset, 'surrogateescape')
|
||||||
except LookupError:
|
except (LookupError, UnicodeEncodeError):
|
||||||
string = bstring.decode('ascii', 'surrogateescape')
|
string = bstring.decode('ascii', 'surrogateescape')
|
||||||
if charset.lower() != 'unknown-8bit':
|
if charset.lower() != 'unknown-8bit':
|
||||||
defects.append(errors.CharsetError("Unknown charset {} "
|
defects.append(errors.CharsetError(f"Unknown charset {charset!r} "
|
||||||
"in encoded word; decoded as unknown bytes".format(charset)))
|
f"in encoded word; decoded as unknown bytes"))
|
||||||
return string, charset, lang, defects
|
return string, charset, lang, defects
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -781,7 +781,7 @@ class MimeParameters(TokenList):
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
value = value.decode(charset, 'surrogateescape')
|
value = value.decode(charset, 'surrogateescape')
|
||||||
except LookupError:
|
except (LookupError, UnicodeEncodeError):
|
||||||
# XXX: there should really be a custom defect for
|
# XXX: there should really be a custom defect for
|
||||||
# unknown character set to make it easy to find,
|
# unknown character set to make it easy to find,
|
||||||
# because otherwise unknown charset is a silent
|
# because otherwise unknown charset is a silent
|
||||||
|
|
|
@ -130,6 +130,13 @@ class TestDecode(TestEmailBase):
|
||||||
# XXX Should this be a new Defect instead?
|
# XXX Should this be a new Defect instead?
|
||||||
defects = [errors.CharsetError])
|
defects = [errors.CharsetError])
|
||||||
|
|
||||||
|
def test_invalid_character_in_charset(self):
|
||||||
|
self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=',
|
||||||
|
b'foo\xacbar'.decode('ascii', 'surrogateescape'),
|
||||||
|
charset = 'utf-8\udce2\udc80\udc9d',
|
||||||
|
# XXX Should this be a new Defect instead?
|
||||||
|
defects = [errors.CharsetError])
|
||||||
|
|
||||||
def test_q_nonascii(self):
|
def test_q_nonascii(self):
|
||||||
self._test('=?utf-8?q?=C3=89ric?=',
|
self._test('=?utf-8?q?=C3=89ric?=',
|
||||||
'Éric',
|
'Éric',
|
||||||
|
|
|
@ -5356,6 +5356,15 @@ Content-Disposition: inline;
|
||||||
Content-Transfer-Encoding: 8bit
|
Content-Transfer-Encoding: 8bit
|
||||||
Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
|
Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
|
||||||
|
|
||||||
|
"""
|
||||||
|
msg = email.message_from_string(m)
|
||||||
|
self.assertEqual(msg.get_filename(), 'myfile.txt')
|
||||||
|
|
||||||
|
def test_rfc2231_bad_character_in_encoding(self):
|
||||||
|
m = """\
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt
|
||||||
|
|
||||||
"""
|
"""
|
||||||
msg = email.message_from_string(m)
|
msg = email.message_from_string(m)
|
||||||
self.assertEqual(msg.get_filename(), 'myfile.txt')
|
self.assertEqual(msg.get_filename(), 'myfile.txt')
|
||||||
|
|
|
@ -714,6 +714,18 @@ class TestContentTypeHeader(TestHeaderBase):
|
||||||
" charset*=unknown-8bit''utf-8%E2%80%9D\n",
|
" charset*=unknown-8bit''utf-8%E2%80%9D\n",
|
||||||
),
|
),
|
||||||
|
|
||||||
|
'rfc2231_nonascii_in_charset_of_charset_parameter_value': (
|
||||||
|
"text/plain; charset*=utf-8”''utf-8%E2%80%9D",
|
||||||
|
'text/plain',
|
||||||
|
'text',
|
||||||
|
'plain',
|
||||||
|
{'charset': 'utf-8”'},
|
||||||
|
[],
|
||||||
|
'text/plain; charset="utf-8”"',
|
||||||
|
"Content-Type: text/plain;"
|
||||||
|
" charset*=utf-8''utf-8%E2%80%9D\n",
|
||||||
|
),
|
||||||
|
|
||||||
'rfc2231_encoded_then_unencoded_segments': (
|
'rfc2231_encoded_then_unencoded_segments': (
|
||||||
('application/x-foo;'
|
('application/x-foo;'
|
||||||
'\tname*0*="us-ascii\'en-us\'My";'
|
'\tname*0*="us-ascii\'en-us\'My";'
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Fix errors in the :mod:`email` module if the charset itself contains
|
||||||
|
undecodable/unencodable characters.
|
Loading…
Reference in New Issue