mirror of https://github.com/python/cpython
gh-94606: Fix error when message with Unicode surrogate not surrogateescaped string (GH-94641)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
3251ba8f1a
commit
27a5fd8cb8
|
@ -289,25 +289,26 @@ class Message:
|
||||||
# cte might be a Header, so for now stringify it.
|
# cte might be a Header, so for now stringify it.
|
||||||
cte = str(self.get('content-transfer-encoding', '')).lower()
|
cte = str(self.get('content-transfer-encoding', '')).lower()
|
||||||
# payload may be bytes here.
|
# payload may be bytes here.
|
||||||
if isinstance(payload, str):
|
if not decode:
|
||||||
if utils._has_surrogates(payload):
|
if isinstance(payload, str) and utils._has_surrogates(payload):
|
||||||
bpayload = payload.encode('ascii', 'surrogateescape')
|
try:
|
||||||
if not decode:
|
bpayload = payload.encode('ascii', 'surrogateescape')
|
||||||
try:
|
try:
|
||||||
payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
|
payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
|
||||||
except LookupError:
|
except LookupError:
|
||||||
payload = bpayload.decode('ascii', 'replace')
|
payload = bpayload.decode('ascii', 'replace')
|
||||||
elif decode:
|
except UnicodeEncodeError:
|
||||||
try:
|
pass
|
||||||
bpayload = payload.encode('ascii')
|
|
||||||
except UnicodeError:
|
|
||||||
# This won't happen for RFC compliant messages (messages
|
|
||||||
# containing only ASCII code points in the unicode input).
|
|
||||||
# If it does happen, turn the string into bytes in a way
|
|
||||||
# guaranteed not to fail.
|
|
||||||
bpayload = payload.encode('raw-unicode-escape')
|
|
||||||
if not decode:
|
|
||||||
return payload
|
return payload
|
||||||
|
if isinstance(payload, str):
|
||||||
|
try:
|
||||||
|
bpayload = payload.encode('ascii', 'surrogateescape')
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
# This won't happen for RFC compliant messages (messages
|
||||||
|
# containing only ASCII code points in the unicode input).
|
||||||
|
# If it does happen, turn the string into bytes in a way
|
||||||
|
# guaranteed not to fail.
|
||||||
|
bpayload = payload.encode('raw-unicode-escape')
|
||||||
if cte == 'quoted-printable':
|
if cte == 'quoted-printable':
|
||||||
return quopri.decodestring(bpayload)
|
return quopri.decodestring(bpayload)
|
||||||
elif cte == 'base64':
|
elif cte == 'base64':
|
||||||
|
|
|
@ -44,10 +44,10 @@ specialsre = re.compile(r'[][\\()<>@,:;".]')
|
||||||
escapesre = re.compile(r'[\\"]')
|
escapesre = re.compile(r'[\\"]')
|
||||||
|
|
||||||
def _has_surrogates(s):
|
def _has_surrogates(s):
|
||||||
"""Return True if s contains surrogate-escaped binary data."""
|
"""Return True if s may contain surrogate-escaped binary data."""
|
||||||
# This check is based on the fact that unless there are surrogates, utf8
|
# This check is based on the fact that unless there are surrogates, utf8
|
||||||
# (Python's default encoding) can encode any string. This is the fastest
|
# (Python's default encoding) can encode any string. This is the fastest
|
||||||
# way to check for surrogates, see issue 11454 for timings.
|
# way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
|
||||||
try:
|
try:
|
||||||
s.encode()
|
s.encode()
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -748,6 +748,35 @@ class TestEmailMessageBase:
|
||||||
self.assertEqual(len(list(m.iter_attachments())), 2)
|
self.assertEqual(len(list(m.iter_attachments())), 2)
|
||||||
self.assertEqual(m.get_payload(), orig)
|
self.assertEqual(m.get_payload(), orig)
|
||||||
|
|
||||||
|
get_payload_surrogate_params = {
|
||||||
|
|
||||||
|
'good_surrogateescape': (
|
||||||
|
"String that can be encod\udcc3\udcabd with surrogateescape",
|
||||||
|
b'String that can be encod\xc3\xabd with surrogateescape'
|
||||||
|
),
|
||||||
|
|
||||||
|
'string_with_utf8': (
|
||||||
|
"String with utf-8 charactër",
|
||||||
|
b'String with utf-8 charact\xebr'
|
||||||
|
),
|
||||||
|
|
||||||
|
'surrogate_and_utf8': (
|
||||||
|
"String that cannot be ëncod\udcc3\udcabd with surrogateescape",
|
||||||
|
b'String that cannot be \xebncod\\udcc3\\udcabd with surrogateescape'
|
||||||
|
),
|
||||||
|
|
||||||
|
'out_of_range_surrogate': (
|
||||||
|
"String with \udfff cannot be encoded with surrogateescape",
|
||||||
|
b'String with \\udfff cannot be encoded with surrogateescape'
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_payload_surrogate_as_gh_94606(self, msg, expected):
|
||||||
|
"""test for GH issue 94606"""
|
||||||
|
m = self._str_msg(msg)
|
||||||
|
payload = m.get_payload(decode=True)
|
||||||
|
self.assertEqual(expected, payload)
|
||||||
|
|
||||||
|
|
||||||
class TestEmailMessage(TestEmailMessageBase, TestEmailBase):
|
class TestEmailMessage(TestEmailMessageBase, TestEmailBase):
|
||||||
message = EmailMessage
|
message = EmailMessage
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fix UnicodeEncodeError when :func:`email.message.get_payload` reads a message
|
||||||
|
with a Unicode surrogate character and the message content is not well-formed for
|
||||||
|
surrogateescape encoding. Patch by Sidney Markowitz.
|
Loading…
Reference in New Issue