#1079: Fix parsing of encoded words.
This is a behavior change: before this leading and trailing spaces were stripped from ASCII parts, now they are preserved. Without this fix we didn't parse the examples in the RFC correctly, so I think breaking backward compatibility here is justified. Patch by Ralf Schlatterbeck.
This commit is contained in:
parent
e11eb0f21b
commit
07ea53cb21
|
@ -40,7 +40,6 @@ ecre = re.compile(r'''
|
||||||
\? # literal ?
|
\? # literal ?
|
||||||
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
|
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
|
||||||
\?= # literal ?=
|
\?= # literal ?=
|
||||||
(?=[ \t]|$) # whitespace or the end of the string
|
|
||||||
''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
|
''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
|
||||||
|
|
||||||
# Field name regexp, including trailing colon, but not separating whitespace,
|
# Field name regexp, including trailing colon, but not separating whitespace,
|
||||||
|
@ -86,8 +85,12 @@ def decode_header(header):
|
||||||
words = []
|
words = []
|
||||||
for line in header.splitlines():
|
for line in header.splitlines():
|
||||||
parts = ecre.split(line)
|
parts = ecre.split(line)
|
||||||
|
first = True
|
||||||
while parts:
|
while parts:
|
||||||
unencoded = parts.pop(0).strip()
|
unencoded = parts.pop(0)
|
||||||
|
if first:
|
||||||
|
unencoded = unencoded.lstrip()
|
||||||
|
first = False
|
||||||
if unencoded:
|
if unencoded:
|
||||||
words.append((unencoded, None, None))
|
words.append((unencoded, None, None))
|
||||||
if parts:
|
if parts:
|
||||||
|
@ -95,6 +98,16 @@ def decode_header(header):
|
||||||
encoding = parts.pop(0).lower()
|
encoding = parts.pop(0).lower()
|
||||||
encoded = parts.pop(0)
|
encoded = parts.pop(0)
|
||||||
words.append((encoded, encoding, charset))
|
words.append((encoded, encoding, charset))
|
||||||
|
# Now loop over words and remove words that consist of whitespace
|
||||||
|
# between two encoded strings.
|
||||||
|
import sys
|
||||||
|
droplist = []
|
||||||
|
for n, w in enumerate(words):
|
||||||
|
if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
|
||||||
|
droplist.append(n-1)
|
||||||
|
for d in reversed(droplist):
|
||||||
|
del words[d]
|
||||||
|
|
||||||
# The next step is to decode each encoded word by applying the reverse
|
# The next step is to decode each encoded word by applying the reverse
|
||||||
# base64 or quopri transformation. decoded_words is now a list of the
|
# base64 or quopri transformation. decoded_words is now a list of the
|
||||||
# form (decoded_word, charset).
|
# form (decoded_word, charset).
|
||||||
|
@ -217,22 +230,27 @@ class Header:
|
||||||
self._normalize()
|
self._normalize()
|
||||||
uchunks = []
|
uchunks = []
|
||||||
lastcs = None
|
lastcs = None
|
||||||
|
lastspace = None
|
||||||
for string, charset in self._chunks:
|
for string, charset in self._chunks:
|
||||||
# We must preserve spaces between encoded and non-encoded word
|
# We must preserve spaces between encoded and non-encoded word
|
||||||
# boundaries, which means for us we need to add a space when we go
|
# boundaries, which means for us we need to add a space when we go
|
||||||
# from a charset to None/us-ascii, or from None/us-ascii to a
|
# from a charset to None/us-ascii, or from None/us-ascii to a
|
||||||
# charset. Only do this for the second and subsequent chunks.
|
# charset. Only do this for the second and subsequent chunks.
|
||||||
|
# Don't add a space if the None/us-ascii string already has
|
||||||
|
# a space (trailing or leading depending on transition)
|
||||||
nextcs = charset
|
nextcs = charset
|
||||||
if nextcs == _charset.UNKNOWN8BIT:
|
if nextcs == _charset.UNKNOWN8BIT:
|
||||||
original_bytes = string.encode('ascii', 'surrogateescape')
|
original_bytes = string.encode('ascii', 'surrogateescape')
|
||||||
string = original_bytes.decode('ascii', 'replace')
|
string = original_bytes.decode('ascii', 'replace')
|
||||||
if uchunks:
|
if uchunks:
|
||||||
|
hasspace = string and self._nonctext(string[0])
|
||||||
if lastcs not in (None, 'us-ascii'):
|
if lastcs not in (None, 'us-ascii'):
|
||||||
if nextcs in (None, 'us-ascii'):
|
if nextcs in (None, 'us-ascii') and not hasspace:
|
||||||
uchunks.append(SPACE)
|
uchunks.append(SPACE)
|
||||||
nextcs = None
|
nextcs = None
|
||||||
elif nextcs not in (None, 'us-ascii'):
|
elif nextcs not in (None, 'us-ascii') and not lastspace:
|
||||||
uchunks.append(SPACE)
|
uchunks.append(SPACE)
|
||||||
|
lastspace = string and self._nonctext(string[-1])
|
||||||
lastcs = nextcs
|
lastcs = nextcs
|
||||||
uchunks.append(string)
|
uchunks.append(string)
|
||||||
return EMPTYSTRING.join(uchunks)
|
return EMPTYSTRING.join(uchunks)
|
||||||
|
@ -291,6 +309,11 @@ class Header:
|
||||||
charset = UTF8
|
charset = UTF8
|
||||||
self._chunks.append((s, charset))
|
self._chunks.append((s, charset))
|
||||||
|
|
||||||
|
def _nonctext(self, s):
|
||||||
|
"""True if string s is not a ctext character of RFC822.
|
||||||
|
"""
|
||||||
|
return s.isspace() or s in ('(', ')', '\\')
|
||||||
|
|
||||||
def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
|
def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
|
||||||
r"""Encode a message header into an RFC-compliant format.
|
r"""Encode a message header into an RFC-compliant format.
|
||||||
|
|
||||||
|
@ -334,7 +357,20 @@ class Header:
|
||||||
maxlinelen = 1000000
|
maxlinelen = 1000000
|
||||||
formatter = _ValueFormatter(self._headerlen, maxlinelen,
|
formatter = _ValueFormatter(self._headerlen, maxlinelen,
|
||||||
self._continuation_ws, splitchars)
|
self._continuation_ws, splitchars)
|
||||||
|
lastcs = None
|
||||||
|
hasspace = lastspace = None
|
||||||
for string, charset in self._chunks:
|
for string, charset in self._chunks:
|
||||||
|
if hasspace is not None:
|
||||||
|
hasspace = string and self._nonctext(string[0])
|
||||||
|
import sys
|
||||||
|
if lastcs not in (None, 'us-ascii'):
|
||||||
|
if not hasspace or charset not in (None, 'us-ascii'):
|
||||||
|
formatter.add_transition()
|
||||||
|
elif charset not in (None, 'us-ascii') and not lastspace:
|
||||||
|
formatter.add_transition()
|
||||||
|
lastspace = string and self._nonctext(string[-1])
|
||||||
|
lastcs = charset
|
||||||
|
hasspace = False
|
||||||
lines = string.splitlines()
|
lines = string.splitlines()
|
||||||
if lines:
|
if lines:
|
||||||
formatter.feed('', lines[0], charset)
|
formatter.feed('', lines[0], charset)
|
||||||
|
@ -351,6 +387,7 @@ class Header:
|
||||||
formatter.feed(fws, sline, charset)
|
formatter.feed(fws, sline, charset)
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
formatter.newline()
|
formatter.newline()
|
||||||
|
if self._chunks:
|
||||||
formatter.add_transition()
|
formatter.add_transition()
|
||||||
value = formatter._str(linesep)
|
value = formatter._str(linesep)
|
||||||
if _embeded_header.search(value):
|
if _embeded_header.search(value):
|
||||||
|
|
|
@ -2005,7 +2005,7 @@ class TestRFC2047(TestEmailBase):
|
||||||
Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
|
Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
|
||||||
=?mac-iceland?q?=9Arg=8Cs?=""")
|
=?mac-iceland?q?=9Arg=8Cs?=""")
|
||||||
|
|
||||||
def test_whitespace_eater_unicode(self):
|
def test_whitespace_keeper_unicode(self):
|
||||||
eq = self.assertEqual
|
eq = self.assertEqual
|
||||||
s = '=?ISO-8859-1?Q?Andr=E9?= Pirard <pirard@dom.ain>'
|
s = '=?ISO-8859-1?Q?Andr=E9?= Pirard <pirard@dom.ain>'
|
||||||
dh = decode_header(s)
|
dh = decode_header(s)
|
||||||
|
@ -2014,7 +2014,7 @@ Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
|
||||||
header = str(make_header(dh))
|
header = str(make_header(dh))
|
||||||
eq(header, 'Andr\xe9 Pirard <pirard@dom.ain>')
|
eq(header, 'Andr\xe9 Pirard <pirard@dom.ain>')
|
||||||
|
|
||||||
def test_whitespace_eater_unicode_2(self):
|
def test_whitespace_keeper_unicode_2(self):
|
||||||
eq = self.assertEqual
|
eq = self.assertEqual
|
||||||
s = 'The =?iso-8859-1?b?cXVpY2sgYnJvd24gZm94?= jumped over the =?iso-8859-1?b?bGF6eSBkb2c=?='
|
s = 'The =?iso-8859-1?b?cXVpY2sgYnJvd24gZm94?= jumped over the =?iso-8859-1?b?bGF6eSBkb2c=?='
|
||||||
dh = decode_header(s)
|
dh = decode_header(s)
|
||||||
|
@ -2026,7 +2026,9 @@ Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
|
||||||
def test_rfc2047_missing_whitespace(self):
|
def test_rfc2047_missing_whitespace(self):
|
||||||
s = 'Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord'
|
s = 'Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord'
|
||||||
dh = decode_header(s)
|
dh = decode_header(s)
|
||||||
self.assertEqual(dh, [(s, None)])
|
self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'),
|
||||||
|
(b'rg', None), (b'\xe5', 'iso-8859-1'),
|
||||||
|
(b'sbord', None)])
|
||||||
|
|
||||||
def test_rfc2047_with_whitespace(self):
|
def test_rfc2047_with_whitespace(self):
|
||||||
s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord'
|
s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord'
|
||||||
|
@ -2051,6 +2053,57 @@ Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
|
||||||
self.assertEqual(decode_header(s),
|
self.assertEqual(decode_header(s),
|
||||||
[(b'andr\xe9=zz', 'iso-8659-1')])
|
[(b'andr\xe9=zz', 'iso-8659-1')])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_1(self):
|
||||||
|
# 1st testcase at end of rfc2047
|
||||||
|
s = '(=?ISO-8859-1?Q?a?=)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'a', 'iso-8859-1'), (b')', None)])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_2(self):
|
||||||
|
# 2nd testcase at end of rfc2047
|
||||||
|
s = '(=?ISO-8859-1?Q?a?= b)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'a', 'iso-8859-1'), (b' b)', None)])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_3(self):
|
||||||
|
# 3rd testcase at end of rfc2047
|
||||||
|
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_4(self):
|
||||||
|
# 4th testcase at end of rfc2047
|
||||||
|
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_5a(self):
|
||||||
|
# 5th testcase at end of rfc2047 newline is \r\n
|
||||||
|
s = '(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_5b(self):
|
||||||
|
# 5th testcase at end of rfc2047 newline is \n
|
||||||
|
s = '(=?ISO-8859-1?Q?a?=\n =?ISO-8859-1?Q?b?=)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_6(self):
|
||||||
|
# 6th testcase at end of rfc2047
|
||||||
|
s = '(=?ISO-8859-1?Q?a_b?=)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'a b', 'iso-8859-1'), (b')', None)])
|
||||||
|
|
||||||
|
def test_rfc2047_rfc2047_7(self):
|
||||||
|
# 7th testcase at end of rfc2047
|
||||||
|
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)'
|
||||||
|
self.assertEqual(decode_header(s),
|
||||||
|
[(b'(', None), (b'a', 'iso-8859-1'), (b' b', 'iso-8859-2'),
|
||||||
|
(b')', None)])
|
||||||
|
self.assertEqual(make_header(decode_header(s)).encode(), s.lower())
|
||||||
|
self.assertEqual(str(make_header(decode_header(s))), '(a b)')
|
||||||
|
|
||||||
|
|
||||||
# Test the MIMEMessage class
|
# Test the MIMEMessage class
|
||||||
class TestMIMEMessage(TestEmailBase):
|
class TestMIMEMessage(TestEmailBase):
|
||||||
|
@ -4388,7 +4441,7 @@ A very long line that must get split to something other than at the
|
||||||
h = make_header(decode_header(s))
|
h = make_header(decode_header(s))
|
||||||
eq(h.encode(), s)
|
eq(h.encode(), s)
|
||||||
|
|
||||||
def test_whitespace_eater(self):
|
def test_whitespace_keeper(self):
|
||||||
eq = self.assertEqual
|
eq = self.assertEqual
|
||||||
s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.'
|
s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.'
|
||||||
parts = decode_header(s)
|
parts = decode_header(s)
|
||||||
|
|
|
@ -10,6 +10,10 @@ What's New in Python 3.3.0 Beta 1?
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #1079: email.header.decode_header now correctly parses all the examples
|
||||||
|
in RFC2047. There is a necessary visible behavior change: the leading and/or
|
||||||
|
trailing whitespace on ASCII parts is now preserved.
|
||||||
|
|
||||||
- Issue #14969: Better handling of exception chaining in contextlib.ExitStack
|
- Issue #14969: Better handling of exception chaining in contextlib.ExitStack
|
||||||
|
|
||||||
- Issue #14962: Update text coloring in IDLE shell window after changing
|
- Issue #14962: Update text coloring in IDLE shell window after changing
|
||||||
|
|
Loading…
Reference in New Issue