#1079: Fix parsing of encoded words.

This is a behavior change: before this leading and trailing spaces were
stripped from ASCII parts, now they are preserved.  Without this fix we didn't
parse the examples in the RFC correctly, so I think breaking backward
compatibility here is justified.

Patch by Ralf Schlatterbeck.
This commit is contained in:
R David Murray 2012-06-02 17:56:49 -04:00
parent e11eb0f21b
commit 07ea53cb21
5 changed files with 113 additions and 19 deletions

View File

@ -40,7 +40,6 @@ ecre = re.compile(r'''
\? # literal ? \? # literal ?
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
\?= # literal ?= \?= # literal ?=
(?=[ \t]|$) # whitespace or the end of the string
''', re.VERBOSE | re.IGNORECASE | re.MULTILINE) ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
# Field name regexp, including trailing colon, but not separating whitespace, # Field name regexp, including trailing colon, but not separating whitespace,
@ -86,8 +85,12 @@ def decode_header(header):
words = [] words = []
for line in header.splitlines(): for line in header.splitlines():
parts = ecre.split(line) parts = ecre.split(line)
first = True
while parts: while parts:
unencoded = parts.pop(0).strip() unencoded = parts.pop(0)
if first:
unencoded = unencoded.lstrip()
first = False
if unencoded: if unencoded:
words.append((unencoded, None, None)) words.append((unencoded, None, None))
if parts: if parts:
@ -95,6 +98,16 @@ def decode_header(header):
encoding = parts.pop(0).lower() encoding = parts.pop(0).lower()
encoded = parts.pop(0) encoded = parts.pop(0)
words.append((encoded, encoding, charset)) words.append((encoded, encoding, charset))
# Now loop over words and remove words that consist of whitespace
# between two encoded strings.
import sys
droplist = []
for n, w in enumerate(words):
if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
droplist.append(n-1)
for d in reversed(droplist):
del words[d]
# The next step is to decode each encoded word by applying the reverse # The next step is to decode each encoded word by applying the reverse
# base64 or quopri transformation. decoded_words is now a list of the # base64 or quopri transformation. decoded_words is now a list of the
# form (decoded_word, charset). # form (decoded_word, charset).
@ -217,22 +230,27 @@ class Header:
self._normalize() self._normalize()
uchunks = [] uchunks = []
lastcs = None lastcs = None
lastspace = None
for string, charset in self._chunks: for string, charset in self._chunks:
# We must preserve spaces between encoded and non-encoded word # We must preserve spaces between encoded and non-encoded word
# boundaries, which means for us we need to add a space when we go # boundaries, which means for us we need to add a space when we go
# from a charset to None/us-ascii, or from None/us-ascii to a # from a charset to None/us-ascii, or from None/us-ascii to a
# charset. Only do this for the second and subsequent chunks. # charset. Only do this for the second and subsequent chunks.
# Don't add a space if the None/us-ascii string already has
# a space (trailing or leading depending on transition)
nextcs = charset nextcs = charset
if nextcs == _charset.UNKNOWN8BIT: if nextcs == _charset.UNKNOWN8BIT:
original_bytes = string.encode('ascii', 'surrogateescape') original_bytes = string.encode('ascii', 'surrogateescape')
string = original_bytes.decode('ascii', 'replace') string = original_bytes.decode('ascii', 'replace')
if uchunks: if uchunks:
hasspace = string and self._nonctext(string[0])
if lastcs not in (None, 'us-ascii'): if lastcs not in (None, 'us-ascii'):
if nextcs in (None, 'us-ascii'): if nextcs in (None, 'us-ascii') and not hasspace:
uchunks.append(SPACE) uchunks.append(SPACE)
nextcs = None nextcs = None
elif nextcs not in (None, 'us-ascii'): elif nextcs not in (None, 'us-ascii') and not lastspace:
uchunks.append(SPACE) uchunks.append(SPACE)
lastspace = string and self._nonctext(string[-1])
lastcs = nextcs lastcs = nextcs
uchunks.append(string) uchunks.append(string)
return EMPTYSTRING.join(uchunks) return EMPTYSTRING.join(uchunks)
@ -291,6 +309,11 @@ class Header:
charset = UTF8 charset = UTF8
self._chunks.append((s, charset)) self._chunks.append((s, charset))
def _nonctext(self, s):
"""True if string s is not a ctext character of RFC822.
"""
return s.isspace() or s in ('(', ')', '\\')
def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'): def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
r"""Encode a message header into an RFC-compliant format. r"""Encode a message header into an RFC-compliant format.
@ -334,7 +357,20 @@ class Header:
maxlinelen = 1000000 maxlinelen = 1000000
formatter = _ValueFormatter(self._headerlen, maxlinelen, formatter = _ValueFormatter(self._headerlen, maxlinelen,
self._continuation_ws, splitchars) self._continuation_ws, splitchars)
lastcs = None
hasspace = lastspace = None
for string, charset in self._chunks: for string, charset in self._chunks:
if hasspace is not None:
hasspace = string and self._nonctext(string[0])
import sys
if lastcs not in (None, 'us-ascii'):
if not hasspace or charset not in (None, 'us-ascii'):
formatter.add_transition()
elif charset not in (None, 'us-ascii') and not lastspace:
formatter.add_transition()
lastspace = string and self._nonctext(string[-1])
lastcs = charset
hasspace = False
lines = string.splitlines() lines = string.splitlines()
if lines: if lines:
formatter.feed('', lines[0], charset) formatter.feed('', lines[0], charset)
@ -351,6 +387,7 @@ class Header:
formatter.feed(fws, sline, charset) formatter.feed(fws, sline, charset)
if len(lines) > 1: if len(lines) > 1:
formatter.newline() formatter.newline()
if self._chunks:
formatter.add_transition() formatter.add_transition()
value = formatter._str(linesep) value = formatter._str(linesep)
if _embeded_header.search(value): if _embeded_header.search(value):

View File

@ -166,7 +166,7 @@ def decode_header(header_str):
parts.append(v.decode(enc or 'ascii')) parts.append(v.decode(enc or 'ascii'))
else: else:
parts.append(v) parts.append(v)
return ' '.join(parts) return ''.join(parts)
def _parse_overview_fmt(lines): def _parse_overview_fmt(lines):
"""Parse a list of string representing the response to LIST OVERVIEW.FMT """Parse a list of string representing the response to LIST OVERVIEW.FMT

View File

@ -41,7 +41,7 @@ class TestEmailAsianCodecs(TestEmailBase):
Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?= Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?=
=?iso-8859-1?q?Gr=FC=DF_Gott!?=""") =?iso-8859-1?q?Gr=FC=DF_Gott!?=""")
eq(decode_header(h.encode()), eq(decode_header(h.encode()),
[(b'Hello World!', None), [(b'Hello World! ', None),
(b'\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'), (b'\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'),
(b'Gr\xfc\xdf Gott!', gcode)]) (b'Gr\xfc\xdf Gott!', gcode)])
subject_bytes = (b'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5' subject_bytes = (b'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5'

View File

@ -1994,9 +1994,9 @@ class TestRFC2047(TestEmailBase):
foo bar =?mac-iceland?q?r=8Aksm=9Arg=8Cs?=""" foo bar =?mac-iceland?q?r=8Aksm=9Arg=8Cs?="""
dh = decode_header(s) dh = decode_header(s)
eq(dh, [ eq(dh, [
(b'Re:', None), (b'Re: ', None),
(b'r\x8aksm\x9arg\x8cs', 'mac-iceland'), (b'r\x8aksm\x9arg\x8cs', 'mac-iceland'),
(b'baz foo bar', None), (b' baz foo bar ', None),
(b'r\x8aksm\x9arg\x8cs', 'mac-iceland')]) (b'r\x8aksm\x9arg\x8cs', 'mac-iceland')])
header = make_header(dh) header = make_header(dh)
eq(str(header), eq(str(header),
@ -2005,35 +2005,37 @@ class TestRFC2047(TestEmailBase):
Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?= Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
=?mac-iceland?q?=9Arg=8Cs?=""") =?mac-iceland?q?=9Arg=8Cs?=""")
def test_whitespace_eater_unicode(self): def test_whitespace_keeper_unicode(self):
eq = self.assertEqual eq = self.assertEqual
s = '=?ISO-8859-1?Q?Andr=E9?= Pirard <pirard@dom.ain>' s = '=?ISO-8859-1?Q?Andr=E9?= Pirard <pirard@dom.ain>'
dh = decode_header(s) dh = decode_header(s)
eq(dh, [(b'Andr\xe9', 'iso-8859-1'), eq(dh, [(b'Andr\xe9', 'iso-8859-1'),
(b'Pirard <pirard@dom.ain>', None)]) (b' Pirard <pirard@dom.ain>', None)])
header = str(make_header(dh)) header = str(make_header(dh))
eq(header, 'Andr\xe9 Pirard <pirard@dom.ain>') eq(header, 'Andr\xe9 Pirard <pirard@dom.ain>')
def test_whitespace_eater_unicode_2(self): def test_whitespace_keeper_unicode_2(self):
eq = self.assertEqual eq = self.assertEqual
s = 'The =?iso-8859-1?b?cXVpY2sgYnJvd24gZm94?= jumped over the =?iso-8859-1?b?bGF6eSBkb2c=?=' s = 'The =?iso-8859-1?b?cXVpY2sgYnJvd24gZm94?= jumped over the =?iso-8859-1?b?bGF6eSBkb2c=?='
dh = decode_header(s) dh = decode_header(s)
eq(dh, [(b'The', None), (b'quick brown fox', 'iso-8859-1'), eq(dh, [(b'The ', None), (b'quick brown fox', 'iso-8859-1'),
(b'jumped over the', None), (b'lazy dog', 'iso-8859-1')]) (b' jumped over the ', None), (b'lazy dog', 'iso-8859-1')])
hu = str(make_header(dh)) hu = str(make_header(dh))
eq(hu, 'The quick brown fox jumped over the lazy dog') eq(hu, 'The quick brown fox jumped over the lazy dog')
def test_rfc2047_missing_whitespace(self): def test_rfc2047_missing_whitespace(self):
s = 'Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord' s = 'Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord'
dh = decode_header(s) dh = decode_header(s)
self.assertEqual(dh, [(s, None)]) self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'),
(b'rg', None), (b'\xe5', 'iso-8859-1'),
(b'sbord', None)])
def test_rfc2047_with_whitespace(self): def test_rfc2047_with_whitespace(self):
s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord' s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord'
dh = decode_header(s) dh = decode_header(s)
self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'), self.assertEqual(dh, [(b'Sm ', None), (b'\xf6', 'iso-8859-1'),
(b'rg', None), (b'\xe5', 'iso-8859-1'), (b' rg ', None), (b'\xe5', 'iso-8859-1'),
(b'sbord', None)]) (b' sbord', None)])
def test_rfc2047_B_bad_padding(self): def test_rfc2047_B_bad_padding(self):
s = '=?iso-8859-1?B?%s?=' s = '=?iso-8859-1?B?%s?='
@ -2051,6 +2053,57 @@ Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
self.assertEqual(decode_header(s), self.assertEqual(decode_header(s),
[(b'andr\xe9=zz', 'iso-8659-1')]) [(b'andr\xe9=zz', 'iso-8659-1')])
def test_rfc2047_rfc2047_1(self):
# 1st testcase at end of rfc2047
s = '(=?ISO-8859-1?Q?a?=)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'a', 'iso-8859-1'), (b')', None)])
def test_rfc2047_rfc2047_2(self):
# 2nd testcase at end of rfc2047
s = '(=?ISO-8859-1?Q?a?= b)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'a', 'iso-8859-1'), (b' b)', None)])
def test_rfc2047_rfc2047_3(self):
# 3rd testcase at end of rfc2047
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
def test_rfc2047_rfc2047_4(self):
# 4th testcase at end of rfc2047
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
def test_rfc2047_rfc2047_5a(self):
# 5th testcase at end of rfc2047 newline is \r\n
s = '(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
def test_rfc2047_rfc2047_5b(self):
# 5th testcase at end of rfc2047 newline is \n
s = '(=?ISO-8859-1?Q?a?=\n =?ISO-8859-1?Q?b?=)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
def test_rfc2047_rfc2047_6(self):
# 6th testcase at end of rfc2047
s = '(=?ISO-8859-1?Q?a_b?=)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'a b', 'iso-8859-1'), (b')', None)])
def test_rfc2047_rfc2047_7(self):
# 7th testcase at end of rfc2047
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)'
self.assertEqual(decode_header(s),
[(b'(', None), (b'a', 'iso-8859-1'), (b' b', 'iso-8859-2'),
(b')', None)])
self.assertEqual(make_header(decode_header(s)).encode(), s.lower())
self.assertEqual(str(make_header(decode_header(s))), '(a b)')
# Test the MIMEMessage class # Test the MIMEMessage class
class TestMIMEMessage(TestEmailBase): class TestMIMEMessage(TestEmailBase):
@ -4388,11 +4441,11 @@ A very long line that must get split to something other than at the
h = make_header(decode_header(s)) h = make_header(decode_header(s))
eq(h.encode(), s) eq(h.encode(), s)
def test_whitespace_eater(self): def test_whitespace_keeper(self):
eq = self.assertEqual eq = self.assertEqual
s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.' s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.'
parts = decode_header(s) parts = decode_header(s)
eq(parts, [(b'Subject:', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b'zz.', None)]) eq(parts, [(b'Subject: ', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b' zz.', None)])
hdr = make_header(parts) hdr = make_header(parts)
eq(hdr.encode(), eq(hdr.encode(),
'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztnK?= zz.') 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztnK?= zz.')

View File

@ -10,6 +10,10 @@ What's New in Python 3.3.0 Beta 1?
Library Library
------- -------
- Issue #1079: email.header.decode_header now correctly parses all the examples
in RFC2047. There is a necessary visible behavior change: the leading and/or
trailing whitespace on ASCII parts is now preserved.
- Issue #14969: Better handling of exception chaining in contextlib.ExitStack - Issue #14969: Better handling of exception chaining in contextlib.ExitStack
- Issue #14962: Update text coloring in IDLE shell window after changing - Issue #14962: Update text coloring in IDLE shell window after changing