From f9844c8292c34fe4361d441164d83642fc76ce68 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 5 Jan 2011 01:47:38 +0000 Subject: [PATCH] Merged revisions 87750 via svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/py3k ........ r87750 | r.david.murray | 2011-01-04 20:39:32 -0500 (Tue, 04 Jan 2011) | 5 lines #10790: make append work when output codec is different from input codec There's still a bug here (the encode call shouldn't use the 'errors' paramter), but I'll fix that later. ........ --- Doc/library/email.header.rst | 14 +++++++------- Lib/email/header.py | 26 ++++++++++---------------- Lib/email/test/test_email.py | 4 ++++ Misc/NEWS | 3 +++ 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/Doc/library/email.header.rst b/Doc/library/email.header.rst index 1d530b2602b..796ac972623 100644 --- a/Doc/library/email.header.rst +++ b/Doc/library/email.header.rst @@ -94,15 +94,15 @@ Here is the :class:`Header` class description: decoded with that character set. If *s* is an instance of :class:`str`, then *charset* is a hint specifying - the character set of the characters in the string. In this case, when - producing an :rfc:`2822`\ -compliant header using :rfc:`2047` rules, the - Unicode string will be encoded using the following charsets in order: - ``us-ascii``, the *charset* hint, ``utf-8``. The first character set to - not provoke a :exc:`UnicodeError` is used. + the character set of the characters in the string. - Optional *errors* is passed through to any :func:`encode` or - :func:`ustr.encode` call, and defaults to "strict". + In either case, when producing an :rfc:`2822`\ -compliant header using + :rfc:`2047` rules, the string will be encoded using the output codec of + the charset. If the string cannot be encoded using the output codec, a + UnicodeError will be raised. + Optional *errors* is passed as the errors argument to the decode call + if *s* is a byte string. .. method:: encode(splitchars=';, \\t', maxlinelen=None) diff --git a/Lib/email/header.py b/Lib/email/header.py index 8194fa39772..aaca18aff47 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -245,32 +245,26 @@ class Header: that byte string, and a UnicodeError will be raised if the string cannot be decoded with that charset. If s is a Unicode string, then charset is a hint specifying the character set of the characters in - the string. In this case, when producing an RFC 2822 compliant header - using RFC 2047 rules, the Unicode string will be encoded using the - following charsets in order: us-ascii, the charset hint, utf-8. The - first character set not to provoke a UnicodeError is used. + the string. In either case, when producing an RFC 2822 compliant + header using RFC 2047 rules, the string will be encoded using the + output codec of the charset. If the string cannot be encoded to the + output codec, a UnicodeError will be raised. - Optional `errors' is passed as the third argument to any unicode() or - ustr.encode() call. + Optional `errors' is passed as the errors argument to the decode + call if s is a byte string. """ if charset is None: charset = self._charset elif not isinstance(charset, Charset): charset = Charset(charset) - if isinstance(s, str): - # Convert the string from the input character set to the output - # character set and store the resulting bytes and the charset for - # composition later. + if not isinstance(s, str): input_charset = charset.input_codec or 'us-ascii' - input_bytes = s.encode(input_charset, errors) - else: - # We already have the bytes we will store internally. - input_bytes = s + s = s.decode(input_charset, errors) # Ensure that the bytes we're storing can be decoded to the output # character set, otherwise an early error is thrown. output_charset = charset.output_codec or 'us-ascii' - output_string = input_bytes.decode(output_charset, errors) - self._chunks.append((output_string, charset)) + s.encode(output_charset, errors) + self._chunks.append((s, charset)) def encode(self, splitchars=';, \t', maxlinelen=None): """Encode a message header into an RFC-compliant format. diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index c9903ecac84..05eb6a7bee1 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -3255,6 +3255,10 @@ A very long line that must get split to something other than at the s = 'Subject: =?EUC-KR?B?CSixpLDtKSC/7Liuvsax4iC6uLmwMcijIKHaILzSwd/H0SC8+LCjwLsgv7W/+Mj3I ?=' raises(errors.HeaderParseError, decode_header, s) + def test_shift_jis_charset(self): + h = Header('文', charset='shift_jis') + self.assertEqual(h.encode(), '=?iso-2022-jp?b?GyRCSjgbKEI=?=') + # Test RFC 2231 header parameters (en/de)coding diff --git a/Misc/NEWS b/Misc/NEWS index 106400513c3..1470351f21b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -27,6 +27,9 @@ Core and Builtins Library ------- +- Issue #10790: email.header.Header.append's charset logic now works correctly + for charsets whose output codec is different from its input codec. + - Issue #6643: Reinitialize locks held within the threading module after fork to avoid a potential rare deadlock or crash on some platforms.