diff --git a/Lib/email/Header.py b/Lib/email/Header.py index 70e0bac8629..0f2eb32ea8b 100644 --- a/Lib/email/Header.py +++ b/Lib/email/Header.py @@ -1,9 +1,11 @@ # Copyright (C) 2002 Python Software Foundation -# Author: che@debian.org (Ben Gertzfield) +# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw) """Header encoding and decoding functionality.""" import re +from types import StringType, UnicodeType + import email.quopriMIME import email.base64MIME from email.Charset import Charset @@ -14,6 +16,12 @@ except SyntaxError: # Python 2.1 spells integer division differently from email._compat21 import _floordiv +try: + True, False +except NameError: + True = 1 + False = 0 + CRLFSPACE = '\r\n ' CRLF = '\r\n' NL = '\n' @@ -25,6 +33,9 @@ MAXLINELEN = 76 ENCODE = 1 DECODE = 2 +USASCII = Charset('us-ascii') +UTF8 = Charset('utf-8') + # Match encoded-word strings in the form =?charset?q?Hello_World?= ecre = re.compile(r''' =\? # literal =? @@ -117,21 +128,19 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None, class Header: def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None, continuation_ws=' '): - """Create a MIME-compliant header that can contain many languages. + """Create a MIME-compliant header that can contain many character sets. - Specify the initial header value in s. If None, the initial header - value is not set. + Optional s is the initial header value. If None, the initial header + value is not set. You can later append to the header with .append() + method calls. s may be a byte string or a Unicode string, but see the + .append() documentation for semantics. - Specify both s's character set, and the default character set by - setting the charset argument to a Charset object (not a character set - name string!). If None, a us-ascii Charset is used as both s's - initial charset and as the default character set for subsequent - .append() calls. - - You can later append to the header with append(s, charset) below; - charset does not have to be the same as the one initially specified - here. In fact, it's optional, and if not given, defaults to the - charset specified in the constructor. + Optional charset serves two purposes: it has the same meaning as the + charset argument to the .append() method. It also sets the default + character set for all subsequent .append() calls that omit the charset + argument. If charset is not provided in the constructor, the us-ascii + charset is used both as s's initial charset and as the default for + subsequent .append() calls. The maximum line length can be specified explicit via maxlinelen. For splitting the first line to a shorter value (to account for the field @@ -143,7 +152,7 @@ class Header: lines. """ if charset is None: - charset = Charset() + charset = USASCII self._charset = charset self._continuation_ws = continuation_ws cws_expanded_len = len(continuation_ws.replace('\t', SPACE8)) @@ -186,20 +195,43 @@ class Header: return not self == other def append(self, s, charset=None): - """Append string s with Charset charset to the MIME header. + """Append a string to the MIME header. - If charset is given, it should be a Charset instance, or the name of a - character set (which will be converted to a Charset instance). A - value of None (the default) means charset is the one given in the - class constructor. + Optional charset, if given, should be a Charset instance or the name + of a character set (which will be converted to a Charset instance). A + value of None (the default) means that the charset given in the + constructor is used. + + s may be a byte string or a Unicode string. If it is a byte string + (i.e. isinstance(s, StringType) is true), then charset is the encoding + of that byte string, and a UnicodeError will be raised if the string + cannot be decoded with that charset. If `s' is a Unicode string, then + charset is a hint specifying the character set of the characters in + the string. In this case, when producing an RFC 2822 compliant header + using RFC 2047 rules, the Unicode string will be encoded using the + following charsets in order: us-ascii, the charset hint, utf-8. """ if charset is None: charset = self._charset elif not isinstance(charset, Charset): charset = Charset(charset) + # Normalize and check the string + if isinstance(s, StringType): + # Possibly raise UnicodeError if it can't e encoded + unicode(s, charset.get_output_charset()) + elif isinstance(s, UnicodeType): + # Convert Unicode to byte string for later concatenation + for charset in USASCII, charset, UTF8: + try: + s = s.encode(charset.get_output_charset()) + break + except UnicodeError: + pass + else: + assert False, 'Could not encode to utf-8' self._chunks.append((s, charset)) - def _split(self, s, charset, firstline=0): + def _split(self, s, charset, firstline=False): # Split up a header safely for use with encode_chunks. BAW: this # appears to be a private convenience method. splittable = charset.to_splittable(s) @@ -227,13 +259,13 @@ class Header: # We can split on _maxlinelen boundaries because we know that the # encoding won't change the size of the string splitpnt = self._maxlinelen - first = charset.from_splittable(splittable[:splitpnt], 0) - last = charset.from_splittable(splittable[splitpnt:], 0) + first = charset.from_splittable(splittable[:splitpnt], False) + last = charset.from_splittable(splittable[splitpnt:], False) else: # Divide and conquer. halfway = _floordiv(len(splittable), 2) - first = charset.from_splittable(splittable[:halfway], 0) - last = charset.from_splittable(splittable[halfway:], 0) + first = charset.from_splittable(splittable[:halfway], False) + last = charset.from_splittable(splittable[halfway:], False) # Do the split return self._split(first, charset, firstline) + \ self._split(last, charset) @@ -248,7 +280,7 @@ class Header: line = lines.pop(0) if firstline: maxlinelen = self._firstlinelen - firstline = 0 + firstline = False else: #line = line.lstrip() maxlinelen = self._maxlinelen @@ -338,7 +370,7 @@ class Header: # There's no encoding for this chunk's charsets _max_append(chunks, header, self._maxlinelen) else: - _max_append(chunks, charset.header_encode(header, 0), + _max_append(chunks, charset.header_encode(header), self._maxlinelen, ' ') joiner = NL + self._continuation_ws return joiner.join(chunks) @@ -363,6 +395,6 @@ class Header: """ newchunks = [] for s, charset in self._chunks: - newchunks += self._split(s, charset, 1) + newchunks += self._split(s, charset, True) self._chunks = newchunks return self._encode_chunks()