From 8451c4b6e044f83efc2298a79af58c3e56d946a2 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Sat, 23 Oct 2010 22:19:56 +0000 Subject: [PATCH] #1349106: add linesep argument to generator.flatten and header.encode. --- Doc/library/email.generator.rst | 84 ++++++++++++++++++++++++++++----- Doc/library/email.header.rst | 9 +++- Lib/email/generator.py | 74 +++++++++++++++++------------ Lib/email/header.py | 16 +++++-- Lib/email/test/data/msg_26.txt | 3 +- Lib/email/test/test_email.py | 24 +++++++++- Misc/NEWS | 3 ++ 7 files changed, 164 insertions(+), 49 deletions(-) diff --git a/Doc/library/email.generator.rst b/Doc/library/email.generator.rst index 2c169b93e73..f29cdac902c 100644 --- a/Doc/library/email.generator.rst +++ b/Doc/library/email.generator.rst @@ -56,7 +56,7 @@ Here are the public methods of the :class:`Generator` class, imported from the The other public :class:`Generator` methods are: - .. method:: flatten(msg, unixfrom=False) + .. method:: flatten(msg, unixfrom=False, linesep='\\n') Print the textual representation of the message object structure rooted at *msg* to the output file specified when the :class:`Generator` instance @@ -71,12 +71,20 @@ Here are the public methods of the :class:`Generator` class, imported from the Note that for subparts, no envelope header is ever printed. + Optional *linesep* specifies the line separator character used to + terminate lines in the output. It defaults to ``\n`` because that is + the most useful value for Python application code (other library packages + expect ``\n`` separated lines). ``linesep=\r\n`` can be used to + generate output with RFC-compliant line separators. + Messages parsed with a Bytes parser that have a :mailheader:`Content-Transfer-Encoding` of 8bit will be converted to a use a 7bit Content-Transfer-Encoding. Any other non-ASCII bytes in the message structure will be converted to '?' characters. - .. versionchanged:: 3.2 added support for re-encoding 8bit message bodies. + .. versionchanged:: 3.2 + added support for re-encoding 8bit message bodies, and the linesep + argument .. method:: clone(fp) @@ -97,16 +105,70 @@ formatted string representation of a message object. For more detail, see .. class:: BytesGenerator(outfp, mangle_from_=True, maxheaderlen=78) - This class has the same API as the :class:`Generator` class, except that - *outfp* must be a file like object that will accept :class`bytes` input to - its ``write`` method. If the message object structure contains non-ASCII - bytes, this generator's :meth:`~BytesGenerator.flatten` method will produce - them as-is, including preserving parts with a - :mailheader:`Content-Transfer-Encoding` of ``8bit``. + The constructor for the :class:`BytesGenerator` class takes a binary + :term:`file-like object` called *outfp* for an argument. *outfp* must + support a :meth:`write` method that accepts binary data. - Note that even the :meth:`write` method API is identical: it expects - strings as input, and converts them to bytes by encoding them using - the ASCII codec. + Optional *mangle_from_* is a flag that, when ``True``, puts a ``>`` + character in front of any line in the body that starts exactly as ``From``, + i.e. ``From`` followed by a space at the beginning of the line. This is the + only guaranteed portable way to avoid having such lines be mistaken for a + Unix mailbox format envelope header separator (see `WHY THE CONTENT-LENGTH + FORMAT IS BAD `_ for details). + *mangle_from_* defaults to ``True``, but you might want to set this to + ``False`` if you are not writing Unix mailbox format files. + + Optional *maxheaderlen* specifies the longest length for a non-continued + header. When a header line is longer than *maxheaderlen* (in characters, + with tabs expanded to 8 spaces), the header will be split as defined in the + :class:`~email.header.Header` class. Set to zero to disable header + wrapping. The default is 78, as recommended (but not required) by + :rfc:`2822`. + + The other public :class:`BytesGenerator` methods are: + + + .. method:: flatten(msg, unixfrom=False, linesep='\n') + + Print the textual representation of the message object structure rooted + at *msg* to the output file specified when the :class:`BytesGenerator` + instance was created. Subparts are visited depth-first and the resulting + text will be properly MIME encoded. If the input that created the *msg* + contained bytes with the high bit set and those bytes have not been + modified, they will be copied faithfully to the output, even if doing so + is not strictly RFC compliant. (To produce strictly RFC compliant + output, use the :class:`Generator` class.) + + Messages parsed with a Bytes parser that have a + :mailheader:`Content-Transfer-Encoding` of 8bit will be reconstructed + as 8bit if they have not been modified. + + Optional *unixfrom* is a flag that forces the printing of the envelope + header delimiter before the first :rfc:`2822` header of the root message + object. If the root object has no envelope header, a standard one is + crafted. By default, this is set to ``False`` to inhibit the printing of + the envelope delimiter. + + Note that for subparts, no envelope header is ever printed. + + Optional *linesep* specifies the line separator character used to + terminate lines in the output. It defaults to ``\n`` because that is + the most useful value for Python application code (other library packages + expect ``\n`` separated lines). ``linesep=\r\n`` can be used to + generate output with RFC-compliant line separators. + + .. method:: clone(fp) + + Return an independent clone of this :class:`BytesGenerator` instance with + the exact same options. + + .. method:: write(s) + + Write the string *s* to the underlying file object. *s* is encoded using + the ``ASCII`` codec and written to the *write* method of the *outfp* + *outfp* passed to the :class:`BytesGenerator`'s constructor. This + provides just enough file-like API for :class:`BytesGenerator` instances + to be used in the :func:`print` function. .. versionadded:: 3.2 diff --git a/Doc/library/email.header.rst b/Doc/library/email.header.rst index 1d530b2602b..d9ebdb7087a 100644 --- a/Doc/library/email.header.rst +++ b/Doc/library/email.header.rst @@ -104,7 +104,7 @@ Here is the :class:`Header` class description: :func:`ustr.encode` call, and defaults to "strict". - .. method:: encode(splitchars=';, \\t', maxlinelen=None) + .. method:: encode(splitchars=';, \\t', maxlinelen=None, linesep='\\n') Encode a message header into an RFC-compliant format, possibly wrapping long lines and encapsulating non-ASCII parts in base64 or quoted-printable @@ -115,6 +115,13 @@ Here is the :class:`Header` class description: *maxlinelen*, if given, overrides the instance's value for the maximum line length. + *linesep* specifies the characters used to separate the lines of the + folded header. It defaults to the most useful value for Python + application code (``\n``), but ``\r\n`` can be specified in order + to produce headers with RFC-compliant line separators. + + .. versionchanged:: 3.2 added the linesep argument + The :class:`Header` class also provides a number of methods to support standard operators and built-in functions. diff --git a/Lib/email/generator.py b/Lib/email/generator.py index 40b95c4f4f1..05019d91fc4 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -17,7 +17,7 @@ from email.header import Header from email.message import _has_surrogates UNDERSCORE = '_' -NL = '\n' +NL = '\n' # XXX: no longer used by the code below. fcre = re.compile(r'^From ', re.MULTILINE) @@ -58,7 +58,7 @@ class Generator: # Just delegate to the file object self._fp.write(s) - def flatten(self, msg, unixfrom=False): + def flatten(self, msg, unixfrom=False, linesep='\n'): """Print the message object tree rooted at msg to the output file specified when the Generator instance was created. @@ -68,12 +68,23 @@ class Generator: is False to inhibit the printing of any From_ delimiter. Note that for subobjects, no From_ line is printed. + + linesep specifies the characters used to indicate a new line in + the output. """ + # We use the _XXX constants for operating on data that comes directly + # from the msg, and _encoded_XXX constants for operating on data that + # has already been converted (to bytes in the BytesGenerator) and + # inserted into a temporary buffer. + self._NL = linesep + self._encoded_NL = self._encode(linesep) + self._EMPTY = '' + self._encoded_EMTPY = self._encode('') if unixfrom: ufrom = msg.get_unixfrom() if not ufrom: ufrom = 'From nobody ' + time.ctime(time.time()) - self.write(ufrom + NL) + self.write(ufrom + self._NL) self._write(msg) def clone(self, fp): @@ -93,20 +104,18 @@ class Generator: # it has already transformed the input; but, since this whole thing is a # hack anyway this seems good enough. - # We use these class constants when we need to manipulate data that has - # already been written to a buffer (ex: constructing a re to check the - # boundary), and the module level NL constant when adding new output to a - # buffer via self.write, because 'write' always takes strings. - # Having write always take strings makes the code simpler, but there are - # a few occasions when we need to write previously created data back - # to the buffer or to a new buffer; for those cases we use self._fp.write. - _NL = NL - _EMPTY = '' + # Similarly, we have _XXX and _encoded_XXX attributes that are used on + # source and buffer data, respectively. + _encoded_EMPTY = '' def _new_buffer(self): # BytesGenerator overrides this to return BytesIO. return StringIO() + def _encode(self, s): + # BytesGenerator overrides this to encode strings to bytes. + return s + def _write(self, msg): # We can't write the headers yet because of the following scenario: # say a multipart message includes the boundary string somewhere in @@ -158,14 +167,15 @@ class Generator: for h, v in msg.items(): self.write('%s: ' % h) if isinstance(v, Header): - self.write(v.encode(maxlinelen=self._maxheaderlen)+NL) + self.write(v.encode( + maxlinelen=self._maxheaderlen, linesep=self._NL)+self._NL) else: # Header's got lots of smarts, so use it. header = Header(v, maxlinelen=self._maxheaderlen, header_name=h) - self.write(header.encode()+NL) + self.write(header.encode(linesep=self._NL)+self._NL) # A blank line always separates headers from body - self.write(NL) + self.write(self._NL) # # Handlers for writing types and subtypes @@ -208,11 +218,11 @@ class Generator: for part in subparts: s = self._new_buffer() g = self.clone(s) - g.flatten(part, unixfrom=False) + g.flatten(part, unixfrom=False, linesep=self._NL) msgtexts.append(s.getvalue()) # Now make sure the boundary we've selected doesn't appear in any of # the message texts. - alltext = self._NL.join(msgtexts) + alltext = self._encoded_NL.join(msgtexts) # BAW: What about boundaries that are wrapped in double-quotes? boundary = msg.get_boundary(failobj=self._make_boundary(alltext)) # If we had to calculate a new boundary because the body text @@ -225,9 +235,9 @@ class Generator: msg.set_boundary(boundary) # If there's a preamble, write it out, with a trailing CRLF if msg.preamble is not None: - self.write(msg.preamble + NL) + self.write(msg.preamble + self._NL) # dash-boundary transport-padding CRLF - self.write('--' + boundary + NL) + self.write('--' + boundary + self._NL) # body-part if msgtexts: self._fp.write(msgtexts.pop(0)) @@ -236,13 +246,13 @@ class Generator: # --> CRLF body-part for body_part in msgtexts: # delimiter transport-padding CRLF - self.write('\n--' + boundary + NL) + self.write(self._NL + '--' + boundary + self._NL) # body-part self._fp.write(body_part) # close-delimiter transport-padding - self.write('\n--' + boundary + '--') + self.write(self._NL + '--' + boundary + '--') if msg.epilogue is not None: - self.write(NL) + self.write(self._NL) self.write(msg.epilogue) def _handle_multipart_signed(self, msg): @@ -266,16 +276,16 @@ class Generator: g = self.clone(s) g.flatten(part, unixfrom=False) text = s.getvalue() - lines = text.split(self._NL) + lines = text.split(self._encoded_NL) # Strip off the unnecessary trailing empty line - if lines and lines[-1] == self._EMPTY: - blocks.append(self._NL.join(lines[:-1])) + if lines and lines[-1] == self._encoded_EMPTY: + blocks.append(self._encoded_NL.join(lines[:-1])) else: blocks.append(text) # Now join all the blocks with an empty line. This has the lovely # effect of separating each block with an empty line, but not adding # an extra one after the last one. - self._fp.write(self._NL.join(blocks)) + self._fp.write(self._encoded_NL.join(blocks)) def _handle_message(self, msg): s = self._new_buffer() @@ -333,10 +343,9 @@ class BytesGenerator(Generator): The outfp object must accept bytes in its write method. """ - # Bytes versions of these constants for use in manipulating data from + # Bytes versions of this constant for use in manipulating data from # the BytesIO buffer. - _NL = NL.encode('ascii') - _EMPTY = b'' + _encoded_EMPTY = b'' def write(self, s): self._fp.write(s.encode('ascii', 'surrogateescape')) @@ -344,6 +353,9 @@ class BytesGenerator(Generator): def _new_buffer(self): return BytesIO() + def _encode(self, s): + return s.encode('ascii') + def _write_headers(self, msg): # This is almost the same as the string version, except for handling # strings with 8bit bytes. @@ -363,9 +375,9 @@ class BytesGenerator(Generator): # Header's got lots of smarts and this string is safe... header = Header(v, maxlinelen=self._maxheaderlen, header_name=h) - self.write(header.encode()+NL) + self.write(header.encode(linesep=self._NL)+self._NL) # A blank line always separates headers from body - self.write(NL) + self.write(self._NL) def _handle_text(self, msg): # If the string has surrogates the original source was bytes, so diff --git a/Lib/email/header.py b/Lib/email/header.py index 89c13910522..88fa80f57e9 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -272,7 +272,7 @@ class Header: output_string = input_bytes.decode(output_charset, errors) self._chunks.append((output_string, charset)) - def encode(self, splitchars=';, \t', maxlinelen=None): + def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'): """Encode a message header into an RFC-compliant format. There are many issues involved in converting a given string for use in @@ -293,6 +293,11 @@ class Header: Optional splitchars is a string containing characters to split long ASCII lines on, in rough support of RFC 2822's `highest level syntactic breaks'. This doesn't affect RFC 2047 encoded lines. + + Optional linesep is a string to be used to separate the lines of + the value. The default value is the most useful for typical + Python applications, but it can be set to \r\n to produce RFC-compliant + line separators when needed. """ self._normalize() if maxlinelen is None: @@ -311,7 +316,7 @@ class Header: if len(lines) > 1: formatter.newline() formatter.add_transition() - return str(formatter) + return formatter._str(linesep) def _normalize(self): # Step 1: Normalize the chunks so that all runs of identical charsets @@ -342,9 +347,12 @@ class _ValueFormatter: self._lines = [] self._current_line = _Accumulator(headerlen) - def __str__(self): + def _str(self, linesep): self.newline() - return NL.join(self._lines) + return linesep.join(self._lines) + + def __str__(self): + return self._str(NL) def newline(self): end_of_line = self._current_line.pop() diff --git a/Lib/email/test/data/msg_26.txt b/Lib/email/test/data/msg_26.txt index 6c71bced9ac..58efaa9c9a8 100644 --- a/Lib/email/test/data/msg_26.txt +++ b/Lib/email/test/data/msg_26.txt @@ -24,7 +24,8 @@ Simple email with attachment. --1618492860--2051301190--113853680 -Content-Type: application/riscos; name="clock.bmp,69c"; type=BMP; load=&fff69c4b; exec=&355dd4d1; access=&03 +Content-Type: application/riscos; name="clock.bmp,69c"; type=BMP; + load=&fff69c4b; exec=&355dd4d1; access=&03 Content-Disposition: attachment; filename="clock.bmp" Content-Transfer-Encoding: base64 diff --git a/Lib/email/test/test_email.py b/Lib/email/test/test_email.py index e5e51c6ffcb..f40d77081d0 100644 --- a/Lib/email/test/test_email.py +++ b/Lib/email/test/test_email.py @@ -77,7 +77,7 @@ class TestMessageAPI(TestEmailBase): eq(msg.get_all('cc'), ['ccc@zzz.org', 'ddd@zzz.org', 'eee@zzz.org']) eq(msg.get_all('xx', 'n/a'), 'n/a') - def test_getset_charset(self): + def TEst_getset_charset(self): eq = self.assertEqual msg = Message() eq(msg.get_charset(), None) @@ -2600,6 +2600,18 @@ Here's the message body part2 = msg.get_payload(1) eq(part2.get_content_type(), 'application/riscos') + def test_crlf_flatten(self): + # Using newline='\n' preserves the crlfs in this input file. + with openfile('msg_26.txt', newline='\n') as fp: + text = fp.read() + msg = email.message_from_string(text) + s = StringIO() + g = Generator(s) + g.flatten(msg, linesep='\r\n') + self.assertEqual(s.getvalue(), text) + + maxDiff = None + def test_multipart_digest_with_extra_mime_headers(self): eq = self.assertEqual neq = self.ndiffAssertEqual @@ -2931,6 +2943,16 @@ class Test8BitBytesHandling(unittest.TestCase): m = bfp.close() self.assertEqual(str(m), self.latin_bin_msg_as7bit) + def test_crlf_flatten(self): + with openfile('msg_26.txt', 'rb') as fp: + text = fp.read() + msg = email.message_from_bytes(text) + s = BytesIO() + g = email.generator.BytesGenerator(s) + g.flatten(msg, linesep='\r\n') + self.assertEqual(s.getvalue(), text) + maxDiff = None + class TestBytesGeneratorIdempotent(TestIdempotent): diff --git a/Misc/NEWS b/Misc/NEWS index 1f3a754e5c6..4c7dc4b5fab 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -48,6 +48,9 @@ Core and Builtins Library ------- +- Issue #1349106: Generator (and BytesGenerator) flatten method and Header + encode method now support a 'linesep' argument. + - Issue #5639: Add a *server_hostname* argument to ``SSLContext.wrap_socket`` in order to support the TLS SNI extension. ``HTTPSConnection`` and ``urlopen()`` also use this argument, so that HTTPS virtual hosts are now