cpython/Lib/email/Parser.py

# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)

"""A parser of RFC 2822 and MIME email messages.
"""

import re
from cStringIO import StringIO
from types import ListType

from email import Errors
from email import Message

EMPTYSTRING = ''
NL = '\n'


class Parser:
    def __init__(self, _class=Message.Message, strict=0):
        """Parser of RFC 2822 and MIME email messages.

        Creates an in-memory object tree representing the email message, which
        can then be manipulated and turned over to a Generator to return the
        textual representation of the message.

        The string must be formatted as a block of RFC 2822 headers and header
        continuation lines, optionally preceeded by a `Unix-from' header.  The
        header block is terminated either by the end of the string or by a
        blank line.

        _class is the class to instantiate for new message objects when they
        must be created.  This class must have a constructor that can take
        zero arguments.  Default is Message.Message.

        Optional strict tells the parser to be strictly RFC compliant or to be
        more forgiving in parsing of ill-formatted MIME documents.  When
        non-strict mode is used, the parser will try to make up for missing or
        erroneous boundaries and other peculiarities seen in the wild.
        Default is non-strict parsing.
        """
        self._class = _class
        self._strict = strict

    def parse(self, fp, headersonly=0):
        root = self._class()
        self._parseheaders(root, fp)
        if not headersonly:
            self._parsebody(root, fp)
        return root

    def parsestr(self, text, headersonly=0):
        return self.parse(StringIO(text), headersonly=headersonly)

    def _parseheaders(self, container, fp):
        # Parse the headers, returning a list of header/value pairs.  None as
        # the header means the Unix-From header.
        lastheader = ''
        lastvalue = []
        lineno = 0
        while 1:
            # Don't strip the line before we test for the end condition,
            # because whitespace-only header lines are RFC compliant
            # continuation lines.
            line = fp.readline()
            if not line:
                break
            line = line.splitlines()[0]
            if not line:
                break
            # Ignore the trailing newline
            lineno += 1
            # Check for initial Unix From_ line
            if line.startswith('From '):
                if lineno == 1:
                    container.set_unixfrom(line)
                    continue
                elif self._strict:
                    raise Errors.HeaderParseError(
                        'Unix-from in headers after first rfc822 header')
                else:
                    # ignore the wierdly placed From_ line
                    # XXX: maybe set unixfrom anyway? or only if not already?
                    continue
            # Header continuation line
            if line[0] in ' \t':
                if not lastheader:
                    raise Errors.HeaderParseError(
                        'Continuation line seen before first header')
                lastvalue.append(line)
                continue
            # Normal, non-continuation header.  BAW: this should check to make
            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
            # should expose the header matching algorithm in the API, and
            # allow for a non-strict parsing mode (that ignores the line
            # instead of raising the exception).
            i = line.find(':')
            if i < 0:
                if self._strict:
                    raise Errors.HeaderParseError(
                        "Not a header, not a continuation: ``%s''"%line)
                elif lineno == 1 and line.startswith('--'):
                    # allow through duplicate boundary tags.
                    continue
                else:
                    raise Errors.HeaderParseError(
                        "Not a header, not a continuation: ``%s''"%line)
            if lastheader:
                container[lastheader] = NL.join(lastvalue)
            lastheader = line[:i]
            lastvalue = [line[i+1:].lstrip()]
        # Make sure we retain the last header
        if lastheader:
            container[lastheader] = NL.join(lastvalue)

    def _parsebody(self, container, fp):
        # Parse the body, but first split the payload on the content-type
        # boundary if present.
        boundary = container.get_boundary()
        isdigest = (container.get_type() == 'multipart/digest')
        # If there's a boundary, split the payload text into its constituent
        # parts and parse each separately.  Otherwise, just parse the rest of
        # the body as a single message.  Note: any exceptions raised in the
        # recursive parse need to have their line numbers coerced.
        if boundary:
            preamble = epilogue = None
            # Split into subparts.  The first boundary we're looking for won't
            # always have a leading newline since we're at the start of the
            # body text, and there's not always a preamble before the first
            # boundary.
            separator = '--' + boundary
            payload = fp.read()
            # We use an RE here because boundaries can have trailing
            # whitespace.
            mo = re.search(
                r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
                payload)
            if not mo:
                if self._strict:
                    raise Errors.BoundaryError(
                        "Couldn't find starting boundary: %s" % boundary)
                container.set_payload(payload)
                return
            start = mo.start()
            if start > 0:
                # there's some pre-MIME boundary preamble
                preamble = payload[0:start]
            # Find out what kind of line endings we're using
            start += len(mo.group('sep')) + len(mo.group('ws'))
            cre = re.compile('\r\n|\r|\n')
            mo = cre.search(payload, start)
            if mo:
                start += len(mo.group(0))
            # We create a compiled regexp first because we need to be able to
            # specify the start position, and the module function doesn't
            # support this signature. :(
            cre = re.compile('(?P<sep>\r\n|\r|\n)' +
                             re.escape(separator) + '--')
            mo = cre.search(payload, start)
            if mo:
                terminator = mo.start()
                linesep = mo.group('sep')
                if mo.end() < len(payload):
                    # There's some post-MIME boundary epilogue
                    epilogue = payload[mo.end():]
            elif self._strict:
                raise Errors.BoundaryError(
                        "Couldn't find terminating boundary: %s" % boundary)
            else:
                # Handle the case of no trailing boundary.  Check that it ends
                # in a blank line.  Some cases (spamspamspam) don't even have
                # that!
                mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
                if not mo:
                    mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
                    if not mo:
                        raise Errors.BoundaryError(
                          'No terminating boundary and no trailing empty line')
                linesep = mo.group('sep')
                terminator = len(payload)
            # We split the textual payload on the boundary separator, which
            # includes the trailing newline. If the container is a
            # multipart/digest then the subparts are by default message/rfc822
            # instead of text/plain.  In that case, they'll have a optional
            # block of MIME headers, then an empty line followed by the
            # message headers.
            parts = re.split(
                linesep + re.escape(separator) + r'[ \t]*' + linesep,
                payload[start:terminator])
            for part in parts:
                if isdigest:
                    if part[0] == linesep:
                        # There's no header block so create an empty message
                        # object as the container, and lop off the newline so
                        # we can parse the sub-subobject
                        msgobj = self._class()
                        part = part[1:]
                    else:
                        parthdrs, part = part.split(linesep+linesep, 1)
                        # msgobj in this case is the "message/rfc822" container
                        msgobj = self.parsestr(parthdrs, headersonly=1)
                    # while submsgobj is the message itself
                    submsgobj = self.parsestr(part)
                    msgobj.attach(submsgobj)
                    msgobj.set_default_type('message/rfc822')
                else:
                    msgobj = self.parsestr(part)
                container.preamble = preamble
                container.epilogue = epilogue
                container.attach(msgobj)
        elif container.get_main_type() == 'multipart':
            # Very bad.  A message is a multipart with no boundary!
            raise Errors.BoundaryError(
                'multipart message with no defined boundary')
        elif container.get_type() == 'message/delivery-status':
            # This special kind of type contains blocks of headers separated
            # by a blank line.  We'll represent each header block as a
            # separate Message object
            blocks = []
            while 1:
                blockmsg = self._class()
                self._parseheaders(blockmsg, fp)
                if not len(blockmsg):
                    # No more header blocks left
                    break
                blocks.append(blockmsg)
            container.set_payload(blocks)
        elif container.get_main_type() == 'message':
            # Create a container for the payload, but watch out for there not
            # being any headers left
            try:
                msg = self.parse(fp)
            except Errors.HeaderParseError:
                msg = self._class()
                self._parsebody(msg, fp)
            container.attach(msg)
        else:
            container.set_payload(fp.read())


class HeaderParser(Parser):
    """A subclass of Parser, this one only meaningfully parses message headers.

    This class can be used if all you're interested in is the headers of a
    message.  While it consumes the message body, it does not parse it, but
    simply makes it available as a string payload.

    Parsing with this subclass can be considerably faster if all you're
    interested in is the message headers.
    """
    def _parsebody(self, container, fp):
        # Consume but do not parse, the body
        container.set_payload(fp.read())
-												_parsebody(): When adding subparts to a multipart container, make sure
that the first subpart added makes the payload a list object.
Otherwise, a multipart/* with only one subpart will not have the
proper structure.

											
										
										
											2002-01-27 02:48:02 -04:00
+								# Copyright (C) 2001,2002 Python Software Foundation
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								# Author: barry@zope.com (Barry Warsaw)
 								"""A parser of RFC 2822 and MIME email messages.
 								"""
-												I've thought about it some more, and I believe it is proper for the
email package's Parser to handle the three common line endings.
Certain protocols such as IMAP define CRLF line endings and it doesn't
make sense for the client app to have to normalize the line endings
before handing it message off to the Parser.

_parsebody(): Be more flexible in the matching of line endings for
finding the MIME separators.  Accept any of \r, \n and \r\n.  Note
that we do /not/ change the line endings in the payloads, we just
accept any of those three around MIME boundaries.

											
										
										
											2002-05-19 20:51:50 -03:00
+								import re
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								from cStringIO import StringIO
-												_parsebody(): When adding subparts to a multipart container, make sure
that the first subpart added makes the payload a list object.
Otherwise, a multipart/* with only one subpart will not have the
proper structure.

											
										
										
											2002-01-27 02:48:02 -04:00
+								from types import ListType
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
-												_parsebody(): Fix for the new message/rfc822 tree structure (the
parent is now a multipart with one element, the sub-message object).

											
										
										
											2002-06-02 16:12:03 -03:00
+								from email import Errors
 								from email import Message
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
 								EMPTYSTRING = ''
 								NL = '\n'
-												Give me back my page breaks.

											
										
										
											2001-10-04 14:05:11 -03:00
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								class Parser:
-												Parser.__init__(): The consensus on the mimelib-devel list is that
non-strict parsing should be the default.  Make it so.

											
										
										
											2002-07-19 19:25:34 -03:00
+								    def __init__(self, _class=Message.Message, strict=0):
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        """Parser of RFC 2822 and MIME email messages.
 								        Creates an in-memory object tree representing the email message, which
 								        can then be manipulated and turned over to a Generator to return the
 								        textual representation of the message.
 								        The string must be formatted as a block of RFC 2822 headers and header
 								        continuation lines, optionally preceeded by a `Unix-from' header.  The
 								        header block is terminated either by the end of the string or by a
 								        blank line.
 								        _class is the class to instantiate for new message objects when they
 								        must be created.  This class must have a constructor that can take
 								        zero arguments.  Default is Message.Message.
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
 								        Optional strict tells the parser to be strictly RFC compliant or to be
 								        more forgiving in parsing of ill-formatted MIME documents.  When
 								        non-strict mode is used, the parser will try to make up for missing or
 								        erroneous boundaries and other peculiarities seen in the wild.
-												Parser.__init__(): The consensus on the mimelib-devel list is that
non-strict parsing should be the default.  Make it so.

											
										
										
											2002-07-19 19:25:34 -03:00
+								        Default is non-strict parsing.
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        """
 								        self._class = _class
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								        self._strict = strict
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								    def parse(self, fp, headersonly=0):
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        root = self._class()
 								        self._parseheaders(root, fp)
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								        if not headersonly:
 								            self._parsebody(root, fp)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        return root
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								    def parsestr(self, text, headersonly=0):
 								        return self.parse(StringIO(text), headersonly=headersonly)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
 								    def _parseheaders(self, container, fp):
 								        # Parse the headers, returning a list of header/value pairs.  None as
 								        # the header means the Unix-From header.
 								        lastheader = ''
 								        lastvalue = []
 								        lineno = 0
 								        while 1:
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            # Don't strip the line before we test for the end condition,
 								            # because whitespace-only header lines are RFC compliant
 								            # continuation lines.
 								            line = fp.readline()
 								            if not line:
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                break
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            line = line.splitlines()[0]
 								            if not line:
 								                break
 								            # Ignore the trailing newline
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            lineno += 1
 								            # Check for initial Unix From_ line
 								            if line.startswith('From '):
 								                if lineno == 1:
 								                    container.set_unixfrom(line)
 								                    continue
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                elif self._strict:
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                    raise Errors.HeaderParseError(
 								                        'Unix-from in headers after first rfc822 header')
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                else:
 								                    # ignore the wierdly placed From_ line
 								                    # XXX: maybe set unixfrom anyway? or only if not already?
 								                    continue
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            # Header continuation line
 								            if line[0] in ' \t':
 								                if not lastheader:
 								                    raise Errors.HeaderParseError(
 								                        'Continuation line seen before first header')
 								                lastvalue.append(line)
 								                continue
 								            # Normal, non-continuation header.  BAW: this should check to make
 								            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
 								            # should expose the header matching algorithm in the API, and
 								            # allow for a non-strict parsing mode (that ignores the line
 								            # instead of raising the exception).
 								            i = line.find(':')
 								            if i < 0:
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                if self._strict:
 								                    raise Errors.HeaderParseError(
 								                        "Not a header, not a continuation: ``%s''"%line)
 								                elif lineno == 1 and line.startswith('--'):
 								                    # allow through duplicate boundary tags.
 								                    continue
 								                else:
 								                    raise Errors.HeaderParseError(
 								                        "Not a header, not a continuation: ``%s''"%line)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            if lastheader:
 								                container[lastheader] = NL.join(lastvalue)
 								            lastheader = line[:i]
 								            lastvalue = [line[i+1:].lstrip()]
 								        # Make sure we retain the last header
 								        if lastheader:
 								            container[lastheader] = NL.join(lastvalue)
 								    def _parsebody(self, container, fp):
 								        # Parse the body, but first split the payload on the content-type
 								        # boundary if present.
-												_parsebody(): Use get_boundary() and get_type().

    Also, add a clause to the big-if to handle message/delivery-status
    content types.  These create a message with subparts that are
    Message instances, which best represent the header blocks of this
    content type.

											
										
										
											2001-09-26 02:44:09 -03:00
+								        boundary = container.get_boundary()
 								        isdigest = (container.get_type() == 'multipart/digest')
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        # If there's a boundary, split the payload text into its constituent
 								        # parts and parse each separately.  Otherwise, just parse the rest of
 								        # the body as a single message.  Note: any exceptions raised in the
 								        # recursive parse need to have their line numbers coerced.
 								        if boundary:
 								            preamble = epilogue = None
 								            # Split into subparts.  The first boundary we're looking for won't
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								            # always have a leading newline since we're at the start of the
 								            # body text, and there's not always a preamble before the first
 								            # boundary.
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            separator = '--' + boundary
 								            payload = fp.read()
-												Whitespace normalization.

											
										
										
											2002-08-23 15:19:30 -03:00
+								            # We use an RE here because boundaries can have trailing
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								            # whitespace.
 								            mo = re.search(
 								                r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
 								                payload)
 								            if not mo:
-												_parsebody(): Instead of raising a BoundaryError when no start
boundary could be found -- in a lax parser -- the entire body is
assigned to the message payload.

											
										
										
											2002-09-10 13:14:56 -03:00
+								                if self._strict:
 								                    raise Errors.BoundaryError(
 								                        "Couldn't find starting boundary: %s" % boundary)
 								                container.set_payload(payload)
 								                return
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								            start = mo.start()
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            if start > 0:
 								                # there's some pre-MIME boundary preamble
 								                preamble = payload[0:start]
-												I've thought about it some more, and I believe it is proper for the
email package's Parser to handle the three common line endings.
Certain protocols such as IMAP define CRLF line endings and it doesn't
make sense for the client app to have to normalize the line endings
before handing it message off to the Parser.

_parsebody(): Be more flexible in the matching of line endings for
finding the MIME separators.  Accept any of \r, \n and \r\n.  Note
that we do /not/ change the line endings in the payloads, we just
accept any of those three around MIME boundaries.

											
										
										
											2002-05-19 20:51:50 -03:00
+								            # Find out what kind of line endings we're using
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								            start += len(mo.group('sep')) + len(mo.group('ws'))
-												I've thought about it some more, and I believe it is proper for the
email package's Parser to handle the three common line endings.
Certain protocols such as IMAP define CRLF line endings and it doesn't
make sense for the client app to have to normalize the line endings
before handing it message off to the Parser.

_parsebody(): Be more flexible in the matching of line endings for
finding the MIME separators.  Accept any of \r, \n and \r\n.  Note
that we do /not/ change the line endings in the payloads, we just
accept any of those three around MIME boundaries.

											
										
										
											2002-05-19 20:51:50 -03:00
+								            cre = re.compile('\r\n|\r|\n')
 								            mo = cre.search(payload, start)
 								            if mo:
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                start += len(mo.group(0))
-												I've thought about it some more, and I believe it is proper for the
email package's Parser to handle the three common line endings.
Certain protocols such as IMAP define CRLF line endings and it doesn't
make sense for the client app to have to normalize the line endings
before handing it message off to the Parser.

_parsebody(): Be more flexible in the matching of line endings for
finding the MIME separators.  Accept any of \r, \n and \r\n.  Note
that we do /not/ change the line endings in the payloads, we just
accept any of those three around MIME boundaries.

											
										
										
											2002-05-19 20:51:50 -03:00
+								            # We create a compiled regexp first because we need to be able to
 								            # specify the start position, and the module function doesn't
 								            # support this signature. :(
 								            cre = re.compile('(?P<sep>\r\n|\r|\n)' +
 								                             re.escape(separator) + '--')
 								            mo = cre.search(payload, start)
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								            if mo:
 								                terminator = mo.start()
 								                linesep = mo.group('sep')
 								                if mo.end() < len(payload):
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								                    # There's some post-MIME boundary epilogue
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                    epilogue = payload[mo.end():]
 								            elif self._strict:
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                raise Errors.BoundaryError(
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                        "Couldn't find terminating boundary: %s" % boundary)
 								            else:
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								                # Handle the case of no trailing boundary.  Check that it ends
 								                # in a blank line.  Some cases (spamspamspam) don't even have
 								                # that!
 								                mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                if not mo:
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								                    mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
 								                    if not mo:
 								                        raise Errors.BoundaryError(
 								                          'No terminating boundary and no trailing empty line')
 								                linesep = mo.group('sep')
 								                terminator = len(payload)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            # We split the textual payload on the boundary separator, which
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								            # includes the trailing newline. If the container is a
-												Whitespace normalization.

											
										
										
											2002-08-23 15:19:30 -03:00
+								            # multipart/digest then the subparts are by default message/rfc822
 								            # instead of text/plain.  In that case, they'll have a optional
 								            # block of MIME headers, then an empty line followed by the
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								            # message headers.
-												Anthony Baxter's cleanup patch.  Python project SF patch # 583190,
quoting:

  in non-strict mode, messages don't require a blank line at the end
  with a missing end-terminator. A single newline is sufficient now.

  Handle trailing whitespace at the end of a boundary. Had to switch
  from using string.split() to re.split()

  Handle whitespace on the end of a parameter list for Content-type.

  Handle whitespace on the end of a plain content-type header.

Specifically,

get_type(): Strip the content type string.

_get_params_preserve(): Strip the parameter names and values on both
sides.

_parsebody(): Lots of changes as described above, with some stylistic
changes by Barry (who hopefully didn't screw things up ;).

											
										
										
											2002-07-18 20:09:09 -03:00
+								            parts = re.split(
 								                linesep + re.escape(separator) + r'[ \t]*' + linesep,
 								                payload[start:terminator])
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            for part in parts:
-												Whitespace normalization.

											
										
										
											2002-08-23 15:19:30 -03:00
+								                if isdigest:
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                    if part[0] == linesep:
 								                        # There's no header block so create an empty message
 								                        # object as the container, and lop off the newline so
 								                        # we can parse the sub-subobject
 								                        msgobj = self._class()
 								                        part = part[1:]
 								                    else:
 								                        parthdrs, part = part.split(linesep+linesep, 1)
 								                        # msgobj in this case is the "message/rfc822" container
 								                        msgobj = self.parsestr(parthdrs, headersonly=1)
 								                    # while submsgobj is the message itself
 								                    submsgobj = self.parsestr(part)
 								                    msgobj.attach(submsgobj)
 								                    msgobj.set_default_type('message/rfc822')
 								                else:
 								                    msgobj = self.parsestr(part)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                container.preamble = preamble
 								                container.epilogue = epilogue
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								                container.attach(msgobj)
 								        elif container.get_main_type() == 'multipart':
 								            # Very bad.  A message is a multipart with no boundary!
 								            raise Errors.BoundaryError(
 								                'multipart message with no defined boundary')
-												_parsebody(): Use get_boundary() and get_type().

    Also, add a clause to the big-if to handle message/delivery-status
    content types.  These create a message with subparts that are
    Message instances, which best represent the header blocks of this
    content type.

											
										
										
											2001-09-26 02:44:09 -03:00
+								        elif container.get_type() == 'message/delivery-status':
 								            # This special kind of type contains blocks of headers separated
 								            # by a blank line.  We'll represent each header block as a
 								            # separate Message object
 								            blocks = []
 								            while 1:
 								                blockmsg = self._class()
 								                self._parseheaders(blockmsg, fp)
 								                if not len(blockmsg):
 								                    # No more header blocks left
 								                    break
 								                blocks.append(blockmsg)
 								            container.set_payload(blocks)
 								        elif container.get_main_type() == 'message':
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            # Create a container for the payload, but watch out for there not
 								            # being any headers left
 								            try:
 								                msg = self.parse(fp)
 								            except Errors.HeaderParseError:
 								                msg = self._class()
 								                self._parsebody(msg, fp)
-												_parsebody(): Fix for the new message/rfc822 tree structure (the
parent is now a multipart with one element, the sub-message object).

											
										
										
											2002-06-02 16:12:03 -03:00
+								            container.attach(msg)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        else:
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            container.set_payload(fp.read())
-												HeaderParser: A new subclass of Parser which only parses the message
headers.  It does not parse the body of the message, instead simply
assigning it as a string to the container's payload.  This can be much
faster when you're only interested in a message's header.

											
										
										
											2001-10-11 12:43:00 -03:00
 								class HeaderParser(Parser):
 								    """A subclass of Parser, this one only meaningfully parses message headers.
 								    This class can be used if all you're interested in is the headers of a
 								    message.  While it consumes the message body, it does not parse it, but
 								    simply makes it available as a string payload.
 								    Parsing with this subclass can be considerably faster if all you're
 								    interested in is the message headers.
 								    """
 								    def _parsebody(self, container, fp):
 								        # Consume but do not parse, the body
 								        container.set_payload(fp.read())