cpython/Lib/email/Parser.py

# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)

"""A parser of RFC 2822 and MIME email messages.
"""

import re
from cStringIO import StringIO
from types import ListType

from email import Errors
from email import Message

EMPTYSTRING = ''
NL = '\n'

try:
    True, False
except NameError:
    True = 1
    False = 0

NLCRE = re.compile('\r\n|\r|\n')

class TextUtil:
    """ A utility class for wrapping a file object and providing a 
        couple of additional useful functions.
    """

    def __init__(self, fp):
        self.fp = fp
        self.unread = []

    def readline(self):
        """ Return a line of data.

        If data has been pushed back with unreadline(), the most recently
        returned unreadline()d data will be returned.
        """
        if self.unread:
            return self.unread.pop()
        else:
            return self.fp.readline()

    def unreadline(self, line):
        """Push a line back into the object. 
        """
        self.unread.append(line)

    def peekline(self):
        """Non-destructively look at the next line"""
        line = self.readline()
        self.unreadline(line)
        return line

    def read(self):
        """Return the remaining data
        """
        r = self.fp.read()
        if self.unread:
            r = "\n".join(self.unread) + r
            self.unread = []
        return r

    def readuntil(self, re, afterblank=0, includematch=0):
        """Read a line at a time until we get the specified RE. 

        Returns the text up to (and including, if includematch is true) the 
        matched text, and the RE match object. If afterblank is true, 
        there must be a blank line before the matched text. Moves current 
        filepointer to the line following the matched line. If we reach 
        end-of-file, return what we've got so far, and return None as the
        RE match object.
        """
        prematch = []
        blankseen = 0
        while 1:
            line = self.readline()
            if not line:
                # end of file
                return EMPTYSTRING.join(prematch), None
            if afterblank:
                if NLCRE.match(line):
                    blankseen = 1
                    continue
                else:
                    blankseen = 0
            m = re.match(line)
            if (m and not afterblank) or (m and afterblank and blankseen):
                if includematch:
                    prematch.append(line)
                return EMPTYSTRING.join(prematch), m
            prematch.append(line)


class Parser:
    def __init__(self, _class=Message.Message, strict=False):
        """Parser of RFC 2822 and MIME email messages.

        Creates an in-memory object tree representing the email message, which
        can then be manipulated and turned over to a Generator to return the
        textual representation of the message.

        The string must be formatted as a block of RFC 2822 headers and header
        continuation lines, optionally preceeded by a `Unix-from' header.  The
        header block is terminated either by the end of the string or by a
        blank line.

        _class is the class to instantiate for new message objects when they
        must be created.  This class must have a constructor that can take
        zero arguments.  Default is Message.Message.

        Optional strict tells the parser to be strictly RFC compliant or to be
        more forgiving in parsing of ill-formatted MIME documents.  When
        non-strict mode is used, the parser will try to make up for missing or
        erroneous boundaries and other peculiarities seen in the wild.
        Default is non-strict parsing.
        """
        self._class = _class
        self._strict = strict

    def parse(self, fp, headersonly=False):
        """Create a message structure from the data in a file.

        Reads all the data from the file and returns the root of the message
        structure.  Optional headersonly is a flag specifying whether to stop
        parsing after reading the headers or not.  The default is False,
        meaning it parses the entire contents of the file.
        """
        root = self._class()
        fp = TextUtil(fp)
        self._parseheaders(root, fp)
        if not headersonly:
            obj = self._parsemessage(root, fp)
            trailer = fp.read()
            if obj and trailer:
                self._attach_trailer(obj, trailer)
        return root

    def parsestr(self, text, headersonly=False):
        """Create a message structure from a string.

        Returns the root of the message structure.  Optional headersonly is a
        flag specifying whether to stop parsing after reading the headers or
        not.  The default is False, meaning it parses the entire contents of
        the file.
        """
        return self.parse(StringIO(text), headersonly=headersonly)

    def _parseheaders(self, container, fp):
        # Parse the headers, returning a list of header/value pairs.  None as
        # the header means the Unix-From header.
        lastheader = ''
        lastvalue = []
        lineno = 0
        while True:
            # Don't strip the line before we test for the end condition,
            # because whitespace-only header lines are RFC compliant
            # continuation lines.
            line = fp.readline()
            if not line:
                break
            line = line.splitlines()[0]
            if not line:
                break
            # Ignore the trailing newline
            lineno += 1
            # Check for initial Unix From_ line
            if line.startswith('From '):
                if lineno == 1:
                    container.set_unixfrom(line)
                    continue
                elif self._strict:
                    raise Errors.HeaderParseError(
                        'Unix-from in headers after first rfc822 header')
                else:
                    # ignore the wierdly placed From_ line
                    # XXX: maybe set unixfrom anyway? or only if not already?
                    continue
            # Header continuation line
            if line[0] in ' \t':
                if not lastheader:
                    raise Errors.HeaderParseError(
                        'Continuation line seen before first header')
                lastvalue.append(line)
                continue
            # Normal, non-continuation header.  BAW: this should check to make
            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
            # should expose the header matching algorithm in the API, and
            # allow for a non-strict parsing mode (that ignores the line
            # instead of raising the exception).
            i = line.find(':')
            if i < 0:
                if self._strict:
                    raise Errors.HeaderParseError(
                        "Not a header, not a continuation: ``%s''" % line)
                elif lineno == 1 and line.startswith('--'):
                    # allow through duplicate boundary tags.
                    continue
                else:
                    # There was no separating blank line as mandated by RFC
                    # 2822, but we're in non-strict mode.  So just offer up
                    # this current line as the first body line.
                    fp.unreadline(line)
                    break
            if lastheader:
                container[lastheader] = NL.join(lastvalue)
            lastheader = line[:i]
            lastvalue = [line[i+1:].lstrip()]
        # Make sure we retain the last header
        if lastheader:
            container[lastheader] = NL.join(lastvalue)
        return 

    def _parsemessage(self, container, fp):
        # Parse the body. We walk through the body from top to bottom,
        # keeping track of the current multipart nesting as we go.
        # We return the object that gets the data at the end of this 
        # block.
        boundary = container.get_boundary()
        isdigest = (container.get_content_type() == 'multipart/digest')
        if boundary: 
            separator = '--' + boundary
            boundaryRE = re.compile(
                    r'(?P<sep>' + re.escape(separator) + 
                    r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
            preamble, matchobj = fp.readuntil(boundaryRE)
            if not matchobj:
                # Broken - we hit the end of file. Just set the body 
                # to the text.
                container.set_payload(preamble)
                return container
            if preamble:
                container.preamble = preamble
            else:
                # The module docs specify an empty preamble is None, not ''
                container.preamble = None
            while 1:
                subobj = self._class()
                if isdigest:
                    subobj.set_default_type('message/rfc822')
                    firstline = fp.peekline()
                    if firstline.strip():
                        # we have MIME headers. all good. 
                        self._parseheaders(subobj, fp)
                    else:
                        # no MIME headers. this is allowed for multipart/digest
                        # Consume the extra blank line
                        fp.readline()
                        pass
                else:
                    self._parseheaders(subobj, fp)
                container.attach(subobj)
                maintype = subobj.get_content_maintype()
                hassubparts = (subobj.get_content_maintype() in 
                                                ( "message", "multipart" ))
                if hassubparts:
                    subobj = self._parsemessage(subobj, fp)

                trailer, matchobj = fp.readuntil(boundaryRE)
                if matchobj is None or trailer:
                    mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
                    if not mo:
                        mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
                        if not mo:
                            raise Errors.BoundaryError(
                          'No terminating boundary and no trailing empty line')
                    linesep = mo.group('sep')
                    trailer = trailer[:-len(linesep)]
                if trailer:
                    self._attach_trailer(subobj, trailer)
                if matchobj is None or matchobj.group('end'):
                    # That was the last piece of data. Let our caller attach
                    # the epilogue to us. But before we do that, push the
                    # line ending of the match group back into the readline
                    # buffer, as it's part of the epilogue.
                    if matchobj:
                        fp.unreadline(matchobj.group('linesep'))
                    return container

        elif container.get_content_maintype() == "multipart":
            # Very bad.  A message is a multipart with no boundary!
            raise Errors.BoundaryError(
                    'multipart message with no defined boundary')
        elif container.get_content_maintype() == "message":
            ct = container.get_content_type()
            if ct == "message/rfc822":
                submessage = self._class()
                self._parseheaders(submessage, fp)
                self._parsemessage(submessage, fp)
                container.attach(submessage)
                return submessage
            elif ct == "message/delivery-status":
                # This special kind of type contains blocks of headers 
                # separated by a blank line.  We'll represent each header 
                # block as a separate Message object
                while 1:
                    nextblock = self._class()
                    self._parseheaders(nextblock, fp)
                    container.attach(nextblock)
                    # next peek ahead to see whether we've hit the end or not
                    nextline = fp.peekline()
                    if nextline[:2] == "--":
                        break
                return container
            else:
                # Other sort of message object (e.g. external-body)
                msg = self._class()
                self._parsemessage(msg, fp)
                container.attach(msg)
                return msg
        else:
            # single body section. We let our caller set the payload.
            return container

    def _attach_trailer(self, obj, trailer):
        if obj.get_content_maintype() in ("message", "multipart"):
            obj.epilogue = trailer
        else:
            obj.set_payload(trailer)


class HeaderParser(Parser):
    """A subclass of Parser, this one only meaningfully parses message headers.

    This class can be used if all you're interested in is the headers of a
    message.  While it consumes the message body, it does not parse it, but
    simply makes it available as a string payload.

    Parsing with this subclass can be considerably faster if all you're
    interested in is the message headers.
    """
    def _parsemessage(self, container, fp):
        # Consume but do not parse, the body
        text = fp.read()
        container.set_payload(text)
        return None
-												_parsebody(): When adding subparts to a multipart container, make sure
that the first subpart added makes the payload a list object.
Otherwise, a multipart/* with only one subpart will not have the
proper structure.

											
										
										
											2002-01-27 02:48:02 -04:00
+								# Copyright (C) 2001,2002 Python Software Foundation
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								# Author: barry@zope.com (Barry Warsaw)
 								"""A parser of RFC 2822 and MIME email messages.
 								"""
-												I've thought about it some more, and I believe it is proper for the
email package's Parser to handle the three common line endings.
Certain protocols such as IMAP define CRLF line endings and it doesn't
make sense for the client app to have to normalize the line endings
before handing it message off to the Parser.

_parsebody(): Be more flexible in the matching of line endings for
finding the MIME separators.  Accept any of \r, \n and \r\n.  Note
that we do /not/ change the line endings in the payloads, we just
accept any of those three around MIME boundaries.

											
										
										
											2002-05-19 20:51:50 -03:00
+								import re
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								from cStringIO import StringIO
-												_parsebody(): When adding subparts to a multipart container, make sure
that the first subpart added makes the payload a list object.
Otherwise, a multipart/* with only one subpart will not have the
proper structure.

											
										
										
											2002-01-27 02:48:02 -04:00
+								from types import ListType
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
-												_parsebody(): Fix for the new message/rfc822 tree structure (the
parent is now a multipart with one element, the sub-message object).

											
										
										
											2002-06-02 16:12:03 -03:00
+								from email import Errors
 								from email import Message
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
 								EMPTYSTRING = ''
 								NL = '\n'
-												Use True/False everywhere.

											
										
										
											2002-09-28 17:44:58 -03:00
+								try:
 								    True, False
 								except NameError:
 								    True = 1
 								    False = 0
-												Merge of the folding-reimpl-branch.  Specific changes,

Rename a constant.

											
										
										
											2003-03-06 01:25:35 -04:00
+								NLCRE = re.compile('\r\n|\r|\n')
-												_parsebody(): Use get_content_type() instead of the deprecated
get_type().  Also, one of the regular expressions is constant so might
as well make it a module global.  And, when splitting up digests,
handle lineseps that are longer than 1 character in length
(e.g. \r\n).

											
										
										
											2002-10-07 14:27:35 -03:00
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								class TextUtil:
 								    """ A utility class for wrapping a file object and providing a
 								        couple of additional useful functions.
 								    """
 								    def __init__(self, fp):
 								        self.fp = fp
 								        self.unread = []
 								    def readline(self):
 								        """ Return a line of data.
 								        If data has been pushed back with unreadline(), the most recently
 								        returned unreadline()d data will be returned.
 								        """
 								        if self.unread:
 								            return self.unread.pop()
 								        else:
 								            return self.fp.readline()
 								    def unreadline(self, line):
 								        """Push a line back into the object.
 								        """
 								        self.unread.append(line)
 								    def peekline(self):
 								        """Non-destructively look at the next line"""
 								        line = self.readline()
 								        self.unreadline(line)
 								        return line
 								    def read(self):
 								        """Return the remaining data
 								        """
 								        r = self.fp.read()
 								        if self.unread:
 								            r = "\n".join(self.unread) + r
 								            self.unread = []
 								        return r
 								    def readuntil(self, re, afterblank=0, includematch=0):
 								        """Read a line at a time until we get the specified RE.
 								        Returns the text up to (and including, if includematch is true) the
 								        matched text, and the RE match object. If afterblank is true,
 								        there must be a blank line before the matched text. Moves current
 								        filepointer to the line following the matched line. If we reach
 								        end-of-file, return what we've got so far, and return None as the
 								        RE match object.
 								        """
 								        prematch = []
 								        blankseen = 0
 								        while 1:
 								            line = self.readline()
 								            if not line:
 								                # end of file
 								                return EMPTYSTRING.join(prematch), None
 								            if afterblank:
 								                if NLCRE.match(line):
 								                    blankseen = 1
 								                    continue
 								                else:
 								                    blankseen = 0
 								            m = re.match(line)
 								            if (m and not afterblank) or (m and afterblank and blankseen):
 								                if includematch:
 								                    prematch.append(line)
 								                return EMPTYSTRING.join(prematch), m
 								            prematch.append(line)
-												Use True/False everywhere.

											
										
										
											2002-09-28 17:44:58 -03:00
-												Give me back my page breaks.

											
										
										
											2001-10-04 14:05:11 -03:00
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								class Parser:
-												Use True/False everywhere.

											
										
										
											2002-09-28 17:44:58 -03:00
+								    def __init__(self, _class=Message.Message, strict=False):
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        """Parser of RFC 2822 and MIME email messages.
 								        Creates an in-memory object tree representing the email message, which
 								        can then be manipulated and turned over to a Generator to return the
 								        textual representation of the message.
 								        The string must be formatted as a block of RFC 2822 headers and header
 								        continuation lines, optionally preceeded by a `Unix-from' header.  The
 								        header block is terminated either by the end of the string or by a
 								        blank line.
 								        _class is the class to instantiate for new message objects when they
 								        must be created.  This class must have a constructor that can take
 								        zero arguments.  Default is Message.Message.
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
 								        Optional strict tells the parser to be strictly RFC compliant or to be
 								        more forgiving in parsing of ill-formatted MIME documents.  When
 								        non-strict mode is used, the parser will try to make up for missing or
 								        erroneous boundaries and other peculiarities seen in the wild.
-												Parser.__init__(): The consensus on the mimelib-devel list is that
non-strict parsing should be the default.  Make it so.

											
										
										
											2002-07-19 19:25:34 -03:00
+								        Default is non-strict parsing.
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        """
 								        self._class = _class
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								        self._strict = strict
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
-												Use True/False everywhere.

											
										
										
											2002-09-28 17:44:58 -03:00
+								    def parse(self, fp, headersonly=False):
-												Docstring consistency with the updated .tex files.

											
										
										
											2002-09-30 17:07:22 -03:00
+								        """Create a message structure from the data in a file.
 								        Reads all the data from the file and returns the root of the message
 								        structure.  Optional headersonly is a flag specifying whether to stop
 								        parsing after reading the headers or not.  The default is False,
 								        meaning it parses the entire contents of the file.
 								        """
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        root = self._class()
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								        fp = TextUtil(fp)
 								        self._parseheaders(root, fp)
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								        if not headersonly:
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								            obj = self._parsemessage(root, fp)
 								            trailer = fp.read()
 								            if obj and trailer:
 								                self._attach_trailer(obj, trailer)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        return root
-												Use True/False everywhere.

											
										
										
											2002-09-28 17:44:58 -03:00
+								    def parsestr(self, text, headersonly=False):
-												Docstring consistency with the updated .tex files.

											
										
										
											2002-09-30 17:07:22 -03:00
+								        """Create a message structure from a string.
 								        Returns the root of the message structure.  Optional headersonly is a
 								        flag specifying whether to stop parsing after reading the headers or
 								        not.  The default is False, meaning it parses the entire contents of
 								        the file.
 								        """
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								        return self.parse(StringIO(text), headersonly=headersonly)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
 								    def _parseheaders(self, container, fp):
 								        # Parse the headers, returning a list of header/value pairs.  None as
 								        # the header means the Unix-From header.
 								        lastheader = ''
 								        lastvalue = []
 								        lineno = 0
-												Use True/False everywhere.

											
										
										
											2002-09-28 17:44:58 -03:00
+								        while True:
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            # Don't strip the line before we test for the end condition,
 								            # because whitespace-only header lines are RFC compliant
 								            # continuation lines.
 								            line = fp.readline()
 								            if not line:
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                break
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            line = line.splitlines()[0]
 								            if not line:
 								                break
 								            # Ignore the trailing newline
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            lineno += 1
 								            # Check for initial Unix From_ line
 								            if line.startswith('From '):
 								                if lineno == 1:
 								                    container.set_unixfrom(line)
 								                    continue
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                elif self._strict:
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                    raise Errors.HeaderParseError(
 								                        'Unix-from in headers after first rfc822 header')
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                else:
 								                    # ignore the wierdly placed From_ line
 								                    # XXX: maybe set unixfrom anyway? or only if not already?
 								                    continue
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            # Header continuation line
 								            if line[0] in ' \t':
 								                if not lastheader:
 								                    raise Errors.HeaderParseError(
 								                        'Continuation line seen before first header')
 								                lastvalue.append(line)
 								                continue
 								            # Normal, non-continuation header.  BAW: this should check to make
 								            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
 								            # should expose the header matching algorithm in the API, and
 								            # allow for a non-strict parsing mode (that ignores the line
 								            # instead of raising the exception).
 								            i = line.find(':')
 								            if i < 0:
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                if self._strict:
 								                    raise Errors.HeaderParseError(
-												parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527,
where in lax parsing, the first non-header line after a header block
(e.g. the first line not containing a colon, and not a continuation),
can be treated as the first body line, even without the RFC mandated
blank line separator.

rfc822 had this behavior, and I vaguely remember problems with this,
but can't remember details.  In any event, all the tests still pass,
so I guess we'll find out. ;/

This patch works by returning the non-header, non-continuation line
from _parseheader() and using that as the first header line prepended
to fp.read() if given.  It's usually None.

We use this approach instead of trying to seek/tell the file-like
object.

											
										
										
											2002-11-05 17:44:06 -04:00
+								                        "Not a header, not a continuation: ``%s''" % line)
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                elif lineno == 1 and line.startswith('--'):
 								                    # allow through duplicate boundary tags.
 								                    continue
 								                else:
-												parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527,
where in lax parsing, the first non-header line after a header block
(e.g. the first line not containing a colon, and not a continuation),
can be treated as the first body line, even without the RFC mandated
blank line separator.

rfc822 had this behavior, and I vaguely remember problems with this,
but can't remember details.  In any event, all the tests still pass,
so I guess we'll find out. ;/

This patch works by returning the non-header, non-continuation line
from _parseheader() and using that as the first header line prepended
to fp.read() if given.  It's usually None.

We use this approach instead of trying to seek/tell the file-like
object.

											
										
										
											2002-11-05 17:44:06 -04:00
+								                    # There was no separating blank line as mandated by RFC
 								                    # 2822, but we're in non-strict mode.  So just offer up
 								                    # this current line as the first body line.
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								                    fp.unreadline(line)
-												parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527,
where in lax parsing, the first non-header line after a header block
(e.g. the first line not containing a colon, and not a continuation),
can be treated as the first body line, even without the RFC mandated
blank line separator.

rfc822 had this behavior, and I vaguely remember problems with this,
but can't remember details.  In any event, all the tests still pass,
so I guess we'll find out. ;/

This patch works by returning the non-header, non-continuation line
from _parseheader() and using that as the first header line prepended
to fp.read() if given.  It's usually None.

We use this approach instead of trying to seek/tell the file-like
object.

											
										
										
											2002-11-05 17:44:06 -04:00
+								                    break
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            if lastheader:
 								                container[lastheader] = NL.join(lastvalue)
 								            lastheader = line[:i]
 								            lastvalue = [line[i+1:].lstrip()]
 								        # Make sure we retain the last header
 								        if lastheader:
 								            container[lastheader] = NL.join(lastvalue)
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								        return
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								    def _parsemessage(self, container, fp):
 								        # Parse the body. We walk through the body from top to bottom,
 								        # keeping track of the current multipart nesting as we go.
 								        # We return the object that gets the data at the end of this
 								        # block.
-												_parsebody(): Use get_boundary() and get_type().

    Also, add a clause to the big-if to handle message/delivery-status
    content types.  These create a message with subparts that are
    Message instances, which best represent the header blocks of this
    content type.

											
										
										
											2001-09-26 02:44:09 -03:00
+								        boundary = container.get_boundary()
-												_parsebody(): Use get_content_type() instead of the deprecated
get_type().  Also, one of the regular expressions is constant so might
as well make it a module global.  And, when splitting up digests,
handle lineseps that are longer than 1 character in length
(e.g. \r\n).

											
										
										
											2002-10-07 14:27:35 -03:00
+								        isdigest = (container.get_content_type() == 'multipart/digest')
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								        if boundary:
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            separator = '--' + boundary
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								            boundaryRE = re.compile(
 								                    r'(?P<sep>' + re.escape(separator) +
 								                    r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
 								            preamble, matchobj = fp.readuntil(boundaryRE)
 								            if not matchobj:
 								                # Broken - we hit the end of file. Just set the body
 								                # to the text.
 								                container.set_payload(preamble)
 								                return container
 								            if preamble:
 								                container.preamble = preamble
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								            else:
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								                # The module docs specify an empty preamble is None, not ''
 								                container.preamble = None
 								            while 1:
 								                subobj = self._class()
-												Whitespace normalization.

											
										
										
											2002-08-23 15:19:30 -03:00
+								                if isdigest:
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								                    subobj.set_default_type('message/rfc822')
 								                    firstline = fp.peekline()
 								                    if firstline.strip():
 								                        # we have MIME headers. all good.
 								                        self._parseheaders(subobj, fp)
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                    else:
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								                        # no MIME headers. this is allowed for multipart/digest
 								                        # Consume the extra blank line
 								                        fp.readline()
 								                        pass
-												Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

											
										
										
											2002-07-08 23:50:02 -03:00
+								                else:
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								                    self._parseheaders(subobj, fp)
 								                container.attach(subobj)
 								                maintype = subobj.get_content_maintype()
 								                hassubparts = (subobj.get_content_maintype() in
 								                                                ( "message", "multipart" ))
 								                if hassubparts:
 								                    subobj = self._parsemessage(subobj, fp)
 								                trailer, matchobj = fp.readuntil(boundaryRE)
 								                if matchobj is None or trailer:
 								                    mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
 								                    if not mo:
 								                        mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
 								                        if not mo:
 								                            raise Errors.BoundaryError(
 								                          'No terminating boundary and no trailing empty line')
 								                    linesep = mo.group('sep')
 								                    trailer = trailer[:-len(linesep)]
 								                if trailer:
 								                    self._attach_trailer(subobj, trailer)
 								                if matchobj is None or matchobj.group('end'):
 								                    # That was the last piece of data. Let our caller attach
 								                    # the epilogue to us. But before we do that, push the
 								                    # line ending of the match group back into the readline
 								                    # buffer, as it's part of the epilogue.
 								                    if matchobj:
 								                        fp.unreadline(matchobj.group('linesep'))
 								                    return container
 								        elif container.get_content_maintype() == "multipart":
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            # Very bad.  A message is a multipart with no boundary!
 								            raise Errors.BoundaryError(
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								                    'multipart message with no defined boundary')
 								        elif container.get_content_maintype() == "message":
 								            ct = container.get_content_type()
 								            if ct == "message/rfc822":
 								                submessage = self._class()
 								                self._parseheaders(submessage, fp)
 								                self._parsemessage(submessage, fp)
 								                container.attach(submessage)
 								                return submessage
 								            elif ct == "message/delivery-status":
 								                # This special kind of type contains blocks of headers
 								                # separated by a blank line.  We'll represent each header
 								                # block as a separate Message object
 								                while 1:
 								                    nextblock = self._class()
 								                    self._parseheaders(nextblock, fp)
 								                    container.attach(nextblock)
 								                    # next peek ahead to see whether we've hit the end or not
 								                    nextline = fp.peekline()
 								                    if nextline[:2] == "--":
 								                        break
 								                return container
 								            else:
 								                # Other sort of message object (e.g. external-body)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                msg = self._class()
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								                self._parsemessage(msg, fp)
 								                container.attach(msg)
 								                return msg
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        else:
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								            # single body section. We let our caller set the payload.
 								            return container
-												HeaderParser: A new subclass of Parser which only parses the message
headers.  It does not parse the body of the message, instead simply
assigning it as a string to the container's payload.  This can be much
faster when you're only interested in a message's header.

											
										
										
											2001-10-11 12:43:00 -03:00
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								    def _attach_trailer(self, obj, trailer):
 								        if obj.get_content_maintype() in ("message", "multipart"):
 								            obj.epilogue = trailer
 								        else:
 								            obj.set_payload(trailer)
-												HeaderParser: A new subclass of Parser which only parses the message
headers.  It does not parse the body of the message, instead simply
assigning it as a string to the container's payload.  This can be much
faster when you're only interested in a message's header.

											
										
										
											2001-10-11 12:43:00 -03:00
 								class HeaderParser(Parser):
 								    """A subclass of Parser, this one only meaningfully parses message headers.
 								    This class can be used if all you're interested in is the headers of a
 								    message.  While it consumes the message body, it does not parse it, but
 								    simply makes it available as a string payload.
 								    Parsing with this subclass can be considerably faster if all you're
 								    interested in is the message headers.
 								    """
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								    def _parsemessage(self, container, fp):
-												HeaderParser: A new subclass of Parser which only parses the message
headers.  It does not parse the body of the message, instead simply
assigning it as a string to the container's payload.  This can be much
faster when you're only interested in a message's header.

											
										
										
											2001-10-11 12:43:00 -03:00
+								        # Consume but do not parse, the body
-												parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527,
where in lax parsing, the first non-header line after a header block
(e.g. the first line not containing a colon, and not a continuation),
can be treated as the first body line, even without the RFC mandated
blank line separator.

rfc822 had this behavior, and I vaguely remember problems with this,
but can't remember details.  In any event, all the tests still pass,
so I guess we'll find out. ;/

This patch works by returning the non-header, non-continuation line
from _parseheader() and using that as the first header line prepended
to fp.read() if given.  It's usually None.

We use this approach instead of trying to seek/tell the file-like
object.

											
										
										
											2002-11-05 17:44:06 -04:00
+								        text = fp.read()
 								        container.set_payload(text)
-												Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================

											
										
										
											2004-03-20 13:31:29 -04:00
+								        return None