cpython/Lib/email/Parser.py

# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)

"""A parser of RFC 2822 and MIME email messages.
"""

from cStringIO import StringIO
from types import ListType

# Intrapackage imports
import Errors
import Message

EMPTYSTRING = ''
NL = '\n'


class Parser:
    def __init__(self, _class=Message.Message):
        """Parser of RFC 2822 and MIME email messages.

        Creates an in-memory object tree representing the email message, which
        can then be manipulated and turned over to a Generator to return the
        textual representation of the message.

        The string must be formatted as a block of RFC 2822 headers and header
        continuation lines, optionally preceeded by a `Unix-from' header.  The
        header block is terminated either by the end of the string or by a
        blank line.

        _class is the class to instantiate for new message objects when they
        must be created.  This class must have a constructor that can take
        zero arguments.  Default is Message.Message.
        """
        self._class = _class

    def parse(self, fp):
        root = self._class()
        self._parseheaders(root, fp)
        self._parsebody(root, fp)
        return root

    def parsestr(self, text):
        return self.parse(StringIO(text))

    def _parseheaders(self, container, fp):
        # Parse the headers, returning a list of header/value pairs.  None as
        # the header means the Unix-From header.
        lastheader = ''
        lastvalue = []
        lineno = 0
        while 1:
            # Don't strip the line before we test for the end condition,
            # because whitespace-only header lines are RFC compliant
            # continuation lines.
            line = fp.readline()
            if not line:
                break
            line = line.splitlines()[0]
            if not line:
                break
            # Ignore the trailing newline
            lineno += 1
            # Check for initial Unix From_ line
            if line.startswith('From '):
                if lineno == 1:
                    container.set_unixfrom(line)
                    continue
                else:
                    raise Errors.HeaderParseError(
                        'Unix-from in headers after first rfc822 header')
            # Header continuation line
            if line[0] in ' \t':
                if not lastheader:
                    raise Errors.HeaderParseError(
                        'Continuation line seen before first header')
                lastvalue.append(line)
                continue
            # Normal, non-continuation header.  BAW: this should check to make
            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
            # should expose the header matching algorithm in the API, and
            # allow for a non-strict parsing mode (that ignores the line
            # instead of raising the exception).
            i = line.find(':')
            if i < 0:
                raise Errors.HeaderParseError(
                    'Not a header, not a continuation')
            if lastheader:
                container[lastheader] = NL.join(lastvalue)
            lastheader = line[:i]
            lastvalue = [line[i+1:].lstrip()]
        # Make sure we retain the last header
        if lastheader:
            container[lastheader] = NL.join(lastvalue)

    def _parsebody(self, container, fp):
        # Parse the body, but first split the payload on the content-type
        # boundary if present.
        boundary = container.get_boundary()
        isdigest = (container.get_type() == 'multipart/digest')
        # If there's a boundary, split the payload text into its constituent
        # parts and parse each separately.  Otherwise, just parse the rest of
        # the body as a single message.  Note: any exceptions raised in the
        # recursive parse need to have their line numbers coerced.
        if boundary:
            preamble = epilogue = None
            # Split into subparts.  The first boundary we're looking for won't
            # have the leading newline since we're at the start of the body
            # text.
            separator = '--' + boundary
            payload = fp.read()
            start = payload.find(separator)
            if start < 0:
                raise Errors.BoundaryError(
                    "Couldn't find starting boundary: %s" % boundary)
            if start > 0:
                # there's some pre-MIME boundary preamble
                preamble = payload[0:start]
            start += len(separator) + 1 + isdigest
            terminator = payload.find('\n' + separator + '--', start)
            if terminator < 0:
                raise Errors.BoundaryError(
                    "Couldn't find terminating boundary: %s" % boundary)
            if terminator+len(separator)+3 < len(payload):
                # there's some post-MIME boundary epilogue
                epilogue = payload[terminator+len(separator)+3:]
            # We split the textual payload on the boundary separator, which
            # includes the trailing newline.  If the container is a
            # multipart/digest then the subparts are by default message/rfc822
            # instead of text/plain.  In that case, they'll have an extra
            # newline before the headers to distinguish the message's headers
            # from the subpart headers.
            if isdigest:
                separator += '\n\n'
            else:
                separator += '\n'
            parts = payload[start:terminator].split('\n' + separator)
            for part in parts:
                msgobj = self.parsestr(part)
                container.preamble = preamble
                container.epilogue = epilogue
                container.attach(msgobj)
        elif container.get_main_type() == 'multipart':
            # Very bad.  A message is a multipart with no boundary!
            raise Errors.BoundaryError(
                'multipart message with no defined boundary')
        elif container.get_type() == 'message/delivery-status':
            # This special kind of type contains blocks of headers separated
            # by a blank line.  We'll represent each header block as a
            # separate Message object
            blocks = []
            while 1:
                blockmsg = self._class()
                self._parseheaders(blockmsg, fp)
                if not len(blockmsg):
                    # No more header blocks left
                    break
                blocks.append(blockmsg)
            container.set_payload(blocks)
        elif container.get_main_type() == 'message':
            # Create a container for the payload, but watch out for there not
            # being any headers left
            try:
                msg = self.parse(fp)
            except Errors.HeaderParseError:
                msg = self._class()
                self._parsebody(msg, fp)
            container.set_payload(msg)
        else:
            container.set_payload(fp.read())


class HeaderParser(Parser):
    """A subclass of Parser, this one only meaningfully parses message headers.

    This class can be used if all you're interested in is the headers of a
    message.  While it consumes the message body, it does not parse it, but
    simply makes it available as a string payload.

    Parsing with this subclass can be considerably faster if all you're
    interested in is the message headers.
    """
    def _parsebody(self, container, fp):
        # Consume but do not parse, the body
        container.set_payload(fp.read())
-												_parsebody(): When adding subparts to a multipart container, make sure
that the first subpart added makes the payload a list object.
Otherwise, a multipart/* with only one subpart will not have the
proper structure.

											
										
										
											2002-01-27 02:48:02 -04:00
+								# Copyright (C) 2001,2002 Python Software Foundation
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								# Author: barry@zope.com (Barry Warsaw)
 								"""A parser of RFC 2822 and MIME email messages.
 								"""
 								from cStringIO import StringIO
-												_parsebody(): When adding subparts to a multipart container, make sure
that the first subpart added makes the payload a list object.
Otherwise, a multipart/* with only one subpart will not have the
proper structure.

											
										
										
											2002-01-27 02:48:02 -04:00
+								from types import ListType
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
 								# Intrapackage imports
 								import Errors
 								import Message
 								EMPTYSTRING = ''
 								NL = '\n'
-												Give me back my page breaks.

											
										
										
											2001-10-04 14:05:11 -03:00
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								class Parser:
 								    def __init__(self, _class=Message.Message):
 								        """Parser of RFC 2822 and MIME email messages.
 								        Creates an in-memory object tree representing the email message, which
 								        can then be manipulated and turned over to a Generator to return the
 								        textual representation of the message.
 								        The string must be formatted as a block of RFC 2822 headers and header
 								        continuation lines, optionally preceeded by a `Unix-from' header.  The
 								        header block is terminated either by the end of the string or by a
 								        blank line.
 								        _class is the class to instantiate for new message objects when they
 								        must be created.  This class must have a constructor that can take
 								        zero arguments.  Default is Message.Message.
 								        """
 								        self._class = _class
 								    def parse(self, fp):
 								        root = self._class()
 								        self._parseheaders(root, fp)
 								        self._parsebody(root, fp)
 								        return root
 								    def parsestr(self, text):
 								        return self.parse(StringIO(text))
 								    def _parseheaders(self, container, fp):
 								        # Parse the headers, returning a list of header/value pairs.  None as
 								        # the header means the Unix-From header.
 								        lastheader = ''
 								        lastvalue = []
 								        lineno = 0
 								        while 1:
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            # Don't strip the line before we test for the end condition,
 								            # because whitespace-only header lines are RFC compliant
 								            # continuation lines.
 								            line = fp.readline()
 								            if not line:
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								                break
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            line = line.splitlines()[0]
 								            if not line:
 								                break
 								            # Ignore the trailing newline
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            lineno += 1
 								            # Check for initial Unix From_ line
 								            if line.startswith('From '):
 								                if lineno == 1:
 								                    container.set_unixfrom(line)
 								                    continue
 								                else:
 								                    raise Errors.HeaderParseError(
 								                        'Unix-from in headers after first rfc822 header')
 								            # Header continuation line
 								            if line[0] in ' \t':
 								                if not lastheader:
 								                    raise Errors.HeaderParseError(
 								                        'Continuation line seen before first header')
 								                lastvalue.append(line)
 								                continue
 								            # Normal, non-continuation header.  BAW: this should check to make
 								            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
 								            # should expose the header matching algorithm in the API, and
 								            # allow for a non-strict parsing mode (that ignores the line
 								            # instead of raising the exception).
 								            i = line.find(':')
 								            if i < 0:
 								                raise Errors.HeaderParseError(
 								                    'Not a header, not a continuation')
 								            if lastheader:
 								                container[lastheader] = NL.join(lastvalue)
 								            lastheader = line[:i]
 								            lastvalue = [line[i+1:].lstrip()]
 								        # Make sure we retain the last header
 								        if lastheader:
 								            container[lastheader] = NL.join(lastvalue)
 								    def _parsebody(self, container, fp):
 								        # Parse the body, but first split the payload on the content-type
 								        # boundary if present.
-												_parsebody(): Use get_boundary() and get_type().

    Also, add a clause to the big-if to handle message/delivery-status
    content types.  These create a message with subparts that are
    Message instances, which best represent the header blocks of this
    content type.

											
										
										
											2001-09-26 02:44:09 -03:00
+								        boundary = container.get_boundary()
 								        isdigest = (container.get_type() == 'multipart/digest')
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        # If there's a boundary, split the payload text into its constituent
 								        # parts and parse each separately.  Otherwise, just parse the rest of
 								        # the body as a single message.  Note: any exceptions raised in the
 								        # recursive parse need to have their line numbers coerced.
 								        if boundary:
 								            preamble = epilogue = None
 								            # Split into subparts.  The first boundary we're looking for won't
 								            # have the leading newline since we're at the start of the body
 								            # text.
 								            separator = '--' + boundary
 								            payload = fp.read()
 								            start = payload.find(separator)
 								            if start < 0:
 								                raise Errors.BoundaryError(
 								                    "Couldn't find starting boundary: %s" % boundary)
 								            if start > 0:
 								                # there's some pre-MIME boundary preamble
 								                preamble = payload[0:start]
 								            start += len(separator) + 1 + isdigest
 								            terminator = payload.find('\n' + separator + '--', start)
 								            if terminator < 0:
 								                raise Errors.BoundaryError(
 								                    "Couldn't find terminating boundary: %s" % boundary)
 								            if terminator+len(separator)+3 < len(payload):
 								                # there's some post-MIME boundary epilogue
 								                epilogue = payload[terminator+len(separator)+3:]
 								            # We split the textual payload on the boundary separator, which
 								            # includes the trailing newline.  If the container is a
 								            # multipart/digest then the subparts are by default message/rfc822
 								            # instead of text/plain.  In that case, they'll have an extra
 								            # newline before the headers to distinguish the message's headers
 								            # from the subpart headers.
 								            if isdigest:
 								                separator += '\n\n'
 								            else:
 								                separator += '\n'
 								            parts = payload[start:terminator].split('\n' + separator)
 								            for part in parts:
 								                msgobj = self.parsestr(part)
 								                container.preamble = preamble
 								                container.epilogue = epilogue
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								                container.attach(msgobj)
 								        elif container.get_main_type() == 'multipart':
 								            # Very bad.  A message is a multipart with no boundary!
 								            raise Errors.BoundaryError(
 								                'multipart message with no defined boundary')
-												_parsebody(): Use get_boundary() and get_type().

    Also, add a clause to the big-if to handle message/delivery-status
    content types.  These create a message with subparts that are
    Message instances, which best represent the header blocks of this
    content type.

											
										
										
											2001-09-26 02:44:09 -03:00
+								        elif container.get_type() == 'message/delivery-status':
 								            # This special kind of type contains blocks of headers separated
 								            # by a blank line.  We'll represent each header block as a
 								            # separate Message object
 								            blocks = []
 								            while 1:
 								                blockmsg = self._class()
 								                self._parseheaders(blockmsg, fp)
 								                if not len(blockmsg):
 								                    # No more header blocks left
 								                    break
 								                blocks.append(blockmsg)
 								            container.set_payload(blocks)
 								        elif container.get_main_type() == 'message':
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								            # Create a container for the payload, but watch out for there not
 								            # being any headers left
 								            try:
 								                msg = self.parse(fp)
 								            except Errors.HeaderParseError:
 								                msg = self._class()
 								                self._parsebody(msg, fp)
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            container.set_payload(msg)
-												The email package version 1.0, prototyped as mimelib
<http://sf.net/projects/mimelib>.  There /are/ API differences between
mimelib and email, but most of the implementations are shared (except
where cool Py2.2 stuff like generators are used).

											
										
										
											2001-09-23 00:17:28 -03:00
+								        else:
-												Sync'ing with standalone email package 2.0.1.  This adds support for
non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).

											
										
										
											2002-04-10 18:01:31 -03:00
+								            container.set_payload(fp.read())
-												HeaderParser: A new subclass of Parser which only parses the message
headers.  It does not parse the body of the message, instead simply
assigning it as a string to the container's payload.  This can be much
faster when you're only interested in a message's header.

											
										
										
											2001-10-11 12:43:00 -03:00
 								class HeaderParser(Parser):
 								    """A subclass of Parser, this one only meaningfully parses message headers.
 								    This class can be used if all you're interested in is the headers of a
 								    message.  While it consumes the message body, it does not parse it, but
 								    simply makes it available as a string payload.
 								    Parsing with this subclass can be considerably faster if all you're
 								    interested in is the message headers.
 								    """
 								    def _parsebody(self, container, fp):
 								        # Consume but do not parse, the body
 								        container.set_payload(fp.read())