2002-01-27 02:48:02 -04:00
|
|
|
|
# Copyright (C) 2001,2002 Python Software Foundation
|
2001-09-23 00:17:28 -03:00
|
|
|
|
# Author: barry@zope.com (Barry Warsaw)
|
|
|
|
|
|
|
|
|
|
"""A parser of RFC 2822 and MIME email messages.
|
|
|
|
|
"""
|
|
|
|
|
|
2002-05-19 20:51:50 -03:00
|
|
|
|
import re
|
2001-09-23 00:17:28 -03:00
|
|
|
|
from cStringIO import StringIO
|
2002-01-27 02:48:02 -04:00
|
|
|
|
from types import ListType
|
2001-09-23 00:17:28 -03:00
|
|
|
|
|
2002-06-02 16:12:03 -03:00
|
|
|
|
from email import Errors
|
|
|
|
|
from email import Message
|
2001-09-23 00:17:28 -03:00
|
|
|
|
|
|
|
|
|
EMPTYSTRING = ''
|
|
|
|
|
NL = '\n'
|
|
|
|
|
|
2001-10-04 14:05:11 -03:00
|
|
|
|
|
2001-09-23 00:17:28 -03:00
|
|
|
|
class Parser:
|
2002-07-19 19:25:34 -03:00
|
|
|
|
def __init__(self, _class=Message.Message, strict=0):
|
2001-09-23 00:17:28 -03:00
|
|
|
|
"""Parser of RFC 2822 and MIME email messages.
|
|
|
|
|
|
|
|
|
|
Creates an in-memory object tree representing the email message, which
|
|
|
|
|
can then be manipulated and turned over to a Generator to return the
|
|
|
|
|
textual representation of the message.
|
|
|
|
|
|
|
|
|
|
The string must be formatted as a block of RFC 2822 headers and header
|
|
|
|
|
continuation lines, optionally preceeded by a `Unix-from' header. The
|
|
|
|
|
header block is terminated either by the end of the string or by a
|
|
|
|
|
blank line.
|
|
|
|
|
|
|
|
|
|
_class is the class to instantiate for new message objects when they
|
|
|
|
|
must be created. This class must have a constructor that can take
|
|
|
|
|
zero arguments. Default is Message.Message.
|
2002-07-08 23:50:02 -03:00
|
|
|
|
|
|
|
|
|
Optional strict tells the parser to be strictly RFC compliant or to be
|
|
|
|
|
more forgiving in parsing of ill-formatted MIME documents. When
|
|
|
|
|
non-strict mode is used, the parser will try to make up for missing or
|
|
|
|
|
erroneous boundaries and other peculiarities seen in the wild.
|
2002-07-19 19:25:34 -03:00
|
|
|
|
Default is non-strict parsing.
|
2001-09-23 00:17:28 -03:00
|
|
|
|
"""
|
|
|
|
|
self._class = _class
|
2002-07-08 23:50:02 -03:00
|
|
|
|
self._strict = strict
|
2001-09-23 00:17:28 -03:00
|
|
|
|
|
2002-07-08 23:50:02 -03:00
|
|
|
|
def parse(self, fp, headersonly=0):
|
2001-09-23 00:17:28 -03:00
|
|
|
|
root = self._class()
|
|
|
|
|
self._parseheaders(root, fp)
|
2002-07-08 23:50:02 -03:00
|
|
|
|
if not headersonly:
|
|
|
|
|
self._parsebody(root, fp)
|
2001-09-23 00:17:28 -03:00
|
|
|
|
return root
|
|
|
|
|
|
2002-07-08 23:50:02 -03:00
|
|
|
|
def parsestr(self, text, headersonly=0):
|
|
|
|
|
return self.parse(StringIO(text), headersonly=headersonly)
|
2001-09-23 00:17:28 -03:00
|
|
|
|
|
|
|
|
|
def _parseheaders(self, container, fp):
|
|
|
|
|
# Parse the headers, returning a list of header/value pairs. None as
|
|
|
|
|
# the header means the Unix-From header.
|
|
|
|
|
lastheader = ''
|
|
|
|
|
lastvalue = []
|
|
|
|
|
lineno = 0
|
|
|
|
|
while 1:
|
2002-04-10 18:01:31 -03:00
|
|
|
|
# Don't strip the line before we test for the end condition,
|
|
|
|
|
# because whitespace-only header lines are RFC compliant
|
|
|
|
|
# continuation lines.
|
|
|
|
|
line = fp.readline()
|
|
|
|
|
if not line:
|
2001-09-23 00:17:28 -03:00
|
|
|
|
break
|
2002-04-10 18:01:31 -03:00
|
|
|
|
line = line.splitlines()[0]
|
|
|
|
|
if not line:
|
|
|
|
|
break
|
|
|
|
|
# Ignore the trailing newline
|
2001-09-23 00:17:28 -03:00
|
|
|
|
lineno += 1
|
|
|
|
|
# Check for initial Unix From_ line
|
|
|
|
|
if line.startswith('From '):
|
|
|
|
|
if lineno == 1:
|
|
|
|
|
container.set_unixfrom(line)
|
|
|
|
|
continue
|
2002-07-08 23:50:02 -03:00
|
|
|
|
elif self._strict:
|
2001-09-23 00:17:28 -03:00
|
|
|
|
raise Errors.HeaderParseError(
|
|
|
|
|
'Unix-from in headers after first rfc822 header')
|
2002-07-08 23:50:02 -03:00
|
|
|
|
else:
|
|
|
|
|
# ignore the wierdly placed From_ line
|
|
|
|
|
# XXX: maybe set unixfrom anyway? or only if not already?
|
|
|
|
|
continue
|
2001-09-23 00:17:28 -03:00
|
|
|
|
# Header continuation line
|
|
|
|
|
if line[0] in ' \t':
|
|
|
|
|
if not lastheader:
|
|
|
|
|
raise Errors.HeaderParseError(
|
|
|
|
|
'Continuation line seen before first header')
|
|
|
|
|
lastvalue.append(line)
|
|
|
|
|
continue
|
|
|
|
|
# Normal, non-continuation header. BAW: this should check to make
|
|
|
|
|
# sure it's a legal header, e.g. doesn't contain spaces. Also, we
|
|
|
|
|
# should expose the header matching algorithm in the API, and
|
|
|
|
|
# allow for a non-strict parsing mode (that ignores the line
|
|
|
|
|
# instead of raising the exception).
|
|
|
|
|
i = line.find(':')
|
|
|
|
|
if i < 0:
|
2002-07-08 23:50:02 -03:00
|
|
|
|
if self._strict:
|
|
|
|
|
raise Errors.HeaderParseError(
|
|
|
|
|
"Not a header, not a continuation: ``%s''"%line)
|
|
|
|
|
elif lineno == 1 and line.startswith('--'):
|
|
|
|
|
# allow through duplicate boundary tags.
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
raise Errors.HeaderParseError(
|
|
|
|
|
"Not a header, not a continuation: ``%s''"%line)
|
2001-09-23 00:17:28 -03:00
|
|
|
|
if lastheader:
|
|
|
|
|
container[lastheader] = NL.join(lastvalue)
|
|
|
|
|
lastheader = line[:i]
|
|
|
|
|
lastvalue = [line[i+1:].lstrip()]
|
|
|
|
|
# Make sure we retain the last header
|
|
|
|
|
if lastheader:
|
|
|
|
|
container[lastheader] = NL.join(lastvalue)
|
|
|
|
|
|
|
|
|
|
def _parsebody(self, container, fp):
|
|
|
|
|
# Parse the body, but first split the payload on the content-type
|
|
|
|
|
# boundary if present.
|
2001-09-26 02:44:09 -03:00
|
|
|
|
boundary = container.get_boundary()
|
|
|
|
|
isdigest = (container.get_type() == 'multipart/digest')
|
2001-09-23 00:17:28 -03:00
|
|
|
|
# If there's a boundary, split the payload text into its constituent
|
|
|
|
|
# parts and parse each separately. Otherwise, just parse the rest of
|
|
|
|
|
# the body as a single message. Note: any exceptions raised in the
|
|
|
|
|
# recursive parse need to have their line numbers coerced.
|
|
|
|
|
if boundary:
|
|
|
|
|
preamble = epilogue = None
|
|
|
|
|
# Split into subparts. The first boundary we're looking for won't
|
2002-07-18 20:09:09 -03:00
|
|
|
|
# always have a leading newline since we're at the start of the
|
|
|
|
|
# body text, and there's not always a preamble before the first
|
|
|
|
|
# boundary.
|
2001-09-23 00:17:28 -03:00
|
|
|
|
separator = '--' + boundary
|
|
|
|
|
payload = fp.read()
|
2002-08-23 15:19:30 -03:00
|
|
|
|
# We use an RE here because boundaries can have trailing
|
2002-07-18 20:09:09 -03:00
|
|
|
|
# whitespace.
|
|
|
|
|
mo = re.search(
|
|
|
|
|
r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
|
|
|
|
|
payload)
|
|
|
|
|
if not mo:
|
2002-09-10 13:14:56 -03:00
|
|
|
|
if self._strict:
|
|
|
|
|
raise Errors.BoundaryError(
|
|
|
|
|
"Couldn't find starting boundary: %s" % boundary)
|
|
|
|
|
container.set_payload(payload)
|
|
|
|
|
return
|
2002-07-18 20:09:09 -03:00
|
|
|
|
start = mo.start()
|
2001-09-23 00:17:28 -03:00
|
|
|
|
if start > 0:
|
|
|
|
|
# there's some pre-MIME boundary preamble
|
|
|
|
|
preamble = payload[0:start]
|
2002-05-19 20:51:50 -03:00
|
|
|
|
# Find out what kind of line endings we're using
|
2002-07-18 20:09:09 -03:00
|
|
|
|
start += len(mo.group('sep')) + len(mo.group('ws'))
|
2002-05-19 20:51:50 -03:00
|
|
|
|
cre = re.compile('\r\n|\r|\n')
|
|
|
|
|
mo = cre.search(payload, start)
|
|
|
|
|
if mo:
|
2002-07-08 23:50:02 -03:00
|
|
|
|
start += len(mo.group(0))
|
2002-05-19 20:51:50 -03:00
|
|
|
|
# We create a compiled regexp first because we need to be able to
|
|
|
|
|
# specify the start position, and the module function doesn't
|
|
|
|
|
# support this signature. :(
|
|
|
|
|
cre = re.compile('(?P<sep>\r\n|\r|\n)' +
|
|
|
|
|
re.escape(separator) + '--')
|
|
|
|
|
mo = cre.search(payload, start)
|
2002-07-08 23:50:02 -03:00
|
|
|
|
if mo:
|
|
|
|
|
terminator = mo.start()
|
|
|
|
|
linesep = mo.group('sep')
|
|
|
|
|
if mo.end() < len(payload):
|
2002-07-18 20:09:09 -03:00
|
|
|
|
# There's some post-MIME boundary epilogue
|
2002-07-08 23:50:02 -03:00
|
|
|
|
epilogue = payload[mo.end():]
|
|
|
|
|
elif self._strict:
|
2001-09-23 00:17:28 -03:00
|
|
|
|
raise Errors.BoundaryError(
|
2002-07-08 23:50:02 -03:00
|
|
|
|
"Couldn't find terminating boundary: %s" % boundary)
|
|
|
|
|
else:
|
2002-07-18 20:09:09 -03:00
|
|
|
|
# Handle the case of no trailing boundary. Check that it ends
|
|
|
|
|
# in a blank line. Some cases (spamspamspam) don't even have
|
|
|
|
|
# that!
|
|
|
|
|
mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
|
2002-07-08 23:50:02 -03:00
|
|
|
|
if not mo:
|
2002-07-18 20:09:09 -03:00
|
|
|
|
mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
|
|
|
|
|
if not mo:
|
|
|
|
|
raise Errors.BoundaryError(
|
|
|
|
|
'No terminating boundary and no trailing empty line')
|
|
|
|
|
linesep = mo.group('sep')
|
|
|
|
|
terminator = len(payload)
|
2001-09-23 00:17:28 -03:00
|
|
|
|
# We split the textual payload on the boundary separator, which
|
2002-07-08 23:50:02 -03:00
|
|
|
|
# includes the trailing newline. If the container is a
|
2002-08-23 15:19:30 -03:00
|
|
|
|
# multipart/digest then the subparts are by default message/rfc822
|
|
|
|
|
# instead of text/plain. In that case, they'll have a optional
|
|
|
|
|
# block of MIME headers, then an empty line followed by the
|
2002-07-08 23:50:02 -03:00
|
|
|
|
# message headers.
|
2002-07-18 20:09:09 -03:00
|
|
|
|
parts = re.split(
|
|
|
|
|
linesep + re.escape(separator) + r'[ \t]*' + linesep,
|
|
|
|
|
payload[start:terminator])
|
2001-09-23 00:17:28 -03:00
|
|
|
|
for part in parts:
|
2002-08-23 15:19:30 -03:00
|
|
|
|
if isdigest:
|
2002-07-08 23:50:02 -03:00
|
|
|
|
if part[0] == linesep:
|
|
|
|
|
# There's no header block so create an empty message
|
|
|
|
|
# object as the container, and lop off the newline so
|
|
|
|
|
# we can parse the sub-subobject
|
|
|
|
|
msgobj = self._class()
|
|
|
|
|
part = part[1:]
|
|
|
|
|
else:
|
|
|
|
|
parthdrs, part = part.split(linesep+linesep, 1)
|
|
|
|
|
# msgobj in this case is the "message/rfc822" container
|
|
|
|
|
msgobj = self.parsestr(parthdrs, headersonly=1)
|
|
|
|
|
# while submsgobj is the message itself
|
|
|
|
|
submsgobj = self.parsestr(part)
|
|
|
|
|
msgobj.attach(submsgobj)
|
|
|
|
|
msgobj.set_default_type('message/rfc822')
|
|
|
|
|
else:
|
|
|
|
|
msgobj = self.parsestr(part)
|
2001-09-23 00:17:28 -03:00
|
|
|
|
container.preamble = preamble
|
|
|
|
|
container.epilogue = epilogue
|
2002-04-10 18:01:31 -03:00
|
|
|
|
container.attach(msgobj)
|
|
|
|
|
elif container.get_main_type() == 'multipart':
|
|
|
|
|
# Very bad. A message is a multipart with no boundary!
|
|
|
|
|
raise Errors.BoundaryError(
|
|
|
|
|
'multipart message with no defined boundary')
|
2001-09-26 02:44:09 -03:00
|
|
|
|
elif container.get_type() == 'message/delivery-status':
|
|
|
|
|
# This special kind of type contains blocks of headers separated
|
|
|
|
|
# by a blank line. We'll represent each header block as a
|
|
|
|
|
# separate Message object
|
|
|
|
|
blocks = []
|
|
|
|
|
while 1:
|
|
|
|
|
blockmsg = self._class()
|
|
|
|
|
self._parseheaders(blockmsg, fp)
|
|
|
|
|
if not len(blockmsg):
|
|
|
|
|
# No more header blocks left
|
|
|
|
|
break
|
|
|
|
|
blocks.append(blockmsg)
|
|
|
|
|
container.set_payload(blocks)
|
|
|
|
|
elif container.get_main_type() == 'message':
|
2001-09-23 00:17:28 -03:00
|
|
|
|
# Create a container for the payload, but watch out for there not
|
|
|
|
|
# being any headers left
|
|
|
|
|
try:
|
|
|
|
|
msg = self.parse(fp)
|
|
|
|
|
except Errors.HeaderParseError:
|
|
|
|
|
msg = self._class()
|
|
|
|
|
self._parsebody(msg, fp)
|
2002-06-02 16:12:03 -03:00
|
|
|
|
container.attach(msg)
|
2001-09-23 00:17:28 -03:00
|
|
|
|
else:
|
2002-04-10 18:01:31 -03:00
|
|
|
|
container.set_payload(fp.read())
|
2001-10-11 12:43:00 -03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HeaderParser(Parser):
|
|
|
|
|
"""A subclass of Parser, this one only meaningfully parses message headers.
|
|
|
|
|
|
|
|
|
|
This class can be used if all you're interested in is the headers of a
|
|
|
|
|
message. While it consumes the message body, it does not parse it, but
|
|
|
|
|
simply makes it available as a string payload.
|
|
|
|
|
|
|
|
|
|
Parsing with this subclass can be considerably faster if all you're
|
|
|
|
|
interested in is the message headers.
|
|
|
|
|
"""
|
|
|
|
|
def _parsebody(self, container, fp):
|
|
|
|
|
# Consume but do not parse, the body
|
|
|
|
|
container.set_payload(fp.read())
|