247 lines
7.3 KiB
Python
247 lines
7.3 KiB
Python
|
"""Generic MIME parser.
|
||
|
|
||
|
Classes:
|
||
|
|
||
|
MimeParser - Generic MIME parser.
|
||
|
|
||
|
Exceptions:
|
||
|
|
||
|
MimeError - Exception raised by MimeParser class.
|
||
|
|
||
|
XXX To do:
|
||
|
|
||
|
- Content-transfer-encoding issues
|
||
|
- Use Content-length header in rawbody()?
|
||
|
- Cache parts instead of reparsing each time
|
||
|
- The message strings in exceptions could use some work
|
||
|
|
||
|
"""
|
||
|
|
||
|
from types import * # Python types, not MIME types :-)
|
||
|
import string
|
||
|
import regex
|
||
|
import SubFile
|
||
|
import mimetools
|
||
|
|
||
|
|
||
|
MimeError = "MimeParser.MimeError" # Exception raised by this class
|
||
|
|
||
|
|
||
|
class MimeParser:
|
||
|
|
||
|
"""Generic MIME parser.
|
||
|
|
||
|
This requires a seekable file.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, fp):
|
||
|
"""Constructor: store the file pointer and parse the headers."""
|
||
|
self._fp = fp
|
||
|
self._start = fp.tell()
|
||
|
self._headers = h = mimetools.Message(fp)
|
||
|
self._bodystart = fp.tell()
|
||
|
self._multipart = h.getmaintype() == 'multipart'
|
||
|
|
||
|
def multipart(self):
|
||
|
"""Return whether this is a multipart message."""
|
||
|
return self._multipart
|
||
|
|
||
|
def headers(self):
|
||
|
"""Return the headers of the MIME message, as a Message object."""
|
||
|
return self._headers
|
||
|
|
||
|
def rawbody(self):
|
||
|
"""Return the raw body of the MIME message, as a file-like object.
|
||
|
|
||
|
This is a fairly low-level interface -- for a multipart
|
||
|
message, you'd have to parse the body yourself, and it doesn't
|
||
|
translate the Content-transfer-encoding.
|
||
|
|
||
|
"""
|
||
|
# XXX Use Content-length to set end if it exists?
|
||
|
return SubFile.SubFile(self._fp, self._bodystart)
|
||
|
|
||
|
def body(self):
|
||
|
"""Return the body of a 1-part MIME message, as a file-like object.
|
||
|
|
||
|
This should interpret the Content-transfer-encoding, if any
|
||
|
(XXX currently it doesn't).
|
||
|
|
||
|
"""
|
||
|
if self._multipart:
|
||
|
raise MimeError, "body() only works for 1-part messages"
|
||
|
return self.rawbody()
|
||
|
|
||
|
_re_content_length = regex.compile('content-length:[ \t]*\([0-9]+\)',
|
||
|
regex.casefold)
|
||
|
|
||
|
def rawparts(self):
|
||
|
"""Return the raw body parts of a multipart MIME message.
|
||
|
|
||
|
This returns a list of SubFile() objects corresponding to the
|
||
|
parts. Note that the phantom part before the first separator
|
||
|
is returned too, as list item 0. If the final part is not
|
||
|
followed by a terminator, it is ignored, and this error is not
|
||
|
reported. (XXX: the error should be raised).
|
||
|
|
||
|
"""
|
||
|
if not self._multipart:
|
||
|
raise MimeError, "[raw]parts() only works for multipart messages"
|
||
|
h = self._headers
|
||
|
separator = h.getparam('boundary')
|
||
|
if not separator:
|
||
|
raise MimeError, "multipart boundary not specified"
|
||
|
separator = "--" + separator
|
||
|
terminator = separator + "--"
|
||
|
ns = len(separator)
|
||
|
list = []
|
||
|
f = self._fp
|
||
|
start = f.tell()
|
||
|
clength = -1
|
||
|
bodystart = -1
|
||
|
inheaders = 0
|
||
|
while 1:
|
||
|
end = f.tell()
|
||
|
line = f.readline()
|
||
|
if not line:
|
||
|
break
|
||
|
if line[:2] != "--" or line[:ns] != separator:
|
||
|
if inheaders:
|
||
|
re = self._re_content_length
|
||
|
if re.match(line) > 0:
|
||
|
try:
|
||
|
clength = string.atoi(re.group(1))
|
||
|
except string.atoi_error:
|
||
|
pass
|
||
|
if not string.strip(line):
|
||
|
inheaders = 0
|
||
|
bodystart = f.tell()
|
||
|
if clength > 0:
|
||
|
# Skip binary data
|
||
|
f.read(clength)
|
||
|
continue
|
||
|
line = string.strip(line)
|
||
|
if line == terminator or line == separator:
|
||
|
if clength >= 0:
|
||
|
# The Content-length header determines the subfile size
|
||
|
end = bodystart + clength
|
||
|
else:
|
||
|
# The final newline is not part of the content
|
||
|
end = end-1
|
||
|
list.append(SubFile.SubFile(f, start, end))
|
||
|
start = f.tell()
|
||
|
clength = -1
|
||
|
inheaders = 1
|
||
|
if line == terminator:
|
||
|
break
|
||
|
return list
|
||
|
|
||
|
def parts(self):
|
||
|
"""Return the parsed body parts of a multipart MIME message.
|
||
|
|
||
|
This returns a list of MimeParser() instances corresponding to
|
||
|
the parts. The phantom part before the first separator is not
|
||
|
included.
|
||
|
|
||
|
"""
|
||
|
return map(MimeParser, self.rawparts()[1:])
|
||
|
|
||
|
def getsubpartbyposition(self, indices):
|
||
|
part = self
|
||
|
for i in indices:
|
||
|
part = part.parts()[i]
|
||
|
return part
|
||
|
|
||
|
def getsubpartbyid(self, id):
|
||
|
h = self._headers
|
||
|
cid = h.getheader('content-id')
|
||
|
if cid and cid == id:
|
||
|
return self
|
||
|
if self._multipart:
|
||
|
for part in self.parts():
|
||
|
parser = MimeParser(part)
|
||
|
hit = parser.getsubpartbyid(id)
|
||
|
if hit:
|
||
|
return hit
|
||
|
return None
|
||
|
|
||
|
def index(self):
|
||
|
"""Return an index of the MIME file.
|
||
|
|
||
|
This parses the entire file and returns index information
|
||
|
about it, in the form of a tuple
|
||
|
|
||
|
(ctype, headers, body)
|
||
|
|
||
|
where 'ctype' is the content type string of the message
|
||
|
(e.g. `text/plain' or `multipart/mixed') and 'headers' is a
|
||
|
Message instance containing the message headers (which should
|
||
|
be treated as read-only).
|
||
|
|
||
|
The 'body' item depends on the content type:
|
||
|
|
||
|
- If it is an atomic message (anything except for content type
|
||
|
multipart/*), it is the file-like object returned by
|
||
|
self.body().
|
||
|
|
||
|
- For a content type of multipart/*, it is the list of
|
||
|
MimeParser() objects returned by self.parts().
|
||
|
|
||
|
"""
|
||
|
if self._multipart:
|
||
|
body = self.parts()
|
||
|
else:
|
||
|
body = self.body()
|
||
|
return self._headers.gettype(), self._headers, body
|
||
|
|
||
|
|
||
|
def _show(parser, level=0):
|
||
|
"""Helper for _test()."""
|
||
|
ctype, headers, body = parser.index()
|
||
|
print ctype,
|
||
|
if type(body) == ListType:
|
||
|
nparts = len(body)
|
||
|
print "(%d part%s):" % (nparts, nparts != 1 and "s" or "")
|
||
|
n = 0
|
||
|
for part in body:
|
||
|
n = n+1
|
||
|
print "%*d." % (4*level+2, n),
|
||
|
_show(part, level+1)
|
||
|
else:
|
||
|
bodylines = body.readlines()
|
||
|
print "(%d header lines, %d body lines)" % (
|
||
|
len(headers.headers), len(bodylines))
|
||
|
for line in headers.headers + ['\n'] + bodylines:
|
||
|
if line[-1:] == '\n': line = line[:-1]
|
||
|
print " "*level + line
|
||
|
|
||
|
def _test(args = None):
|
||
|
"""Test program invoked when run as a script.
|
||
|
|
||
|
When a filename argument is specified, it reads from that file.
|
||
|
When no arguments are present, it defaults to 'testkp.txt' if it
|
||
|
exists, else it defaults to stdin.
|
||
|
|
||
|
"""
|
||
|
if not args:
|
||
|
import sys
|
||
|
args = sys.argv[1:]
|
||
|
if args:
|
||
|
fn = args[0]
|
||
|
else:
|
||
|
import os
|
||
|
fn = 'testkp.txt'
|
||
|
if not os.path.exists(fn):
|
||
|
fn = '-'
|
||
|
if fn == '-':
|
||
|
fp = sys.stdin
|
||
|
else:
|
||
|
fp = open(fn)
|
||
|
mp = MimeParser(fp)
|
||
|
_show(mp)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import sys
|
||
|
_test()
|