363 lines
13 KiB
Python
363 lines
13 KiB
Python
# A new Feed-style Parser
|
|
|
|
from email import Errors, Message
|
|
import re
|
|
|
|
NLCRE = re.compile('\r\n|\r|\n')
|
|
|
|
EMPTYSTRING = ''
|
|
NL = '\n'
|
|
|
|
NeedMoreData = object()
|
|
|
|
class FeedableLumpOfText:
|
|
"A file-like object that can have new data loaded into it"
|
|
|
|
def __init__(self):
|
|
self._partial = ''
|
|
self._done = False
|
|
# _pending is a list of lines, in reverse order
|
|
self._pending = []
|
|
|
|
def readline(self):
|
|
""" Return a line of data.
|
|
|
|
If data has been pushed back with unreadline(), the most recently
|
|
returned unreadline()d data will be returned.
|
|
"""
|
|
if not self._pending:
|
|
if self._done:
|
|
return ''
|
|
return NeedMoreData
|
|
return self._pending.pop()
|
|
|
|
def unreadline(self, line):
|
|
""" Push a line back into the object.
|
|
"""
|
|
self._pending.append(line)
|
|
|
|
def peekline(self):
|
|
""" Non-destructively look at the next line """
|
|
if not self._pending:
|
|
if self._done:
|
|
return ''
|
|
return NeedMoreData
|
|
return self._pending[-1]
|
|
|
|
|
|
# for r in self._input.readuntil(regexp):
|
|
# if r is NeedMoreData:
|
|
# yield NeedMoreData
|
|
# preamble, matchobj = r
|
|
def readuntil(self, matchre, afterblank=False, includematch=False):
|
|
""" Read a line at a time until we get the specified RE.
|
|
|
|
Returns the text up to (and including, if includematch is true) the
|
|
matched text, and the RE match object. If afterblank is true,
|
|
there must be a blank line before the matched text. Moves current
|
|
filepointer to the line following the matched line. If we reach
|
|
end-of-file, return what we've got so far, and return None as the
|
|
RE match object.
|
|
"""
|
|
prematch = []
|
|
blankseen = 0
|
|
while 1:
|
|
if not self._pending:
|
|
if self._done:
|
|
# end of file
|
|
yield EMPTYSTRING.join(prematch), None
|
|
else:
|
|
yield NeedMoreData
|
|
continue
|
|
line = self._pending.pop()
|
|
if afterblank:
|
|
if NLCRE.match(line):
|
|
blankseen = 1
|
|
continue
|
|
else:
|
|
blankseen = 0
|
|
m = matchre.match(line)
|
|
if (m and not afterblank) or (m and afterblank and blankseen):
|
|
if includematch:
|
|
prematch.append(line)
|
|
yield EMPTYSTRING.join(prematch), m
|
|
prematch.append(line)
|
|
|
|
|
|
NLatend = re.compile('(\r\n|\r|\n)$').match
|
|
NLCRE_crack = re.compile('(\r\n|\r|\n)')
|
|
|
|
def push(self, data):
|
|
""" Push some new data into this object """
|
|
# Handle any previous leftovers
|
|
data, self._partial = self._partial+data, ''
|
|
# Crack into lines, but leave the newlines on the end of each
|
|
lines = self.NLCRE_crack.split(data)
|
|
# The *ahem* interesting behaviour of re.split when supplied
|
|
# groups means that the last element is the data after the
|
|
# final RE. In the case of a NL/CR terminated string, this is
|
|
# the empty string.
|
|
self._partial = lines.pop()
|
|
o = []
|
|
for i in range(len(lines) / 2):
|
|
o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]]))
|
|
self.pushlines(o)
|
|
|
|
def pushlines(self, lines):
|
|
""" Push a list of new lines into the object """
|
|
# Reverse and insert at the front of _pending
|
|
self._pending[:0] = lines[::-1]
|
|
|
|
def end(self):
|
|
""" There is no more data """
|
|
self._done = True
|
|
|
|
def is_done(self):
|
|
return self._done
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def next(self):
|
|
l = self.readline()
|
|
if l == '':
|
|
raise StopIteration
|
|
return l
|
|
|
|
class FeedParser:
|
|
"A feed-style parser of email. copy docstring here"
|
|
|
|
def __init__(self, _class=Message.Message):
|
|
"fnord fnord fnord"
|
|
self._class = _class
|
|
self._input = FeedableLumpOfText()
|
|
self._root = None
|
|
self._objectstack = []
|
|
self._parse = self._parsegen().next
|
|
|
|
def end(self):
|
|
self._input.end()
|
|
self._call_parse()
|
|
return self._root
|
|
|
|
def feed(self, data):
|
|
self._input.push(data)
|
|
self._call_parse()
|
|
|
|
def _call_parse(self):
|
|
try:
|
|
self._parse()
|
|
except StopIteration:
|
|
pass
|
|
|
|
headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
|
|
|
|
def _parse_headers(self,headerlist):
|
|
# Passed a list of strings that are the headers for the
|
|
# current object
|
|
lastheader = ''
|
|
lastvalue = []
|
|
|
|
|
|
for lineno, line in enumerate(headerlist):
|
|
# Check for continuation
|
|
if line[0] in ' \t':
|
|
if not lastheader:
|
|
raise Errors.HeaderParseError('First line must not be a continuation')
|
|
lastvalue.append(line)
|
|
continue
|
|
|
|
if lastheader:
|
|
# XXX reconsider the joining of folded lines
|
|
self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
|
|
lastheader, lastvalue = '', []
|
|
|
|
# Check for Unix-From
|
|
if line.startswith('From '):
|
|
if lineno == 0:
|
|
self._cur.set_unixfrom(line)
|
|
continue
|
|
elif lineno == len(headerlist) - 1:
|
|
# Something looking like a unix-from at the end - it's
|
|
# probably the first line of the body
|
|
self._input.unreadline(line)
|
|
return
|
|
else:
|
|
# Weirdly placed unix-from line. Ignore it.
|
|
continue
|
|
|
|
i = line.find(':')
|
|
if i < 0:
|
|
# The older parser had various special-cases here. We've
|
|
# already handled them
|
|
raise Errors.HeaderParseError(
|
|
"Not a header, not a continuation: ``%s''" % line)
|
|
lastheader = line[:i]
|
|
lastvalue = [line[i+1:].lstrip()]
|
|
|
|
if lastheader:
|
|
# XXX reconsider the joining of folded lines
|
|
self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
|
|
|
|
|
|
def _parsegen(self):
|
|
# Parse any currently available text
|
|
self._new_sub_object()
|
|
self._root = self._cur
|
|
completing = False
|
|
last = None
|
|
|
|
for line in self._input:
|
|
if line is NeedMoreData:
|
|
yield None # Need More Data
|
|
continue
|
|
self._input.unreadline(line)
|
|
if not completing:
|
|
headers = []
|
|
# Now collect all headers.
|
|
for line in self._input:
|
|
if line is NeedMoreData:
|
|
yield None # Need More Data
|
|
continue
|
|
if not self.headerRE.match(line):
|
|
self._parse_headers(headers)
|
|
# A message/rfc822 has no body and no internal
|
|
# boundary.
|
|
if self._cur.get_content_maintype() == "message":
|
|
self._new_sub_object()
|
|
completing = False
|
|
headers = []
|
|
continue
|
|
if line.strip():
|
|
# No blank line between headers and body.
|
|
# Push this line back, it's the first line of
|
|
# the body.
|
|
self._input.unreadline(line)
|
|
break
|
|
else:
|
|
headers.append(line)
|
|
else:
|
|
# We're done with the data and are still inside the headers
|
|
self._parse_headers(headers)
|
|
|
|
# Now we're dealing with the body
|
|
boundary = self._cur.get_boundary()
|
|
isdigest = (self._cur.get_content_type() == 'multipart/digest')
|
|
if boundary and not self._cur._finishing:
|
|
separator = '--' + boundary
|
|
self._cur._boundaryRE = re.compile(
|
|
r'(?P<sep>' + re.escape(separator) +
|
|
r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
|
|
for r in self._input.readuntil(self._cur._boundaryRE):
|
|
if r is NeedMoreData:
|
|
yield NeedMoreData
|
|
else:
|
|
preamble, matchobj = r
|
|
break
|
|
if not matchobj:
|
|
# Broken - we hit the end of file. Just set the body
|
|
# to the text.
|
|
if completing:
|
|
self._attach_trailer(last, preamble)
|
|
else:
|
|
self._attach_preamble(self._cur, preamble)
|
|
# XXX move back to the parent container.
|
|
self._pop_container()
|
|
completing = True
|
|
continue
|
|
if preamble:
|
|
if completing:
|
|
preamble = preamble[:-len(matchobj.group('linesep'))]
|
|
self._attach_trailer(last, preamble)
|
|
else:
|
|
self._attach_preamble(self._cur, preamble)
|
|
elif not completing:
|
|
# The module docs specify an empty preamble is None, not ''
|
|
self._cur.preamble = None
|
|
# If we _are_ completing, the last object gets no payload
|
|
|
|
if matchobj.group('end'):
|
|
# That was the end boundary tag. Bounce back to the
|
|
# parent container
|
|
last = self._pop_container()
|
|
self._input.unreadline(matchobj.group('linesep'))
|
|
completing = True
|
|
continue
|
|
|
|
# A number of MTAs produced by a nameless large company
|
|
# we shall call "SicroMoft" produce repeated boundary
|
|
# lines.
|
|
while True:
|
|
line = self._input.peekline()
|
|
if line is NeedMoreData:
|
|
yield None
|
|
continue
|
|
if self._cur._boundaryRE.match(line):
|
|
self._input.readline()
|
|
else:
|
|
break
|
|
|
|
self._new_sub_object()
|
|
|
|
completing = False
|
|
if isdigest:
|
|
self._cur.set_default_type('message/rfc822')
|
|
continue
|
|
else:
|
|
# non-multipart or after end-boundary
|
|
if last is not self._root:
|
|
last = self._pop_container()
|
|
if self._cur.get_content_maintype() == "message":
|
|
# We double-pop to leave the RFC822 object
|
|
self._pop_container()
|
|
completing = True
|
|
elif self._cur._boundaryRE and last <> self._root:
|
|
completing = True
|
|
else:
|
|
# Non-multipart top level, or in the trailer of the
|
|
# top level multipart
|
|
while not self._input.is_done():
|
|
yield None
|
|
data = list(self._input)
|
|
body = EMPTYSTRING.join(data)
|
|
self._attach_trailer(last, body)
|
|
|
|
|
|
def _attach_trailer(self, obj, trailer):
|
|
#import pdb ; pdb.set_trace()
|
|
if obj.get_content_maintype() in ( "multipart", "message" ):
|
|
obj.epilogue = trailer
|
|
else:
|
|
obj.set_payload(trailer)
|
|
|
|
def _attach_preamble(self, obj, trailer):
|
|
if obj.get_content_maintype() in ( "multipart", "message" ):
|
|
obj.preamble = trailer
|
|
else:
|
|
obj.set_payload(trailer)
|
|
|
|
|
|
def _new_sub_object(self):
|
|
new = self._class()
|
|
#print "pushing", self._objectstack, repr(new)
|
|
if self._objectstack:
|
|
self._objectstack[-1].attach(new)
|
|
self._objectstack.append(new)
|
|
new._boundaryRE = None
|
|
new._finishing = False
|
|
self._cur = new
|
|
|
|
def _pop_container(self):
|
|
# Move the pointer to the container of the current object.
|
|
# Returns the (old) current object
|
|
#import pdb ; pdb.set_trace()
|
|
#print "popping", self._objectstack
|
|
last = self._objectstack.pop()
|
|
if self._objectstack:
|
|
self._cur = self._objectstack[-1]
|
|
else:
|
|
self._cur._finishing = True
|
|
return last
|
|
|
|
|