New parser. Next up, making the current parser use this parser
This commit is contained in:
parent
e62c5c88f1
commit
39a0f04421
|
@ -0,0 +1,362 @@
|
|||
# A new Feed-style Parser
|
||||
|
||||
from email import Errors, Message
|
||||
import re
|
||||
|
||||
NLCRE = re.compile('\r\n|\r|\n')
|
||||
|
||||
EMPTYSTRING = ''
|
||||
NL = '\n'
|
||||
|
||||
NeedMoreData = object()
|
||||
|
||||
class FeedableLumpOfText:
|
||||
"A file-like object that can have new data loaded into it"
|
||||
|
||||
def __init__(self):
|
||||
self._partial = ''
|
||||
self._done = False
|
||||
# _pending is a list of lines, in reverse order
|
||||
self._pending = []
|
||||
|
||||
def readline(self):
|
||||
""" Return a line of data.
|
||||
|
||||
If data has been pushed back with unreadline(), the most recently
|
||||
returned unreadline()d data will be returned.
|
||||
"""
|
||||
if not self._pending:
|
||||
if self._done:
|
||||
return ''
|
||||
return NeedMoreData
|
||||
return self._pending.pop()
|
||||
|
||||
def unreadline(self, line):
|
||||
""" Push a line back into the object.
|
||||
"""
|
||||
self._pending.append(line)
|
||||
|
||||
def peekline(self):
|
||||
""" Non-destructively look at the next line """
|
||||
if not self._pending:
|
||||
if self._done:
|
||||
return ''
|
||||
return NeedMoreData
|
||||
return self._pending[-1]
|
||||
|
||||
|
||||
# for r in self._input.readuntil(regexp):
|
||||
# if r is NeedMoreData:
|
||||
# yield NeedMoreData
|
||||
# preamble, matchobj = r
|
||||
def readuntil(self, matchre, afterblank=False, includematch=False):
|
||||
""" Read a line at a time until we get the specified RE.
|
||||
|
||||
Returns the text up to (and including, if includematch is true) the
|
||||
matched text, and the RE match object. If afterblank is true,
|
||||
there must be a blank line before the matched text. Moves current
|
||||
filepointer to the line following the matched line. If we reach
|
||||
end-of-file, return what we've got so far, and return None as the
|
||||
RE match object.
|
||||
"""
|
||||
prematch = []
|
||||
blankseen = 0
|
||||
while 1:
|
||||
if not self._pending:
|
||||
if self._done:
|
||||
# end of file
|
||||
yield EMPTYSTRING.join(prematch), None
|
||||
else:
|
||||
yield NeedMoreData
|
||||
continue
|
||||
line = self._pending.pop()
|
||||
if afterblank:
|
||||
if NLCRE.match(line):
|
||||
blankseen = 1
|
||||
continue
|
||||
else:
|
||||
blankseen = 0
|
||||
m = matchre.match(line)
|
||||
if (m and not afterblank) or (m and afterblank and blankseen):
|
||||
if includematch:
|
||||
prematch.append(line)
|
||||
yield EMPTYSTRING.join(prematch), m
|
||||
prematch.append(line)
|
||||
|
||||
|
||||
NLatend = re.compile('(\r\n|\r|\n)$').match
|
||||
NLCRE_crack = re.compile('(\r\n|\r|\n)')
|
||||
|
||||
def push(self, data):
|
||||
""" Push some new data into this object """
|
||||
# Handle any previous leftovers
|
||||
data, self._partial = self._partial+data, ''
|
||||
# Crack into lines, but leave the newlines on the end of each
|
||||
lines = self.NLCRE_crack.split(data)
|
||||
# The *ahem* interesting behaviour of re.split when supplied
|
||||
# groups means that the last element is the data after the
|
||||
# final RE. In the case of a NL/CR terminated string, this is
|
||||
# the empty string.
|
||||
self._partial = lines.pop()
|
||||
o = []
|
||||
for i in range(len(lines) / 2):
|
||||
o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]]))
|
||||
self.pushlines(o)
|
||||
|
||||
def pushlines(self, lines):
|
||||
""" Push a list of new lines into the object """
|
||||
# Reverse and insert at the front of _pending
|
||||
self._pending[:0] = lines[::-1]
|
||||
|
||||
def end(self):
|
||||
""" There is no more data """
|
||||
self._done = True
|
||||
|
||||
def is_done(self):
|
||||
return self._done
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
l = self.readline()
|
||||
if l == '':
|
||||
raise StopIteration
|
||||
return l
|
||||
|
||||
class FeedParser:
|
||||
"A feed-style parser of email. copy docstring here"
|
||||
|
||||
def __init__(self, _class=Message.Message):
|
||||
"fnord fnord fnord"
|
||||
self._class = _class
|
||||
self._input = FeedableLumpOfText()
|
||||
self._root = None
|
||||
self._objectstack = []
|
||||
self._parse = self._parsegen().next
|
||||
|
||||
def end(self):
|
||||
self._input.end()
|
||||
self._call_parse()
|
||||
return self._root
|
||||
|
||||
def feed(self, data):
|
||||
self._input.push(data)
|
||||
self._call_parse()
|
||||
|
||||
def _call_parse(self):
|
||||
try:
|
||||
self._parse()
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
|
||||
|
||||
def _parse_headers(self,headerlist):
|
||||
# Passed a list of strings that are the headers for the
|
||||
# current object
|
||||
lastheader = ''
|
||||
lastvalue = []
|
||||
|
||||
|
||||
for lineno, line in enumerate(headerlist):
|
||||
# Check for continuation
|
||||
if line[0] in ' \t':
|
||||
if not lastheader:
|
||||
raise Errors.HeaderParseError('First line must not be a continuation')
|
||||
lastvalue.append(line)
|
||||
continue
|
||||
|
||||
if lastheader:
|
||||
# XXX reconsider the joining of folded lines
|
||||
self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
|
||||
lastheader, lastvalue = '', []
|
||||
|
||||
# Check for Unix-From
|
||||
if line.startswith('From '):
|
||||
if lineno == 0:
|
||||
self._cur.set_unixfrom(line)
|
||||
continue
|
||||
elif lineno == len(headerlist) - 1:
|
||||
# Something looking like a unix-from at the end - it's
|
||||
# probably the first line of the body
|
||||
self._input.unreadline(line)
|
||||
return
|
||||
else:
|
||||
# Weirdly placed unix-from line. Ignore it.
|
||||
continue
|
||||
|
||||
i = line.find(':')
|
||||
if i < 0:
|
||||
# The older parser had various special-cases here. We've
|
||||
# already handled them
|
||||
raise Errors.HeaderParseError(
|
||||
"Not a header, not a continuation: ``%s''" % line)
|
||||
lastheader = line[:i]
|
||||
lastvalue = [line[i+1:].lstrip()]
|
||||
|
||||
if lastheader:
|
||||
# XXX reconsider the joining of folded lines
|
||||
self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
|
||||
|
||||
|
||||
def _parsegen(self):
|
||||
# Parse any currently available text
|
||||
self._new_sub_object()
|
||||
self._root = self._cur
|
||||
completing = False
|
||||
last = None
|
||||
|
||||
for line in self._input:
|
||||
if line is NeedMoreData:
|
||||
yield None # Need More Data
|
||||
continue
|
||||
self._input.unreadline(line)
|
||||
if not completing:
|
||||
headers = []
|
||||
# Now collect all headers.
|
||||
for line in self._input:
|
||||
if line is NeedMoreData:
|
||||
yield None # Need More Data
|
||||
continue
|
||||
if not self.headerRE.match(line):
|
||||
self._parse_headers(headers)
|
||||
# A message/rfc822 has no body and no internal
|
||||
# boundary.
|
||||
if self._cur.get_content_maintype() == "message":
|
||||
self._new_sub_object()
|
||||
completing = False
|
||||
headers = []
|
||||
continue
|
||||
if line.strip():
|
||||
# No blank line between headers and body.
|
||||
# Push this line back, it's the first line of
|
||||
# the body.
|
||||
self._input.unreadline(line)
|
||||
break
|
||||
else:
|
||||
headers.append(line)
|
||||
else:
|
||||
# We're done with the data and are still inside the headers
|
||||
self._parse_headers(headers)
|
||||
|
||||
# Now we're dealing with the body
|
||||
boundary = self._cur.get_boundary()
|
||||
isdigest = (self._cur.get_content_type() == 'multipart/digest')
|
||||
if boundary and not self._cur._finishing:
|
||||
separator = '--' + boundary
|
||||
self._cur._boundaryRE = re.compile(
|
||||
r'(?P<sep>' + re.escape(separator) +
|
||||
r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
|
||||
for r in self._input.readuntil(self._cur._boundaryRE):
|
||||
if r is NeedMoreData:
|
||||
yield NeedMoreData
|
||||
else:
|
||||
preamble, matchobj = r
|
||||
break
|
||||
if not matchobj:
|
||||
# Broken - we hit the end of file. Just set the body
|
||||
# to the text.
|
||||
if completing:
|
||||
self._attach_trailer(last, preamble)
|
||||
else:
|
||||
self._attach_preamble(self._cur, preamble)
|
||||
# XXX move back to the parent container.
|
||||
self._pop_container()
|
||||
completing = True
|
||||
continue
|
||||
if preamble:
|
||||
if completing:
|
||||
preamble = preamble[:-len(matchobj.group('linesep'))]
|
||||
self._attach_trailer(last, preamble)
|
||||
else:
|
||||
self._attach_preamble(self._cur, preamble)
|
||||
elif not completing:
|
||||
# The module docs specify an empty preamble is None, not ''
|
||||
self._cur.preamble = None
|
||||
# If we _are_ completing, the last object gets no payload
|
||||
|
||||
if matchobj.group('end'):
|
||||
# That was the end boundary tag. Bounce back to the
|
||||
# parent container
|
||||
last = self._pop_container()
|
||||
self._input.unreadline(matchobj.group('linesep'))
|
||||
completing = True
|
||||
continue
|
||||
|
||||
# A number of MTAs produced by a nameless large company
|
||||
# we shall call "SicroMoft" produce repeated boundary
|
||||
# lines.
|
||||
while True:
|
||||
line = self._input.peekline()
|
||||
if line is NeedMoreData:
|
||||
yield None
|
||||
continue
|
||||
if self._cur._boundaryRE.match(line):
|
||||
self._input.readline()
|
||||
else:
|
||||
break
|
||||
|
||||
self._new_sub_object()
|
||||
|
||||
completing = False
|
||||
if isdigest:
|
||||
self._cur.set_default_type('message/rfc822')
|
||||
continue
|
||||
else:
|
||||
# non-multipart or after end-boundary
|
||||
if last is not self._root:
|
||||
last = self._pop_container()
|
||||
if self._cur.get_content_maintype() == "message":
|
||||
# We double-pop to leave the RFC822 object
|
||||
self._pop_container()
|
||||
completing = True
|
||||
elif self._cur._boundaryRE and last <> self._root:
|
||||
completing = True
|
||||
else:
|
||||
# Non-multipart top level, or in the trailer of the
|
||||
# top level multipart
|
||||
while not self._input.is_done():
|
||||
yield None
|
||||
data = list(self._input)
|
||||
body = EMPTYSTRING.join(data)
|
||||
self._attach_trailer(last, body)
|
||||
|
||||
|
||||
def _attach_trailer(self, obj, trailer):
|
||||
#import pdb ; pdb.set_trace()
|
||||
if obj.get_content_maintype() in ( "multipart", "message" ):
|
||||
obj.epilogue = trailer
|
||||
else:
|
||||
obj.set_payload(trailer)
|
||||
|
||||
def _attach_preamble(self, obj, trailer):
|
||||
if obj.get_content_maintype() in ( "multipart", "message" ):
|
||||
obj.preamble = trailer
|
||||
else:
|
||||
obj.set_payload(trailer)
|
||||
|
||||
|
||||
def _new_sub_object(self):
|
||||
new = self._class()
|
||||
#print "pushing", self._objectstack, repr(new)
|
||||
if self._objectstack:
|
||||
self._objectstack[-1].attach(new)
|
||||
self._objectstack.append(new)
|
||||
new._boundaryRE = None
|
||||
new._finishing = False
|
||||
self._cur = new
|
||||
|
||||
def _pop_container(self):
|
||||
# Move the pointer to the container of the current object.
|
||||
# Returns the (old) current object
|
||||
#import pdb ; pdb.set_trace()
|
||||
#print "popping", self._objectstack
|
||||
last = self._objectstack.pop()
|
||||
if self._objectstack:
|
||||
self._cur = self._objectstack[-1]
|
||||
else:
|
||||
self._cur._finishing = True
|
||||
return last
|
||||
|
||||
|
Loading…
Reference in New Issue