gh-106628: email parsing speedup (gh-106629)

This commit is contained in:
CF Bolz-Tereick 2023-07-13 08:12:56 +02:00 committed by GitHub
parent af51bd7cda
commit 7e6ce48872
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 6 deletions

View File

@ -37,6 +37,8 @@ NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
EMPTYSTRING = ''
NL = '\n'
boundaryendRE = re.compile(
r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
NeedMoreData = object()
@ -327,9 +329,10 @@ class FeedParser:
# this onto the input stream until we've scanned past the
# preamble.
separator = '--' + boundary
boundaryre = re.compile(
'(?P<sep>' + re.escape(separator) +
r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
def boundarymatch(line):
if not line.startswith(separator):
return None
return boundaryendRE.match(line, len(separator))
capturing_preamble = True
preamble = []
linesep = False
@ -341,7 +344,7 @@ class FeedParser:
continue
if line == '':
break
mo = boundaryre.match(line)
mo = boundarymatch(line)
if mo:
# If we're looking at the end boundary, we're done with
# this multipart. If there was a newline at the end of
@ -373,13 +376,13 @@ class FeedParser:
if line is NeedMoreData:
yield NeedMoreData
continue
mo = boundaryre.match(line)
mo = boundarymatch(line)
if not mo:
self._input.unreadline(line)
break
# Recurse to parse this subpart; the input stream points
# at the subpart's first line.
self._input.push_eof_matcher(boundaryre.match)
self._input.push_eof_matcher(boundarymatch)
for retval in self._parsegen():
if retval is NeedMoreData:
yield NeedMoreData

View File

@ -0,0 +1,2 @@
Speed up parsing of emails by about 20% by not compiling a new regular
expression for every single email.