bpo-30340: Enhanced regular expressions optimization. (#1542)

This increased the performance of matching some patterns up to 25 times.
This commit is contained in:
Serhiy Storchaka 2017-05-14 08:32:33 +03:00 committed by GitHub
parent cbddf58c79
commit 821a9d146b
4 changed files with 95 additions and 54 deletions

View File

@ -20,6 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL}
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE} _SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT} _ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
# Sets of lowercase characters which have the same uppercase. # Sets of lowercase characters which have the same uppercase.
_equivalences = ( _equivalences = (
@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
elif op in REPEATING_CODES: elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE: if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,)) raise error("internal: unsupported template operator %r" % (op,))
elif _simple(av) and op is not REPEAT: if _simple(av[2]):
if op is MAX_REPEAT: if op is MAX_REPEAT:
emit(REPEAT_ONE) emit(REPEAT_ONE)
else: else:
@ -404,10 +405,14 @@ def _bytes_to_codes(b):
assert len(a) * a.itemsize == len(b) assert len(a) * a.itemsize == len(b)
return a.tolist() return a.tolist()
def _simple(av): def _simple(p):
# check if av is a "simple" operator # check if this subpattern is a "simple" operator
lo, hi = av[2].getwidth() if len(p) != 1:
return lo == hi == 1 and av[2][0][0] != SUBPATTERN return False
op, av = p[0]
if op is SUBPATTERN:
return av[0] is None and _simple(av[-1])
return op in _UNIT_CODES
def _generate_overlap_table(prefix): def _generate_overlap_table(prefix):
""" """

View File

@ -114,6 +114,7 @@ class SubPattern:
data = [] data = []
self.data = data self.data = data
self.width = None self.width = None
def dump(self, level=0): def dump(self, level=0):
nl = True nl = True
seqtypes = (tuple, list) seqtypes = (tuple, list)
@ -404,6 +405,15 @@ def _escape(source, escape, state):
pass pass
raise source.error("bad escape %s" % escape, len(escape)) raise source.error("bad escape %s" % escape, len(escape))
def _uniq(items):
if len(set(items)) == len(items):
return items
newitems = []
for item in items:
if item not in newitems:
newitems.append(item)
return newitems
def _parse_sub(source, state, verbose, nested=True): def _parse_sub(source, state, verbose, nested=True):
# parse an alternation: a|b|c # parse an alternation: a|b|c
@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
return items[0] return items[0]
subpattern = SubPattern(state) subpattern = SubPattern(state)
subpatternappend = subpattern.append
# check if all items share a common prefix # check if all items share a common prefix
while True: while True:
@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
# move it out of the branch # move it out of the branch
for item in items: for item in items:
del item[0] del item[0]
subpatternappend(prefix) subpattern.append(prefix)
continue # check next one continue # check next one
break break
# check if the branch can be replaced by a character set # check if the branch can be replaced by a character set
set = []
for item in items: for item in items:
if len(item) != 1 or item[0][0] is not LITERAL: if len(item) != 1:
break
op, av = item[0]
if op is LITERAL:
set.append((op, av))
elif op is IN and av[0][0] is not NEGATE:
set.extend(av)
else:
break break
else: else:
# we can store this as a character set instead of a # we can store this as a character set instead of a
# branch (the compiler may optimize this even more) # branch (the compiler may optimize this even more)
subpatternappend((IN, [item[0] for item in items])) subpattern.append((IN, _uniq(set)))
return subpattern return subpattern
subpattern.append((BRANCH, (None, items))) subpattern.append((BRANCH, (None, items)))
return subpattern return subpattern
def _parse_sub_cond(source, state, condgroup, verbose):
item_yes = _parse(source, state, verbose)
if source.match("|"):
item_no = _parse(source, state, verbose)
if source.next == "|":
raise source.error("conditional backref with more than two branches")
else:
item_no = None
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
def _parse(source, state, verbose, first=False): def _parse(source, state, verbose, first=False):
# parse a simple pattern # parse a simple pattern
subpattern = SubPattern(state) subpattern = SubPattern(state)
@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
setappend = set.append setappend = set.append
## if sourcematch(":"): ## if sourcematch(":"):
## pass # handle character classes ## pass # handle character classes
if sourcematch("^"): negate = sourcematch("^")
setappend((NEGATE, None))
# check remaining characters # check remaining characters
start = set[:]
while True: while True:
this = sourceget() this = sourceget()
if this is None: if this is None:
raise source.error("unterminated character set", raise source.error("unterminated character set",
source.tell() - here) source.tell() - here)
if this == "]" and set != start: if this == "]" and set:
break break
elif this[0] == "\\": elif this[0] == "\\":
code1 = _class_escape(source, this) code1 = _class_escape(source, this)
@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
code1 = code1[1][0] code1 = code1[1][0]
setappend(code1) setappend(code1)
set = _uniq(set)
# XXX: <fl> should move set optimization to compiler! # XXX: <fl> should move set optimization to compiler!
if _len(set)==1 and set[0][0] is LITERAL: if _len(set) == 1 and set[0][0] is LITERAL:
subpatternappend(set[0]) # optimization # optimization
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: if negate:
subpatternappend((NOT_LITERAL, set[1][1])) # optimization subpatternappend((NOT_LITERAL, set[0][1]))
else:
subpatternappend(set[0])
else: else:
# XXX: <fl> should add charmap optimization here if negate:
set.insert(0, (NEGATE, None))
# charmap optimization can't be added here because
# global flags still are not known
subpatternappend((IN, set)) subpatternappend((IN, set))
elif this in REPEAT_CHARS: elif this in REPEAT_CHARS:
@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
if source.next == "}": if source.next == "}":
subpatternappend((LITERAL, _ord(this))) subpatternappend((LITERAL, _ord(this)))
continue continue
min, max = 0, MAXREPEAT min, max = 0, MAXREPEAT
lo = hi = "" lo = hi = ""
while source.next in DIGITS: while source.next in DIGITS:
@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
subpatternappend((LITERAL, _ord(this))) subpatternappend((LITERAL, _ord(this)))
source.seek(here) source.seek(here)
continue continue
if lo: if lo:
min = int(lo) min = int(lo)
if min >= MAXREPEAT: if min >= MAXREPEAT:
@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
item = subpattern[-1:] item = subpattern[-1:]
else: else:
item = None item = None
if not item or (_len(item) == 1 and item[0][0] is AT): if not item or item[0][0] is AT:
raise source.error("nothing to repeat", raise source.error("nothing to repeat",
source.tell() - here + len(this)) source.tell() - here + len(this))
if item[0][0] in _REPEATCODES: if item[0][0] in _REPEATCODES:
raise source.error("multiple repeat", raise source.error("multiple repeat",
source.tell() - here + len(this)) source.tell() - here + len(this))
if item[0][0] is SUBPATTERN:
group, add_flags, del_flags, p = item[0][1]
if group is None and not add_flags and not del_flags:
item = p
if sourcematch("?"): if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item)) subpattern[-1] = (MIN_REPEAT, (min, max, item))
else: else:
@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
start = source.tell() - 1 start = source.tell() - 1
group = True group = True
name = None name = None
condgroup = None
add_flags = 0 add_flags = 0
del_flags = 0 del_flags = 0
if sourcematch("?"): if sourcematch("?"):
@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
state.checklookbehindgroup(gid, source) state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid)) subpatternappend((GROUPREF, gid))
continue continue
else: else:
char = sourceget() char = sourceget()
if char is None: if char is None:
@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
if sourceget() == ")": if sourceget() == ")":
break break
continue continue
elif char in "=!<": elif char in "=!<":
# lookahead assertions # lookahead assertions
dir = 1 dir = 1
@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
else: else:
subpatternappend((ASSERT_NOT, (dir, p))) subpatternappend((ASSERT_NOT, (dir, p)))
continue continue
elif char == "(": elif char == "(":
# conditional backreference group # conditional backreference group
condname = source.getuntil(")") condname = source.getuntil(")")
group = None
if condname.isidentifier(): if condname.isidentifier():
condgroup = state.groupdict.get(condname) condgroup = state.groupdict.get(condname)
if condgroup is None: if condgroup is None:
@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
msg = "invalid group reference %d" % condgroup msg = "invalid group reference %d" % condgroup
raise source.error(msg, len(condname) + 1) raise source.error(msg, len(condname) + 1)
state.checklookbehindgroup(condgroup, source) state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose)
if source.match("|"):
item_no = _parse(source, state, verbose)
if source.next == "|":
raise source.error("conditional backref with more than two branches")
else:
item_no = None
if not source.match(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
continue
elif char in FLAGS or char == "-": elif char in FLAGS or char == "-":
# flags # flags
flags = _parse_flags(source, state, char) flags = _parse_flags(source, state, char)
@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
if (state.flags & SRE_FLAG_VERBOSE) and not verbose: if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
raise Verbose raise Verbose
continue continue
add_flags, del_flags = flags add_flags, del_flags = flags
group = None group = None
else: else:
@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
group = state.opengroup(name) group = state.opengroup(name)
except error as err: except error as err:
raise source.error(err.msg, len(name) + 1) from None raise source.error(err.msg, len(name) + 1) from None
if condgroup: sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
p = _parse_sub_cond(source, state, condgroup, verbose) not (del_flags & SRE_FLAG_VERBOSE))
else: p = _parse_sub(source, state, sub_verbose)
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose)
if not source.match(")"): if not source.match(")"):
raise source.error("missing ), unterminated subpattern", raise source.error("missing ), unterminated subpattern",
source.tell() - start) source.tell() - start)
@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
subpatternappend((AT, AT_BEGINNING)) subpatternappend((AT, AT_BEGINNING))
elif this == "$": elif this == "$":
subpattern.append((AT, AT_END)) subpatternappend((AT, AT_END))
else: else:
raise AssertionError("unsupported special character %r" % (char,)) raise AssertionError("unsupported special character %r" % (char,))
# unpack non-capturing groups
for i in range(len(subpattern))[::-1]:
op, av = subpattern[i]
if op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group is None and not add_flags and not del_flags:
subpattern[i: i+1] = p
return subpattern return subpattern
def _parse_flags(source, state, char): def _parse_flags(source, state, char):

View File

@ -1695,20 +1695,18 @@ class ReTests(unittest.TestCase):
dump = '''\ dump = '''\
SUBPATTERN 1 0 0 SUBPATTERN 1 0 0
LITERAL 46 LITERAL 46
SUBPATTERN None 0 0 BRANCH
BRANCH IN
IN LITERAL 99
LITERAL 99 LITERAL 104
LITERAL 104 OR
OR LITERAL 112
LITERAL 112 LITERAL 121
LITERAL 121 GROUPREF_EXISTS 1
SUBPATTERN None 0 0 AT AT_END
GROUPREF_EXISTS 1 ELSE
AT AT_END LITERAL 58
ELSE LITERAL 32
LITERAL 58
LITERAL 32
''' '''
self.assertEqual(out.getvalue(), dump) self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing # Debug output is output again even a second time (bypassing

View File

@ -326,6 +326,9 @@ Library
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is - bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
running coroutine and the coroutine returned without any more ``await``. running coroutine and the coroutine returned without any more ``await``.
- bpo-30340: Enhanced regular expressions optimization. This increased
the performance of matching some patterns up to 25 times.
- bpo-30298: Weaken the condition of deprecation warnings for inline modifiers. - bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
Now allowed several subsequential inline modifiers at the start of the Now allowed several subsequential inline modifiers at the start of the
pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments