mirror of https://github.com/python/cpython
bpo-30340: Enhanced regular expressions optimization. (#1542)
This increased the performance of matching some patterns up to 25 times.
This commit is contained in:
parent
cbddf58c79
commit
821a9d146b
|
@ -20,6 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL}
|
||||||
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
|
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
|
||||||
_SUCCESS_CODES = {SUCCESS, FAILURE}
|
_SUCCESS_CODES = {SUCCESS, FAILURE}
|
||||||
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
|
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
|
||||||
|
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
|
||||||
|
|
||||||
# Sets of lowercase characters which have the same uppercase.
|
# Sets of lowercase characters which have the same uppercase.
|
||||||
_equivalences = (
|
_equivalences = (
|
||||||
|
@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
|
||||||
elif op in REPEATING_CODES:
|
elif op in REPEATING_CODES:
|
||||||
if flags & SRE_FLAG_TEMPLATE:
|
if flags & SRE_FLAG_TEMPLATE:
|
||||||
raise error("internal: unsupported template operator %r" % (op,))
|
raise error("internal: unsupported template operator %r" % (op,))
|
||||||
elif _simple(av) and op is not REPEAT:
|
if _simple(av[2]):
|
||||||
if op is MAX_REPEAT:
|
if op is MAX_REPEAT:
|
||||||
emit(REPEAT_ONE)
|
emit(REPEAT_ONE)
|
||||||
else:
|
else:
|
||||||
|
@ -404,10 +405,14 @@ def _bytes_to_codes(b):
|
||||||
assert len(a) * a.itemsize == len(b)
|
assert len(a) * a.itemsize == len(b)
|
||||||
return a.tolist()
|
return a.tolist()
|
||||||
|
|
||||||
def _simple(av):
|
def _simple(p):
|
||||||
# check if av is a "simple" operator
|
# check if this subpattern is a "simple" operator
|
||||||
lo, hi = av[2].getwidth()
|
if len(p) != 1:
|
||||||
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
|
return False
|
||||||
|
op, av = p[0]
|
||||||
|
if op is SUBPATTERN:
|
||||||
|
return av[0] is None and _simple(av[-1])
|
||||||
|
return op in _UNIT_CODES
|
||||||
|
|
||||||
def _generate_overlap_table(prefix):
|
def _generate_overlap_table(prefix):
|
||||||
"""
|
"""
|
||||||
|
|
105
Lib/sre_parse.py
105
Lib/sre_parse.py
|
@ -114,6 +114,7 @@ class SubPattern:
|
||||||
data = []
|
data = []
|
||||||
self.data = data
|
self.data = data
|
||||||
self.width = None
|
self.width = None
|
||||||
|
|
||||||
def dump(self, level=0):
|
def dump(self, level=0):
|
||||||
nl = True
|
nl = True
|
||||||
seqtypes = (tuple, list)
|
seqtypes = (tuple, list)
|
||||||
|
@ -404,6 +405,15 @@ def _escape(source, escape, state):
|
||||||
pass
|
pass
|
||||||
raise source.error("bad escape %s" % escape, len(escape))
|
raise source.error("bad escape %s" % escape, len(escape))
|
||||||
|
|
||||||
|
def _uniq(items):
|
||||||
|
if len(set(items)) == len(items):
|
||||||
|
return items
|
||||||
|
newitems = []
|
||||||
|
for item in items:
|
||||||
|
if item not in newitems:
|
||||||
|
newitems.append(item)
|
||||||
|
return newitems
|
||||||
|
|
||||||
def _parse_sub(source, state, verbose, nested=True):
|
def _parse_sub(source, state, verbose, nested=True):
|
||||||
# parse an alternation: a|b|c
|
# parse an alternation: a|b|c
|
||||||
|
|
||||||
|
@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
|
||||||
return items[0]
|
return items[0]
|
||||||
|
|
||||||
subpattern = SubPattern(state)
|
subpattern = SubPattern(state)
|
||||||
subpatternappend = subpattern.append
|
|
||||||
|
|
||||||
# check if all items share a common prefix
|
# check if all items share a common prefix
|
||||||
while True:
|
while True:
|
||||||
|
@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
|
||||||
# move it out of the branch
|
# move it out of the branch
|
||||||
for item in items:
|
for item in items:
|
||||||
del item[0]
|
del item[0]
|
||||||
subpatternappend(prefix)
|
subpattern.append(prefix)
|
||||||
continue # check next one
|
continue # check next one
|
||||||
break
|
break
|
||||||
|
|
||||||
# check if the branch can be replaced by a character set
|
# check if the branch can be replaced by a character set
|
||||||
|
set = []
|
||||||
for item in items:
|
for item in items:
|
||||||
if len(item) != 1 or item[0][0] is not LITERAL:
|
if len(item) != 1:
|
||||||
|
break
|
||||||
|
op, av = item[0]
|
||||||
|
if op is LITERAL:
|
||||||
|
set.append((op, av))
|
||||||
|
elif op is IN and av[0][0] is not NEGATE:
|
||||||
|
set.extend(av)
|
||||||
|
else:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# we can store this as a character set instead of a
|
# we can store this as a character set instead of a
|
||||||
# branch (the compiler may optimize this even more)
|
# branch (the compiler may optimize this even more)
|
||||||
subpatternappend((IN, [item[0] for item in items]))
|
subpattern.append((IN, _uniq(set)))
|
||||||
return subpattern
|
return subpattern
|
||||||
|
|
||||||
subpattern.append((BRANCH, (None, items)))
|
subpattern.append((BRANCH, (None, items)))
|
||||||
return subpattern
|
return subpattern
|
||||||
|
|
||||||
def _parse_sub_cond(source, state, condgroup, verbose):
|
|
||||||
item_yes = _parse(source, state, verbose)
|
|
||||||
if source.match("|"):
|
|
||||||
item_no = _parse(source, state, verbose)
|
|
||||||
if source.next == "|":
|
|
||||||
raise source.error("conditional backref with more than two branches")
|
|
||||||
else:
|
|
||||||
item_no = None
|
|
||||||
subpattern = SubPattern(state)
|
|
||||||
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
|
||||||
return subpattern
|
|
||||||
|
|
||||||
def _parse(source, state, verbose, first=False):
|
def _parse(source, state, verbose, first=False):
|
||||||
# parse a simple pattern
|
# parse a simple pattern
|
||||||
subpattern = SubPattern(state)
|
subpattern = SubPattern(state)
|
||||||
|
@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
|
||||||
setappend = set.append
|
setappend = set.append
|
||||||
## if sourcematch(":"):
|
## if sourcematch(":"):
|
||||||
## pass # handle character classes
|
## pass # handle character classes
|
||||||
if sourcematch("^"):
|
negate = sourcematch("^")
|
||||||
setappend((NEGATE, None))
|
|
||||||
# check remaining characters
|
# check remaining characters
|
||||||
start = set[:]
|
|
||||||
while True:
|
while True:
|
||||||
this = sourceget()
|
this = sourceget()
|
||||||
if this is None:
|
if this is None:
|
||||||
raise source.error("unterminated character set",
|
raise source.error("unterminated character set",
|
||||||
source.tell() - here)
|
source.tell() - here)
|
||||||
if this == "]" and set != start:
|
if this == "]" and set:
|
||||||
break
|
break
|
||||||
elif this[0] == "\\":
|
elif this[0] == "\\":
|
||||||
code1 = _class_escape(source, this)
|
code1 = _class_escape(source, this)
|
||||||
|
@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
|
||||||
code1 = code1[1][0]
|
code1 = code1[1][0]
|
||||||
setappend(code1)
|
setappend(code1)
|
||||||
|
|
||||||
|
set = _uniq(set)
|
||||||
# XXX: <fl> should move set optimization to compiler!
|
# XXX: <fl> should move set optimization to compiler!
|
||||||
if _len(set)==1 and set[0][0] is LITERAL:
|
if _len(set) == 1 and set[0][0] is LITERAL:
|
||||||
subpatternappend(set[0]) # optimization
|
# optimization
|
||||||
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
|
if negate:
|
||||||
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
|
subpatternappend((NOT_LITERAL, set[0][1]))
|
||||||
|
else:
|
||||||
|
subpatternappend(set[0])
|
||||||
else:
|
else:
|
||||||
# XXX: <fl> should add charmap optimization here
|
if negate:
|
||||||
|
set.insert(0, (NEGATE, None))
|
||||||
|
# charmap optimization can't be added here because
|
||||||
|
# global flags still are not known
|
||||||
subpatternappend((IN, set))
|
subpatternappend((IN, set))
|
||||||
|
|
||||||
elif this in REPEAT_CHARS:
|
elif this in REPEAT_CHARS:
|
||||||
|
@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
|
||||||
if source.next == "}":
|
if source.next == "}":
|
||||||
subpatternappend((LITERAL, _ord(this)))
|
subpatternappend((LITERAL, _ord(this)))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
min, max = 0, MAXREPEAT
|
min, max = 0, MAXREPEAT
|
||||||
lo = hi = ""
|
lo = hi = ""
|
||||||
while source.next in DIGITS:
|
while source.next in DIGITS:
|
||||||
|
@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
|
||||||
subpatternappend((LITERAL, _ord(this)))
|
subpatternappend((LITERAL, _ord(this)))
|
||||||
source.seek(here)
|
source.seek(here)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if lo:
|
if lo:
|
||||||
min = int(lo)
|
min = int(lo)
|
||||||
if min >= MAXREPEAT:
|
if min >= MAXREPEAT:
|
||||||
|
@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
|
||||||
item = subpattern[-1:]
|
item = subpattern[-1:]
|
||||||
else:
|
else:
|
||||||
item = None
|
item = None
|
||||||
if not item or (_len(item) == 1 and item[0][0] is AT):
|
if not item or item[0][0] is AT:
|
||||||
raise source.error("nothing to repeat",
|
raise source.error("nothing to repeat",
|
||||||
source.tell() - here + len(this))
|
source.tell() - here + len(this))
|
||||||
if item[0][0] in _REPEATCODES:
|
if item[0][0] in _REPEATCODES:
|
||||||
raise source.error("multiple repeat",
|
raise source.error("multiple repeat",
|
||||||
source.tell() - here + len(this))
|
source.tell() - here + len(this))
|
||||||
|
if item[0][0] is SUBPATTERN:
|
||||||
|
group, add_flags, del_flags, p = item[0][1]
|
||||||
|
if group is None and not add_flags and not del_flags:
|
||||||
|
item = p
|
||||||
if sourcematch("?"):
|
if sourcematch("?"):
|
||||||
subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
||||||
else:
|
else:
|
||||||
|
@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
|
||||||
start = source.tell() - 1
|
start = source.tell() - 1
|
||||||
group = True
|
group = True
|
||||||
name = None
|
name = None
|
||||||
condgroup = None
|
|
||||||
add_flags = 0
|
add_flags = 0
|
||||||
del_flags = 0
|
del_flags = 0
|
||||||
if sourcematch("?"):
|
if sourcematch("?"):
|
||||||
|
@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
|
||||||
state.checklookbehindgroup(gid, source)
|
state.checklookbehindgroup(gid, source)
|
||||||
subpatternappend((GROUPREF, gid))
|
subpatternappend((GROUPREF, gid))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
else:
|
else:
|
||||||
char = sourceget()
|
char = sourceget()
|
||||||
if char is None:
|
if char is None:
|
||||||
|
@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
|
||||||
if sourceget() == ")":
|
if sourceget() == ")":
|
||||||
break
|
break
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif char in "=!<":
|
elif char in "=!<":
|
||||||
# lookahead assertions
|
# lookahead assertions
|
||||||
dir = 1
|
dir = 1
|
||||||
|
@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
|
||||||
else:
|
else:
|
||||||
subpatternappend((ASSERT_NOT, (dir, p)))
|
subpatternappend((ASSERT_NOT, (dir, p)))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif char == "(":
|
elif char == "(":
|
||||||
# conditional backreference group
|
# conditional backreference group
|
||||||
condname = source.getuntil(")")
|
condname = source.getuntil(")")
|
||||||
group = None
|
|
||||||
if condname.isidentifier():
|
if condname.isidentifier():
|
||||||
condgroup = state.groupdict.get(condname)
|
condgroup = state.groupdict.get(condname)
|
||||||
if condgroup is None:
|
if condgroup is None:
|
||||||
|
@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
|
||||||
msg = "invalid group reference %d" % condgroup
|
msg = "invalid group reference %d" % condgroup
|
||||||
raise source.error(msg, len(condname) + 1)
|
raise source.error(msg, len(condname) + 1)
|
||||||
state.checklookbehindgroup(condgroup, source)
|
state.checklookbehindgroup(condgroup, source)
|
||||||
|
item_yes = _parse(source, state, verbose)
|
||||||
|
if source.match("|"):
|
||||||
|
item_no = _parse(source, state, verbose)
|
||||||
|
if source.next == "|":
|
||||||
|
raise source.error("conditional backref with more than two branches")
|
||||||
|
else:
|
||||||
|
item_no = None
|
||||||
|
if not source.match(")"):
|
||||||
|
raise source.error("missing ), unterminated subpattern",
|
||||||
|
source.tell() - start)
|
||||||
|
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
||||||
|
continue
|
||||||
|
|
||||||
elif char in FLAGS or char == "-":
|
elif char in FLAGS or char == "-":
|
||||||
# flags
|
# flags
|
||||||
flags = _parse_flags(source, state, char)
|
flags = _parse_flags(source, state, char)
|
||||||
|
@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
|
||||||
if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
|
if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
|
||||||
raise Verbose
|
raise Verbose
|
||||||
continue
|
continue
|
||||||
|
|
||||||
add_flags, del_flags = flags
|
add_flags, del_flags = flags
|
||||||
group = None
|
group = None
|
||||||
else:
|
else:
|
||||||
|
@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
|
||||||
group = state.opengroup(name)
|
group = state.opengroup(name)
|
||||||
except error as err:
|
except error as err:
|
||||||
raise source.error(err.msg, len(name) + 1) from None
|
raise source.error(err.msg, len(name) + 1) from None
|
||||||
if condgroup:
|
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
|
||||||
p = _parse_sub_cond(source, state, condgroup, verbose)
|
not (del_flags & SRE_FLAG_VERBOSE))
|
||||||
else:
|
p = _parse_sub(source, state, sub_verbose)
|
||||||
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
|
|
||||||
not (del_flags & SRE_FLAG_VERBOSE))
|
|
||||||
p = _parse_sub(source, state, sub_verbose)
|
|
||||||
if not source.match(")"):
|
if not source.match(")"):
|
||||||
raise source.error("missing ), unterminated subpattern",
|
raise source.error("missing ), unterminated subpattern",
|
||||||
source.tell() - start)
|
source.tell() - start)
|
||||||
|
@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
|
||||||
subpatternappend((AT, AT_BEGINNING))
|
subpatternappend((AT, AT_BEGINNING))
|
||||||
|
|
||||||
elif this == "$":
|
elif this == "$":
|
||||||
subpattern.append((AT, AT_END))
|
subpatternappend((AT, AT_END))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise AssertionError("unsupported special character %r" % (char,))
|
raise AssertionError("unsupported special character %r" % (char,))
|
||||||
|
|
||||||
|
# unpack non-capturing groups
|
||||||
|
for i in range(len(subpattern))[::-1]:
|
||||||
|
op, av = subpattern[i]
|
||||||
|
if op is SUBPATTERN:
|
||||||
|
group, add_flags, del_flags, p = av
|
||||||
|
if group is None and not add_flags and not del_flags:
|
||||||
|
subpattern[i: i+1] = p
|
||||||
|
|
||||||
return subpattern
|
return subpattern
|
||||||
|
|
||||||
def _parse_flags(source, state, char):
|
def _parse_flags(source, state, char):
|
||||||
|
|
|
@ -1695,20 +1695,18 @@ class ReTests(unittest.TestCase):
|
||||||
dump = '''\
|
dump = '''\
|
||||||
SUBPATTERN 1 0 0
|
SUBPATTERN 1 0 0
|
||||||
LITERAL 46
|
LITERAL 46
|
||||||
SUBPATTERN None 0 0
|
BRANCH
|
||||||
BRANCH
|
IN
|
||||||
IN
|
LITERAL 99
|
||||||
LITERAL 99
|
LITERAL 104
|
||||||
LITERAL 104
|
OR
|
||||||
OR
|
LITERAL 112
|
||||||
LITERAL 112
|
LITERAL 121
|
||||||
LITERAL 121
|
GROUPREF_EXISTS 1
|
||||||
SUBPATTERN None 0 0
|
AT AT_END
|
||||||
GROUPREF_EXISTS 1
|
ELSE
|
||||||
AT AT_END
|
LITERAL 58
|
||||||
ELSE
|
LITERAL 32
|
||||||
LITERAL 58
|
|
||||||
LITERAL 32
|
|
||||||
'''
|
'''
|
||||||
self.assertEqual(out.getvalue(), dump)
|
self.assertEqual(out.getvalue(), dump)
|
||||||
# Debug output is output again even a second time (bypassing
|
# Debug output is output again even a second time (bypassing
|
||||||
|
|
|
@ -326,6 +326,9 @@ Library
|
||||||
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
|
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
|
||||||
running coroutine and the coroutine returned without any more ``await``.
|
running coroutine and the coroutine returned without any more ``await``.
|
||||||
|
|
||||||
|
- bpo-30340: Enhanced regular expressions optimization. This increased
|
||||||
|
the performance of matching some patterns up to 25 times.
|
||||||
|
|
||||||
- bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
|
- bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
|
||||||
Now allowed several subsequential inline modifiers at the start of the
|
Now allowed several subsequential inline modifiers at the start of the
|
||||||
pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments
|
pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments
|
||||||
|
|
Loading…
Reference in New Issue