From 821a9d146bc04a1bc1a9807962990a1f59d692b8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 14 May 2017 08:32:33 +0300 Subject: [PATCH] bpo-30340: Enhanced regular expressions optimization. (#1542) This increased the performance of matching some patterns up to 25 times. --- Lib/sre_compile.py | 15 ++++--- Lib/sre_parse.py | 105 +++++++++++++++++++++++++++++--------------- Lib/test/test_re.py | 26 +++++------ Misc/NEWS | 3 ++ 4 files changed, 95 insertions(+), 54 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index cebecb93c0a..aeb89bcc7b4 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -20,6 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL} _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} +_UNIT_CODES = _LITERAL_CODES | {ANY, IN} # Sets of lowercase characters which have the same uppercase. _equivalences = ( @@ -125,7 +126,7 @@ def _compile(code, pattern, flags): elif op in REPEATING_CODES: if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator %r" % (op,)) - elif _simple(av) and op is not REPEAT: + if _simple(av[2]): if op is MAX_REPEAT: emit(REPEAT_ONE) else: @@ -404,10 +405,14 @@ def _bytes_to_codes(b): assert len(a) * a.itemsize == len(b) return a.tolist() -def _simple(av): - # check if av is a "simple" operator - lo, hi = av[2].getwidth() - return lo == hi == 1 and av[2][0][0] != SUBPATTERN +def _simple(p): + # check if this subpattern is a "simple" operator + if len(p) != 1: + return False + op, av = p[0] + if op is SUBPATTERN: + return av[0] is None and _simple(av[-1]) + return op in _UNIT_CODES def _generate_overlap_table(prefix): """ diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index d8d1bd552fb..f72408f010b 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -114,6 +114,7 @@ class SubPattern: data = [] self.data = data self.width = None + def dump(self, level=0): nl = True seqtypes = (tuple, list) @@ -404,6 +405,15 @@ def _escape(source, escape, state): pass raise source.error("bad escape %s" % escape, len(escape)) +def _uniq(items): + if len(set(items)) == len(items): + return items + newitems = [] + for item in items: + if item not in newitems: + newitems.append(item) + return newitems + def _parse_sub(source, state, verbose, nested=True): # parse an alternation: a|b|c @@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True): return items[0] subpattern = SubPattern(state) - subpatternappend = subpattern.append # check if all items share a common prefix while True: @@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True): # move it out of the branch for item in items: del item[0] - subpatternappend(prefix) + subpattern.append(prefix) continue # check next one break # check if the branch can be replaced by a character set + set = [] for item in items: - if len(item) != 1 or item[0][0] is not LITERAL: + if len(item) != 1: + break + op, av = item[0] + if op is LITERAL: + set.append((op, av)) + elif op is IN and av[0][0] is not NEGATE: + set.extend(av) + else: break else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more) - subpatternappend((IN, [item[0] for item in items])) + subpattern.append((IN, _uniq(set))) return subpattern subpattern.append((BRANCH, (None, items))) return subpattern -def _parse_sub_cond(source, state, condgroup, verbose): - item_yes = _parse(source, state, verbose) - if source.match("|"): - item_no = _parse(source, state, verbose) - if source.next == "|": - raise source.error("conditional backref with more than two branches") - else: - item_no = None - subpattern = SubPattern(state) - subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) - return subpattern - def _parse(source, state, verbose, first=False): # parse a simple pattern subpattern = SubPattern(state) @@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False): setappend = set.append ## if sourcematch(":"): ## pass # handle character classes - if sourcematch("^"): - setappend((NEGATE, None)) + negate = sourcematch("^") # check remaining characters - start = set[:] while True: this = sourceget() if this is None: raise source.error("unterminated character set", source.tell() - here) - if this == "]" and set != start: + if this == "]" and set: break elif this[0] == "\\": code1 = _class_escape(source, this) @@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False): code1 = code1[1][0] setappend(code1) + set = _uniq(set) # XXX: should move set optimization to compiler! - if _len(set)==1 and set[0][0] is LITERAL: - subpatternappend(set[0]) # optimization - elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpatternappend((NOT_LITERAL, set[1][1])) # optimization + if _len(set) == 1 and set[0][0] is LITERAL: + # optimization + if negate: + subpatternappend((NOT_LITERAL, set[0][1])) + else: + subpatternappend(set[0]) else: - # XXX: should add charmap optimization here + if negate: + set.insert(0, (NEGATE, None)) + # charmap optimization can't be added here because + # global flags still are not known subpatternappend((IN, set)) elif this in REPEAT_CHARS: @@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False): if source.next == "}": subpatternappend((LITERAL, _ord(this))) continue + min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: @@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False): subpatternappend((LITERAL, _ord(this))) source.seek(here) continue + if lo: min = int(lo) if min >= MAXREPEAT: @@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False): item = subpattern[-1:] else: item = None - if not item or (_len(item) == 1 and item[0][0] is AT): + if not item or item[0][0] is AT: raise source.error("nothing to repeat", source.tell() - here + len(this)) if item[0][0] in _REPEATCODES: raise source.error("multiple repeat", source.tell() - here + len(this)) + if item[0][0] is SUBPATTERN: + group, add_flags, del_flags, p = item[0][1] + if group is None and not add_flags and not del_flags: + item = p if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: @@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False): start = source.tell() - 1 group = True name = None - condgroup = None add_flags = 0 del_flags = 0 if sourcematch("?"): @@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False): state.checklookbehindgroup(gid, source) subpatternappend((GROUPREF, gid)) continue + else: char = sourceget() if char is None: @@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False): if sourceget() == ")": break continue + elif char in "=!<": # lookahead assertions dir = 1 @@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False): else: subpatternappend((ASSERT_NOT, (dir, p))) continue + elif char == "(": # conditional backreference group condname = source.getuntil(")") - group = None if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False): msg = "invalid group reference %d" % condgroup raise source.error(msg, len(condname) + 1) state.checklookbehindgroup(condgroup, source) + item_yes = _parse(source, state, verbose) + if source.match("|"): + item_no = _parse(source, state, verbose) + if source.next == "|": + raise source.error("conditional backref with more than two branches") + else: + item_no = None + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + continue + elif char in FLAGS or char == "-": # flags flags = _parse_flags(source, state, char) @@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False): if (state.flags & SRE_FLAG_VERBOSE) and not verbose: raise Verbose continue + add_flags, del_flags = flags group = None else: @@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False): group = state.opengroup(name) except error as err: raise source.error(err.msg, len(name) + 1) from None - if condgroup: - p = _parse_sub_cond(source, state, condgroup, verbose) - else: - sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and - not (del_flags & SRE_FLAG_VERBOSE)) - p = _parse_sub(source, state, sub_verbose) + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose) if not source.match(")"): raise source.error("missing ), unterminated subpattern", source.tell() - start) @@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False): subpatternappend((AT, AT_BEGINNING)) elif this == "$": - subpattern.append((AT, AT_END)) + subpatternappend((AT, AT_END)) else: raise AssertionError("unsupported special character %r" % (char,)) + # unpack non-capturing groups + for i in range(len(subpattern))[::-1]: + op, av = subpattern[i] + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group is None and not add_flags and not del_flags: + subpattern[i: i+1] = p + return subpattern def _parse_flags(source, state, char): diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 4d71eea517e..5d36b54680d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1695,20 +1695,18 @@ class ReTests(unittest.TestCase): dump = '''\ SUBPATTERN 1 0 0 LITERAL 46 -SUBPATTERN None 0 0 - BRANCH - IN - LITERAL 99 - LITERAL 104 - OR - LITERAL 112 - LITERAL 121 -SUBPATTERN None 0 0 - GROUPREF_EXISTS 1 - AT AT_END - ELSE - LITERAL 58 - LITERAL 32 +BRANCH + IN + LITERAL 99 + LITERAL 104 +OR + LITERAL 112 + LITERAL 121 +GROUPREF_EXISTS 1 + AT AT_END +ELSE + LITERAL 58 + LITERAL 32 ''' self.assertEqual(out.getvalue(), dump) # Debug output is output again even a second time (bypassing diff --git a/Misc/NEWS b/Misc/NEWS index 73cd82c8b6b..e6b4ced1a26 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -326,6 +326,9 @@ Library - bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is running coroutine and the coroutine returned without any more ``await``. +- bpo-30340: Enhanced regular expressions optimization. This increased + the performance of matching some patterns up to 25 times. + - bpo-30298: Weaken the condition of deprecation warnings for inline modifiers. Now allowed several subsequential inline modifiers at the start of the pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments