bpo-30340: Enhanced regular expressions optimization. (#1542)

This increased the performance of matching some patterns up to 25 times.
2017-05-14 08:32:33 +03:00 · 2017-05-14 08:32:33 +03:00 · 821a9d146b
parent cbddf58c79
commit 821a9d146b
4 changed files with 95 additions and 54 deletions
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@ -20,6 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL}
 _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
 _SUCCESS_CODES = {SUCCESS, FAILURE}
 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
+_UNIT_CODES = _LITERAL_CODES | {ANY, IN}

 # Sets of lowercase characters which have the same uppercase.
 _equivalences = (
@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
        elif op in REPEATING_CODES:
            if flags & SRE_FLAG_TEMPLATE:
                raise error("internal: unsupported template operator %r" % (op,))
-            elif _simple(av) and op is not REPEAT:
+            if _simple(av[2]):
                if op is MAX_REPEAT:
                    emit(REPEAT_ONE)
                else:
@ -404,10 +405,14 @@ def _bytes_to_codes(b):
    assert len(a) * a.itemsize == len(b)
    return a.tolist()

-def _simple(av):
-    # check if av is a "simple" operator
-    lo, hi = av[2].getwidth()
-    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
+def _simple(p):
+    # check if this subpattern is a "simple" operator
+    if len(p) != 1:
+        return False
+    op, av = p[0]
+    if op is SUBPATTERN:
+        return av[0] is None and _simple(av[-1])
+    return op in _UNIT_CODES

 def _generate_overlap_table(prefix):
    """
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@ -114,6 +114,7 @@ class SubPattern:
            data = []
        self.data = data
        self.width = None
+
    def dump(self, level=0):
        nl = True
        seqtypes = (tuple, list)
@ -404,6 +405,15 @@ def _escape(source, escape, state):
        pass
    raise source.error("bad escape %s" % escape, len(escape))

+def _uniq(items):
+    if len(set(items)) == len(items):
+        return items
+    newitems = []
+    for item in items:
+        if item not in newitems:
+            newitems.append(item)
+    return newitems
+
 def _parse_sub(source, state, verbose, nested=True):
    # parse an alternation: a|b|c

@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
        return items[0]

    subpattern = SubPattern(state)
-    subpatternappend = subpattern.append

    # check if all items share a common prefix
    while True:
@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
            # move it out of the branch
            for item in items:
                del item[0]
-            subpatternappend(prefix)
+            subpattern.append(prefix)
            continue # check next one
        break

    # check if the branch can be replaced by a character set
+    set = []
    for item in items:
-        if len(item) != 1 or item[0][0] is not LITERAL:
+        if len(item) != 1:
+            break
+        op, av = item[0]
+        if op is LITERAL:
+            set.append((op, av))
+        elif op is IN and av[0][0] is not NEGATE:
+            set.extend(av)
+        else:
            break
    else:
        # we can store this as a character set instead of a
        # branch (the compiler may optimize this even more)
-        subpatternappend((IN, [item[0] for item in items]))
+        subpattern.append((IN, _uniq(set)))
        return subpattern

    subpattern.append((BRANCH, (None, items)))
    return subpattern

-def _parse_sub_cond(source, state, condgroup, verbose):
-    item_yes = _parse(source, state, verbose)
-    if source.match("|"):
-        item_no = _parse(source, state, verbose)
-        if source.next == "|":
-            raise source.error("conditional backref with more than two branches")
-    else:
-        item_no = None
-    subpattern = SubPattern(state)
-    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
-    return subpattern
-
 def _parse(source, state, verbose, first=False):
    # parse a simple pattern
    subpattern = SubPattern(state)
@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
            setappend = set.append
 ##          if sourcematch(":"):
 ##              pass # handle character classes
-            if sourcematch("^"):
-                setappend((NEGATE, None))
+            negate = sourcematch("^")
            # check remaining characters
-            start = set[:]
            while True:
                this = sourceget()
                if this is None:
                    raise source.error("unterminated character set",
                                       source.tell() - here)
-                if this == "]" and set != start:
+                if this == "]" and set:
                    break
                elif this[0] == "\\":
                    code1 = _class_escape(source, this)
@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
                        code1 = code1[1][0]
                    setappend(code1)

+            set = _uniq(set)
            # XXX: <fl> should move set optimization to compiler!
-            if _len(set)==1 and set[0][0] is LITERAL:
-                subpatternappend(set[0]) # optimization
-            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
-                subpatternappend((NOT_LITERAL, set[1][1])) # optimization
+            if _len(set) == 1 and set[0][0] is LITERAL:
+                # optimization
+                if negate:
+                    subpatternappend((NOT_LITERAL, set[0][1]))
+                else:
+                    subpatternappend(set[0])
            else:
-                # XXX: <fl> should add charmap optimization here
+                if negate:
+                    set.insert(0, (NEGATE, None))
+                # charmap optimization can't be added here because
+                # global flags still are not known
                subpatternappend((IN, set))

        elif this in REPEAT_CHARS:
@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
                if source.next == "}":
                    subpatternappend((LITERAL, _ord(this)))
                    continue
+
                min, max = 0, MAXREPEAT
                lo = hi = ""
                while source.next in DIGITS:
@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
                    subpatternappend((LITERAL, _ord(this)))
                    source.seek(here)
                    continue
+
                if lo:
                    min = int(lo)
                    if min >= MAXREPEAT:
@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
                item = subpattern[-1:]
            else:
                item = None
-            if not item or (_len(item) == 1 and item[0][0] is AT):
+            if not item or item[0][0] is AT:
                raise source.error("nothing to repeat",
                                   source.tell() - here + len(this))
            if item[0][0] in _REPEATCODES:
                raise source.error("multiple repeat",
                                   source.tell() - here + len(this))
+            if item[0][0] is SUBPATTERN:
+                group, add_flags, del_flags, p = item[0][1]
+                if group is None and not add_flags and not del_flags:
+                    item = p
            if sourcematch("?"):
                subpattern[-1] = (MIN_REPEAT, (min, max, item))
            else:
@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
            start = source.tell() - 1
            group = True
            name = None
-            condgroup = None
            add_flags = 0
            del_flags = 0
            if sourcematch("?"):
@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
                        state.checklookbehindgroup(gid, source)
                        subpatternappend((GROUPREF, gid))
                        continue
+
                    else:
                        char = sourceget()
                        if char is None:
@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
                        if sourceget() == ")":
                            break
                    continue
+
                elif char in "=!<":
                    # lookahead assertions
                    dir = 1
@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
                    else:
                        subpatternappend((ASSERT_NOT, (dir, p)))
                    continue
+
                elif char == "(":
                    # conditional backreference group
                    condname = source.getuntil(")")
-                    group = None
                    if condname.isidentifier():
                        condgroup = state.groupdict.get(condname)
                        if condgroup is None:
@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
                            msg = "invalid group reference %d" % condgroup
                            raise source.error(msg, len(condname) + 1)
                    state.checklookbehindgroup(condgroup, source)
+                    item_yes = _parse(source, state, verbose)
+                    if source.match("|"):
+                        item_no = _parse(source, state, verbose)
+                        if source.next == "|":
+                            raise source.error("conditional backref with more than two branches")
+                    else:
+                        item_no = None
+                    if not source.match(")"):
+                        raise source.error("missing ), unterminated subpattern",
+                                           source.tell() - start)
+                    subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
+                    continue
+
                elif char in FLAGS or char == "-":
                    # flags
                    flags = _parse_flags(source, state, char)
@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
                        if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
                            raise Verbose
                        continue
+
                    add_flags, del_flags = flags
                    group = None
                else:
@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
                    group = state.opengroup(name)
                except error as err:
                    raise source.error(err.msg, len(name) + 1) from None
-            if condgroup:
-                p = _parse_sub_cond(source, state, condgroup, verbose)
-            else:
-                sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
-                               not (del_flags & SRE_FLAG_VERBOSE))
-                p = _parse_sub(source, state, sub_verbose)
+            sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
+                           not (del_flags & SRE_FLAG_VERBOSE))
+            p = _parse_sub(source, state, sub_verbose)
            if not source.match(")"):
                raise source.error("missing ), unterminated subpattern",
                                   source.tell() - start)
@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
            subpatternappend((AT, AT_BEGINNING))

        elif this == "$":
-            subpattern.append((AT, AT_END))
+            subpatternappend((AT, AT_END))

        else:
            raise AssertionError("unsupported special character %r" % (char,))

+    # unpack non-capturing groups
+    for i in range(len(subpattern))[::-1]:
+        op, av = subpattern[i]
+        if op is SUBPATTERN:
+            group, add_flags, del_flags, p = av
+            if group is None and not add_flags and not del_flags:
+                subpattern[i: i+1] = p
+
    return subpattern

 def _parse_flags(source, state, char):
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@ -1695,20 +1695,18 @@ class ReTests(unittest.TestCase):
        dump = '''\
 SUBPATTERN 1 0 0
  LITERAL 46
-SUBPATTERN None 0 0
-  BRANCH
-    IN
-      LITERAL 99
-      LITERAL 104
-  OR
-    LITERAL 112
-    LITERAL 121
-SUBPATTERN None 0 0
-  GROUPREF_EXISTS 1
-    AT AT_END
-  ELSE
-    LITERAL 58
-    LITERAL 32
+BRANCH
+  IN
+    LITERAL 99
+    LITERAL 104
+OR
+  LITERAL 112
+  LITERAL 121
+GROUPREF_EXISTS 1
+  AT AT_END
+ELSE
+  LITERAL 58
+  LITERAL 32
 '''
        self.assertEqual(out.getvalue(), dump)
        # Debug output is output again even a second time (bypassing
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -326,6 +326,9 @@ Library
 - bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
  running coroutine and the coroutine returned without any more ``await``.

+- bpo-30340: Enhanced regular expressions optimization. This increased
+  the performance of matching some patterns up to 25 times.
+
 - bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
  Now allowed several subsequential inline modifiers at the start of the
  pattern (e.g. ``'(?i)(?s)...'``).  In verbose mode whitespaces and comments