Issue #19380: Optimized parsing of regular expressions.

This commit is contained in:
Serhiy Storchaka 2014-10-10 11:14:49 +03:00
parent 5aa47443c6
commit e2ccf5608c
2 changed files with 122 additions and 150 deletions

View File

@ -18,12 +18,15 @@ from _sre import MAXREPEAT
SPECIAL_CHARS = ".\\[{()*+?^$|" SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{" REPEAT_CHARS = "*+?{"
DIGITS = set("0123456789") DIGITS = frozenset("0123456789")
OCTDIGITS = set("01234567") OCTDIGITS = frozenset("01234567")
HEXDIGITS = set("0123456789abcdefABCDEF") HEXDIGITS = frozenset("0123456789abcdefABCDEF")
WHITESPACE = set(" \t\n\r\v\f") WHITESPACE = frozenset(" \t\n\r\v\f")
_REPEATCODES = frozenset((MIN_REPEAT, MAX_REPEAT))
_UNITCODES = frozenset((ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY))
ESCAPES = { ESCAPES = {
r"\a": (LITERAL, ord("\a")), r"\a": (LITERAL, ord("\a")),
@ -153,11 +156,9 @@ class SubPattern:
self.data.append(code) self.data.append(code)
def getwidth(self): def getwidth(self):
# determine the width (min, max) for this subpattern # determine the width (min, max) for this subpattern
if self.width: if self.width is not None:
return self.width return self.width
lo = hi = 0 lo = hi = 0
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
for op, av in self.data: for op, av in self.data:
if op is BRANCH: if op is BRANCH:
i = MAXREPEAT - 1 i = MAXREPEAT - 1
@ -176,11 +177,11 @@ class SubPattern:
i, j = av[1].getwidth() i, j = av[1].getwidth()
lo = lo + i lo = lo + i
hi = hi + j hi = hi + j
elif op in REPEATCODES: elif op in _REPEATCODES:
i, j = av[2].getwidth() i, j = av[2].getwidth()
lo = lo + i * av[0] lo = lo + i * av[0]
hi = hi + j * av[1] hi = hi + j * av[1]
elif op in UNITCODES: elif op in _UNITCODES:
lo = lo + 1 lo = lo + 1
hi = hi + 1 hi = hi + 1
elif op == SUCCESS: elif op == SUCCESS:
@ -191,34 +192,31 @@ class SubPattern:
class Tokenizer: class Tokenizer:
def __init__(self, string): def __init__(self, string):
self.istext = isinstance(string, str) self.istext = isinstance(string, str)
if not self.istext:
string = str(string, 'latin1')
self.string = string self.string = string
self.index = 0 self.index = 0
self.__next() self.__next()
def __next(self): def __next(self):
if self.index >= len(self.string): index = self.index
try:
char = self.string[index]
except IndexError:
self.next = None self.next = None
return return
char = self.string[self.index:self.index+1]
# Special case for the str8, since indexing returns a integer
# XXX This is only needed for test_bug_926075 in test_re.py
if char and not self.istext:
char = chr(char[0])
if char == "\\": if char == "\\":
index += 1
try: try:
c = self.string[self.index + 1] char += self.string[index]
except IndexError: except IndexError:
raise error("bogus escape (end of line)") raise error("bogus escape (end of line)")
if not self.istext: self.index = index + 1
c = chr(c)
char = char + c
self.index = self.index + len(char)
self.next = char self.next = char
def match(self, char, skip=1): def match(self, char):
if char == self.next: if char == self.next:
if skip: self.__next()
self.__next() return True
return 1 return False
return 0
def get(self): def get(self):
this = self.next this = self.next
self.__next() self.__next()
@ -232,6 +230,17 @@ class Tokenizer:
result += c result += c
self.__next() self.__next()
return result return result
def getuntil(self, terminator):
result = ''
while True:
c = self.next
self.__next()
if c is None:
raise error("unterminated name")
if c == terminator:
break
result += c
return result
def tell(self): def tell(self):
return self.index, self.next return self.index, self.next
def seek(self, index): def seek(self, index):
@ -270,7 +279,7 @@ def _class_escape(source, escape):
if code: if code:
return code return code
code = CATEGORIES.get(escape) code = CATEGORIES.get(escape)
if code and code[0] == IN: if code and code[0] is IN:
return code return code
try: try:
c = escape[1:2] c = escape[1:2]
@ -279,7 +288,7 @@ def _class_escape(source, escape):
escape += source.getwhile(2, HEXDIGITS) escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4: if len(escape) != 4:
raise ValueError raise ValueError
return LITERAL, int(escape[2:], 16) & 0xff return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext: elif c == "u" and source.istext:
# unicode escape (exactly four digits) # unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS) escape += source.getwhile(4, HEXDIGITS)
@ -325,7 +334,7 @@ def _escape(source, escape, state):
escape += source.getwhile(2, HEXDIGITS) escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4: if len(escape) != 4:
raise ValueError raise ValueError
return LITERAL, int(escape[2:], 16) & 0xff return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext: elif c == "u" and source.istext:
# unicode escape (exactly four digits) # unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS) escape += source.getwhile(4, HEXDIGITS)
@ -347,11 +356,11 @@ def _escape(source, escape, state):
elif c in DIGITS: elif c in DIGITS:
# octal escape *or* decimal group reference (sigh) # octal escape *or* decimal group reference (sigh)
if source.next in DIGITS: if source.next in DIGITS:
escape = escape + source.get() escape += source.get()
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
source.next in OCTDIGITS): source.next in OCTDIGITS):
# got three octal digits; this is an octal escape # got three octal digits; this is an octal escape
escape = escape + source.get() escape += source.get()
c = int(escape[1:], 8) c = int(escape[1:], 8)
if c > 0o377: if c > 0o377:
raise error('octal escape value %r outside of ' raise error('octal escape value %r outside of '
@ -370,22 +379,18 @@ def _escape(source, escape, state):
pass pass
raise error("bogus escape: %s" % repr(escape)) raise error("bogus escape: %s" % repr(escape))
def _parse_sub(source, state, nested=1): def _parse_sub(source, state, nested=True):
# parse an alternation: a|b|c # parse an alternation: a|b|c
items = [] items = []
itemsappend = items.append itemsappend = items.append
sourcematch = source.match sourcematch = source.match
while 1: while True:
itemsappend(_parse(source, state)) itemsappend(_parse(source, state))
if sourcematch("|"): if not sourcematch("|"):
continue
if not nested:
break break
if not source.next or sourcematch(")", 0): if nested and source.next is not None and source.next != ")":
break raise error("pattern not properly closed")
else:
raise error("pattern not properly closed")
if len(items) == 1: if len(items) == 1:
return items[0] return items[0]
@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1):
subpatternappend = subpattern.append subpatternappend = subpattern.append
# check if all items share a common prefix # check if all items share a common prefix
while 1: while True:
prefix = None prefix = None
for item in items: for item in items:
if not item: if not item:
@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1):
# check if the branch can be replaced by a character set # check if the branch can be replaced by a character set
for item in items: for item in items:
if len(item) != 1 or item[0][0] != LITERAL: if len(item) != 1 or item[0][0] is not LITERAL:
break break
else: else:
# we can store this as a character set instead of a # we can store this as a character set instead of a
# branch (the compiler may optimize this even more) # branch (the compiler may optimize this even more)
set = [] subpatternappend((IN, [item[0] for item in items]))
setappend = set.append
for item in items:
setappend(item[0])
subpatternappend((IN, set))
return subpattern return subpattern
subpattern.append((BRANCH, (None, items))) subpattern.append((BRANCH, (None, items)))
@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condgroup):
item_yes = _parse(source, state) item_yes = _parse(source, state)
if source.match("|"): if source.match("|"):
item_no = _parse(source, state) item_no = _parse(source, state)
if source.match("|"): if source.next == "|":
raise error("conditional backref with more than two branches") raise error("conditional backref with more than two branches")
else: else:
item_no = None item_no = None
if source.next and not source.match(")", 0): if source.next is not None and source.next != ")":
raise error("pattern not properly closed") raise error("pattern not properly closed")
subpattern = SubPattern(state) subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern return subpattern
_PATTERNENDERS = set("|)")
_ASSERTCHARS = set("=!<")
_LOOKBEHINDASSERTCHARS = set("=!")
_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
def _parse(source, state): def _parse(source, state):
# parse a simple pattern # parse a simple pattern
subpattern = SubPattern(state) subpattern = SubPattern(state)
@ -457,32 +453,35 @@ def _parse(source, state):
sourceget = source.get sourceget = source.get
sourcematch = source.match sourcematch = source.match
_len = len _len = len
PATTERNENDERS = _PATTERNENDERS _ord = ord
ASSERTCHARS = _ASSERTCHARS verbose = state.flags & SRE_FLAG_VERBOSE
LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
REPEATCODES = _REPEATCODES
while 1: while True:
if source.next in PATTERNENDERS: this = source.next
break # end of subpattern
this = sourceget()
if this is None: if this is None:
break # end of pattern break # end of pattern
if this in "|)":
break # end of subpattern
sourceget()
if state.flags & SRE_FLAG_VERBOSE: if verbose:
# skip whitespace and comments # skip whitespace and comments
if this in WHITESPACE: if this in WHITESPACE:
continue continue
if this == "#": if this == "#":
while 1: while True:
this = sourceget() this = sourceget()
if this in (None, "\n"): if this is None or this == "\n":
break break
continue continue
if this and this[0] not in SPECIAL_CHARS: if this[0] == "\\":
subpatternappend((LITERAL, ord(this))) code = _escape(source, this, state)
subpatternappend(code)
elif this not in SPECIAL_CHARS:
subpatternappend((LITERAL, _ord(this)))
elif this == "[": elif this == "[":
# character set # character set
@ -494,39 +493,38 @@ def _parse(source, state):
setappend((NEGATE, None)) setappend((NEGATE, None))
# check remaining characters # check remaining characters
start = set[:] start = set[:]
while 1: while True:
this = sourceget() this = sourceget()
if this is None:
raise error("unexpected end of regular expression")
if this == "]" and set != start: if this == "]" and set != start:
break break
elif this and this[0] == "\\": elif this[0] == "\\":
code1 = _class_escape(source, this) code1 = _class_escape(source, this)
elif this:
code1 = LITERAL, ord(this)
else: else:
raise error("unexpected end of regular expression") code1 = LITERAL, _ord(this)
if sourcematch("-"): if sourcematch("-"):
# potential range # potential range
this = sourceget() this = sourceget()
if this is None:
raise error("unexpected end of regular expression")
if this == "]": if this == "]":
if code1[0] is IN: if code1[0] is IN:
code1 = code1[1][0] code1 = code1[1][0]
setappend(code1) setappend(code1)
setappend((LITERAL, ord("-"))) setappend((LITERAL, _ord("-")))
break break
elif this: if this[0] == "\\":
if this[0] == "\\": code2 = _class_escape(source, this)
code2 = _class_escape(source, this)
else:
code2 = LITERAL, ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
raise error("bad character range")
lo = code1[1]
hi = code2[1]
if hi < lo:
raise error("bad character range")
setappend((RANGE, (lo, hi)))
else: else:
raise error("unexpected end of regular expression") code2 = LITERAL, _ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
raise error("bad character range")
lo = code1[1]
hi = code2[1]
if hi < lo:
raise error("bad character range")
setappend((RANGE, (lo, hi)))
else: else:
if code1[0] is IN: if code1[0] is IN:
code1 = code1[1][0] code1 = code1[1][0]
@ -541,7 +539,7 @@ def _parse(source, state):
# XXX: <fl> should add charmap optimization here # XXX: <fl> should add charmap optimization here
subpatternappend((IN, set)) subpatternappend((IN, set))
elif this and this[0] in REPEAT_CHARS: elif this in REPEAT_CHARS:
# repeat previous item # repeat previous item
if this == "?": if this == "?":
min, max = 0, 1 min, max = 0, 1
@ -552,20 +550,20 @@ def _parse(source, state):
min, max = 1, MAXREPEAT min, max = 1, MAXREPEAT
elif this == "{": elif this == "{":
if source.next == "}": if source.next == "}":
subpatternappend((LITERAL, ord(this))) subpatternappend((LITERAL, _ord(this)))
continue continue
here = source.tell() here = source.tell()
min, max = 0, MAXREPEAT min, max = 0, MAXREPEAT
lo = hi = "" lo = hi = ""
while source.next in DIGITS: while source.next in DIGITS:
lo = lo + source.get() lo += sourceget()
if sourcematch(","): if sourcematch(","):
while source.next in DIGITS: while source.next in DIGITS:
hi = hi + sourceget() hi += sourceget()
else: else:
hi = lo hi = lo
if not sourcematch("}"): if not sourcematch("}"):
subpatternappend((LITERAL, ord(this))) subpatternappend((LITERAL, _ord(this)))
source.seek(here) source.seek(here)
continue continue
if lo: if lo:
@ -587,7 +585,7 @@ def _parse(source, state):
item = None item = None
if not item or (_len(item) == 1 and item[0][0] == AT): if not item or (_len(item) == 1 and item[0][0] == AT):
raise error("nothing to repeat") raise error("nothing to repeat")
if item[0][0] in REPEATCODES: if item[0][0] in _REPEATCODES:
raise error("multiple repeat") raise error("multiple repeat")
if sourcematch("?"): if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item)) subpattern[-1] = (MIN_REPEAT, (min, max, item))
@ -604,18 +602,14 @@ def _parse(source, state):
if sourcematch("?"): if sourcematch("?"):
group = 0 group = 0
# options # options
if sourcematch("P"): char = sourceget()
if char is None:
raise error("unexpected end of pattern")
if char == "P":
# python extensions # python extensions
if sourcematch("<"): if sourcematch("<"):
# named group: skip forward to end of name # named group: skip forward to end of name
name = "" name = source.getuntil(">")
while 1:
char = sourceget()
if char is None:
raise error("unterminated name")
if char == ">":
break
name = name + char
group = 1 group = 1
if not name: if not name:
raise error("missing group name") raise error("missing group name")
@ -623,14 +617,7 @@ def _parse(source, state):
raise error("bad character in group name %r" % name) raise error("bad character in group name %r" % name)
elif sourcematch("="): elif sourcematch("="):
# named backreference # named backreference
name = "" name = source.getuntil(")")
while 1:
char = sourceget()
if char is None:
raise error("unterminated name")
if char == ")":
break
name = name + char
if not name: if not name:
raise error("missing group name") raise error("missing group name")
if not name.isidentifier(): if not name.isidentifier():
@ -647,27 +634,25 @@ def _parse(source, state):
if char is None: if char is None:
raise error("unexpected end of pattern") raise error("unexpected end of pattern")
raise error("unknown specifier: ?P%s" % char) raise error("unknown specifier: ?P%s" % char)
elif sourcematch(":"): elif char == ":":
# non-capturing group # non-capturing group
group = 2 group = 2
elif sourcematch("#"): elif char == "#":
# comment # comment
while 1: while True:
if source.next is None or source.next == ")": if source.next is None:
raise error("unbalanced parenthesis")
if sourceget() == ")":
break break
sourceget()
if not sourcematch(")"):
raise error("unbalanced parenthesis")
continue continue
elif source.next in ASSERTCHARS: elif char in "=!<":
# lookahead assertions # lookahead assertions
char = sourceget()
dir = 1 dir = 1
if char == "<": if char == "<":
if source.next not in LOOKBEHINDASSERTCHARS: char = sourceget()
if char is None or char not in "=!":
raise error("syntax error") raise error("syntax error")
dir = -1 # lookbehind dir = -1 # lookbehind
char = sourceget()
p = _parse_sub(source, state) p = _parse_sub(source, state)
if not sourcematch(")"): if not sourcematch(")"):
raise error("unbalanced parenthesis") raise error("unbalanced parenthesis")
@ -676,16 +661,9 @@ def _parse(source, state):
else: else:
subpatternappend((ASSERT_NOT, (dir, p))) subpatternappend((ASSERT_NOT, (dir, p)))
continue continue
elif sourcematch("("): elif char == "(":
# conditional backreference group # conditional backreference group
condname = "" condname = source.getuntil(")")
while 1:
char = sourceget()
if char is None:
raise error("unterminated name")
if char == ")":
break
condname = condname + char
group = 2 group = 2
if not condname: if not condname:
raise error("missing group name") raise error("missing group name")
@ -705,12 +683,14 @@ def _parse(source, state):
raise error("bad group number") raise error("bad group number")
if condgroup >= MAXGROUPS: if condgroup >= MAXGROUPS:
raise error("the group number is too large") raise error("the group number is too large")
else: elif char in FLAGS:
# flags # flags
if not source.next in FLAGS: state.flags |= FLAGS[char]
raise error("unexpected end of pattern")
while source.next in FLAGS: while source.next in FLAGS:
state.flags = state.flags | FLAGS[sourceget()] state.flags |= FLAGS[sourceget()]
verbose = state.flags & SRE_FLAG_VERBOSE
else:
raise error("unexpected end of pattern " + char)
if group: if group:
# parse group contents # parse group contents
if group == 2: if group == 2:
@ -728,7 +708,7 @@ def _parse(source, state):
state.closegroup(group) state.closegroup(group)
subpatternappend((SUBPATTERN, (group, p))) subpatternappend((SUBPATTERN, (group, p)))
else: else:
while 1: while True:
char = sourceget() char = sourceget()
if char is None: if char is None:
raise error("unexpected end of pattern") raise error("unexpected end of pattern")
@ -742,10 +722,6 @@ def _parse(source, state):
elif this == "$": elif this == "$":
subpattern.append((AT, AT_END)) subpattern.append((AT, AT_END))
elif this and this[0] == "\\":
code = _escape(source, this, state)
subpatternappend(code)
else: else:
raise error("parser error") raise error("parser error")
@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None):
p = _parse_sub(source, pattern, 0) p = _parse_sub(source, pattern, 0)
p.pattern.flags = fix_flags(str, p.pattern.flags) p.pattern.flags = fix_flags(str, p.pattern.flags)
tail = source.get() if source.next is not None:
if tail == ")": if source.next == ")":
raise error("unbalanced parenthesis") raise error("unbalanced parenthesis")
elif tail: else:
raise error("bogus characters at end of regular expression") raise error("bogus characters at end of regular expression")
if flags & SRE_FLAG_DEBUG: if flags & SRE_FLAG_DEBUG:
p.dump() p.dump()
@ -817,13 +793,7 @@ def parse_template(source, pattern):
if c == "g": if c == "g":
name = "" name = ""
if s.match("<"): if s.match("<"):
while True: name = s.getuntil(">")
char = sget()
if char is None:
raise error("unterminated group name")
if char == ">":
break
name += char
if not name: if not name:
raise error("missing group name") raise error("missing group name")
try: try:

View File

@ -166,7 +166,9 @@ Core and Builtins
Library Library
------- -------
- Issue 1519638: Now unmatched groups are replaced with empty strings in re.sub() - Issue #19380: Optimized parsing of regular expressions.
- Issue #1519638: Now unmatched groups are replaced with empty strings in re.sub()
and re.subn(). and re.subn().
- Issue #18615: sndhdr.what/whathdr now return a namedtuple. - Issue #18615: sndhdr.what/whathdr now return a namedtuple.