bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.
This commit is contained in:
Serhiy Storchaka 2017-05-09 23:37:14 +03:00 committed by GitHub
parent f93234bb8a
commit 6d336a0279
6 changed files with 215 additions and 70 deletions

View File

@ -208,6 +208,10 @@ Optimizations
using the :func:`os.scandir` function. using the :func:`os.scandir` function.
(Contributed by Serhiy Storchaka in :issue:`25996`.) (Contributed by Serhiy Storchaka in :issue:`25996`.)
* Optimized case-insensitive matching and searching of :mod:`regular
expressions <re>`. Searching some patterns can now be up to 20 times faster.
(Contributed by Serhiy Storchaka in :issue:`30285`.)
Build and C API Changes Build and C API Changes
======================= =======================

View File

@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
REPEATING_CODES = _REPEATING_CODES REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES ASSERT_CODES = _ASSERT_CODES
iscased = None
tolower = None tolower = None
fixes = None fixes = None
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes fixes = _ignorecase_fixes
else: else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower tolower = _sre.ascii_tolower
for op, av in pattern: for op, av in pattern:
if op in LITERAL_CODES: if op in LITERAL_CODES:
@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
elif flags & SRE_FLAG_LOCALE: elif flags & SRE_FLAG_LOCALE:
emit(OP_LOC_IGNORE[op]) emit(OP_LOC_IGNORE[op])
emit(av) emit(av)
elif not iscased(av):
emit(op)
emit(av)
else: else:
lo = tolower(av) lo = tolower(av)
if fixes and lo in fixes: if fixes and lo in fixes:
@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
emit(OP_IGNORE[op]) emit(OP_IGNORE[op])
emit(lo) emit(lo)
elif op is IN: elif op is IN:
if not flags & SRE_FLAG_IGNORECASE: charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
emit(op) if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
elif flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE) emit(IN_LOC_IGNORE)
else: elif hascased:
emit(IN_IGNORE) emit(IN_IGNORE)
else:
emit(IN)
skip = _len(code); emit(0) skip = _len(code); emit(0)
_compile_charset(av, flags, code, tolower, fixes) _compile_charset(charset, flags, code)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
elif op is ANY: elif op is ANY:
if flags & SRE_FLAG_DOTALL: if flags & SRE_FLAG_DOTALL:
@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
else: else:
raise error("internal: unsupported operand type %r" % (op,)) raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code, fixup=None, fixes=None): def _compile_charset(charset, flags, code):
# compile charset subprogram # compile charset subprogram
emit = code.append emit = code.append
for op, av in _optimize_charset(charset, fixup, fixes): for op, av in charset:
emit(op) emit(op)
if op is NEGATE: if op is NEGATE:
pass pass
@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
raise error("internal: unsupported set operator %r" % (op,)) raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE) emit(FAILURE)
def _optimize_charset(charset, fixup, fixes): def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set # internal: optimize character set
out = [] out = []
tail = [] tail = []
charmap = bytearray(256) charmap = bytearray(256)
hascased = False
for op, av in charset: for op, av in charset:
while True: while True:
try: try:
@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
if fixes and lo in fixes: if fixes and lo in fixes:
for k in fixes[lo]: for k in fixes[lo]:
charmap[k] = 1 charmap[k] = 1
if not hascased and iscased(av):
hascased = True
else: else:
charmap[av] = 1 charmap[av] = 1
elif op is RANGE: elif op is RANGE:
r = range(av[0], av[1]+1) r = range(av[0], av[1]+1)
if fixup: if fixup:
r = map(fixup, r) if fixes:
if fixup and fixes: for i in map(fixup, r):
for i in r: charmap[i] = 1
charmap[i] = 1 if i in fixes:
if i in fixes: for k in fixes[i]:
for k in fixes[i]: charmap[k] = 1
charmap[k] = 1 else:
for i in map(fixup, r):
charmap[i] = 1
if not hascased:
hascased = any(map(iscased, r))
else: else:
for i in r: for i in r:
charmap[i] = 1 charmap[i] = 1
@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
charmap += b'\0' * 0xff00 charmap += b'\0' * 0xff00
continue continue
# Character set contains non-BMP character codes. # Character set contains non-BMP character codes.
# There are only two ranges of cased non-BMP characters: if fixup:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), hascased = True
# and for both ranges RANGE_IGNORE works. # There are only two ranges of cased non-BMP characters:
if fixup and op is RANGE: # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
op = RANGE_IGNORE # and for both ranges RANGE_IGNORE works.
if op is RANGE:
op = RANGE_IGNORE
tail.append((op, av)) tail.append((op, av))
break break
@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
out.append((RANGE, (p, q - 1))) out.append((RANGE, (p, q - 1)))
out += tail out += tail
# if the case was changed or new representation is more compact # if the case was changed or new representation is more compact
if fixup or len(out) < len(charset): if hascased or len(out) < len(charset):
return out return out, hascased
# else original character set is good enough # else original character set is good enough
return charset return charset, hascased
# use bitmap # use bitmap
if len(charmap) == 256: if len(charmap) == 256:
data = _mk_bitmap(charmap) data = _mk_bitmap(charmap)
out.append((CHARSET, data)) out.append((CHARSET, data))
out += tail out += tail
return out return out, hascased
# To represent a big charset, first a bitmap of all characters in the # To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256 # set is constructed. Then, this bitmap is sliced into chunks of 256
@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
data[0:0] = [block] + _bytes_to_codes(mapping) data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data)) out.append((BIGCHARSET, data))
out += tail out += tail
return out return out, hascased
_CODEBITS = _sre.CODESIZE * 8 _CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1 MAXCODE = (1 << _CODEBITS) - 1
@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
table[i] = idx + 1 table[i] = idx + 1
return table return table
def _get_literal_prefix(pattern): def _get_iscased(flags):
if not flags & SRE_FLAG_IGNORECASE:
return None
elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
return _sre.unicode_iscased
else:
return _sre.ascii_iscased
def _get_literal_prefix(pattern, flags):
# look for literal prefix # look for literal prefix
prefix = [] prefix = []
prefixappend = prefix.append prefixappend = prefix.append
prefix_skip = None prefix_skip = None
iscased = _get_iscased(flags)
for op, av in pattern.data: for op, av in pattern.data:
if op is LITERAL: if op is LITERAL:
if iscased and iscased(av):
break
prefixappend(av) prefixappend(av)
elif op is SUBPATTERN: elif op is SUBPATTERN:
group, add_flags, del_flags, p = av group, add_flags, del_flags, p = av
if add_flags & SRE_FLAG_IGNORECASE: flags1 = (flags | add_flags) & ~del_flags
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
break break
prefix1, prefix_skip1, got_all = _get_literal_prefix(p) prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
if prefix_skip is None: if prefix_skip is None:
if group is not None: if group is not None:
prefix_skip = len(prefix) prefix_skip = len(prefix)
@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
return prefix, prefix_skip, True return prefix, prefix_skip, True
return prefix, prefix_skip, False return prefix, prefix_skip, False
def _get_charset_prefix(pattern): def _get_charset_prefix(pattern, flags):
charset = [] # not used while True:
charsetappend = charset.append if not pattern.data:
if pattern.data: return None
op, av = pattern.data[0] op, av = pattern.data[0]
if op is SUBPATTERN: if op is not SUBPATTERN:
group, add_flags, del_flags, p = av break
if p and not (add_flags & SRE_FLAG_IGNORECASE): group, add_flags, del_flags, pattern = av
op, av = p[0] flags = (flags | add_flags) & ~del_flags
if op is LITERAL: if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
charsetappend((op, av)) return None
elif op is BRANCH:
c = [] iscased = _get_iscased(flags)
cappend = c.append if op is LITERAL:
for p in av[1]: if iscased and iscased(av):
if not p: return None
break return [(op, av)]
op, av = p[0] elif op is BRANCH:
if op is LITERAL: charset = []
cappend((op, av)) charsetappend = charset.append
else: for p in av[1]:
break if not p:
else: return None
charset = c op, av = p[0]
elif op is BRANCH: if op is LITERAL and not (iscased and iscased(av)):
c = [] charsetappend((op, av))
cappend = c.append
for p in av[1]:
if not p:
break
op, av = p[0]
if op is LITERAL:
cappend((op, av))
else:
break
else: else:
charset = c return None
elif op is IN: return charset
charset = av elif op is IN:
return charset charset = av
if iscased:
for op, av in charset:
if op is LITERAL:
if iscased(av):
return None
elif op is RANGE:
if av[1] > 0xffff:
return None
if any(map(iscased, range(av[0], av[1]+1))):
return None
return charset
return None
def _compile_info(code, pattern, flags): def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version, # internal: compile an info block. in the current version,
@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
prefix = [] prefix = []
prefix_skip = 0 prefix_skip = 0
charset = [] # not used charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE): if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix # look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern) prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix # if no prefix, look for charset prefix
if not prefix: if not prefix:
charset = _get_charset_prefix(pattern) charset = _get_charset_prefix(pattern, flags)
## if prefix: ## if prefix:
## print("*** PREFIX", prefix, prefix_skip) ## print("*** PREFIX", prefix, prefix_skip)
## if charset: ## if charset:
@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
# generate overlap table # generate overlap table
code.extend(_generate_overlap_table(prefix)) code.extend(_generate_overlap_table(prefix))
elif charset: elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code) _compile_charset(charset, flags, code)
code[skip] = len(code) - skip code[skip] = len(code) - skip

View File

@ -891,15 +891,24 @@ class ReTests(unittest.TestCase):
lo = ord(c.lower()) lo = ord(c.lower())
self.assertEqual(_sre.ascii_tolower(i), lo) self.assertEqual(_sre.ascii_tolower(i), lo)
self.assertEqual(_sre.unicode_tolower(i), lo) self.assertEqual(_sre.unicode_tolower(i), lo)
iscased = c in string.ascii_letters
self.assertEqual(_sre.ascii_iscased(i), iscased)
self.assertEqual(_sre.unicode_iscased(i), iscased)
for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
c = chr(i) c = chr(i)
self.assertEqual(_sre.ascii_tolower(i), i) self.assertEqual(_sre.ascii_tolower(i), i)
if i != 0x0130: if i != 0x0130:
self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
iscased = c != c.lower() or c != c.upper()
self.assertFalse(_sre.ascii_iscased(i))
self.assertEqual(_sre.unicode_iscased(i),
c != c.lower() or c != c.upper())
self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
self.assertFalse(_sre.ascii_iscased(0x0130))
self.assertTrue(_sre.unicode_iscased(0x0130))
def test_not_literal(self): def test_not_literal(self):
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")

View File

@ -320,6 +320,9 @@ Extension Modules
Library Library
------- -------
- bpo-30285: Optimized case-insensitive matching and searching of regular
expressions.
- bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin. - bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin.
- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and - bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and

View File

@ -273,6 +273,38 @@ _sre_getcodesize_impl(PyObject *module)
return sizeof(SRE_CODE); return sizeof(SRE_CODE);
} }
/*[clinic input]
_sre.ascii_iscased -> bool
character: int
/
[clinic start generated code]*/
static int
_sre_ascii_iscased_impl(PyObject *module, int character)
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
{
unsigned int ch = (unsigned int)character;
return ch != sre_lower(ch) || ch != sre_upper(ch);
}
/*[clinic input]
_sre.unicode_iscased -> bool
character: int
/
[clinic start generated code]*/
static int
_sre_unicode_iscased_impl(PyObject *module, int character)
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
{
unsigned int ch = (unsigned int)character;
return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
}
/*[clinic input] /*[clinic input]
_sre.ascii_tolower -> int _sre.ascii_tolower -> int
@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = {
static PyMethodDef _functions[] = { static PyMethodDef _functions[] = {
_SRE_COMPILE_METHODDEF _SRE_COMPILE_METHODDEF
_SRE_GETCODESIZE_METHODDEF _SRE_GETCODESIZE_METHODDEF
_SRE_ASCII_ISCASED_METHODDEF
_SRE_UNICODE_ISCASED_METHODDEF
_SRE_ASCII_TOLOWER_METHODDEF _SRE_ASCII_TOLOWER_METHODDEF
_SRE_UNICODE_TOLOWER_METHODDEF _SRE_UNICODE_TOLOWER_METHODDEF
{NULL, NULL} {NULL, NULL}

View File

@ -29,6 +29,68 @@ exit:
return return_value; return return_value;
} }
PyDoc_STRVAR(_sre_ascii_iscased__doc__,
"ascii_iscased($module, character, /)\n"
"--\n"
"\n");
#define _SRE_ASCII_ISCASED_METHODDEF \
{"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__},
static int
_sre_ascii_iscased_impl(PyObject *module, int character);
static PyObject *
_sre_ascii_iscased(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int character;
int _return_value;
if (!PyArg_Parse(arg, "i:ascii_iscased", &character)) {
goto exit;
}
_return_value = _sre_ascii_iscased_impl(module, character);
if ((_return_value == -1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyBool_FromLong((long)_return_value);
exit:
return return_value;
}
PyDoc_STRVAR(_sre_unicode_iscased__doc__,
"unicode_iscased($module, character, /)\n"
"--\n"
"\n");
#define _SRE_UNICODE_ISCASED_METHODDEF \
{"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__},
static int
_sre_unicode_iscased_impl(PyObject *module, int character);
static PyObject *
_sre_unicode_iscased(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int character;
int _return_value;
if (!PyArg_Parse(arg, "i:unicode_iscased", &character)) {
goto exit;
}
_return_value = _sre_unicode_iscased_impl(module, character);
if ((_return_value == -1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyBool_FromLong((long)_return_value);
exit:
return return_value;
}
PyDoc_STRVAR(_sre_ascii_tolower__doc__, PyDoc_STRVAR(_sre_ascii_tolower__doc__,
"ascii_tolower($module, character, /)\n" "ascii_tolower($module, character, /)\n"
"--\n" "--\n"
@ -715,4 +777,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored))
{ {
return _sre_SRE_Scanner_search_impl(self); return _sre_SRE_Scanner_search_impl(self);
} }
/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/ /*[clinic end generated code: output=5fe47c49e475cccb input=a9049054013a1b77]*/