mirror of https://github.com/python/cpython
gh-111259: Optimize complementary character sets in RE (GH-120742)
Patterns like "[\s\S]" or "\s|\S" which match any character are now compiled to the same effective code as a dot with the DOTALL modifier ("(?s:.)").
This commit is contained in:
parent
3846fcfb92
commit
8bc76ae45f
|
@ -28,6 +28,8 @@ _REPEATING_CODES = {
|
||||||
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
|
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_CHARSET_ALL = [(NEGATE, None)]
|
||||||
|
|
||||||
def _combine_flags(flags, add_flags, del_flags,
|
def _combine_flags(flags, add_flags, del_flags,
|
||||||
TYPE_FLAGS=_parser.TYPE_FLAGS):
|
TYPE_FLAGS=_parser.TYPE_FLAGS):
|
||||||
if add_flags & TYPE_FLAGS:
|
if add_flags & TYPE_FLAGS:
|
||||||
|
@ -84,17 +86,22 @@ def _compile(code, pattern, flags):
|
||||||
code[skip] = _len(code) - skip
|
code[skip] = _len(code) - skip
|
||||||
elif op is IN:
|
elif op is IN:
|
||||||
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
|
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
|
||||||
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
if not charset:
|
||||||
emit(IN_LOC_IGNORE)
|
emit(FAILURE)
|
||||||
elif not hascased:
|
elif charset == _CHARSET_ALL:
|
||||||
emit(IN)
|
emit(ANY_ALL)
|
||||||
elif not fixes: # ascii
|
|
||||||
emit(IN_IGNORE)
|
|
||||||
else:
|
else:
|
||||||
emit(IN_UNI_IGNORE)
|
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
||||||
skip = _len(code); emit(0)
|
emit(IN_LOC_IGNORE)
|
||||||
_compile_charset(charset, flags, code)
|
elif not hascased:
|
||||||
code[skip] = _len(code) - skip
|
emit(IN)
|
||||||
|
elif not fixes: # ascii
|
||||||
|
emit(IN_IGNORE)
|
||||||
|
else:
|
||||||
|
emit(IN_UNI_IGNORE)
|
||||||
|
skip = _len(code); emit(0)
|
||||||
|
_compile_charset(charset, flags, code)
|
||||||
|
code[skip] = _len(code) - skip
|
||||||
elif op is ANY:
|
elif op is ANY:
|
||||||
if flags & SRE_FLAG_DOTALL:
|
if flags & SRE_FLAG_DOTALL:
|
||||||
emit(ANY_ALL)
|
emit(ANY_ALL)
|
||||||
|
@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
|
||||||
charmap[i] = 1
|
charmap[i] = 1
|
||||||
elif op is NEGATE:
|
elif op is NEGATE:
|
||||||
out.append((op, av))
|
out.append((op, av))
|
||||||
|
elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
|
||||||
|
# Optimize [\s\S] etc.
|
||||||
|
out = [] if out else _CHARSET_ALL
|
||||||
|
return out, False
|
||||||
else:
|
else:
|
||||||
tail.append((op, av))
|
tail.append((op, av))
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
@ -519,13 +530,18 @@ def _compile_info(code, pattern, flags):
|
||||||
# look for a literal prefix
|
# look for a literal prefix
|
||||||
prefix = []
|
prefix = []
|
||||||
prefix_skip = 0
|
prefix_skip = 0
|
||||||
charset = [] # not used
|
charset = None # not used
|
||||||
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
|
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
|
||||||
# look for literal prefix
|
# look for literal prefix
|
||||||
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
|
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
|
||||||
# if no prefix, look for charset prefix
|
# if no prefix, look for charset prefix
|
||||||
if not prefix:
|
if not prefix:
|
||||||
charset = _get_charset_prefix(pattern, flags)
|
charset = _get_charset_prefix(pattern, flags)
|
||||||
|
if charset:
|
||||||
|
charset, hascased = _optimize_charset(charset)
|
||||||
|
assert not hascased
|
||||||
|
if charset == _CHARSET_ALL:
|
||||||
|
charset = None
|
||||||
## if prefix:
|
## if prefix:
|
||||||
## print("*** PREFIX", prefix, prefix_skip)
|
## print("*** PREFIX", prefix, prefix_skip)
|
||||||
## if charset:
|
## if charset:
|
||||||
|
@ -560,8 +576,6 @@ def _compile_info(code, pattern, flags):
|
||||||
# generate overlap table
|
# generate overlap table
|
||||||
code.extend(_generate_overlap_table(prefix))
|
code.extend(_generate_overlap_table(prefix))
|
||||||
elif charset:
|
elif charset:
|
||||||
charset, hascased = _optimize_charset(charset)
|
|
||||||
assert not hascased
|
|
||||||
_compile_charset(charset, flags, code)
|
_compile_charset(charset, flags, code)
|
||||||
code[skip] = len(code) - skip
|
code[skip] = len(code) - skip
|
||||||
|
|
||||||
|
|
|
@ -206,6 +206,8 @@ CH_UNICODE = {
|
||||||
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
|
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2]))
|
||||||
|
|
||||||
# flags
|
# flags
|
||||||
SRE_FLAG_IGNORECASE = 2 # case insensitive
|
SRE_FLAG_IGNORECASE = 2 # case insensitive
|
||||||
SRE_FLAG_LOCALE = 4 # honour system locale
|
SRE_FLAG_LOCALE = 4 # honour system locale
|
||||||
|
|
|
@ -2473,6 +2473,24 @@ class ReTests(unittest.TestCase):
|
||||||
def test_fail(self):
|
def test_fail(self):
|
||||||
self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
|
self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
|
||||||
|
|
||||||
|
def test_character_set_any(self):
|
||||||
|
# The union of complementary character sets mathes any character
|
||||||
|
# and is equivalent to "(?s:.)".
|
||||||
|
s = '1x\n'
|
||||||
|
for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
|
||||||
|
with self.subTest(pattern=p):
|
||||||
|
self.assertEqual(re.findall(p, s), list(s))
|
||||||
|
self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
|
||||||
|
|
||||||
|
def test_character_set_none(self):
|
||||||
|
# Negation of the union of complementary character sets does not match
|
||||||
|
# any character.
|
||||||
|
s = '1x\n'
|
||||||
|
for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
|
||||||
|
with self.subTest(pattern=p):
|
||||||
|
self.assertIsNone(re.search(p, s))
|
||||||
|
self.assertIsNone(re.search('(?s:.)' + p, s))
|
||||||
|
|
||||||
|
|
||||||
def get_debug_out(pat):
|
def get_debug_out(pat):
|
||||||
with captured_stdout() as out:
|
with captured_stdout() as out:
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
:mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match
|
||||||
|
any character as effectively as a dot with the ``DOTALL`` modifier
|
||||||
|
(``"(?s:.)"``).
|
Loading…
Reference in New Issue