gh-111259: Optimize complementary character sets in RE (GH-120742)

Patterns like "[\s\S]" or "\s|\S" which match any character are now compiled
to the same effective code as a dot with the DOTALL modifier ("(?s:.)").
This commit is contained in:
Serhiy Storchaka 2024-06-20 10:19:32 +03:00 committed by GitHub
parent 3846fcfb92
commit 8bc76ae45f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 50 additions and 13 deletions

View File

@ -28,6 +28,8 @@ _REPEATING_CODES = {
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
} }
_CHARSET_ALL = [(NEGATE, None)]
def _combine_flags(flags, add_flags, del_flags, def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=_parser.TYPE_FLAGS): TYPE_FLAGS=_parser.TYPE_FLAGS):
if add_flags & TYPE_FLAGS: if add_flags & TYPE_FLAGS:
@ -84,17 +86,22 @@ def _compile(code, pattern, flags):
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
elif op is IN: elif op is IN:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes) charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: if not charset:
emit(IN_LOC_IGNORE) emit(FAILURE)
elif not hascased: elif charset == _CHARSET_ALL:
emit(IN) emit(ANY_ALL)
elif not fixes: # ascii
emit(IN_IGNORE)
else: else:
emit(IN_UNI_IGNORE) if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
skip = _len(code); emit(0) emit(IN_LOC_IGNORE)
_compile_charset(charset, flags, code) elif not hascased:
code[skip] = _len(code) - skip emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY: elif op is ANY:
if flags & SRE_FLAG_DOTALL: if flags & SRE_FLAG_DOTALL:
emit(ANY_ALL) emit(ANY_ALL)
@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
charmap[i] = 1 charmap[i] = 1
elif op is NEGATE: elif op is NEGATE:
out.append((op, av)) out.append((op, av))
elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
# Optimize [\s\S] etc.
out = [] if out else _CHARSET_ALL
return out, False
else: else:
tail.append((op, av)) tail.append((op, av))
except IndexError: except IndexError:
@ -519,13 +530,18 @@ def _compile_info(code, pattern, flags):
# look for a literal prefix # look for a literal prefix
prefix = [] prefix = []
prefix_skip = 0 prefix_skip = 0
charset = [] # not used charset = None # not used
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix # look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix # if no prefix, look for charset prefix
if not prefix: if not prefix:
charset = _get_charset_prefix(pattern, flags) charset = _get_charset_prefix(pattern, flags)
if charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
if charset == _CHARSET_ALL:
charset = None
## if prefix: ## if prefix:
## print("*** PREFIX", prefix, prefix_skip) ## print("*** PREFIX", prefix, prefix_skip)
## if charset: ## if charset:
@ -560,8 +576,6 @@ def _compile_info(code, pattern, flags):
# generate overlap table # generate overlap table
code.extend(_generate_overlap_table(prefix)) code.extend(_generate_overlap_table(prefix))
elif charset: elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code) _compile_charset(charset, flags, code)
code[skip] = len(code) - skip code[skip] = len(code) - skip

View File

@ -206,6 +206,8 @@ CH_UNICODE = {
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
} }
CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2]))
# flags # flags
SRE_FLAG_IGNORECASE = 2 # case insensitive SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale SRE_FLAG_LOCALE = 4 # honour system locale

View File

@ -2473,6 +2473,24 @@ class ReTests(unittest.TestCase):
def test_fail(self): def test_fail(self):
self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3') self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
def test_character_set_any(self):
# The union of complementary character sets mathes any character
# and is equivalent to "(?s:.)".
s = '1x\n'
for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
with self.subTest(pattern=p):
self.assertEqual(re.findall(p, s), list(s))
self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
def test_character_set_none(self):
# Negation of the union of complementary character sets does not match
# any character.
s = '1x\n'
for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
with self.subTest(pattern=p):
self.assertIsNone(re.search(p, s))
self.assertIsNone(re.search('(?s:.)' + p, s))
def get_debug_out(pat): def get_debug_out(pat):
with captured_stdout() as out: with captured_stdout() as out:

View File

@ -0,0 +1,3 @@
:mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match
any character as effectively as a dot with the ``DOTALL`` modifier
(``"(?s:.)"``).