diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 7b888f877eb..29109f8812e 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -28,6 +28,8 @@ _REPEATING_CODES = { POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), } +_CHARSET_ALL = [(NEGATE, None)] + def _combine_flags(flags, add_flags, del_flags, TYPE_FLAGS=_parser.TYPE_FLAGS): if add_flags & TYPE_FLAGS: @@ -84,17 +86,22 @@ def _compile(code, pattern, flags): code[skip] = _len(code) - skip elif op is IN: charset, hascased = _optimize_charset(av, iscased, tolower, fixes) - if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: - emit(IN_LOC_IGNORE) - elif not hascased: - emit(IN) - elif not fixes: # ascii - emit(IN_IGNORE) + if not charset: + emit(FAILURE) + elif charset == _CHARSET_ALL: + emit(ANY_ALL) else: - emit(IN_UNI_IGNORE) - skip = _len(code); emit(0) - _compile_charset(charset, flags, code) - code[skip] = _len(code) - skip + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + elif not hascased: + emit(IN) + elif not fixes: # ascii + emit(IN_IGNORE) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + _compile_charset(charset, flags, code) + code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: emit(ANY_ALL) @@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): charmap[i] = 1 elif op is NEGATE: out.append((op, av)) + elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail: + # Optimize [\s\S] etc. + out = [] if out else _CHARSET_ALL + return out, False else: tail.append((op, av)) except IndexError: @@ -519,13 +530,18 @@ def _compile_info(code, pattern, flags): # look for a literal prefix prefix = [] prefix_skip = 0 - charset = [] # not used + charset = None # not used if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): # look for literal prefix prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) # if no prefix, look for charset prefix if not prefix: charset = _get_charset_prefix(pattern, flags) + if charset: + charset, hascased = _optimize_charset(charset) + assert not hascased + if charset == _CHARSET_ALL: + charset = None ## if prefix: ## print("*** PREFIX", prefix, prefix_skip) ## if charset: @@ -560,8 +576,6 @@ def _compile_info(code, pattern, flags): # generate overlap table code.extend(_generate_overlap_table(prefix)) elif charset: - charset, hascased = _optimize_charset(charset) - assert not hascased _compile_charset(charset, flags, code) code[skip] = len(code) - skip diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index 4cb88c96d92..d6f32302d37 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -206,6 +206,8 @@ CH_UNICODE = { CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK } +CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2])) + # flags SRE_FLAG_IGNORECASE = 2 # case insensitive SRE_FLAG_LOCALE = 4 # honour system locale diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 1f2ab6028b5..a93c2aef170 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2473,6 +2473,24 @@ class ReTests(unittest.TestCase): def test_fail(self): self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3') + def test_character_set_any(self): + # The union of complementary character sets mathes any character + # and is equivalent to "(?s:.)". + s = '1x\n' + for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S': + with self.subTest(pattern=p): + self.assertEqual(re.findall(p, s), list(s)) + self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s) + + def test_character_set_none(self): + # Negation of the union of complementary character sets does not match + # any character. + s = '1x\n' + for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]': + with self.subTest(pattern=p): + self.assertIsNone(re.search(p, s)) + self.assertIsNone(re.search('(?s:.)' + p, s)) + def get_debug_out(pat): with captured_stdout() as out: diff --git a/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst b/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst new file mode 100644 index 00000000000..91ed5f550e4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst @@ -0,0 +1,3 @@ +:mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match +any character as effectively as a dot with the ``DOTALL`` modifier +(``"(?s:.)"``).