From 3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 24 Oct 2017 23:31:42 +0300 Subject: [PATCH] bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885) --- Doc/library/re.rst | 58 ++++---- Doc/whatsnew/3.7.rst | 7 + Lib/sre_compile.py | 59 +++++--- Lib/sre_constants.py | 40 ++++-- Lib/sre_parse.py | 24 +++- Lib/test/test_re.py | 22 +-- .../2017-10-05-15-14-46.bpo-31690.f0XteV.rst | 2 + Modules/_sre.c | 37 ++--- Modules/sre.h | 4 - Modules/sre_constants.h | 51 ++++--- Modules/sre_lib.h | 136 +++++++++++++++--- 11 files changed, 300 insertions(+), 140 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 3dd3a0f80d2..e0cb626305d 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -245,16 +245,32 @@ The special characters are: *cannot* be retrieved after performing a match or referenced later in the pattern. -``(?imsx-imsx:...)`` - (Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``, - optionally followed by ``'-'`` followed by one or more letters from the - same set.) The letters set or removes the corresponding flags: - :const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S` - (dot matches all), and :const:`re.X` (verbose), for the part of the - expression. (The flags are described in :ref:`contents-of-module-re`.) +``(?aiLmsux-imsx:...)`` + (Zero or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``, + ``'s'``, ``'u'``, ``'x'``, optionally followed by ``'-'`` followed by + one or more letters from the ``'i'``, ``'m'``, ``'s'``, ``'x'``.) + The letters set or remove the corresponding flags: + :const:`re.A` (ASCII-only matching), :const:`re.I` (ignore case), + :const:`re.L` (locale dependent), :const:`re.M` (multi-line), + :const:`re.S` (dot matches all), :const:`re.U` (Unicode matching), + and :const:`re.X` (verbose), for the part of the expression. + (The flags are described in :ref:`contents-of-module-re`.) + + The letters ``'a'``, ``'L'`` and ``'u'`` are mutually exclusive when used + as inline flags, so they can't be combined or follow ``'-'``. Instead, + when one of them appears in an inline group, it overrides the matching mode + in the enclosing group. In Unicode patterns ``(?a:...)`` switches to + ASCII-only matching, and ``(?u:...)`` switches to Unicode matching + (default). In byte pattern ``(?L:...)`` switches to locale depending + matching, and ``(?a:...)`` switches to ASCII-only matching (default). + This override is only in effect for the narrow inline group, and the + original matching mode is restored outside of the group. .. versionadded:: 3.6 + .. versionchanged:: 3.7 + The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group. + ``(?P...)`` Similar to regular parentheses, but the substring matched by the group is accessible via the symbolic group name *name*. Group names must be valid @@ -384,9 +400,7 @@ character ``'$'``. Matches any Unicode decimal digit (that is, any character in Unicode character category [Nd]). This includes ``[0-9]``, and also many other digit characters. If the :const:`ASCII` flag is - used only ``[0-9]`` is matched (but the flag affects the entire - regular expression, so in such cases using an explicit ``[0-9]`` - may be a better choice). + used only ``[0-9]`` is matched. For 8-bit (bytes) patterns: Matches any decimal digit; this is equivalent to ``[0-9]``. @@ -394,9 +408,7 @@ character ``'$'``. ``\D`` Matches any character which is not a decimal digit. This is the opposite of ``\d``. If the :const:`ASCII` flag is used this - becomes the equivalent of ``[^0-9]`` (but the flag affects the entire - regular expression, so in such cases using an explicit ``[^0-9]`` may - be a better choice). + becomes the equivalent of ``[^0-9]``. ``\s`` For Unicode (str) patterns: @@ -404,9 +416,7 @@ character ``'$'``. ``[ \t\n\r\f\v]``, and also many other characters, for example the non-breaking spaces mandated by typography rules in many languages). If the :const:`ASCII` flag is used, only - ``[ \t\n\r\f\v]`` is matched (but the flag affects the entire - regular expression, so in such cases using an explicit - ``[ \t\n\r\f\v]`` may be a better choice). + ``[ \t\n\r\f\v]`` is matched. For 8-bit (bytes) patterns: Matches characters considered whitespace in the ASCII character set; @@ -415,18 +425,14 @@ character ``'$'``. ``\S`` Matches any character which is not a whitespace character. This is the opposite of ``\s``. If the :const:`ASCII` flag is used this - becomes the equivalent of ``[^ \t\n\r\f\v]`` (but the flag affects the entire - regular expression, so in such cases using an explicit ``[^ \t\n\r\f\v]`` may - be a better choice). + becomes the equivalent of ``[^ \t\n\r\f\v]``. ``\w`` For Unicode (str) patterns: Matches Unicode word characters; this includes most characters that can be part of a word in any language, as well as numbers and the underscore. If the :const:`ASCII` flag is used, only - ``[a-zA-Z0-9_]`` is matched (but the flag affects the entire - regular expression, so in such cases using an explicit - ``[a-zA-Z0-9_]`` may be a better choice). + ``[a-zA-Z0-9_]`` is matched. For 8-bit (bytes) patterns: Matches characters considered alphanumeric in the ASCII character set; @@ -437,9 +443,7 @@ character ``'$'``. ``\W`` Matches any character which is not a word character. This is the opposite of ``\w``. If the :const:`ASCII` flag is used this - becomes the equivalent of ``[^a-zA-Z0-9_]`` (but the flag affects the - entire regular expression, so in such cases using an explicit - ``[^a-zA-Z0-9_]`` may be a better choice). If the :const:`LOCALE` flag is + becomes the equivalent of ``[^a-zA-Z0-9_]``. If the :const:`LOCALE` flag is used, matches characters considered alphanumeric in the current locale and the underscore. @@ -563,9 +567,7 @@ form. letter I with dot above), 'ı' (U+0131, Latin small letter dotless i), 'ſ' (U+017F, Latin small letter long s) and 'K' (U+212A, Kelvin sign). If the :const:`ASCII` flag is used, only letters 'a' to 'z' - and 'A' to 'Z' are matched (but the flag affects the entire regular - expression, so in such cases using an explicit ``(?-i:[a-zA-Z])`` may be - a better choice). + and 'A' to 'Z' are matched. .. data:: L LOCALE diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 46121dcf300..17e4e0a8813 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -296,6 +296,13 @@ pdb argument. If given, this is printed to the console just before debugging begins. +re +-- + +The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE` +can be set within the scope of a group. +(Contributed by Serhiy Storchaka in :issue:`31690`.) + string ------ diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 144620c6d1b..e5216b792f6 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -62,6 +62,12 @@ _equivalences = ( _ignorecase_fixes = {i: tuple(j for j in t if i != j) for t in _equivalences for i in t} +def _combine_flags(flags, add_flags, del_flags, + TYPE_FLAGS=sre_parse.TYPE_FLAGS): + if add_flags & TYPE_FLAGS: + flags &= ~TYPE_FLAGS + return (flags | add_flags) & ~del_flags + def _compile(code, pattern, flags): # internal: compile a (sub)pattern emit = code.append @@ -87,15 +93,21 @@ def _compile(code, pattern, flags): emit(op) emit(av) elif flags & SRE_FLAG_LOCALE: - emit(OP_LOC_IGNORE[op]) + emit(OP_LOCALE_IGNORE[op]) emit(av) elif not iscased(av): emit(op) emit(av) else: lo = tolower(av) - if fixes and lo in fixes: - emit(IN_IGNORE) + if not fixes: # ascii + emit(OP_IGNORE[op]) + emit(lo) + elif lo not in fixes: + emit(OP_UNICODE_IGNORE[op]) + emit(lo) + else: + emit(IN_UNI_IGNORE) skip = _len(code); emit(0) if op is NOT_LITERAL: emit(NEGATE) @@ -104,17 +116,16 @@ def _compile(code, pattern, flags): emit(k) emit(FAILURE) code[skip] = _len(code) - skip - else: - emit(OP_IGNORE[op]) - emit(lo) elif op is IN: charset, hascased = _optimize_charset(av, iscased, tolower, fixes) if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: emit(IN_LOC_IGNORE) - elif hascased: + elif not hascased: + emit(IN) + elif not fixes: # ascii emit(IN_IGNORE) else: - emit(IN) + emit(IN_UNI_IGNORE) skip = _len(code); emit(0) _compile_charset(charset, flags, code) code[skip] = _len(code) - skip @@ -153,8 +164,8 @@ def _compile(code, pattern, flags): if group: emit(MARK) emit((group-1)*2) - # _compile_info(code, p, (flags | add_flags) & ~del_flags) - _compile(code, p, (flags | add_flags) & ~del_flags) + # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) + _compile(code, p, _combine_flags(flags, add_flags, del_flags)) if group: emit(MARK) emit((group-1)*2+1) @@ -210,10 +221,14 @@ def _compile(code, pattern, flags): av = CH_UNICODE[av] emit(av) elif op is GROUPREF: - if flags & SRE_FLAG_IGNORECASE: - emit(OP_IGNORE[op]) - else: + if not flags & SRE_FLAG_IGNORECASE: emit(op) + elif flags & SRE_FLAG_LOCALE: + emit(GROUPREF_LOC_IGNORE) + elif not fixes: # ascii + emit(GROUPREF_IGNORE) + else: + emit(GROUPREF_UNI_IGNORE) emit(av-1) elif op is GROUPREF_EXISTS: emit(op) @@ -240,7 +255,7 @@ def _compile_charset(charset, flags, code): pass elif op is LITERAL: emit(av) - elif op is RANGE or op is RANGE_IGNORE: + elif op is RANGE or op is RANGE_UNI_IGNORE: emit(av[0]) emit(av[1]) elif op is CHARSET: @@ -309,9 +324,9 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): hascased = True # There are only two ranges of cased non-BMP characters: # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), - # and for both ranges RANGE_IGNORE works. + # and for both ranges RANGE_UNI_IGNORE works. if op is RANGE: - op = RANGE_IGNORE + op = RANGE_UNI_IGNORE tail.append((op, av)) break @@ -456,7 +471,7 @@ def _get_literal_prefix(pattern, flags): prefixappend(av) elif op is SUBPATTERN: group, add_flags, del_flags, p = av - flags1 = (flags | add_flags) & ~del_flags + flags1 = _combine_flags(flags, add_flags, del_flags) if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: break prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) @@ -482,7 +497,7 @@ def _get_charset_prefix(pattern, flags): if op is not SUBPATTERN: break group, add_flags, del_flags, pattern = av - flags = (flags | add_flags) & ~del_flags + flags = _combine_flags(flags, add_flags, del_flags) if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: return None @@ -631,6 +646,7 @@ def dis(code): print_(op) elif op in (LITERAL, NOT_LITERAL, LITERAL_IGNORE, NOT_LITERAL_IGNORE, + LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): arg = code[i] i += 1 @@ -647,12 +663,12 @@ def dis(code): arg = str(CHCODES[arg]) assert arg[:9] == 'CATEGORY_' print_(op, arg[9:]) - elif op in (IN, IN_IGNORE, IN_LOC_IGNORE): + elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): skip = code[i] print_(op, skip, to=i+skip) dis_(i+1, i+skip) i += skip - elif op in (RANGE, RANGE_IGNORE): + elif op in (RANGE, RANGE_UNI_IGNORE): lo, hi = code[i: i+2] i += 2 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) @@ -671,7 +687,8 @@ def dis(code): print_2(_hex_code(code[i: i + 256//_CODEBITS])) i += 256//_CODEBITS level -= 1 - elif op in (MARK, GROUPREF, GROUPREF_IGNORE): + elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, + GROUPREF_LOC_IGNORE): arg = code[i] i += 1 print_(op, arg) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 1daa7bd00f4..13deb00bc81 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20170530 +MAGIC = 20171005 from _sre import MAXREPEAT, MAXGROUPS @@ -84,25 +84,37 @@ OPCODES = _makecodes(""" CALL CATEGORY CHARSET BIGCHARSET - GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE - IN IN_IGNORE + GROUPREF GROUPREF_EXISTS + IN INFO JUMP - LITERAL LITERAL_IGNORE + LITERAL MARK MAX_UNTIL MIN_UNTIL - NOT_LITERAL NOT_LITERAL_IGNORE + NOT_LITERAL NEGATE RANGE REPEAT REPEAT_ONE SUBPATTERN MIN_REPEAT_ONE - RANGE_IGNORE + + GROUPREF_IGNORE + IN_IGNORE + LITERAL_IGNORE + NOT_LITERAL_IGNORE + + GROUPREF_LOC_IGNORE + IN_LOC_IGNORE LITERAL_LOC_IGNORE NOT_LITERAL_LOC_IGNORE - IN_LOC_IGNORE + + GROUPREF_UNI_IGNORE + IN_UNI_IGNORE + LITERAL_UNI_IGNORE + NOT_LITERAL_UNI_IGNORE + RANGE_UNI_IGNORE MIN_REPEAT MAX_REPEAT """) @@ -113,7 +125,9 @@ ATCODES = _makecodes(""" AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING AT_BOUNDARY AT_NON_BOUNDARY AT_END AT_END_LINE AT_END_STRING + AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY + AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY """) @@ -123,7 +137,9 @@ CHCODES = _makecodes(""" CATEGORY_SPACE CATEGORY_NOT_SPACE CATEGORY_WORD CATEGORY_NOT_WORD CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK + CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD + CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD @@ -133,18 +149,20 @@ CHCODES = _makecodes(""" # replacement operations for "ignore case" mode OP_IGNORE = { - GROUPREF: GROUPREF_IGNORE, - IN: IN_IGNORE, LITERAL: LITERAL_IGNORE, NOT_LITERAL: NOT_LITERAL_IGNORE, - RANGE: RANGE_IGNORE, } -OP_LOC_IGNORE = { +OP_LOCALE_IGNORE = { LITERAL: LITERAL_LOC_IGNORE, NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, } +OP_UNICODE_IGNORE = { + LITERAL: LITERAL_UNI_IGNORE, + NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, +} + AT_MULTILINE = { AT_BEGINNING: AT_BEGINNING_LINE, AT_END: AT_END_LINE diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 545252074f6..85274122938 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -65,8 +65,8 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } -GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | - SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) +TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE +GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE class Verbose(Exception): pass @@ -822,7 +822,19 @@ def _parse_flags(source, state, char): del_flags = 0 if char != "-": while True: - add_flags |= FLAGS[char] + flag = FLAGS[char] + if source.istext: + if char == 'L': + msg = "bad inline flags: cannot use 'L' flag with a str pattern" + raise source.error(msg) + else: + if char == 'u': + msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" + raise source.error(msg) + add_flags |= flag + if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: + msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" + raise source.error(msg) char = sourceget() if char is None: raise source.error("missing -, : or )") @@ -844,7 +856,11 @@ def _parse_flags(source, state, char): msg = "unknown flag" if char.isalpha() else "missing flag" raise source.error(msg, len(char)) while True: - del_flags |= FLAGS[char] + flag = FLAGS[char] + if flag & TYPE_FLAGS: + msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" + raise source.error(msg) + del_flags |= flag char = sourceget() if char is None: raise source.error("missing :") diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 9cb426a04dc..fc015e4ed9b 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1470,11 +1470,11 @@ class ReTests(unittest.TestCase): self.assertIsNone(pat.match(b'\xe0')) # Incompatibilities self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) - self.assertRaises(ValueError, re.compile, br'(?u)\w') + self.assertRaises(re.error, re.compile, br'(?u)\w') self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII) self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII) self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) - self.assertRaises(ValueError, re.compile, r'(?au)\w') + self.assertRaises(re.error, re.compile, r'(?au)\w') def test_locale_flag(self): import locale @@ -1516,11 +1516,11 @@ class ReTests(unittest.TestCase): self.assertIsNone(pat.match(bletter)) # Incompatibilities self.assertRaises(ValueError, re.compile, '', re.LOCALE) - self.assertRaises(ValueError, re.compile, '(?L)') + self.assertRaises(re.error, re.compile, '(?L)') self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII) self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII) self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) - self.assertRaises(ValueError, re.compile, b'(?aL)') + self.assertRaises(re.error, re.compile, b'(?aL)') def test_scoped_flags(self): self.assertTrue(re.match(r'(?i:a)b', 'Ab')) @@ -1535,12 +1535,18 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) - self.checkPatternError(r'(?a:\w)', - 'bad inline flags: cannot turn on global flag', 3) + self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) + self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) + self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) + self.checkPatternError(r'(?a)(?-a:\w)', - 'bad inline flags: cannot turn off global flag', 8) + "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8) self.checkPatternError(r'(?i-i:a)', - 'bad inline flags: flag turned on and off', 5) + 'bad inline flags: flag turned on and off', 5) + self.checkPatternError(r'(?au:a)', + "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) + self.checkPatternError(br'(?aL:a)', + "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) self.checkPatternError(r'(?-', 'missing flag', 3) self.checkPatternError(r'(?-+', 'missing flag', 3) diff --git a/Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst b/Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst new file mode 100644 index 00000000000..1505615d270 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst @@ -0,0 +1,2 @@ +Allow the flags re.ASCII, re.LOCALE, and re.UNICODE to be used as group flags +for regular expressions. diff --git a/Modules/_sre.c b/Modules/_sre.c index c42ab2668fd..a9b6b50e84e 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -97,12 +97,12 @@ static const char copyright[] = #define SRE_IS_WORD(ch)\ ((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_')) -static unsigned int sre_lower(unsigned int ch) +static unsigned int sre_lower_ascii(unsigned int ch) { return ((ch) < 128 ? Py_TOLOWER(ch) : ch); } -static unsigned int sre_upper(unsigned int ch) +static unsigned int sre_upper_ascii(unsigned int ch) { return ((ch) < 128 ? Py_TOUPPER(ch) : ch); } @@ -188,6 +188,15 @@ sre_category(SRE_CODE category, unsigned int ch) return 0; } +LOCAL(int) +char_loc_ignore(SRE_CODE pattern, SRE_CODE ch) +{ + return ch == pattern + || (SRE_CODE) sre_lower_locale(ch) == pattern + || (SRE_CODE) sre_upper_locale(ch) == pattern; +} + + /* helpers */ static void @@ -286,7 +295,7 @@ _sre_ascii_iscased_impl(PyObject *module, int character) /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/ { unsigned int ch = (unsigned int)character; - return ch != sre_lower(ch) || ch != sre_upper(ch); + return ch != sre_lower_ascii(ch) || ch != sre_upper_ascii(ch); } /*[clinic input] @@ -317,7 +326,7 @@ static int _sre_ascii_tolower_impl(PyObject *module, int character) /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/ { - return sre_lower(character); + return sre_lower_ascii(character); } /*[clinic input] @@ -448,19 +457,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->pos = start; state->endpos = end; - if (pattern->flags & SRE_FLAG_LOCALE) { - state->lower = sre_lower_locale; - state->upper = sre_upper_locale; - } - else if (pattern->flags & SRE_FLAG_UNICODE) { - state->lower = sre_lower_unicode; - state->upper = sre_upper_unicode; - } - else { - state->lower = sre_lower; - state->upper = sre_upper; - } - return string; err: PyMem_Del(state->mark); @@ -1533,7 +1529,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) break; case SRE_OP_RANGE: - case SRE_OP_RANGE_IGNORE: + case SRE_OP_RANGE_UNI_IGNORE: GET_ARG; GET_ARG; break; @@ -1630,6 +1626,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_NOT_LITERAL: case SRE_OP_LITERAL_IGNORE: case SRE_OP_NOT_LITERAL_IGNORE: + case SRE_OP_LITERAL_UNI_IGNORE: + case SRE_OP_NOT_LITERAL_UNI_IGNORE: case SRE_OP_LITERAL_LOC_IGNORE: case SRE_OP_NOT_LITERAL_LOC_IGNORE: GET_ARG; @@ -1669,6 +1667,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_IN: case SRE_OP_IN_IGNORE: + case SRE_OP_IN_UNI_IGNORE: case SRE_OP_IN_LOC_IGNORE: GET_SKIP; /* Stop 1 before the end; we check the FAILURE below */ @@ -1805,6 +1804,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_GROUPREF: case SRE_OP_GROUPREF_IGNORE: + case SRE_OP_GROUPREF_UNI_IGNORE: + case SRE_OP_GROUPREF_LOC_IGNORE: GET_ARG; if (arg >= (size_t)groups) FAIL; diff --git a/Modules/sre.h b/Modules/sre.h index 9af5e405749..585d2841a66 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -52,8 +52,6 @@ typedef struct { Py_ssize_t mark[1]; } MatchObject; -typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); - typedef struct SRE_REPEAT_T { Py_ssize_t count; SRE_CODE* pattern; /* points to REPEAT operator arguments */ @@ -83,8 +81,6 @@ typedef struct { Py_buffer buffer; /* current repeat context */ SRE_REPEAT *repeat; - /* hooks */ - SRE_TOLOWER_HOOK lower, upper; } SRE_STATE; typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 6d6d21efd04..c8ccb32d21d 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20170530 +#define SRE_MAGIC 20171005 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -26,28 +26,33 @@ #define SRE_OP_BIGCHARSET 11 #define SRE_OP_GROUPREF 12 #define SRE_OP_GROUPREF_EXISTS 13 -#define SRE_OP_GROUPREF_IGNORE 14 -#define SRE_OP_IN 15 -#define SRE_OP_IN_IGNORE 16 -#define SRE_OP_INFO 17 -#define SRE_OP_JUMP 18 -#define SRE_OP_LITERAL 19 -#define SRE_OP_LITERAL_IGNORE 20 -#define SRE_OP_MARK 21 -#define SRE_OP_MAX_UNTIL 22 -#define SRE_OP_MIN_UNTIL 23 -#define SRE_OP_NOT_LITERAL 24 -#define SRE_OP_NOT_LITERAL_IGNORE 25 -#define SRE_OP_NEGATE 26 -#define SRE_OP_RANGE 27 -#define SRE_OP_REPEAT 28 -#define SRE_OP_REPEAT_ONE 29 -#define SRE_OP_SUBPATTERN 30 -#define SRE_OP_MIN_REPEAT_ONE 31 -#define SRE_OP_RANGE_IGNORE 32 -#define SRE_OP_LITERAL_LOC_IGNORE 33 -#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34 -#define SRE_OP_IN_LOC_IGNORE 35 +#define SRE_OP_IN 14 +#define SRE_OP_INFO 15 +#define SRE_OP_JUMP 16 +#define SRE_OP_LITERAL 17 +#define SRE_OP_MARK 18 +#define SRE_OP_MAX_UNTIL 19 +#define SRE_OP_MIN_UNTIL 20 +#define SRE_OP_NOT_LITERAL 21 +#define SRE_OP_NEGATE 22 +#define SRE_OP_RANGE 23 +#define SRE_OP_REPEAT 24 +#define SRE_OP_REPEAT_ONE 25 +#define SRE_OP_SUBPATTERN 26 +#define SRE_OP_MIN_REPEAT_ONE 27 +#define SRE_OP_GROUPREF_IGNORE 28 +#define SRE_OP_IN_IGNORE 29 +#define SRE_OP_LITERAL_IGNORE 30 +#define SRE_OP_NOT_LITERAL_IGNORE 31 +#define SRE_OP_GROUPREF_LOC_IGNORE 32 +#define SRE_OP_IN_LOC_IGNORE 33 +#define SRE_OP_LITERAL_LOC_IGNORE 34 +#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35 +#define SRE_OP_GROUPREF_UNI_IGNORE 36 +#define SRE_OP_IN_UNI_IGNORE 37 +#define SRE_OP_LITERAL_UNI_IGNORE 38 +#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39 +#define SRE_OP_RANGE_UNI_IGNORE 40 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index b540d219dde..e13b90e8bc0 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -100,14 +100,6 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) return 0; } -LOCAL(int) -SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch) -{ - return ch == pattern - || (SRE_CODE) state->lower(ch) == pattern - || (SRE_CODE) state->upper(ch) == pattern; -} - LOCAL(int) SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) { @@ -150,14 +142,14 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) set += 2; break; - case SRE_OP_RANGE_IGNORE: - /* */ + case SRE_OP_RANGE_UNI_IGNORE: + /* */ { SRE_CODE uch; /* ch is already lower cased */ if (set[0] <= ch && ch <= set[1]) return ok; - uch = state->upper(ch); + uch = sre_upper_unicode(ch); if (set[0] <= uch && uch <= set[1]) return ok; set += 2; @@ -199,11 +191,11 @@ LOCAL(int) SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) { SRE_CODE lo, up; - lo = state->lower(ch); + lo = sre_lower_locale(ch); if (SRE(charset)(state, set, lo)) return 1; - up = state->upper(ch); + up = sre_upper_locale(ch); return up != lo && SRE(charset)(state, set, up); } @@ -263,7 +255,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) + while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr) + ptr++; + break; + + case SRE_OP_LITERAL_UNI_IGNORE: + /* repeated literal */ + chr = pattern[1]; + TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); + while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr) ptr++; break; @@ -271,7 +271,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr)) + while (ptr < end && char_loc_ignore(chr, *ptr)) ptr++; break; @@ -293,7 +293,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) + while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr) + ptr++; + break; + + case SRE_OP_NOT_LITERAL_UNI_IGNORE: + /* repeated non-literal */ + chr = pattern[1]; + TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); + while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr) ptr++; break; @@ -301,7 +309,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr)) + while (ptr < end && !char_loc_ignore(chr, *ptr)) ptr++; break; @@ -687,7 +695,17 @@ entrance: TRACE(("|%p|%p|LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end || - state->lower(*ctx->ptr) != *ctx->pattern) + sre_lower_ascii(*ctx->ptr) != *ctx->pattern) + RETURN_FAILURE; + ctx->pattern++; + ctx->ptr++; + break; + + case SRE_OP_LITERAL_UNI_IGNORE: + TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n", + ctx->pattern, ctx->ptr, ctx->pattern[0])); + if (ctx->ptr >= end || + sre_lower_unicode(*ctx->ptr) != *ctx->pattern) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -697,7 +715,7 @@ entrance: TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end - || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) + || !char_loc_ignore(*ctx->pattern, *ctx->ptr)) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -707,7 +725,17 @@ entrance: TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end || - state->lower(*ctx->ptr) == *ctx->pattern) + sre_lower_ascii(*ctx->ptr) == *ctx->pattern) + RETURN_FAILURE; + ctx->pattern++; + ctx->ptr++; + break; + + case SRE_OP_NOT_LITERAL_UNI_IGNORE: + TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n", + ctx->pattern, ctx->ptr, *ctx->pattern)); + if (ctx->ptr >= end || + sre_lower_unicode(*ctx->ptr) == *ctx->pattern) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -717,7 +745,7 @@ entrance: TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end - || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) + || char_loc_ignore(*ctx->pattern, *ctx->ptr)) RETURN_FAILURE; ctx->pattern++; ctx->ptr++; @@ -727,7 +755,17 @@ entrance: TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end || !SRE(charset)(state, ctx->pattern+1, - (SRE_CODE)state->lower(*ctx->ptr))) + (SRE_CODE)sre_lower_ascii(*ctx->ptr))) + RETURN_FAILURE; + ctx->pattern += ctx->pattern[0]; + ctx->ptr++; + break; + + case SRE_OP_IN_UNI_IGNORE: + TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr)); + if (ctx->ptr >= end + || !SRE(charset)(state, ctx->pattern+1, + (SRE_CODE)sre_lower_unicode(*ctx->ptr))) RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; ctx->ptr++; @@ -1135,7 +1173,59 @@ entrance: RETURN_FAILURE; while (p < e) { if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*p)) + sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p)) + RETURN_FAILURE; + p++; + ctx->ptr++; + } + } + } + ctx->pattern++; + break; + + case SRE_OP_GROUPREF_UNI_IGNORE: + /* match backreference */ + TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[0])); + i = ctx->pattern[0]; + { + Py_ssize_t groupref = i+i; + if (groupref >= state->lastmark) { + RETURN_FAILURE; + } else { + SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + if (!p || !e || e < p) + RETURN_FAILURE; + while (p < e) { + if (ctx->ptr >= end || + sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p)) + RETURN_FAILURE; + p++; + ctx->ptr++; + } + } + } + ctx->pattern++; + break; + + case SRE_OP_GROUPREF_LOC_IGNORE: + /* match backreference */ + TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[0])); + i = ctx->pattern[0]; + { + Py_ssize_t groupref = i+i; + if (groupref >= state->lastmark) { + RETURN_FAILURE; + } else { + SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; + SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + if (!p || !e || e < p) + RETURN_FAILURE; + while (p < e) { + if (ctx->ptr >= end || + sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p)) RETURN_FAILURE; p++; ctx->ptr++;