bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)
This commit is contained in:
parent
fdd9b217c6
commit
3557b05c5a
|
@ -245,16 +245,32 @@ The special characters are:
|
||||||
*cannot* be retrieved after performing a match or referenced later in the
|
*cannot* be retrieved after performing a match or referenced later in the
|
||||||
pattern.
|
pattern.
|
||||||
|
|
||||||
``(?imsx-imsx:...)``
|
``(?aiLmsux-imsx:...)``
|
||||||
(Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
|
(Zero or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``,
|
||||||
optionally followed by ``'-'`` followed by one or more letters from the
|
``'s'``, ``'u'``, ``'x'``, optionally followed by ``'-'`` followed by
|
||||||
same set.) The letters set or removes the corresponding flags:
|
one or more letters from the ``'i'``, ``'m'``, ``'s'``, ``'x'``.)
|
||||||
:const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
|
The letters set or remove the corresponding flags:
|
||||||
(dot matches all), and :const:`re.X` (verbose), for the part of the
|
:const:`re.A` (ASCII-only matching), :const:`re.I` (ignore case),
|
||||||
expression. (The flags are described in :ref:`contents-of-module-re`.)
|
:const:`re.L` (locale dependent), :const:`re.M` (multi-line),
|
||||||
|
:const:`re.S` (dot matches all), :const:`re.U` (Unicode matching),
|
||||||
|
and :const:`re.X` (verbose), for the part of the expression.
|
||||||
|
(The flags are described in :ref:`contents-of-module-re`.)
|
||||||
|
|
||||||
|
The letters ``'a'``, ``'L'`` and ``'u'`` are mutually exclusive when used
|
||||||
|
as inline flags, so they can't be combined or follow ``'-'``. Instead,
|
||||||
|
when one of them appears in an inline group, it overrides the matching mode
|
||||||
|
in the enclosing group. In Unicode patterns ``(?a:...)`` switches to
|
||||||
|
ASCII-only matching, and ``(?u:...)`` switches to Unicode matching
|
||||||
|
(default). In byte pattern ``(?L:...)`` switches to locale depending
|
||||||
|
matching, and ``(?a:...)`` switches to ASCII-only matching (default).
|
||||||
|
This override is only in effect for the narrow inline group, and the
|
||||||
|
original matching mode is restored outside of the group.
|
||||||
|
|
||||||
.. versionadded:: 3.6
|
.. versionadded:: 3.6
|
||||||
|
|
||||||
|
.. versionchanged:: 3.7
|
||||||
|
The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group.
|
||||||
|
|
||||||
``(?P<name>...)``
|
``(?P<name>...)``
|
||||||
Similar to regular parentheses, but the substring matched by the group is
|
Similar to regular parentheses, but the substring matched by the group is
|
||||||
accessible via the symbolic group name *name*. Group names must be valid
|
accessible via the symbolic group name *name*. Group names must be valid
|
||||||
|
@ -384,9 +400,7 @@ character ``'$'``.
|
||||||
Matches any Unicode decimal digit (that is, any character in
|
Matches any Unicode decimal digit (that is, any character in
|
||||||
Unicode character category [Nd]). This includes ``[0-9]``, and
|
Unicode character category [Nd]). This includes ``[0-9]``, and
|
||||||
also many other digit characters. If the :const:`ASCII` flag is
|
also many other digit characters. If the :const:`ASCII` flag is
|
||||||
used only ``[0-9]`` is matched (but the flag affects the entire
|
used only ``[0-9]`` is matched.
|
||||||
regular expression, so in such cases using an explicit ``[0-9]``
|
|
||||||
may be a better choice).
|
|
||||||
|
|
||||||
For 8-bit (bytes) patterns:
|
For 8-bit (bytes) patterns:
|
||||||
Matches any decimal digit; this is equivalent to ``[0-9]``.
|
Matches any decimal digit; this is equivalent to ``[0-9]``.
|
||||||
|
@ -394,9 +408,7 @@ character ``'$'``.
|
||||||
``\D``
|
``\D``
|
||||||
Matches any character which is not a decimal digit. This is
|
Matches any character which is not a decimal digit. This is
|
||||||
the opposite of ``\d``. If the :const:`ASCII` flag is used this
|
the opposite of ``\d``. If the :const:`ASCII` flag is used this
|
||||||
becomes the equivalent of ``[^0-9]`` (but the flag affects the entire
|
becomes the equivalent of ``[^0-9]``.
|
||||||
regular expression, so in such cases using an explicit ``[^0-9]`` may
|
|
||||||
be a better choice).
|
|
||||||
|
|
||||||
``\s``
|
``\s``
|
||||||
For Unicode (str) patterns:
|
For Unicode (str) patterns:
|
||||||
|
@ -404,9 +416,7 @@ character ``'$'``.
|
||||||
``[ \t\n\r\f\v]``, and also many other characters, for example the
|
``[ \t\n\r\f\v]``, and also many other characters, for example the
|
||||||
non-breaking spaces mandated by typography rules in many
|
non-breaking spaces mandated by typography rules in many
|
||||||
languages). If the :const:`ASCII` flag is used, only
|
languages). If the :const:`ASCII` flag is used, only
|
||||||
``[ \t\n\r\f\v]`` is matched (but the flag affects the entire
|
``[ \t\n\r\f\v]`` is matched.
|
||||||
regular expression, so in such cases using an explicit
|
|
||||||
``[ \t\n\r\f\v]`` may be a better choice).
|
|
||||||
|
|
||||||
For 8-bit (bytes) patterns:
|
For 8-bit (bytes) patterns:
|
||||||
Matches characters considered whitespace in the ASCII character set;
|
Matches characters considered whitespace in the ASCII character set;
|
||||||
|
@ -415,18 +425,14 @@ character ``'$'``.
|
||||||
``\S``
|
``\S``
|
||||||
Matches any character which is not a whitespace character. This is
|
Matches any character which is not a whitespace character. This is
|
||||||
the opposite of ``\s``. If the :const:`ASCII` flag is used this
|
the opposite of ``\s``. If the :const:`ASCII` flag is used this
|
||||||
becomes the equivalent of ``[^ \t\n\r\f\v]`` (but the flag affects the entire
|
becomes the equivalent of ``[^ \t\n\r\f\v]``.
|
||||||
regular expression, so in such cases using an explicit ``[^ \t\n\r\f\v]`` may
|
|
||||||
be a better choice).
|
|
||||||
|
|
||||||
``\w``
|
``\w``
|
||||||
For Unicode (str) patterns:
|
For Unicode (str) patterns:
|
||||||
Matches Unicode word characters; this includes most characters
|
Matches Unicode word characters; this includes most characters
|
||||||
that can be part of a word in any language, as well as numbers and
|
that can be part of a word in any language, as well as numbers and
|
||||||
the underscore. If the :const:`ASCII` flag is used, only
|
the underscore. If the :const:`ASCII` flag is used, only
|
||||||
``[a-zA-Z0-9_]`` is matched (but the flag affects the entire
|
``[a-zA-Z0-9_]`` is matched.
|
||||||
regular expression, so in such cases using an explicit
|
|
||||||
``[a-zA-Z0-9_]`` may be a better choice).
|
|
||||||
|
|
||||||
For 8-bit (bytes) patterns:
|
For 8-bit (bytes) patterns:
|
||||||
Matches characters considered alphanumeric in the ASCII character set;
|
Matches characters considered alphanumeric in the ASCII character set;
|
||||||
|
@ -437,9 +443,7 @@ character ``'$'``.
|
||||||
``\W``
|
``\W``
|
||||||
Matches any character which is not a word character. This is
|
Matches any character which is not a word character. This is
|
||||||
the opposite of ``\w``. If the :const:`ASCII` flag is used this
|
the opposite of ``\w``. If the :const:`ASCII` flag is used this
|
||||||
becomes the equivalent of ``[^a-zA-Z0-9_]`` (but the flag affects the
|
becomes the equivalent of ``[^a-zA-Z0-9_]``. If the :const:`LOCALE` flag is
|
||||||
entire regular expression, so in such cases using an explicit
|
|
||||||
``[^a-zA-Z0-9_]`` may be a better choice). If the :const:`LOCALE` flag is
|
|
||||||
used, matches characters considered alphanumeric in the current locale
|
used, matches characters considered alphanumeric in the current locale
|
||||||
and the underscore.
|
and the underscore.
|
||||||
|
|
||||||
|
@ -563,9 +567,7 @@ form.
|
||||||
letter I with dot above), 'ı' (U+0131, Latin small letter dotless i),
|
letter I with dot above), 'ı' (U+0131, Latin small letter dotless i),
|
||||||
'ſ' (U+017F, Latin small letter long s) and 'K' (U+212A, Kelvin sign).
|
'ſ' (U+017F, Latin small letter long s) and 'K' (U+212A, Kelvin sign).
|
||||||
If the :const:`ASCII` flag is used, only letters 'a' to 'z'
|
If the :const:`ASCII` flag is used, only letters 'a' to 'z'
|
||||||
and 'A' to 'Z' are matched (but the flag affects the entire regular
|
and 'A' to 'Z' are matched.
|
||||||
expression, so in such cases using an explicit ``(?-i:[a-zA-Z])`` may be
|
|
||||||
a better choice).
|
|
||||||
|
|
||||||
.. data:: L
|
.. data:: L
|
||||||
LOCALE
|
LOCALE
|
||||||
|
|
|
@ -296,6 +296,13 @@ pdb
|
||||||
argument. If given, this is printed to the console just before debugging
|
argument. If given, this is printed to the console just before debugging
|
||||||
begins.
|
begins.
|
||||||
|
|
||||||
|
re
|
||||||
|
--
|
||||||
|
|
||||||
|
The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE`
|
||||||
|
can be set within the scope of a group.
|
||||||
|
(Contributed by Serhiy Storchaka in :issue:`31690`.)
|
||||||
|
|
||||||
string
|
string
|
||||||
------
|
------
|
||||||
|
|
||||||
|
|
|
@ -62,6 +62,12 @@ _equivalences = (
|
||||||
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
|
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
|
||||||
for t in _equivalences for i in t}
|
for t in _equivalences for i in t}
|
||||||
|
|
||||||
|
def _combine_flags(flags, add_flags, del_flags,
|
||||||
|
TYPE_FLAGS=sre_parse.TYPE_FLAGS):
|
||||||
|
if add_flags & TYPE_FLAGS:
|
||||||
|
flags &= ~TYPE_FLAGS
|
||||||
|
return (flags | add_flags) & ~del_flags
|
||||||
|
|
||||||
def _compile(code, pattern, flags):
|
def _compile(code, pattern, flags):
|
||||||
# internal: compile a (sub)pattern
|
# internal: compile a (sub)pattern
|
||||||
emit = code.append
|
emit = code.append
|
||||||
|
@ -87,15 +93,21 @@ def _compile(code, pattern, flags):
|
||||||
emit(op)
|
emit(op)
|
||||||
emit(av)
|
emit(av)
|
||||||
elif flags & SRE_FLAG_LOCALE:
|
elif flags & SRE_FLAG_LOCALE:
|
||||||
emit(OP_LOC_IGNORE[op])
|
emit(OP_LOCALE_IGNORE[op])
|
||||||
emit(av)
|
emit(av)
|
||||||
elif not iscased(av):
|
elif not iscased(av):
|
||||||
emit(op)
|
emit(op)
|
||||||
emit(av)
|
emit(av)
|
||||||
else:
|
else:
|
||||||
lo = tolower(av)
|
lo = tolower(av)
|
||||||
if fixes and lo in fixes:
|
if not fixes: # ascii
|
||||||
emit(IN_IGNORE)
|
emit(OP_IGNORE[op])
|
||||||
|
emit(lo)
|
||||||
|
elif lo not in fixes:
|
||||||
|
emit(OP_UNICODE_IGNORE[op])
|
||||||
|
emit(lo)
|
||||||
|
else:
|
||||||
|
emit(IN_UNI_IGNORE)
|
||||||
skip = _len(code); emit(0)
|
skip = _len(code); emit(0)
|
||||||
if op is NOT_LITERAL:
|
if op is NOT_LITERAL:
|
||||||
emit(NEGATE)
|
emit(NEGATE)
|
||||||
|
@ -104,17 +116,16 @@ def _compile(code, pattern, flags):
|
||||||
emit(k)
|
emit(k)
|
||||||
emit(FAILURE)
|
emit(FAILURE)
|
||||||
code[skip] = _len(code) - skip
|
code[skip] = _len(code) - skip
|
||||||
else:
|
|
||||||
emit(OP_IGNORE[op])
|
|
||||||
emit(lo)
|
|
||||||
elif op is IN:
|
elif op is IN:
|
||||||
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
|
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
|
||||||
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
||||||
emit(IN_LOC_IGNORE)
|
emit(IN_LOC_IGNORE)
|
||||||
elif hascased:
|
elif not hascased:
|
||||||
|
emit(IN)
|
||||||
|
elif not fixes: # ascii
|
||||||
emit(IN_IGNORE)
|
emit(IN_IGNORE)
|
||||||
else:
|
else:
|
||||||
emit(IN)
|
emit(IN_UNI_IGNORE)
|
||||||
skip = _len(code); emit(0)
|
skip = _len(code); emit(0)
|
||||||
_compile_charset(charset, flags, code)
|
_compile_charset(charset, flags, code)
|
||||||
code[skip] = _len(code) - skip
|
code[skip] = _len(code) - skip
|
||||||
|
@ -153,8 +164,8 @@ def _compile(code, pattern, flags):
|
||||||
if group:
|
if group:
|
||||||
emit(MARK)
|
emit(MARK)
|
||||||
emit((group-1)*2)
|
emit((group-1)*2)
|
||||||
# _compile_info(code, p, (flags | add_flags) & ~del_flags)
|
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
|
||||||
_compile(code, p, (flags | add_flags) & ~del_flags)
|
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
|
||||||
if group:
|
if group:
|
||||||
emit(MARK)
|
emit(MARK)
|
||||||
emit((group-1)*2+1)
|
emit((group-1)*2+1)
|
||||||
|
@ -210,10 +221,14 @@ def _compile(code, pattern, flags):
|
||||||
av = CH_UNICODE[av]
|
av = CH_UNICODE[av]
|
||||||
emit(av)
|
emit(av)
|
||||||
elif op is GROUPREF:
|
elif op is GROUPREF:
|
||||||
if flags & SRE_FLAG_IGNORECASE:
|
if not flags & SRE_FLAG_IGNORECASE:
|
||||||
emit(OP_IGNORE[op])
|
|
||||||
else:
|
|
||||||
emit(op)
|
emit(op)
|
||||||
|
elif flags & SRE_FLAG_LOCALE:
|
||||||
|
emit(GROUPREF_LOC_IGNORE)
|
||||||
|
elif not fixes: # ascii
|
||||||
|
emit(GROUPREF_IGNORE)
|
||||||
|
else:
|
||||||
|
emit(GROUPREF_UNI_IGNORE)
|
||||||
emit(av-1)
|
emit(av-1)
|
||||||
elif op is GROUPREF_EXISTS:
|
elif op is GROUPREF_EXISTS:
|
||||||
emit(op)
|
emit(op)
|
||||||
|
@ -240,7 +255,7 @@ def _compile_charset(charset, flags, code):
|
||||||
pass
|
pass
|
||||||
elif op is LITERAL:
|
elif op is LITERAL:
|
||||||
emit(av)
|
emit(av)
|
||||||
elif op is RANGE or op is RANGE_IGNORE:
|
elif op is RANGE or op is RANGE_UNI_IGNORE:
|
||||||
emit(av[0])
|
emit(av[0])
|
||||||
emit(av[1])
|
emit(av[1])
|
||||||
elif op is CHARSET:
|
elif op is CHARSET:
|
||||||
|
@ -309,9 +324,9 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
|
||||||
hascased = True
|
hascased = True
|
||||||
# There are only two ranges of cased non-BMP characters:
|
# There are only two ranges of cased non-BMP characters:
|
||||||
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
|
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
|
||||||
# and for both ranges RANGE_IGNORE works.
|
# and for both ranges RANGE_UNI_IGNORE works.
|
||||||
if op is RANGE:
|
if op is RANGE:
|
||||||
op = RANGE_IGNORE
|
op = RANGE_UNI_IGNORE
|
||||||
tail.append((op, av))
|
tail.append((op, av))
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -456,7 +471,7 @@ def _get_literal_prefix(pattern, flags):
|
||||||
prefixappend(av)
|
prefixappend(av)
|
||||||
elif op is SUBPATTERN:
|
elif op is SUBPATTERN:
|
||||||
group, add_flags, del_flags, p = av
|
group, add_flags, del_flags, p = av
|
||||||
flags1 = (flags | add_flags) & ~del_flags
|
flags1 = _combine_flags(flags, add_flags, del_flags)
|
||||||
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
|
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
|
||||||
break
|
break
|
||||||
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
|
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
|
||||||
|
@ -482,7 +497,7 @@ def _get_charset_prefix(pattern, flags):
|
||||||
if op is not SUBPATTERN:
|
if op is not SUBPATTERN:
|
||||||
break
|
break
|
||||||
group, add_flags, del_flags, pattern = av
|
group, add_flags, del_flags, pattern = av
|
||||||
flags = (flags | add_flags) & ~del_flags
|
flags = _combine_flags(flags, add_flags, del_flags)
|
||||||
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -631,6 +646,7 @@ def dis(code):
|
||||||
print_(op)
|
print_(op)
|
||||||
elif op in (LITERAL, NOT_LITERAL,
|
elif op in (LITERAL, NOT_LITERAL,
|
||||||
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
|
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
|
||||||
|
LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
|
||||||
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
|
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
|
||||||
arg = code[i]
|
arg = code[i]
|
||||||
i += 1
|
i += 1
|
||||||
|
@ -647,12 +663,12 @@ def dis(code):
|
||||||
arg = str(CHCODES[arg])
|
arg = str(CHCODES[arg])
|
||||||
assert arg[:9] == 'CATEGORY_'
|
assert arg[:9] == 'CATEGORY_'
|
||||||
print_(op, arg[9:])
|
print_(op, arg[9:])
|
||||||
elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
|
elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
|
||||||
skip = code[i]
|
skip = code[i]
|
||||||
print_(op, skip, to=i+skip)
|
print_(op, skip, to=i+skip)
|
||||||
dis_(i+1, i+skip)
|
dis_(i+1, i+skip)
|
||||||
i += skip
|
i += skip
|
||||||
elif op in (RANGE, RANGE_IGNORE):
|
elif op in (RANGE, RANGE_UNI_IGNORE):
|
||||||
lo, hi = code[i: i+2]
|
lo, hi = code[i: i+2]
|
||||||
i += 2
|
i += 2
|
||||||
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
|
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
|
||||||
|
@ -671,7 +687,8 @@ def dis(code):
|
||||||
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
|
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
|
||||||
i += 256//_CODEBITS
|
i += 256//_CODEBITS
|
||||||
level -= 1
|
level -= 1
|
||||||
elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
|
elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
|
||||||
|
GROUPREF_LOC_IGNORE):
|
||||||
arg = code[i]
|
arg = code[i]
|
||||||
i += 1
|
i += 1
|
||||||
print_(op, arg)
|
print_(op, arg)
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
|
|
||||||
# update when constants are added or removed
|
# update when constants are added or removed
|
||||||
|
|
||||||
MAGIC = 20170530
|
MAGIC = 20171005
|
||||||
|
|
||||||
from _sre import MAXREPEAT, MAXGROUPS
|
from _sre import MAXREPEAT, MAXGROUPS
|
||||||
|
|
||||||
|
@ -84,25 +84,37 @@ OPCODES = _makecodes("""
|
||||||
CALL
|
CALL
|
||||||
CATEGORY
|
CATEGORY
|
||||||
CHARSET BIGCHARSET
|
CHARSET BIGCHARSET
|
||||||
GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE
|
GROUPREF GROUPREF_EXISTS
|
||||||
IN IN_IGNORE
|
IN
|
||||||
INFO
|
INFO
|
||||||
JUMP
|
JUMP
|
||||||
LITERAL LITERAL_IGNORE
|
LITERAL
|
||||||
MARK
|
MARK
|
||||||
MAX_UNTIL
|
MAX_UNTIL
|
||||||
MIN_UNTIL
|
MIN_UNTIL
|
||||||
NOT_LITERAL NOT_LITERAL_IGNORE
|
NOT_LITERAL
|
||||||
NEGATE
|
NEGATE
|
||||||
RANGE
|
RANGE
|
||||||
REPEAT
|
REPEAT
|
||||||
REPEAT_ONE
|
REPEAT_ONE
|
||||||
SUBPATTERN
|
SUBPATTERN
|
||||||
MIN_REPEAT_ONE
|
MIN_REPEAT_ONE
|
||||||
RANGE_IGNORE
|
|
||||||
|
GROUPREF_IGNORE
|
||||||
|
IN_IGNORE
|
||||||
|
LITERAL_IGNORE
|
||||||
|
NOT_LITERAL_IGNORE
|
||||||
|
|
||||||
|
GROUPREF_LOC_IGNORE
|
||||||
|
IN_LOC_IGNORE
|
||||||
LITERAL_LOC_IGNORE
|
LITERAL_LOC_IGNORE
|
||||||
NOT_LITERAL_LOC_IGNORE
|
NOT_LITERAL_LOC_IGNORE
|
||||||
IN_LOC_IGNORE
|
|
||||||
|
GROUPREF_UNI_IGNORE
|
||||||
|
IN_UNI_IGNORE
|
||||||
|
LITERAL_UNI_IGNORE
|
||||||
|
NOT_LITERAL_UNI_IGNORE
|
||||||
|
RANGE_UNI_IGNORE
|
||||||
|
|
||||||
MIN_REPEAT MAX_REPEAT
|
MIN_REPEAT MAX_REPEAT
|
||||||
""")
|
""")
|
||||||
|
@ -113,7 +125,9 @@ ATCODES = _makecodes("""
|
||||||
AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
|
AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
|
||||||
AT_BOUNDARY AT_NON_BOUNDARY
|
AT_BOUNDARY AT_NON_BOUNDARY
|
||||||
AT_END AT_END_LINE AT_END_STRING
|
AT_END AT_END_LINE AT_END_STRING
|
||||||
|
|
||||||
AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
|
AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
|
||||||
|
|
||||||
AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
|
AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
@ -123,7 +137,9 @@ CHCODES = _makecodes("""
|
||||||
CATEGORY_SPACE CATEGORY_NOT_SPACE
|
CATEGORY_SPACE CATEGORY_NOT_SPACE
|
||||||
CATEGORY_WORD CATEGORY_NOT_WORD
|
CATEGORY_WORD CATEGORY_NOT_WORD
|
||||||
CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
|
CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
|
||||||
|
|
||||||
CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
|
CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
|
||||||
|
|
||||||
CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
|
CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
|
||||||
CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
|
CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
|
||||||
CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
|
CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
|
||||||
|
@ -133,18 +149,20 @@ CHCODES = _makecodes("""
|
||||||
|
|
||||||
# replacement operations for "ignore case" mode
|
# replacement operations for "ignore case" mode
|
||||||
OP_IGNORE = {
|
OP_IGNORE = {
|
||||||
GROUPREF: GROUPREF_IGNORE,
|
|
||||||
IN: IN_IGNORE,
|
|
||||||
LITERAL: LITERAL_IGNORE,
|
LITERAL: LITERAL_IGNORE,
|
||||||
NOT_LITERAL: NOT_LITERAL_IGNORE,
|
NOT_LITERAL: NOT_LITERAL_IGNORE,
|
||||||
RANGE: RANGE_IGNORE,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
OP_LOC_IGNORE = {
|
OP_LOCALE_IGNORE = {
|
||||||
LITERAL: LITERAL_LOC_IGNORE,
|
LITERAL: LITERAL_LOC_IGNORE,
|
||||||
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
|
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OP_UNICODE_IGNORE = {
|
||||||
|
LITERAL: LITERAL_UNI_IGNORE,
|
||||||
|
NOT_LITERAL: NOT_LITERAL_UNI_IGNORE,
|
||||||
|
}
|
||||||
|
|
||||||
AT_MULTILINE = {
|
AT_MULTILINE = {
|
||||||
AT_BEGINNING: AT_BEGINNING_LINE,
|
AT_BEGINNING: AT_BEGINNING_LINE,
|
||||||
AT_END: AT_END_LINE
|
AT_END: AT_END_LINE
|
||||||
|
|
|
@ -65,8 +65,8 @@ FLAGS = {
|
||||||
"u": SRE_FLAG_UNICODE,
|
"u": SRE_FLAG_UNICODE,
|
||||||
}
|
}
|
||||||
|
|
||||||
GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
|
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
|
||||||
SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
|
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
|
||||||
|
|
||||||
class Verbose(Exception):
|
class Verbose(Exception):
|
||||||
pass
|
pass
|
||||||
|
@ -822,7 +822,19 @@ def _parse_flags(source, state, char):
|
||||||
del_flags = 0
|
del_flags = 0
|
||||||
if char != "-":
|
if char != "-":
|
||||||
while True:
|
while True:
|
||||||
add_flags |= FLAGS[char]
|
flag = FLAGS[char]
|
||||||
|
if source.istext:
|
||||||
|
if char == 'L':
|
||||||
|
msg = "bad inline flags: cannot use 'L' flag with a str pattern"
|
||||||
|
raise source.error(msg)
|
||||||
|
else:
|
||||||
|
if char == 'u':
|
||||||
|
msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
|
||||||
|
raise source.error(msg)
|
||||||
|
add_flags |= flag
|
||||||
|
if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
|
||||||
|
msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
|
||||||
|
raise source.error(msg)
|
||||||
char = sourceget()
|
char = sourceget()
|
||||||
if char is None:
|
if char is None:
|
||||||
raise source.error("missing -, : or )")
|
raise source.error("missing -, : or )")
|
||||||
|
@ -844,7 +856,11 @@ def _parse_flags(source, state, char):
|
||||||
msg = "unknown flag" if char.isalpha() else "missing flag"
|
msg = "unknown flag" if char.isalpha() else "missing flag"
|
||||||
raise source.error(msg, len(char))
|
raise source.error(msg, len(char))
|
||||||
while True:
|
while True:
|
||||||
del_flags |= FLAGS[char]
|
flag = FLAGS[char]
|
||||||
|
if flag & TYPE_FLAGS:
|
||||||
|
msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
|
||||||
|
raise source.error(msg)
|
||||||
|
del_flags |= flag
|
||||||
char = sourceget()
|
char = sourceget()
|
||||||
if char is None:
|
if char is None:
|
||||||
raise source.error("missing :")
|
raise source.error("missing :")
|
||||||
|
|
|
@ -1470,11 +1470,11 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertIsNone(pat.match(b'\xe0'))
|
self.assertIsNone(pat.match(b'\xe0'))
|
||||||
# Incompatibilities
|
# Incompatibilities
|
||||||
self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
|
self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
|
||||||
self.assertRaises(ValueError, re.compile, br'(?u)\w')
|
self.assertRaises(re.error, re.compile, br'(?u)\w')
|
||||||
self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
|
self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
|
||||||
self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
|
self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
|
||||||
self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
|
self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
|
||||||
self.assertRaises(ValueError, re.compile, r'(?au)\w')
|
self.assertRaises(re.error, re.compile, r'(?au)\w')
|
||||||
|
|
||||||
def test_locale_flag(self):
|
def test_locale_flag(self):
|
||||||
import locale
|
import locale
|
||||||
|
@ -1516,11 +1516,11 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertIsNone(pat.match(bletter))
|
self.assertIsNone(pat.match(bletter))
|
||||||
# Incompatibilities
|
# Incompatibilities
|
||||||
self.assertRaises(ValueError, re.compile, '', re.LOCALE)
|
self.assertRaises(ValueError, re.compile, '', re.LOCALE)
|
||||||
self.assertRaises(ValueError, re.compile, '(?L)')
|
self.assertRaises(re.error, re.compile, '(?L)')
|
||||||
self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
|
self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
|
||||||
self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
|
self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
|
||||||
self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
|
self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
|
||||||
self.assertRaises(ValueError, re.compile, b'(?aL)')
|
self.assertRaises(re.error, re.compile, b'(?aL)')
|
||||||
|
|
||||||
def test_scoped_flags(self):
|
def test_scoped_flags(self):
|
||||||
self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
|
self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
|
||||||
|
@ -1535,12 +1535,18 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
|
self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
|
||||||
self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
|
self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
|
||||||
|
|
||||||
self.checkPatternError(r'(?a:\w)',
|
self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
|
||||||
'bad inline flags: cannot turn on global flag', 3)
|
self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
|
||||||
|
self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
|
||||||
|
|
||||||
self.checkPatternError(r'(?a)(?-a:\w)',
|
self.checkPatternError(r'(?a)(?-a:\w)',
|
||||||
'bad inline flags: cannot turn off global flag', 8)
|
"bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
|
||||||
self.checkPatternError(r'(?i-i:a)',
|
self.checkPatternError(r'(?i-i:a)',
|
||||||
'bad inline flags: flag turned on and off', 5)
|
'bad inline flags: flag turned on and off', 5)
|
||||||
|
self.checkPatternError(r'(?au:a)',
|
||||||
|
"bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
|
||||||
|
self.checkPatternError(br'(?aL:a)',
|
||||||
|
"bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
|
||||||
|
|
||||||
self.checkPatternError(r'(?-', 'missing flag', 3)
|
self.checkPatternError(r'(?-', 'missing flag', 3)
|
||||||
self.checkPatternError(r'(?-+', 'missing flag', 3)
|
self.checkPatternError(r'(?-+', 'missing flag', 3)
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Allow the flags re.ASCII, re.LOCALE, and re.UNICODE to be used as group flags
|
||||||
|
for regular expressions.
|
|
@ -97,12 +97,12 @@ static const char copyright[] =
|
||||||
#define SRE_IS_WORD(ch)\
|
#define SRE_IS_WORD(ch)\
|
||||||
((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_'))
|
((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_'))
|
||||||
|
|
||||||
static unsigned int sre_lower(unsigned int ch)
|
static unsigned int sre_lower_ascii(unsigned int ch)
|
||||||
{
|
{
|
||||||
return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
|
return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned int sre_upper(unsigned int ch)
|
static unsigned int sre_upper_ascii(unsigned int ch)
|
||||||
{
|
{
|
||||||
return ((ch) < 128 ? Py_TOUPPER(ch) : ch);
|
return ((ch) < 128 ? Py_TOUPPER(ch) : ch);
|
||||||
}
|
}
|
||||||
|
@ -188,6 +188,15 @@ sre_category(SRE_CODE category, unsigned int ch)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOCAL(int)
|
||||||
|
char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
|
||||||
|
{
|
||||||
|
return ch == pattern
|
||||||
|
|| (SRE_CODE) sre_lower_locale(ch) == pattern
|
||||||
|
|| (SRE_CODE) sre_upper_locale(ch) == pattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* helpers */
|
/* helpers */
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -286,7 +295,7 @@ _sre_ascii_iscased_impl(PyObject *module, int character)
|
||||||
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
|
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
|
||||||
{
|
{
|
||||||
unsigned int ch = (unsigned int)character;
|
unsigned int ch = (unsigned int)character;
|
||||||
return ch != sre_lower(ch) || ch != sre_upper(ch);
|
return ch != sre_lower_ascii(ch) || ch != sre_upper_ascii(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
|
@ -317,7 +326,7 @@ static int
|
||||||
_sre_ascii_tolower_impl(PyObject *module, int character)
|
_sre_ascii_tolower_impl(PyObject *module, int character)
|
||||||
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
|
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
|
||||||
{
|
{
|
||||||
return sre_lower(character);
|
return sre_lower_ascii(character);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
|
@ -448,19 +457,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
|
||||||
state->pos = start;
|
state->pos = start;
|
||||||
state->endpos = end;
|
state->endpos = end;
|
||||||
|
|
||||||
if (pattern->flags & SRE_FLAG_LOCALE) {
|
|
||||||
state->lower = sre_lower_locale;
|
|
||||||
state->upper = sre_upper_locale;
|
|
||||||
}
|
|
||||||
else if (pattern->flags & SRE_FLAG_UNICODE) {
|
|
||||||
state->lower = sre_lower_unicode;
|
|
||||||
state->upper = sre_upper_unicode;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
state->lower = sre_lower;
|
|
||||||
state->upper = sre_upper;
|
|
||||||
}
|
|
||||||
|
|
||||||
return string;
|
return string;
|
||||||
err:
|
err:
|
||||||
PyMem_Del(state->mark);
|
PyMem_Del(state->mark);
|
||||||
|
@ -1533,7 +1529,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SRE_OP_RANGE:
|
case SRE_OP_RANGE:
|
||||||
case SRE_OP_RANGE_IGNORE:
|
case SRE_OP_RANGE_UNI_IGNORE:
|
||||||
GET_ARG;
|
GET_ARG;
|
||||||
GET_ARG;
|
GET_ARG;
|
||||||
break;
|
break;
|
||||||
|
@ -1630,6 +1626,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
||||||
case SRE_OP_NOT_LITERAL:
|
case SRE_OP_NOT_LITERAL:
|
||||||
case SRE_OP_LITERAL_IGNORE:
|
case SRE_OP_LITERAL_IGNORE:
|
||||||
case SRE_OP_NOT_LITERAL_IGNORE:
|
case SRE_OP_NOT_LITERAL_IGNORE:
|
||||||
|
case SRE_OP_LITERAL_UNI_IGNORE:
|
||||||
|
case SRE_OP_NOT_LITERAL_UNI_IGNORE:
|
||||||
case SRE_OP_LITERAL_LOC_IGNORE:
|
case SRE_OP_LITERAL_LOC_IGNORE:
|
||||||
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
|
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
|
||||||
GET_ARG;
|
GET_ARG;
|
||||||
|
@ -1669,6 +1667,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
||||||
|
|
||||||
case SRE_OP_IN:
|
case SRE_OP_IN:
|
||||||
case SRE_OP_IN_IGNORE:
|
case SRE_OP_IN_IGNORE:
|
||||||
|
case SRE_OP_IN_UNI_IGNORE:
|
||||||
case SRE_OP_IN_LOC_IGNORE:
|
case SRE_OP_IN_LOC_IGNORE:
|
||||||
GET_SKIP;
|
GET_SKIP;
|
||||||
/* Stop 1 before the end; we check the FAILURE below */
|
/* Stop 1 before the end; we check the FAILURE below */
|
||||||
|
@ -1805,6 +1804,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
||||||
|
|
||||||
case SRE_OP_GROUPREF:
|
case SRE_OP_GROUPREF:
|
||||||
case SRE_OP_GROUPREF_IGNORE:
|
case SRE_OP_GROUPREF_IGNORE:
|
||||||
|
case SRE_OP_GROUPREF_UNI_IGNORE:
|
||||||
|
case SRE_OP_GROUPREF_LOC_IGNORE:
|
||||||
GET_ARG;
|
GET_ARG;
|
||||||
if (arg >= (size_t)groups)
|
if (arg >= (size_t)groups)
|
||||||
FAIL;
|
FAIL;
|
||||||
|
|
|
@ -52,8 +52,6 @@ typedef struct {
|
||||||
Py_ssize_t mark[1];
|
Py_ssize_t mark[1];
|
||||||
} MatchObject;
|
} MatchObject;
|
||||||
|
|
||||||
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
|
|
||||||
|
|
||||||
typedef struct SRE_REPEAT_T {
|
typedef struct SRE_REPEAT_T {
|
||||||
Py_ssize_t count;
|
Py_ssize_t count;
|
||||||
SRE_CODE* pattern; /* points to REPEAT operator arguments */
|
SRE_CODE* pattern; /* points to REPEAT operator arguments */
|
||||||
|
@ -83,8 +81,6 @@ typedef struct {
|
||||||
Py_buffer buffer;
|
Py_buffer buffer;
|
||||||
/* current repeat context */
|
/* current repeat context */
|
||||||
SRE_REPEAT *repeat;
|
SRE_REPEAT *repeat;
|
||||||
/* hooks */
|
|
||||||
SRE_TOLOWER_HOOK lower, upper;
|
|
||||||
} SRE_STATE;
|
} SRE_STATE;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
* See the _sre.c file for information on usage and redistribution.
|
* See the _sre.c file for information on usage and redistribution.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define SRE_MAGIC 20170530
|
#define SRE_MAGIC 20171005
|
||||||
#define SRE_OP_FAILURE 0
|
#define SRE_OP_FAILURE 0
|
||||||
#define SRE_OP_SUCCESS 1
|
#define SRE_OP_SUCCESS 1
|
||||||
#define SRE_OP_ANY 2
|
#define SRE_OP_ANY 2
|
||||||
|
@ -26,28 +26,33 @@
|
||||||
#define SRE_OP_BIGCHARSET 11
|
#define SRE_OP_BIGCHARSET 11
|
||||||
#define SRE_OP_GROUPREF 12
|
#define SRE_OP_GROUPREF 12
|
||||||
#define SRE_OP_GROUPREF_EXISTS 13
|
#define SRE_OP_GROUPREF_EXISTS 13
|
||||||
#define SRE_OP_GROUPREF_IGNORE 14
|
#define SRE_OP_IN 14
|
||||||
#define SRE_OP_IN 15
|
#define SRE_OP_INFO 15
|
||||||
#define SRE_OP_IN_IGNORE 16
|
#define SRE_OP_JUMP 16
|
||||||
#define SRE_OP_INFO 17
|
#define SRE_OP_LITERAL 17
|
||||||
#define SRE_OP_JUMP 18
|
#define SRE_OP_MARK 18
|
||||||
#define SRE_OP_LITERAL 19
|
#define SRE_OP_MAX_UNTIL 19
|
||||||
#define SRE_OP_LITERAL_IGNORE 20
|
#define SRE_OP_MIN_UNTIL 20
|
||||||
#define SRE_OP_MARK 21
|
#define SRE_OP_NOT_LITERAL 21
|
||||||
#define SRE_OP_MAX_UNTIL 22
|
#define SRE_OP_NEGATE 22
|
||||||
#define SRE_OP_MIN_UNTIL 23
|
#define SRE_OP_RANGE 23
|
||||||
#define SRE_OP_NOT_LITERAL 24
|
#define SRE_OP_REPEAT 24
|
||||||
#define SRE_OP_NOT_LITERAL_IGNORE 25
|
#define SRE_OP_REPEAT_ONE 25
|
||||||
#define SRE_OP_NEGATE 26
|
#define SRE_OP_SUBPATTERN 26
|
||||||
#define SRE_OP_RANGE 27
|
#define SRE_OP_MIN_REPEAT_ONE 27
|
||||||
#define SRE_OP_REPEAT 28
|
#define SRE_OP_GROUPREF_IGNORE 28
|
||||||
#define SRE_OP_REPEAT_ONE 29
|
#define SRE_OP_IN_IGNORE 29
|
||||||
#define SRE_OP_SUBPATTERN 30
|
#define SRE_OP_LITERAL_IGNORE 30
|
||||||
#define SRE_OP_MIN_REPEAT_ONE 31
|
#define SRE_OP_NOT_LITERAL_IGNORE 31
|
||||||
#define SRE_OP_RANGE_IGNORE 32
|
#define SRE_OP_GROUPREF_LOC_IGNORE 32
|
||||||
#define SRE_OP_LITERAL_LOC_IGNORE 33
|
#define SRE_OP_IN_LOC_IGNORE 33
|
||||||
#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
|
#define SRE_OP_LITERAL_LOC_IGNORE 34
|
||||||
#define SRE_OP_IN_LOC_IGNORE 35
|
#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35
|
||||||
|
#define SRE_OP_GROUPREF_UNI_IGNORE 36
|
||||||
|
#define SRE_OP_IN_UNI_IGNORE 37
|
||||||
|
#define SRE_OP_LITERAL_UNI_IGNORE 38
|
||||||
|
#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39
|
||||||
|
#define SRE_OP_RANGE_UNI_IGNORE 40
|
||||||
#define SRE_AT_BEGINNING 0
|
#define SRE_AT_BEGINNING 0
|
||||||
#define SRE_AT_BEGINNING_LINE 1
|
#define SRE_AT_BEGINNING_LINE 1
|
||||||
#define SRE_AT_BEGINNING_STRING 2
|
#define SRE_AT_BEGINNING_STRING 2
|
||||||
|
|
|
@ -100,14 +100,6 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOCAL(int)
|
|
||||||
SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
|
|
||||||
{
|
|
||||||
return ch == pattern
|
|
||||||
|| (SRE_CODE) state->lower(ch) == pattern
|
|
||||||
|| (SRE_CODE) state->upper(ch) == pattern;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOCAL(int)
|
LOCAL(int)
|
||||||
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
||||||
{
|
{
|
||||||
|
@ -150,14 +142,14 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
||||||
set += 2;
|
set += 2;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SRE_OP_RANGE_IGNORE:
|
case SRE_OP_RANGE_UNI_IGNORE:
|
||||||
/* <RANGE_IGNORE> <lower> <upper> */
|
/* <RANGE_UNI_IGNORE> <lower> <upper> */
|
||||||
{
|
{
|
||||||
SRE_CODE uch;
|
SRE_CODE uch;
|
||||||
/* ch is already lower cased */
|
/* ch is already lower cased */
|
||||||
if (set[0] <= ch && ch <= set[1])
|
if (set[0] <= ch && ch <= set[1])
|
||||||
return ok;
|
return ok;
|
||||||
uch = state->upper(ch);
|
uch = sre_upper_unicode(ch);
|
||||||
if (set[0] <= uch && uch <= set[1])
|
if (set[0] <= uch && uch <= set[1])
|
||||||
return ok;
|
return ok;
|
||||||
set += 2;
|
set += 2;
|
||||||
|
@ -199,11 +191,11 @@ LOCAL(int)
|
||||||
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
||||||
{
|
{
|
||||||
SRE_CODE lo, up;
|
SRE_CODE lo, up;
|
||||||
lo = state->lower(ch);
|
lo = sre_lower_locale(ch);
|
||||||
if (SRE(charset)(state, set, lo))
|
if (SRE(charset)(state, set, lo))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
up = state->upper(ch);
|
up = sre_upper_locale(ch);
|
||||||
return up != lo && SRE(charset)(state, set, up);
|
return up != lo && SRE(charset)(state, set, up);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -263,7 +255,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
||||||
/* repeated literal */
|
/* repeated literal */
|
||||||
chr = pattern[1];
|
chr = pattern[1];
|
||||||
TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
|
TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
|
||||||
while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
|
while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr)
|
||||||
|
ptr++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_LITERAL_UNI_IGNORE:
|
||||||
|
/* repeated literal */
|
||||||
|
chr = pattern[1];
|
||||||
|
TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
|
||||||
|
while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr)
|
||||||
ptr++;
|
ptr++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -271,7 +271,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
||||||
/* repeated literal */
|
/* repeated literal */
|
||||||
chr = pattern[1];
|
chr = pattern[1];
|
||||||
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
|
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
|
||||||
while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
|
while (ptr < end && char_loc_ignore(chr, *ptr))
|
||||||
ptr++;
|
ptr++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -293,7 +293,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
||||||
/* repeated non-literal */
|
/* repeated non-literal */
|
||||||
chr = pattern[1];
|
chr = pattern[1];
|
||||||
TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
|
TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
|
||||||
while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
|
while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr)
|
||||||
|
ptr++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_NOT_LITERAL_UNI_IGNORE:
|
||||||
|
/* repeated non-literal */
|
||||||
|
chr = pattern[1];
|
||||||
|
TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
|
||||||
|
while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr)
|
||||||
ptr++;
|
ptr++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -301,7 +309,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
||||||
/* repeated non-literal */
|
/* repeated non-literal */
|
||||||
chr = pattern[1];
|
chr = pattern[1];
|
||||||
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
|
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
|
||||||
while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
|
while (ptr < end && !char_loc_ignore(chr, *ptr))
|
||||||
ptr++;
|
ptr++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -687,7 +695,17 @@ entrance:
|
||||||
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
|
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
|
||||||
ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
||||||
if (ctx->ptr >= end ||
|
if (ctx->ptr >= end ||
|
||||||
state->lower(*ctx->ptr) != *ctx->pattern)
|
sre_lower_ascii(*ctx->ptr) != *ctx->pattern)
|
||||||
|
RETURN_FAILURE;
|
||||||
|
ctx->pattern++;
|
||||||
|
ctx->ptr++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_LITERAL_UNI_IGNORE:
|
||||||
|
TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n",
|
||||||
|
ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
||||||
|
if (ctx->ptr >= end ||
|
||||||
|
sre_lower_unicode(*ctx->ptr) != *ctx->pattern)
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
ctx->pattern++;
|
ctx->pattern++;
|
||||||
ctx->ptr++;
|
ctx->ptr++;
|
||||||
|
@ -697,7 +715,7 @@ entrance:
|
||||||
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
|
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
|
||||||
ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
||||||
if (ctx->ptr >= end
|
if (ctx->ptr >= end
|
||||||
|| !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
|
|| !char_loc_ignore(*ctx->pattern, *ctx->ptr))
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
ctx->pattern++;
|
ctx->pattern++;
|
||||||
ctx->ptr++;
|
ctx->ptr++;
|
||||||
|
@ -707,7 +725,17 @@ entrance:
|
||||||
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
|
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
|
||||||
ctx->pattern, ctx->ptr, *ctx->pattern));
|
ctx->pattern, ctx->ptr, *ctx->pattern));
|
||||||
if (ctx->ptr >= end ||
|
if (ctx->ptr >= end ||
|
||||||
state->lower(*ctx->ptr) == *ctx->pattern)
|
sre_lower_ascii(*ctx->ptr) == *ctx->pattern)
|
||||||
|
RETURN_FAILURE;
|
||||||
|
ctx->pattern++;
|
||||||
|
ctx->ptr++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_NOT_LITERAL_UNI_IGNORE:
|
||||||
|
TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n",
|
||||||
|
ctx->pattern, ctx->ptr, *ctx->pattern));
|
||||||
|
if (ctx->ptr >= end ||
|
||||||
|
sre_lower_unicode(*ctx->ptr) == *ctx->pattern)
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
ctx->pattern++;
|
ctx->pattern++;
|
||||||
ctx->ptr++;
|
ctx->ptr++;
|
||||||
|
@ -717,7 +745,7 @@ entrance:
|
||||||
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
|
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
|
||||||
ctx->pattern, ctx->ptr, *ctx->pattern));
|
ctx->pattern, ctx->ptr, *ctx->pattern));
|
||||||
if (ctx->ptr >= end
|
if (ctx->ptr >= end
|
||||||
|| SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
|
|| char_loc_ignore(*ctx->pattern, *ctx->ptr))
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
ctx->pattern++;
|
ctx->pattern++;
|
||||||
ctx->ptr++;
|
ctx->ptr++;
|
||||||
|
@ -727,7 +755,17 @@ entrance:
|
||||||
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
|
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
|
||||||
if (ctx->ptr >= end
|
if (ctx->ptr >= end
|
||||||
|| !SRE(charset)(state, ctx->pattern+1,
|
|| !SRE(charset)(state, ctx->pattern+1,
|
||||||
(SRE_CODE)state->lower(*ctx->ptr)))
|
(SRE_CODE)sre_lower_ascii(*ctx->ptr)))
|
||||||
|
RETURN_FAILURE;
|
||||||
|
ctx->pattern += ctx->pattern[0];
|
||||||
|
ctx->ptr++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_IN_UNI_IGNORE:
|
||||||
|
TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr));
|
||||||
|
if (ctx->ptr >= end
|
||||||
|
|| !SRE(charset)(state, ctx->pattern+1,
|
||||||
|
(SRE_CODE)sre_lower_unicode(*ctx->ptr)))
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
ctx->pattern += ctx->pattern[0];
|
ctx->pattern += ctx->pattern[0];
|
||||||
ctx->ptr++;
|
ctx->ptr++;
|
||||||
|
@ -1135,7 +1173,59 @@ entrance:
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
while (p < e) {
|
while (p < e) {
|
||||||
if (ctx->ptr >= end ||
|
if (ctx->ptr >= end ||
|
||||||
state->lower(*ctx->ptr) != state->lower(*p))
|
sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p))
|
||||||
|
RETURN_FAILURE;
|
||||||
|
p++;
|
||||||
|
ctx->ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctx->pattern++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_GROUPREF_UNI_IGNORE:
|
||||||
|
/* match backreference */
|
||||||
|
TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern,
|
||||||
|
ctx->ptr, ctx->pattern[0]));
|
||||||
|
i = ctx->pattern[0];
|
||||||
|
{
|
||||||
|
Py_ssize_t groupref = i+i;
|
||||||
|
if (groupref >= state->lastmark) {
|
||||||
|
RETURN_FAILURE;
|
||||||
|
} else {
|
||||||
|
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
|
||||||
|
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
|
||||||
|
if (!p || !e || e < p)
|
||||||
|
RETURN_FAILURE;
|
||||||
|
while (p < e) {
|
||||||
|
if (ctx->ptr >= end ||
|
||||||
|
sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p))
|
||||||
|
RETURN_FAILURE;
|
||||||
|
p++;
|
||||||
|
ctx->ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctx->pattern++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_GROUPREF_LOC_IGNORE:
|
||||||
|
/* match backreference */
|
||||||
|
TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern,
|
||||||
|
ctx->ptr, ctx->pattern[0]));
|
||||||
|
i = ctx->pattern[0];
|
||||||
|
{
|
||||||
|
Py_ssize_t groupref = i+i;
|
||||||
|
if (groupref >= state->lastmark) {
|
||||||
|
RETURN_FAILURE;
|
||||||
|
} else {
|
||||||
|
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
|
||||||
|
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
|
||||||
|
if (!p || !e || e < p)
|
||||||
|
RETURN_FAILURE;
|
||||||
|
while (p < e) {
|
||||||
|
if (ctx->ptr >= end ||
|
||||||
|
sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p))
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
p++;
|
p++;
|
||||||
ctx->ptr++;
|
ctx->ptr++;
|
||||||
|
|
Loading…
Reference in New Issue