Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.
This commit is contained in:
parent
455de40a6e
commit
4b8f8949b4
|
@ -22,9 +22,6 @@ if _sre.CODESIZE == 2:
|
||||||
else:
|
else:
|
||||||
MAXCODE = 0xFFFFFFFF
|
MAXCODE = 0xFFFFFFFF
|
||||||
|
|
||||||
def _identityfunction(x):
|
|
||||||
return x
|
|
||||||
|
|
||||||
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
|
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
|
||||||
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
|
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
|
||||||
_SUCCESS_CODES = set([SUCCESS, FAILURE])
|
_SUCCESS_CODES = set([SUCCESS, FAILURE])
|
||||||
|
@ -53,7 +50,7 @@ def _compile(code, pattern, flags):
|
||||||
return _sre.getlower(literal, flags)
|
return _sre.getlower(literal, flags)
|
||||||
else:
|
else:
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
fixup = _identityfunction
|
fixup = None
|
||||||
skip = _len(code); emit(0)
|
skip = _len(code); emit(0)
|
||||||
_compile_charset(av, flags, code, fixup)
|
_compile_charset(av, flags, code, fixup)
|
||||||
code[skip] = _len(code) - skip
|
code[skip] = _len(code) - skip
|
||||||
|
@ -172,17 +169,15 @@ def _compile(code, pattern, flags):
|
||||||
def _compile_charset(charset, flags, code, fixup=None):
|
def _compile_charset(charset, flags, code, fixup=None):
|
||||||
# compile charset subprogram
|
# compile charset subprogram
|
||||||
emit = code.append
|
emit = code.append
|
||||||
if fixup is None:
|
|
||||||
fixup = _identityfunction
|
|
||||||
for op, av in _optimize_charset(charset, fixup):
|
for op, av in _optimize_charset(charset, fixup):
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
if op is NEGATE:
|
if op is NEGATE:
|
||||||
pass
|
pass
|
||||||
elif op is LITERAL:
|
elif op is LITERAL:
|
||||||
emit(fixup(av))
|
emit(av)
|
||||||
elif op is RANGE:
|
elif op is RANGE or op is RANGE_IGNORE:
|
||||||
emit(fixup(av[0]))
|
emit(av[0])
|
||||||
emit(fixup(av[1]))
|
emit(av[1])
|
||||||
elif op is CHARSET:
|
elif op is CHARSET:
|
||||||
code.extend(av)
|
code.extend(av)
|
||||||
elif op is BIGCHARSET:
|
elif op is BIGCHARSET:
|
||||||
|
@ -207,9 +202,14 @@ def _optimize_charset(charset, fixup):
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
if op is LITERAL:
|
if op is LITERAL:
|
||||||
charmap[fixup(av)] = 1
|
if fixup:
|
||||||
|
av = fixup(av)
|
||||||
|
charmap[av] = 1
|
||||||
elif op is RANGE:
|
elif op is RANGE:
|
||||||
for i in range(fixup(av[0]), fixup(av[1])+1):
|
r = range(av[0], av[1]+1)
|
||||||
|
if fixup:
|
||||||
|
r = map(fixup, r)
|
||||||
|
for i in r:
|
||||||
charmap[i] = 1
|
charmap[i] = 1
|
||||||
elif op is NEGATE:
|
elif op is NEGATE:
|
||||||
out.append((op, av))
|
out.append((op, av))
|
||||||
|
@ -220,7 +220,12 @@ def _optimize_charset(charset, fixup):
|
||||||
# character set contains non-UCS1 character codes
|
# character set contains non-UCS1 character codes
|
||||||
charmap += b'\0' * 0xff00
|
charmap += b'\0' * 0xff00
|
||||||
continue
|
continue
|
||||||
# character set contains non-BMP character codes
|
# Character set contains non-BMP character codes.
|
||||||
|
# There are only two ranges of cased non-BMP characters:
|
||||||
|
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
|
||||||
|
# and for both ranges RANGE_IGNORE works.
|
||||||
|
if fixup and op is RANGE:
|
||||||
|
op = RANGE_IGNORE
|
||||||
tail.append((op, av))
|
tail.append((op, av))
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -247,8 +252,10 @@ def _optimize_charset(charset, fixup):
|
||||||
else:
|
else:
|
||||||
out.append((RANGE, (p, q - 1)))
|
out.append((RANGE, (p, q - 1)))
|
||||||
out += tail
|
out += tail
|
||||||
if len(out) < len(charset):
|
# if the case was changed or new representation is more compact
|
||||||
|
if fixup or len(out) < len(charset):
|
||||||
return out
|
return out
|
||||||
|
# else original character set is good enough
|
||||||
return charset
|
return charset
|
||||||
|
|
||||||
# use bitmap
|
# use bitmap
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
|
|
||||||
# update when constants are added or removed
|
# update when constants are added or removed
|
||||||
|
|
||||||
MAGIC = 20031017
|
MAGIC = 20140917
|
||||||
|
|
||||||
from _sre import MAXREPEAT, MAXGROUPS
|
from _sre import MAXREPEAT, MAXGROUPS
|
||||||
|
|
||||||
|
@ -56,6 +56,7 @@ NEGATE = "negate"
|
||||||
NOT_LITERAL = "not_literal"
|
NOT_LITERAL = "not_literal"
|
||||||
NOT_LITERAL_IGNORE = "not_literal_ignore"
|
NOT_LITERAL_IGNORE = "not_literal_ignore"
|
||||||
RANGE = "range"
|
RANGE = "range"
|
||||||
|
RANGE_IGNORE = "range_ignore"
|
||||||
REPEAT = "repeat"
|
REPEAT = "repeat"
|
||||||
REPEAT_ONE = "repeat_one"
|
REPEAT_ONE = "repeat_one"
|
||||||
SUBPATTERN = "subpattern"
|
SUBPATTERN = "subpattern"
|
||||||
|
@ -121,7 +122,8 @@ OPCODES = [
|
||||||
REPEAT,
|
REPEAT,
|
||||||
REPEAT_ONE,
|
REPEAT_ONE,
|
||||||
SUBPATTERN,
|
SUBPATTERN,
|
||||||
MIN_REPEAT_ONE
|
MIN_REPEAT_ONE,
|
||||||
|
RANGE_IGNORE,
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -159,7 +161,8 @@ OP_IGNORE = {
|
||||||
GROUPREF: GROUPREF_IGNORE,
|
GROUPREF: GROUPREF_IGNORE,
|
||||||
IN: IN_IGNORE,
|
IN: IN_IGNORE,
|
||||||
LITERAL: LITERAL_IGNORE,
|
LITERAL: LITERAL_IGNORE,
|
||||||
NOT_LITERAL: NOT_LITERAL_IGNORE
|
NOT_LITERAL: NOT_LITERAL_IGNORE,
|
||||||
|
RANGE: RANGE_IGNORE,
|
||||||
}
|
}
|
||||||
|
|
||||||
AT_MULTILINE = {
|
AT_MULTILINE = {
|
||||||
|
|
|
@ -601,6 +601,25 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
|
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
|
||||||
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
|
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
|
||||||
|
|
||||||
|
def test_ignore_case_range(self):
|
||||||
|
# Issues #3511, #17381.
|
||||||
|
self.assertTrue(re.match(r'[9-a]', '_', re.I))
|
||||||
|
self.assertIsNone(re.match(r'[9-A]', '_', re.I))
|
||||||
|
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
|
||||||
|
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
|
||||||
|
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
|
||||||
|
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
|
||||||
|
|
||||||
def test_category(self):
|
def test_category(self):
|
||||||
self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
|
self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
|
||||||
|
|
||||||
|
|
|
@ -180,6 +180,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #17381: Fixed handling of case-insensitive ranges in regular
|
||||||
|
expressions.
|
||||||
|
|
||||||
- Issue #22410: Module level functions in the re module now cache compiled
|
- Issue #22410: Module level functions in the re module now cache compiled
|
||||||
locale-dependent regular expressions taking into account the locale.
|
locale-dependent regular expressions taking into account the locale.
|
||||||
|
|
||||||
|
|
|
@ -113,6 +113,11 @@ static unsigned int sre_lower(unsigned int ch)
|
||||||
return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
|
return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned int sre_upper(unsigned int ch)
|
||||||
|
{
|
||||||
|
return ((ch) < 128 ? Py_TOUPPER(ch) : ch);
|
||||||
|
}
|
||||||
|
|
||||||
/* locale-specific character predicates */
|
/* locale-specific character predicates */
|
||||||
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
|
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
|
||||||
* warnings when c's type supports only numbers < N+1 */
|
* warnings when c's type supports only numbers < N+1 */
|
||||||
|
@ -124,6 +129,11 @@ static unsigned int sre_lower_locale(unsigned int ch)
|
||||||
return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
|
return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned int sre_upper_locale(unsigned int ch)
|
||||||
|
{
|
||||||
|
return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
|
||||||
|
}
|
||||||
|
|
||||||
/* unicode-specific character predicates */
|
/* unicode-specific character predicates */
|
||||||
|
|
||||||
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
|
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
|
||||||
|
@ -137,6 +147,11 @@ static unsigned int sre_lower_unicode(unsigned int ch)
|
||||||
return (unsigned int) Py_UNICODE_TOLOWER(ch);
|
return (unsigned int) Py_UNICODE_TOLOWER(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned int sre_upper_unicode(unsigned int ch)
|
||||||
|
{
|
||||||
|
return (unsigned int) Py_UNICODE_TOUPPER(ch);
|
||||||
|
}
|
||||||
|
|
||||||
LOCAL(int)
|
LOCAL(int)
|
||||||
sre_category(SRE_CODE category, unsigned int ch)
|
sre_category(SRE_CODE category, unsigned int ch)
|
||||||
{
|
{
|
||||||
|
@ -377,12 +392,18 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
|
||||||
state->pos = start;
|
state->pos = start;
|
||||||
state->endpos = end;
|
state->endpos = end;
|
||||||
|
|
||||||
if (pattern->flags & SRE_FLAG_LOCALE)
|
if (pattern->flags & SRE_FLAG_LOCALE) {
|
||||||
state->lower = sre_lower_locale;
|
state->lower = sre_lower_locale;
|
||||||
else if (pattern->flags & SRE_FLAG_UNICODE)
|
state->upper = sre_upper_locale;
|
||||||
|
}
|
||||||
|
else if (pattern->flags & SRE_FLAG_UNICODE) {
|
||||||
state->lower = sre_lower_unicode;
|
state->lower = sre_lower_unicode;
|
||||||
else
|
state->upper = sre_upper_unicode;
|
||||||
|
}
|
||||||
|
else {
|
||||||
state->lower = sre_lower;
|
state->lower = sre_lower;
|
||||||
|
state->upper = sre_upper;
|
||||||
|
}
|
||||||
|
|
||||||
return string;
|
return string;
|
||||||
err:
|
err:
|
||||||
|
@ -1567,6 +1588,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SRE_OP_RANGE:
|
case SRE_OP_RANGE:
|
||||||
|
case SRE_OP_RANGE_IGNORE:
|
||||||
GET_ARG;
|
GET_ARG;
|
||||||
GET_ARG;
|
GET_ARG;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -84,7 +84,7 @@ typedef struct {
|
||||||
/* current repeat context */
|
/* current repeat context */
|
||||||
SRE_REPEAT *repeat;
|
SRE_REPEAT *repeat;
|
||||||
/* hooks */
|
/* hooks */
|
||||||
SRE_TOLOWER_HOOK lower;
|
SRE_TOLOWER_HOOK lower, upper;
|
||||||
} SRE_STATE;
|
} SRE_STATE;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
* See the _sre.c file for information on usage and redistribution.
|
* See the _sre.c file for information on usage and redistribution.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define SRE_MAGIC 20031017
|
#define SRE_MAGIC 20140917
|
||||||
#define SRE_OP_FAILURE 0
|
#define SRE_OP_FAILURE 0
|
||||||
#define SRE_OP_SUCCESS 1
|
#define SRE_OP_SUCCESS 1
|
||||||
#define SRE_OP_ANY 2
|
#define SRE_OP_ANY 2
|
||||||
|
@ -44,6 +44,7 @@
|
||||||
#define SRE_OP_REPEAT_ONE 29
|
#define SRE_OP_REPEAT_ONE 29
|
||||||
#define SRE_OP_SUBPATTERN 30
|
#define SRE_OP_SUBPATTERN 30
|
||||||
#define SRE_OP_MIN_REPEAT_ONE 31
|
#define SRE_OP_MIN_REPEAT_ONE 31
|
||||||
|
#define SRE_OP_RANGE_IGNORE 32
|
||||||
#define SRE_AT_BEGINNING 0
|
#define SRE_AT_BEGINNING 0
|
||||||
#define SRE_AT_BEGINNING_LINE 1
|
#define SRE_AT_BEGINNING_LINE 1
|
||||||
#define SRE_AT_BEGINNING_STRING 2
|
#define SRE_AT_BEGINNING_STRING 2
|
||||||
|
|
|
@ -101,7 +101,7 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
|
||||||
}
|
}
|
||||||
|
|
||||||
LOCAL(int)
|
LOCAL(int)
|
||||||
SRE(charset)(SRE_CODE* set, SRE_CODE ch)
|
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
||||||
{
|
{
|
||||||
/* check if character is a member of the given set */
|
/* check if character is a member of the given set */
|
||||||
|
|
||||||
|
@ -142,6 +142,20 @@ SRE(charset)(SRE_CODE* set, SRE_CODE ch)
|
||||||
set += 2;
|
set += 2;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case SRE_OP_RANGE_IGNORE:
|
||||||
|
/* <RANGE_IGNORE> <lower> <upper> */
|
||||||
|
{
|
||||||
|
SRE_CODE uch;
|
||||||
|
/* ch is already lower cased */
|
||||||
|
if (set[0] <= ch && ch <= set[1])
|
||||||
|
return ok;
|
||||||
|
uch = state->upper(ch);
|
||||||
|
if (set[0] <= uch && uch <= set[1])
|
||||||
|
return ok;
|
||||||
|
set += 2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case SRE_OP_NEGATE:
|
case SRE_OP_NEGATE:
|
||||||
ok = !ok;
|
ok = !ok;
|
||||||
break;
|
break;
|
||||||
|
@ -193,7 +207,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
||||||
case SRE_OP_IN:
|
case SRE_OP_IN:
|
||||||
/* repeated set */
|
/* repeated set */
|
||||||
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
|
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
|
||||||
while (ptr < end && SRE(charset)(pattern + 2, *ptr))
|
while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
|
||||||
ptr++;
|
ptr++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -628,7 +642,8 @@ entrance:
|
||||||
/* match set member (or non_member) */
|
/* match set member (or non_member) */
|
||||||
/* <IN> <skip> <set> */
|
/* <IN> <skip> <set> */
|
||||||
TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
|
TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
|
||||||
if (ctx->ptr >= end || !SRE(charset)(ctx->pattern + 1, *ctx->ptr))
|
if (ctx->ptr >= end ||
|
||||||
|
!SRE(charset)(state, ctx->pattern + 1, *ctx->ptr))
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
ctx->pattern += ctx->pattern[0];
|
ctx->pattern += ctx->pattern[0];
|
||||||
ctx->ptr++;
|
ctx->ptr++;
|
||||||
|
@ -657,7 +672,7 @@ entrance:
|
||||||
case SRE_OP_IN_IGNORE:
|
case SRE_OP_IN_IGNORE:
|
||||||
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
|
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
|
||||||
if (ctx->ptr >= end
|
if (ctx->ptr >= end
|
||||||
|| !SRE(charset)(ctx->pattern+1,
|
|| !SRE(charset)(state, ctx->pattern+1,
|
||||||
(SRE_CODE)state->lower(*ctx->ptr)))
|
(SRE_CODE)state->lower(*ctx->ptr)))
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
ctx->pattern += ctx->pattern[0];
|
ctx->pattern += ctx->pattern[0];
|
||||||
|
@ -688,7 +703,8 @@ entrance:
|
||||||
continue;
|
continue;
|
||||||
if (ctx->pattern[1] == SRE_OP_IN &&
|
if (ctx->pattern[1] == SRE_OP_IN &&
|
||||||
(ctx->ptr >= end ||
|
(ctx->ptr >= end ||
|
||||||
!SRE(charset)(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
|
!SRE(charset)(state, ctx->pattern + 3,
|
||||||
|
(SRE_CODE) *ctx->ptr)))
|
||||||
continue;
|
continue;
|
||||||
state->ptr = ctx->ptr;
|
state->ptr = ctx->ptr;
|
||||||
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
|
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
|
||||||
|
@ -1310,7 +1326,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
/* pattern starts with a character from a known set */
|
/* pattern starts with a character from a known set */
|
||||||
end = (SRE_CHAR *)state->end;
|
end = (SRE_CHAR *)state->end;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
while (ptr < end && !SRE(charset)(charset, *ptr))
|
while (ptr < end && !SRE(charset)(state, charset, *ptr))
|
||||||
ptr++;
|
ptr++;
|
||||||
if (ptr >= end)
|
if (ptr >= end)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
Loading…
Reference in New Issue