Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.

Added new opcode RANGE_IGNORE.
This commit is contained in:
Serhiy Storchaka 2014-10-31 12:36:56 +02:00
parent 455de40a6e
commit 4b8f8949b4
8 changed files with 99 additions and 28 deletions

View File

@ -22,9 +22,6 @@ if _sre.CODESIZE == 2:
else: else:
MAXCODE = 0xFFFFFFFF MAXCODE = 0xFFFFFFFF
def _identityfunction(x):
return x
_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
_SUCCESS_CODES = set([SUCCESS, FAILURE]) _SUCCESS_CODES = set([SUCCESS, FAILURE])
@ -53,7 +50,7 @@ def _compile(code, pattern, flags):
return _sre.getlower(literal, flags) return _sre.getlower(literal, flags)
else: else:
emit(OPCODES[op]) emit(OPCODES[op])
fixup = _identityfunction fixup = None
skip = _len(code); emit(0) skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup) _compile_charset(av, flags, code, fixup)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
@ -172,17 +169,15 @@ def _compile(code, pattern, flags):
def _compile_charset(charset, flags, code, fixup=None): def _compile_charset(charset, flags, code, fixup=None):
# compile charset subprogram # compile charset subprogram
emit = code.append emit = code.append
if fixup is None:
fixup = _identityfunction
for op, av in _optimize_charset(charset, fixup): for op, av in _optimize_charset(charset, fixup):
emit(OPCODES[op]) emit(OPCODES[op])
if op is NEGATE: if op is NEGATE:
pass pass
elif op is LITERAL: elif op is LITERAL:
emit(fixup(av)) emit(av)
elif op is RANGE: elif op is RANGE or op is RANGE_IGNORE:
emit(fixup(av[0])) emit(av[0])
emit(fixup(av[1])) emit(av[1])
elif op is CHARSET: elif op is CHARSET:
code.extend(av) code.extend(av)
elif op is BIGCHARSET: elif op is BIGCHARSET:
@ -207,9 +202,14 @@ def _optimize_charset(charset, fixup):
while True: while True:
try: try:
if op is LITERAL: if op is LITERAL:
charmap[fixup(av)] = 1 if fixup:
av = fixup(av)
charmap[av] = 1
elif op is RANGE: elif op is RANGE:
for i in range(fixup(av[0]), fixup(av[1])+1): r = range(av[0], av[1]+1)
if fixup:
r = map(fixup, r)
for i in r:
charmap[i] = 1 charmap[i] = 1
elif op is NEGATE: elif op is NEGATE:
out.append((op, av)) out.append((op, av))
@ -220,7 +220,12 @@ def _optimize_charset(charset, fixup):
# character set contains non-UCS1 character codes # character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00 charmap += b'\0' * 0xff00
continue continue
# character set contains non-BMP character codes # Character set contains non-BMP character codes.
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_IGNORE works.
if fixup and op is RANGE:
op = RANGE_IGNORE
tail.append((op, av)) tail.append((op, av))
break break
@ -247,8 +252,10 @@ def _optimize_charset(charset, fixup):
else: else:
out.append((RANGE, (p, q - 1))) out.append((RANGE, (p, q - 1)))
out += tail out += tail
if len(out) < len(charset): # if the case was changed or new representation is more compact
if fixup or len(out) < len(charset):
return out return out
# else original character set is good enough
return charset return charset
# use bitmap # use bitmap

View File

@ -13,7 +13,7 @@
# update when constants are added or removed # update when constants are added or removed
MAGIC = 20031017 MAGIC = 20140917
from _sre import MAXREPEAT, MAXGROUPS from _sre import MAXREPEAT, MAXGROUPS
@ -56,6 +56,7 @@ NEGATE = "negate"
NOT_LITERAL = "not_literal" NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore" NOT_LITERAL_IGNORE = "not_literal_ignore"
RANGE = "range" RANGE = "range"
RANGE_IGNORE = "range_ignore"
REPEAT = "repeat" REPEAT = "repeat"
REPEAT_ONE = "repeat_one" REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern" SUBPATTERN = "subpattern"
@ -121,7 +122,8 @@ OPCODES = [
REPEAT, REPEAT,
REPEAT_ONE, REPEAT_ONE,
SUBPATTERN, SUBPATTERN,
MIN_REPEAT_ONE MIN_REPEAT_ONE,
RANGE_IGNORE,
] ]
@ -159,7 +161,8 @@ OP_IGNORE = {
GROUPREF: GROUPREF_IGNORE, GROUPREF: GROUPREF_IGNORE,
IN: IN_IGNORE, IN: IN_IGNORE,
LITERAL: LITERAL_IGNORE, LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE NOT_LITERAL: NOT_LITERAL_IGNORE,
RANGE: RANGE_IGNORE,
} }
AT_MULTILINE = { AT_MULTILINE = {

View File

@ -601,6 +601,25 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
def test_ignore_case_range(self):
# Issues #3511, #17381.
self.assertTrue(re.match(r'[9-a]', '_', re.I))
self.assertIsNone(re.match(r'[9-A]', '_', re.I))
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
def test_category(self): def test_category(self):
self.assertEqual(re.match(r"(\s)", " ").group(1), " ") self.assertEqual(re.match(r"(\s)", " ").group(1), " ")

View File

@ -180,6 +180,9 @@ Core and Builtins
Library Library
------- -------
- Issue #17381: Fixed handling of case-insensitive ranges in regular
expressions.
- Issue #22410: Module level functions in the re module now cache compiled - Issue #22410: Module level functions in the re module now cache compiled
locale-dependent regular expressions taking into account the locale. locale-dependent regular expressions taking into account the locale.

View File

@ -113,6 +113,11 @@ static unsigned int sre_lower(unsigned int ch)
return ((ch) < 128 ? Py_TOLOWER(ch) : ch); return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
} }
static unsigned int sre_upper(unsigned int ch)
{
return ((ch) < 128 ? Py_TOUPPER(ch) : ch);
}
/* locale-specific character predicates */ /* locale-specific character predicates */
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
* warnings when c's type supports only numbers < N+1 */ * warnings when c's type supports only numbers < N+1 */
@ -124,6 +129,11 @@ static unsigned int sre_lower_locale(unsigned int ch)
return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch); return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
} }
static unsigned int sre_upper_locale(unsigned int ch)
{
return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
}
/* unicode-specific character predicates */ /* unicode-specific character predicates */
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch) #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
@ -137,6 +147,11 @@ static unsigned int sre_lower_unicode(unsigned int ch)
return (unsigned int) Py_UNICODE_TOLOWER(ch); return (unsigned int) Py_UNICODE_TOLOWER(ch);
} }
static unsigned int sre_upper_unicode(unsigned int ch)
{
return (unsigned int) Py_UNICODE_TOUPPER(ch);
}
LOCAL(int) LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch) sre_category(SRE_CODE category, unsigned int ch)
{ {
@ -377,12 +392,18 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->pos = start; state->pos = start;
state->endpos = end; state->endpos = end;
if (pattern->flags & SRE_FLAG_LOCALE) if (pattern->flags & SRE_FLAG_LOCALE) {
state->lower = sre_lower_locale; state->lower = sre_lower_locale;
else if (pattern->flags & SRE_FLAG_UNICODE) state->upper = sre_upper_locale;
}
else if (pattern->flags & SRE_FLAG_UNICODE) {
state->lower = sre_lower_unicode; state->lower = sre_lower_unicode;
else state->upper = sre_upper_unicode;
}
else {
state->lower = sre_lower; state->lower = sre_lower;
state->upper = sre_upper;
}
return string; return string;
err: err:
@ -1567,6 +1588,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
break; break;
case SRE_OP_RANGE: case SRE_OP_RANGE:
case SRE_OP_RANGE_IGNORE:
GET_ARG; GET_ARG;
GET_ARG; GET_ARG;
break; break;

View File

@ -84,7 +84,7 @@ typedef struct {
/* current repeat context */ /* current repeat context */
SRE_REPEAT *repeat; SRE_REPEAT *repeat;
/* hooks */ /* hooks */
SRE_TOLOWER_HOOK lower; SRE_TOLOWER_HOOK lower, upper;
} SRE_STATE; } SRE_STATE;
typedef struct { typedef struct {

View File

@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution. * See the _sre.c file for information on usage and redistribution.
*/ */
#define SRE_MAGIC 20031017 #define SRE_MAGIC 20140917
#define SRE_OP_FAILURE 0 #define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1 #define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2 #define SRE_OP_ANY 2
@ -44,6 +44,7 @@
#define SRE_OP_REPEAT_ONE 29 #define SRE_OP_REPEAT_ONE 29
#define SRE_OP_SUBPATTERN 30 #define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31 #define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_OP_RANGE_IGNORE 32
#define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2 #define SRE_AT_BEGINNING_STRING 2

View File

@ -101,7 +101,7 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
} }
LOCAL(int) LOCAL(int)
SRE(charset)(SRE_CODE* set, SRE_CODE ch) SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{ {
/* check if character is a member of the given set */ /* check if character is a member of the given set */
@ -142,6 +142,20 @@ SRE(charset)(SRE_CODE* set, SRE_CODE ch)
set += 2; set += 2;
break; break;
case SRE_OP_RANGE_IGNORE:
/* <RANGE_IGNORE> <lower> <upper> */
{
SRE_CODE uch;
/* ch is already lower cased */
if (set[0] <= ch && ch <= set[1])
return ok;
uch = state->upper(ch);
if (set[0] <= uch && uch <= set[1])
return ok;
set += 2;
break;
}
case SRE_OP_NEGATE: case SRE_OP_NEGATE:
ok = !ok; ok = !ok;
break; break;
@ -193,7 +207,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
case SRE_OP_IN: case SRE_OP_IN:
/* repeated set */ /* repeated set */
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
while (ptr < end && SRE(charset)(pattern + 2, *ptr)) while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
ptr++; ptr++;
break; break;
@ -628,7 +642,8 @@ entrance:
/* match set member (or non_member) */ /* match set member (or non_member) */
/* <IN> <skip> <set> */ /* <IN> <skip> <set> */
TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end || !SRE(charset)(ctx->pattern + 1, *ctx->ptr)) if (ctx->ptr >= end ||
!SRE(charset)(state, ctx->pattern + 1, *ctx->ptr))
RETURN_FAILURE; RETURN_FAILURE;
ctx->pattern += ctx->pattern[0]; ctx->pattern += ctx->pattern[0];
ctx->ptr++; ctx->ptr++;
@ -657,7 +672,7 @@ entrance:
case SRE_OP_IN_IGNORE: case SRE_OP_IN_IGNORE:
TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end if (ctx->ptr >= end
|| !SRE(charset)(ctx->pattern+1, || !SRE(charset)(state, ctx->pattern+1,
(SRE_CODE)state->lower(*ctx->ptr))) (SRE_CODE)state->lower(*ctx->ptr)))
RETURN_FAILURE; RETURN_FAILURE;
ctx->pattern += ctx->pattern[0]; ctx->pattern += ctx->pattern[0];
@ -688,7 +703,8 @@ entrance:
continue; continue;
if (ctx->pattern[1] == SRE_OP_IN && if (ctx->pattern[1] == SRE_OP_IN &&
(ctx->ptr >= end || (ctx->ptr >= end ||
!SRE(charset)(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) !SRE(charset)(state, ctx->pattern + 3,
(SRE_CODE) *ctx->ptr)))
continue; continue;
state->ptr = ctx->ptr; state->ptr = ctx->ptr;
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
@ -1310,7 +1326,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
/* pattern starts with a character from a known set */ /* pattern starts with a character from a known set */
end = (SRE_CHAR *)state->end; end = (SRE_CHAR *)state->end;
for (;;) { for (;;) {
while (ptr < end && !SRE(charset)(charset, *ptr)) while (ptr < end && !SRE(charset)(state, charset, *ptr))
ptr++; ptr++;
if (ptr >= end) if (ptr >= end)
return 0; return 0;