mirror of https://github.com/python/cpython
Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
This commit is contained in:
parent
36ac510956
commit
b1847e7541
|
@ -22,9 +22,6 @@ if _sre.CODESIZE == 2:
|
||||||
else:
|
else:
|
||||||
MAXCODE = 0xFFFFFFFF
|
MAXCODE = 0xFFFFFFFF
|
||||||
|
|
||||||
def _identityfunction(x):
|
|
||||||
return x
|
|
||||||
|
|
||||||
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
|
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
|
||||||
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
|
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
|
||||||
_SUCCESS_CODES = set([SUCCESS, FAILURE])
|
_SUCCESS_CODES = set([SUCCESS, FAILURE])
|
||||||
|
@ -53,7 +50,7 @@ def _compile(code, pattern, flags):
|
||||||
return _sre.getlower(literal, flags)
|
return _sre.getlower(literal, flags)
|
||||||
else:
|
else:
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
fixup = _identityfunction
|
fixup = None
|
||||||
skip = _len(code); emit(0)
|
skip = _len(code); emit(0)
|
||||||
_compile_charset(av, flags, code, fixup)
|
_compile_charset(av, flags, code, fixup)
|
||||||
code[skip] = _len(code) - skip
|
code[skip] = _len(code) - skip
|
||||||
|
@ -172,17 +169,15 @@ def _compile(code, pattern, flags):
|
||||||
def _compile_charset(charset, flags, code, fixup=None):
|
def _compile_charset(charset, flags, code, fixup=None):
|
||||||
# compile charset subprogram
|
# compile charset subprogram
|
||||||
emit = code.append
|
emit = code.append
|
||||||
if fixup is None:
|
for op, av in _optimize_charset(charset, fixup, flags & SRE_FLAG_UNICODE):
|
||||||
fixup = _identityfunction
|
|
||||||
for op, av in _optimize_charset(charset, fixup):
|
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
if op is NEGATE:
|
if op is NEGATE:
|
||||||
pass
|
pass
|
||||||
elif op is LITERAL:
|
elif op is LITERAL:
|
||||||
emit(fixup(av))
|
emit(av)
|
||||||
elif op is RANGE:
|
elif op is RANGE:
|
||||||
emit(fixup(av[0]))
|
emit(av[0])
|
||||||
emit(fixup(av[1]))
|
emit(av[1])
|
||||||
elif op is CHARSET:
|
elif op is CHARSET:
|
||||||
code.extend(av)
|
code.extend(av)
|
||||||
elif op is BIGCHARSET:
|
elif op is BIGCHARSET:
|
||||||
|
@ -198,7 +193,7 @@ def _compile_charset(charset, flags, code, fixup=None):
|
||||||
raise error("internal: unsupported set operator")
|
raise error("internal: unsupported set operator")
|
||||||
emit(OPCODES[FAILURE])
|
emit(OPCODES[FAILURE])
|
||||||
|
|
||||||
def _optimize_charset(charset, fixup):
|
def _optimize_charset(charset, fixup, isunicode):
|
||||||
# internal: optimize character set
|
# internal: optimize character set
|
||||||
out = []
|
out = []
|
||||||
tail = []
|
tail = []
|
||||||
|
@ -207,9 +202,15 @@ def _optimize_charset(charset, fixup):
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
if op is LITERAL:
|
if op is LITERAL:
|
||||||
charmap[fixup(av)] = 1
|
i = av
|
||||||
|
if fixup:
|
||||||
|
i = fixup(i)
|
||||||
|
charmap[i] = 1
|
||||||
elif op is RANGE:
|
elif op is RANGE:
|
||||||
for i in range(fixup(av[0]), fixup(av[1])+1):
|
r = range(av[0], av[1]+1)
|
||||||
|
if fixup:
|
||||||
|
r = map(fixup, r)
|
||||||
|
for i in r:
|
||||||
charmap[i] = 1
|
charmap[i] = 1
|
||||||
elif op is NEGATE:
|
elif op is NEGATE:
|
||||||
out.append((op, av))
|
out.append((op, av))
|
||||||
|
@ -221,7 +222,20 @@ def _optimize_charset(charset, fixup):
|
||||||
charmap += b'\0' * 0xff00
|
charmap += b'\0' * 0xff00
|
||||||
continue
|
continue
|
||||||
# character set contains non-BMP character codes
|
# character set contains non-BMP character codes
|
||||||
tail.append((op, av))
|
if fixup and isunicode and op is RANGE:
|
||||||
|
lo, hi = av
|
||||||
|
ranges = [av]
|
||||||
|
# There are only two ranges of cased astral characters:
|
||||||
|
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
|
||||||
|
_fixup_range(max(0x10000, lo), min(0x11fff, hi),
|
||||||
|
ranges, fixup)
|
||||||
|
for lo, hi in ranges:
|
||||||
|
if lo == hi:
|
||||||
|
tail.append((LITERAL, hi))
|
||||||
|
else:
|
||||||
|
tail.append((RANGE, (lo, hi)))
|
||||||
|
else:
|
||||||
|
tail.append((op, av))
|
||||||
break
|
break
|
||||||
|
|
||||||
# compress character map
|
# compress character map
|
||||||
|
@ -247,8 +261,10 @@ def _optimize_charset(charset, fixup):
|
||||||
else:
|
else:
|
||||||
out.append((RANGE, (p, q - 1)))
|
out.append((RANGE, (p, q - 1)))
|
||||||
out += tail
|
out += tail
|
||||||
if len(out) < len(charset):
|
# if the case was changed or new representation is more compact
|
||||||
|
if fixup or len(out) < len(charset):
|
||||||
return out
|
return out
|
||||||
|
# else original character set is good enough
|
||||||
return charset
|
return charset
|
||||||
|
|
||||||
# use bitmap
|
# use bitmap
|
||||||
|
@ -297,6 +313,24 @@ def _optimize_charset(charset, fixup):
|
||||||
out += tail
|
out += tail
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def _fixup_range(lo, hi, ranges, fixup):
|
||||||
|
for i in map(fixup, range(lo, hi+1)):
|
||||||
|
for k, (lo, hi) in enumerate(ranges):
|
||||||
|
if i < lo:
|
||||||
|
if l == lo - 1:
|
||||||
|
ranges[k] = (i, hi)
|
||||||
|
else:
|
||||||
|
ranges.insert(k, (i, i))
|
||||||
|
break
|
||||||
|
elif i > hi:
|
||||||
|
if i == hi + 1:
|
||||||
|
ranges[k] = (lo, i)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
ranges.append((i, i))
|
||||||
|
|
||||||
_CODEBITS = _sre.CODESIZE * 8
|
_CODEBITS = _sre.CODESIZE * 8
|
||||||
_BITS_TRANS = b'0' + b'1' * 255
|
_BITS_TRANS = b'0' + b'1' * 255
|
||||||
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
|
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
|
||||||
|
|
|
@ -583,6 +583,25 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
|
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
|
||||||
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
|
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
|
||||||
|
|
||||||
|
def test_ignore_case_range(self):
|
||||||
|
# Issues #3511, #17381.
|
||||||
|
self.assertTrue(re.match(r'[9-a]', '_', re.I))
|
||||||
|
self.assertIsNone(re.match(r'[9-A]', '_', re.I))
|
||||||
|
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
|
||||||
|
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
|
||||||
|
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
|
||||||
|
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
|
||||||
|
|
||||||
def test_category(self):
|
def test_category(self):
|
||||||
self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
|
self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #17381: Fixed handling of case-insensitive ranges in regular
|
||||||
|
expressions.
|
||||||
|
|
||||||
- Issue #22410: Module level functions in the re module now cache compiled
|
- Issue #22410: Module level functions in the re module now cache compiled
|
||||||
locale-dependent regular expressions taking into account the locale.
|
locale-dependent regular expressions taking into account the locale.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue