bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer depend on the locale at compile time. Only the locale at matching time affects the result of matching.
This commit is contained in:
parent
647c3d381e
commit
898ff03e1e
|
@ -559,6 +559,11 @@ form.
|
|||
:const:`re.LOCALE` can be used only with bytes patterns and is
|
||||
not compatible with :const:`re.ASCII`.
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
Compiled regular expression objects with the :const:`re.LOCALE` flag no
|
||||
longer depend on the locale at compile time. Only the locale at
|
||||
matching time affects the result of matching.
|
||||
|
||||
|
||||
.. data:: M
|
||||
MULTILINE
|
||||
|
|
12
Lib/re.py
12
Lib/re.py
|
@ -268,9 +268,7 @@ _MAXCACHE = 512
|
|||
def _compile(pattern, flags):
|
||||
# internal: compile pattern
|
||||
try:
|
||||
p, loc = _cache[type(pattern), pattern, flags]
|
||||
if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
|
||||
return p
|
||||
return _cache[type(pattern), pattern, flags]
|
||||
except KeyError:
|
||||
pass
|
||||
if isinstance(pattern, _pattern_type):
|
||||
|
@ -284,13 +282,7 @@ def _compile(pattern, flags):
|
|||
if not (flags & DEBUG):
|
||||
if len(_cache) >= _MAXCACHE:
|
||||
_cache.clear()
|
||||
if p.flags & LOCALE:
|
||||
if not _locale:
|
||||
return p
|
||||
loc = _locale.setlocale(_locale.LC_CTYPE)
|
||||
else:
|
||||
loc = None
|
||||
_cache[type(pattern), pattern, flags] = p, loc
|
||||
_cache[type(pattern), pattern, flags] = p
|
||||
return p
|
||||
|
||||
@functools.lru_cache(_MAXCACHE)
|
||||
|
|
|
@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
|
|||
fixes = None
|
||||
for op, av in pattern:
|
||||
if op in LITERAL_CODES:
|
||||
if flags & SRE_FLAG_IGNORECASE:
|
||||
if not flags & SRE_FLAG_IGNORECASE:
|
||||
emit(op)
|
||||
emit(av)
|
||||
elif flags & SRE_FLAG_LOCALE:
|
||||
emit(OP_LOC_IGNORE[op])
|
||||
emit(av)
|
||||
else:
|
||||
lo = _sre.getlower(av, flags)
|
||||
if fixes and lo in fixes:
|
||||
emit(IN_IGNORE)
|
||||
|
@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
|
|||
else:
|
||||
emit(OP_IGNORE[op])
|
||||
emit(lo)
|
||||
else:
|
||||
emit(op)
|
||||
emit(av)
|
||||
elif op is IN:
|
||||
if flags & SRE_FLAG_IGNORECASE:
|
||||
emit(OP_IGNORE[op])
|
||||
def fixup(literal, flags=flags):
|
||||
return _sre.getlower(literal, flags)
|
||||
else:
|
||||
if not flags & SRE_FLAG_IGNORECASE:
|
||||
emit(op)
|
||||
fixup = None
|
||||
elif flags & SRE_FLAG_LOCALE:
|
||||
emit(IN_LOC_IGNORE)
|
||||
fixup = None
|
||||
else:
|
||||
emit(IN_IGNORE)
|
||||
def fixup(literal, flags=flags):
|
||||
return _sre.getlower(literal, flags)
|
||||
skip = _len(code); emit(0)
|
||||
_compile_charset(av, flags, code, fixup, fixes)
|
||||
code[skip] = _len(code) - skip
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
# update when constants are added or removed
|
||||
|
||||
MAGIC = 20140917
|
||||
MAGIC = 20170530
|
||||
|
||||
from _sre import MAXREPEAT, MAXGROUPS
|
||||
|
||||
|
@ -87,6 +87,9 @@ OPCODES = _makecodes("""
|
|||
SUBPATTERN
|
||||
MIN_REPEAT_ONE
|
||||
RANGE_IGNORE
|
||||
LITERAL_LOC_IGNORE
|
||||
NOT_LITERAL_LOC_IGNORE
|
||||
IN_LOC_IGNORE
|
||||
|
||||
MIN_REPEAT MAX_REPEAT
|
||||
""")
|
||||
|
@ -124,6 +127,11 @@ OP_IGNORE = {
|
|||
RANGE: RANGE_IGNORE,
|
||||
}
|
||||
|
||||
OP_LOC_IGNORE = {
|
||||
LITERAL: LITERAL_LOC_IGNORE,
|
||||
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
|
||||
}
|
||||
|
||||
AT_MULTILINE = {
|
||||
AT_BEGINNING: AT_BEGINNING_LINE,
|
||||
AT_END: AT_END_LINE
|
||||
|
|
|
@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0
|
|||
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
|
||||
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
|
||||
|
||||
def test_locale_compiled(self):
|
||||
oldlocale = locale.setlocale(locale.LC_CTYPE)
|
||||
self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
|
||||
for loc in 'en_US.iso88591', 'en_US.utf8':
|
||||
try:
|
||||
locale.setlocale(locale.LC_CTYPE, loc)
|
||||
except locale.Error:
|
||||
# Unsupported locale on this system
|
||||
self.skipTest('test needs %s locale' % loc)
|
||||
|
||||
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
|
||||
p1 = re.compile(b'\xc5\xe5', re.L|re.I)
|
||||
p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
|
||||
p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
|
||||
p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
|
||||
for p in p1, p2, p3:
|
||||
self.assertTrue(p.match(b'\xc5\xe5'))
|
||||
self.assertTrue(p.match(b'\xe5\xe5'))
|
||||
self.assertTrue(p.match(b'\xc5\xc5'))
|
||||
self.assertIsNone(p4.match(b'\xe5\xc5'))
|
||||
self.assertIsNone(p4.match(b'\xe5\xe5'))
|
||||
self.assertIsNone(p4.match(b'\xc5\xc5'))
|
||||
|
||||
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
|
||||
for p in p1, p2, p3:
|
||||
self.assertTrue(p.match(b'\xc5\xe5'))
|
||||
self.assertIsNone(p.match(b'\xe5\xe5'))
|
||||
self.assertIsNone(p.match(b'\xc5\xc5'))
|
||||
self.assertTrue(p4.match(b'\xe5\xc5'))
|
||||
self.assertIsNone(p4.match(b'\xe5\xe5'))
|
||||
self.assertIsNone(p4.match(b'\xc5\xc5'))
|
||||
|
||||
def test_error(self):
|
||||
with self.assertRaises(re.error) as cm:
|
||||
re.compile('(\u20ac))')
|
||||
|
|
|
@ -317,6 +317,10 @@ Extension Modules
|
|||
Library
|
||||
-------
|
||||
|
||||
- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no
|
||||
longer depend on the locale at compile time. Only the locale at matching
|
||||
time affects the result of matching.
|
||||
|
||||
- bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process
|
||||
when Ctrl-C is received.
|
||||
|
||||
|
|
|
@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
|||
case SRE_OP_NOT_LITERAL:
|
||||
case SRE_OP_LITERAL_IGNORE:
|
||||
case SRE_OP_NOT_LITERAL_IGNORE:
|
||||
case SRE_OP_LITERAL_LOC_IGNORE:
|
||||
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
|
||||
GET_ARG;
|
||||
/* The arg is just a character, nothing to check */
|
||||
break;
|
||||
|
@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
|||
|
||||
case SRE_OP_IN:
|
||||
case SRE_OP_IN_IGNORE:
|
||||
case SRE_OP_IN_LOC_IGNORE:
|
||||
GET_SKIP;
|
||||
/* Stop 1 before the end; we check the FAILURE below */
|
||||
if (!_validate_charset(code, code+skip-2))
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
* See the _sre.c file for information on usage and redistribution.
|
||||
*/
|
||||
|
||||
#define SRE_MAGIC 20140917
|
||||
#define SRE_MAGIC 20170530
|
||||
#define SRE_OP_FAILURE 0
|
||||
#define SRE_OP_SUCCESS 1
|
||||
#define SRE_OP_ANY 2
|
||||
|
@ -45,6 +45,9 @@
|
|||
#define SRE_OP_SUBPATTERN 30
|
||||
#define SRE_OP_MIN_REPEAT_ONE 31
|
||||
#define SRE_OP_RANGE_IGNORE 32
|
||||
#define SRE_OP_LITERAL_LOC_IGNORE 33
|
||||
#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
|
||||
#define SRE_OP_IN_LOC_IGNORE 35
|
||||
#define SRE_AT_BEGINNING 0
|
||||
#define SRE_AT_BEGINNING_LINE 1
|
||||
#define SRE_AT_BEGINNING_STRING 2
|
||||
|
|
|
@ -100,6 +100,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
|
|||
return 0;
|
||||
}
|
||||
|
||||
LOCAL(int)
|
||||
SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
|
||||
{
|
||||
return ch == pattern
|
||||
|| (SRE_CODE) state->lower(ch) == pattern
|
||||
|| (SRE_CODE) state->upper(ch) == pattern;
|
||||
}
|
||||
|
||||
LOCAL(int)
|
||||
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
||||
{
|
||||
|
@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
|||
}
|
||||
}
|
||||
|
||||
LOCAL(int)
|
||||
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
|
||||
{
|
||||
SRE_CODE lo, up;
|
||||
lo = state->lower(ch);
|
||||
if (SRE(charset)(state, set, lo))
|
||||
return 1;
|
||||
|
||||
up = state->upper(ch);
|
||||
return up != lo && SRE(charset)(state, set, up);
|
||||
}
|
||||
|
||||
LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
|
||||
|
||||
LOCAL(Py_ssize_t)
|
||||
|
@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
|||
ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_LITERAL_LOC_IGNORE:
|
||||
/* repeated literal */
|
||||
chr = pattern[1];
|
||||
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
|
||||
while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
|
||||
ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_NOT_LITERAL:
|
||||
/* repeated non-literal */
|
||||
chr = pattern[1];
|
||||
|
@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
|||
ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
|
||||
/* repeated non-literal */
|
||||
chr = pattern[1];
|
||||
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
|
||||
while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
|
||||
ptr++;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* repeated single character pattern */
|
||||
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
|
||||
|
@ -651,7 +687,17 @@ entrance:
|
|||
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
|
||||
ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
||||
if (ctx->ptr >= end ||
|
||||
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
|
||||
state->lower(*ctx->ptr) != *ctx->pattern)
|
||||
RETURN_FAILURE;
|
||||
ctx->pattern++;
|
||||
ctx->ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_LITERAL_LOC_IGNORE:
|
||||
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
|
||||
ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
||||
if (ctx->ptr >= end
|
||||
|| !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
|
||||
RETURN_FAILURE;
|
||||
ctx->pattern++;
|
||||
ctx->ptr++;
|
||||
|
@ -661,7 +707,17 @@ entrance:
|
|||
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
|
||||
ctx->pattern, ctx->ptr, *ctx->pattern));
|
||||
if (ctx->ptr >= end ||
|
||||
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
|
||||
state->lower(*ctx->ptr) == *ctx->pattern)
|
||||
RETURN_FAILURE;
|
||||
ctx->pattern++;
|
||||
ctx->ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
|
||||
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
|
||||
ctx->pattern, ctx->ptr, *ctx->pattern));
|
||||
if (ctx->ptr >= end
|
||||
|| SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
|
||||
RETURN_FAILURE;
|
||||
ctx->pattern++;
|
||||
ctx->ptr++;
|
||||
|
@ -677,6 +733,15 @@ entrance:
|
|||
ctx->ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_IN_LOC_IGNORE:
|
||||
TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
|
||||
if (ctx->ptr >= end
|
||||
|| !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
|
||||
RETURN_FAILURE;
|
||||
ctx->pattern += ctx->pattern[0];
|
||||
ctx->ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_JUMP:
|
||||
case SRE_OP_INFO:
|
||||
/* jump forward */
|
||||
|
|
Loading…
Reference in New Issue