bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.
This commit is contained in:
Serhiy Storchaka 2017-05-05 08:53:40 +03:00 committed by GitHub
parent 647c3d381e
commit 898ff03e1e
9 changed files with 141 additions and 23 deletions

View File

@ -559,6 +559,11 @@ form.
:const:`re.LOCALE` can be used only with bytes patterns and is
not compatible with :const:`re.ASCII`.
.. versionchanged:: 3.7
Compiled regular expression objects with the :const:`re.LOCALE` flag no
longer depend on the locale at compile time. Only the locale at
matching time affects the result of matching.
.. data:: M
MULTILINE

View File

@ -268,9 +268,7 @@ _MAXCACHE = 512
def _compile(pattern, flags):
# internal: compile pattern
try:
p, loc = _cache[type(pattern), pattern, flags]
if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
return p
return _cache[type(pattern), pattern, flags]
except KeyError:
pass
if isinstance(pattern, _pattern_type):
@ -284,13 +282,7 @@ def _compile(pattern, flags):
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
_cache.clear()
if p.flags & LOCALE:
if not _locale:
return p
loc = _locale.setlocale(_locale.LC_CTYPE)
else:
loc = None
_cache[type(pattern), pattern, flags] = p, loc
_cache[type(pattern), pattern, flags] = p
return p
@functools.lru_cache(_MAXCACHE)

View File

@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
fixes = None
for op, av in pattern:
if op in LITERAL_CODES:
if flags & SRE_FLAG_IGNORECASE:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
emit(av)
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOC_IGNORE[op])
emit(av)
else:
lo = _sre.getlower(av, flags)
if fixes and lo in fixes:
emit(IN_IGNORE)
@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
else:
emit(OP_IGNORE[op])
emit(lo)
else:
emit(op)
emit(av)
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
emit(OP_IGNORE[op])
def fixup(literal, flags=flags):
return _sre.getlower(literal, flags)
else:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
fixup = None
elif flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
fixup = None
else:
emit(IN_IGNORE)
def fixup(literal, flags=flags):
return _sre.getlower(literal, flags)
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup, fixes)
code[skip] = _len(code) - skip

View File

@ -13,7 +13,7 @@
# update when constants are added or removed
MAGIC = 20140917
MAGIC = 20170530
from _sre import MAXREPEAT, MAXGROUPS
@ -87,6 +87,9 @@ OPCODES = _makecodes("""
SUBPATTERN
MIN_REPEAT_ONE
RANGE_IGNORE
LITERAL_LOC_IGNORE
NOT_LITERAL_LOC_IGNORE
IN_LOC_IGNORE
MIN_REPEAT MAX_REPEAT
""")
@ -124,6 +127,11 @@ OP_IGNORE = {
RANGE: RANGE_IGNORE,
}
OP_LOC_IGNORE = {
LITERAL: LITERAL_LOC_IGNORE,
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
}
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE

View File

@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def test_locale_compiled(self):
oldlocale = locale.setlocale(locale.LC_CTYPE)
self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
for loc in 'en_US.iso88591', 'en_US.utf8':
try:
locale.setlocale(locale.LC_CTYPE, loc)
except locale.Error:
# Unsupported locale on this system
self.skipTest('test needs %s locale' % loc)
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
p1 = re.compile(b'\xc5\xe5', re.L|re.I)
p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
for p in p1, p2, p3:
self.assertTrue(p.match(b'\xc5\xe5'))
self.assertTrue(p.match(b'\xe5\xe5'))
self.assertTrue(p.match(b'\xc5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xe5'))
self.assertIsNone(p4.match(b'\xc5\xc5'))
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
for p in p1, p2, p3:
self.assertTrue(p.match(b'\xc5\xe5'))
self.assertIsNone(p.match(b'\xe5\xe5'))
self.assertIsNone(p.match(b'\xc5\xc5'))
self.assertTrue(p4.match(b'\xe5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xe5'))
self.assertIsNone(p4.match(b'\xc5\xc5'))
def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')

View File

@ -317,6 +317,10 @@ Extension Modules
Library
-------
- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no
longer depend on the locale at compile time. Only the locale at matching
time affects the result of matching.
- bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process
when Ctrl-C is received.

View File

@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_NOT_LITERAL:
case SRE_OP_LITERAL_IGNORE:
case SRE_OP_NOT_LITERAL_IGNORE:
case SRE_OP_LITERAL_LOC_IGNORE:
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
GET_ARG;
/* The arg is just a character, nothing to check */
break;
@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_IN:
case SRE_OP_IN_IGNORE:
case SRE_OP_IN_LOC_IGNORE:
GET_SKIP;
/* Stop 1 before the end; we check the FAILURE below */
if (!_validate_charset(code, code+skip-2))

View File

@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
#define SRE_MAGIC 20140917
#define SRE_MAGIC 20170530
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@ -45,6 +45,9 @@
#define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_OP_RANGE_IGNORE 32
#define SRE_OP_LITERAL_LOC_IGNORE 33
#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
#define SRE_OP_IN_LOC_IGNORE 35
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2

View File

@ -100,6 +100,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
return 0;
}
LOCAL(int)
SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
{
return ch == pattern
|| (SRE_CODE) state->lower(ch) == pattern
|| (SRE_CODE) state->upper(ch) == pattern;
}
LOCAL(int)
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
}
}
LOCAL(int)
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
SRE_CODE lo, up;
lo = state->lower(ch);
if (SRE(charset)(state, set, lo))
return 1;
up = state->upper(ch);
return up != lo && SRE(charset)(state, set, up);
}
LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
LOCAL(Py_ssize_t)
@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;
case SRE_OP_LITERAL_LOC_IGNORE:
/* repeated literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
ptr++;
break;
case SRE_OP_NOT_LITERAL:
/* repeated non-literal */
chr = pattern[1];
@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
/* repeated non-literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
ptr++;
break;
default:
/* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
@ -651,7 +687,17 @@ entrance:
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end ||
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
state->lower(*ctx->ptr) != *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;
case SRE_OP_LITERAL_LOC_IGNORE:
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end
|| !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@ -661,7 +707,17 @@ entrance:
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end ||
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
state->lower(*ctx->ptr) == *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end
|| SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@ -677,6 +733,15 @@ entrance:
ctx->ptr++;
break;
case SRE_OP_IN_LOC_IGNORE:
TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end
|| !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern += ctx->pattern[0];
ctx->ptr++;
break;
case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */