sre 2.1b2 update:
- take locale into account for word boundary anchors (#410271) - restored 2.0's *? behaviour (#233283, #408936 and others) - speed up re.sub/re.subn
This commit is contained in:
parent
8e9972c215
commit
b25e1ad253
22
Lib/sre.py
22
Lib/sre.py
|
@ -23,6 +23,8 @@ __all__ = [ "match", "search", "sub", "subn", "split", "findall",
|
|||
"U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
|
||||
"UNICODE", "error" ]
|
||||
|
||||
__version__ = "2.1b2"
|
||||
|
||||
# this module works under 1.5.2 and later. don't use string methods
|
||||
import string
|
||||
|
||||
|
@ -90,6 +92,7 @@ def compile(pattern, flags=0):
|
|||
def purge():
|
||||
"Clear the regular expression cache"
|
||||
_cache.clear()
|
||||
_cache_repl.clear()
|
||||
|
||||
def template(pattern, flags=0):
|
||||
"Compile a template pattern, returning a pattern object"
|
||||
|
@ -111,6 +114,8 @@ def escape(pattern):
|
|||
# internals
|
||||
|
||||
_cache = {}
|
||||
_cache_repl = {}
|
||||
|
||||
_MAXCACHE = 100
|
||||
|
||||
def _join(seq, sep):
|
||||
|
@ -134,6 +139,21 @@ def _compile(*key):
|
|||
_cache[key] = p
|
||||
return p
|
||||
|
||||
def _compile_repl(*key):
|
||||
# internal: compile replacement pattern
|
||||
p = _cache_repl.get(key)
|
||||
if p is not None:
|
||||
return p
|
||||
repl, pattern = key
|
||||
try:
|
||||
p = sre_parse.parse_template(repl, pattern)
|
||||
except error, v:
|
||||
raise error, v # invalid expression
|
||||
if len(_cache_repl) >= _MAXCACHE:
|
||||
_cache_repl.clear()
|
||||
_cache_repl[key] = p
|
||||
return p
|
||||
|
||||
def _expand(pattern, match, template):
|
||||
# internal: match.expand implementation hook
|
||||
template = sre_parse.parse_template(template, pattern)
|
||||
|
@ -148,7 +168,7 @@ def _subn(pattern, template, string, count=0):
|
|||
if callable(template):
|
||||
filter = template
|
||||
else:
|
||||
template = sre_parse.parse_template(template, pattern)
|
||||
template = _compile_repl(template, pattern)
|
||||
def filter(match, template=template):
|
||||
return sre_parse.expand_template(template, match)
|
||||
n = i = 0
|
||||
|
|
|
@ -105,8 +105,11 @@ def _compile(code, pattern, flags):
|
|||
elif op is AT:
|
||||
emit(OPCODES[op])
|
||||
if flags & SRE_FLAG_MULTILINE:
|
||||
emit(ATCODES[AT_MULTILINE.get(av, av)])
|
||||
else:
|
||||
av = AT_MULTILINE.get(av, av)
|
||||
if flags & SRE_FLAG_LOCALE:
|
||||
av = AT_LOCALE.get(av, av)
|
||||
elif flags & SRE_FLAG_UNICODE:
|
||||
av = AT_UNICODE.get(av, av)
|
||||
emit(ATCODES[av])
|
||||
elif op is BRANCH:
|
||||
emit(OPCODES[op])
|
||||
|
@ -124,10 +127,9 @@ def _compile(code, pattern, flags):
|
|||
elif op is CATEGORY:
|
||||
emit(OPCODES[op])
|
||||
if flags & SRE_FLAG_LOCALE:
|
||||
emit(CHCODES[CH_LOCALE[av]])
|
||||
av = CH_LOCALE[av]
|
||||
elif flags & SRE_FLAG_UNICODE:
|
||||
emit(CHCODES[CH_UNICODE[av]])
|
||||
else:
|
||||
av = CH_UNICODE[av]
|
||||
emit(CHCODES[av])
|
||||
elif op is GROUPREF:
|
||||
if flags & SRE_FLAG_IGNORECASE:
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
# update when constants are added or removed
|
||||
|
||||
MAGIC = 20010115
|
||||
MAGIC = 20010320
|
||||
|
||||
# max code word in this release
|
||||
|
||||
|
@ -67,6 +67,10 @@ AT_NON_BOUNDARY = "at_non_boundary"
|
|||
AT_END = "at_end"
|
||||
AT_END_LINE = "at_end_line"
|
||||
AT_END_STRING = "at_end_string"
|
||||
AT_LOC_BOUNDARY = "at_loc_boundary"
|
||||
AT_LOC_NON_BOUNDARY = "at_loc_non_boundary"
|
||||
AT_UNI_BOUNDARY = "at_uni_boundary"
|
||||
AT_UNI_NON_BOUNDARY = "at_uni_non_boundary"
|
||||
|
||||
# categories
|
||||
CATEGORY_DIGIT = "category_digit"
|
||||
|
@ -119,7 +123,9 @@ OPCODES = [
|
|||
|
||||
ATCODES = [
|
||||
AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
|
||||
AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING
|
||||
AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING,
|
||||
AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY,
|
||||
AT_UNI_NON_BOUNDARY
|
||||
]
|
||||
|
||||
CHCODES = [
|
||||
|
@ -157,6 +163,16 @@ AT_MULTILINE = {
|
|||
AT_END: AT_END_LINE
|
||||
}
|
||||
|
||||
AT_LOCALE = {
|
||||
AT_BOUNDARY: AT_LOC_BOUNDARY,
|
||||
AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
|
||||
}
|
||||
|
||||
AT_UNICODE = {
|
||||
AT_BOUNDARY: AT_UNI_BOUNDARY,
|
||||
AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
|
||||
}
|
||||
|
||||
CH_LOCALE = {
|
||||
CATEGORY_DIGIT: CATEGORY_DIGIT,
|
||||
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
|
||||
|
|
|
@ -638,6 +638,16 @@ def parse_template(source, pattern):
|
|||
s = Tokenizer(source)
|
||||
p = []
|
||||
a = p.append
|
||||
def literal(literal, p=p):
|
||||
if p and p[-1][0] is LITERAL:
|
||||
p[-1] = LITERAL, p[-1][1] + literal
|
||||
else:
|
||||
p.append((LITERAL, literal))
|
||||
sep = source[:0]
|
||||
if type(sep) is type(""):
|
||||
char = chr
|
||||
else:
|
||||
char = unichr
|
||||
while 1:
|
||||
this = s.get()
|
||||
if this is None:
|
||||
|
@ -681,33 +691,42 @@ def parse_template(source, pattern):
|
|||
break
|
||||
if not code:
|
||||
this = this[1:]
|
||||
code = LITERAL, atoi(this[-6:], 8) & 0xff
|
||||
code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
|
||||
if code[0] is LITERAL:
|
||||
literal(code[1])
|
||||
else:
|
||||
a(code)
|
||||
else:
|
||||
try:
|
||||
a(ESCAPES[this])
|
||||
this = char(ESCAPES[this][1])
|
||||
except KeyError:
|
||||
for c in this:
|
||||
a((LITERAL, ord(c)))
|
||||
pass
|
||||
literal(this)
|
||||
else:
|
||||
a((LITERAL, ord(this)))
|
||||
return p
|
||||
literal(this)
|
||||
# convert template to groups and literals lists
|
||||
i = 0
|
||||
groups = []
|
||||
literals = []
|
||||
for c, s in p:
|
||||
if c is MARK:
|
||||
groups.append((i, s))
|
||||
literals.append(None)
|
||||
else:
|
||||
literals.append(s)
|
||||
i = i + 1
|
||||
return groups, literals
|
||||
|
||||
def expand_template(template, match):
|
||||
# XXX: <fl> this is sooooo slow. drop in the slicelist code instead
|
||||
p = []
|
||||
a = p.append
|
||||
g = match.group
|
||||
sep = match.string[:0]
|
||||
if type(sep) is type(""):
|
||||
char = chr
|
||||
else:
|
||||
char = unichr
|
||||
for c, s in template:
|
||||
if c is LITERAL:
|
||||
a(char(s))
|
||||
elif c is MARK:
|
||||
s = match.group(s)
|
||||
groups, literals = template
|
||||
literals = literals[:]
|
||||
try:
|
||||
for index, group in groups:
|
||||
literals[index] = s = g(group)
|
||||
if s is None:
|
||||
raise IndexError
|
||||
except IndexError:
|
||||
raise error, "empty group"
|
||||
a(s)
|
||||
return string.join(p, sep)
|
||||
return string.join(literals, sep)
|
||||
|
|
|
@ -639,3 +639,14 @@ xyzabc
|
|||
# bug 130748: ^* should be an error (nothing to repeat)
|
||||
(r'^*', '', SYNTAX_ERROR),
|
||||
]
|
||||
|
||||
try:
|
||||
u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'")
|
||||
except SyntaxError:
|
||||
pass
|
||||
else:
|
||||
tests.extend([
|
||||
# bug 410271: \b broken under locales
|
||||
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
|
||||
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
|
||||
])
|
||||
|
|
|
@ -329,6 +329,8 @@ for t in tests:
|
|||
u = unicode(s, "latin-1")
|
||||
except NameError:
|
||||
pass
|
||||
except TypeError:
|
||||
continue # skip unicode test strings
|
||||
else:
|
||||
result=obj.search(u)
|
||||
if result==None:
|
||||
|
|
|
@ -24,8 +24,9 @@
|
|||
* 2000-10-24 fl really fixed assert_not; reset groups in findall
|
||||
* 2000-12-21 fl fixed memory leak in groupdict
|
||||
* 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
|
||||
* 2001-01-15 fl avoid recursion for MIN_UTIL; fixed uppercase literal bug
|
||||
* 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
|
||||
* 2001-01-16 fl fixed memory leak in pattern destructor
|
||||
* 2001-03-20 fl lots of fixes for 2.1b2
|
||||
*
|
||||
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
||||
*
|
||||
|
@ -40,7 +41,7 @@
|
|||
|
||||
#ifndef SRE_RECURSIVE
|
||||
|
||||
char copyright[] = " SRE 2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
|
||||
char copyright[] = " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB ";
|
||||
|
||||
#include "Python.h"
|
||||
|
||||
|
@ -141,11 +142,6 @@ static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|||
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
|
||||
120, 121, 122, 123, 124, 125, 126, 127 };
|
||||
|
||||
static unsigned int sre_lower(unsigned int ch)
|
||||
{
|
||||
return ((ch) < 128 ? sre_char_lower[ch] : ch);
|
||||
}
|
||||
|
||||
#define SRE_IS_DIGIT(ch)\
|
||||
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
|
||||
#define SRE_IS_SPACE(ch)\
|
||||
|
@ -157,30 +153,39 @@ static unsigned int sre_lower(unsigned int ch)
|
|||
#define SRE_IS_WORD(ch)\
|
||||
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
|
||||
|
||||
static unsigned int sre_lower(unsigned int ch)
|
||||
{
|
||||
return ((ch) < 128 ? sre_char_lower[ch] : ch);
|
||||
}
|
||||
|
||||
/* locale-specific character predicates */
|
||||
|
||||
static unsigned int sre_lower_locale(unsigned int ch)
|
||||
{
|
||||
return ((ch) < 256 ? tolower((ch)) : ch);
|
||||
}
|
||||
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
|
||||
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
|
||||
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
|
||||
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
|
||||
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
|
||||
|
||||
static unsigned int sre_lower_locale(unsigned int ch)
|
||||
{
|
||||
return ((ch) < 256 ? tolower((ch)) : ch);
|
||||
}
|
||||
|
||||
/* unicode-specific character predicates */
|
||||
|
||||
#if defined(HAVE_UNICODE)
|
||||
static unsigned int sre_lower_unicode(unsigned int ch)
|
||||
{
|
||||
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
|
||||
}
|
||||
|
||||
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
|
||||
|
||||
static unsigned int sre_lower_unicode(unsigned int ch)
|
||||
{
|
||||
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
LOCAL(int)
|
||||
|
@ -418,6 +423,42 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
|
|||
this = ((void*) ptr < state->end) ?
|
||||
SRE_IS_WORD((int) ptr[0]) : 0;
|
||||
return this == that;
|
||||
|
||||
case SRE_AT_LOC_BOUNDARY:
|
||||
if (state->beginning == state->end)
|
||||
return 0;
|
||||
that = ((void*) ptr > state->beginning) ?
|
||||
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
|
||||
this = ((void*) ptr < state->end) ?
|
||||
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
|
||||
return this != that;
|
||||
|
||||
case SRE_AT_LOC_NON_BOUNDARY:
|
||||
if (state->beginning == state->end)
|
||||
return 0;
|
||||
that = ((void*) ptr > state->beginning) ?
|
||||
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
|
||||
this = ((void*) ptr < state->end) ?
|
||||
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
|
||||
return this == that;
|
||||
|
||||
case SRE_AT_UNI_BOUNDARY:
|
||||
if (state->beginning == state->end)
|
||||
return 0;
|
||||
that = ((void*) ptr > state->beginning) ?
|
||||
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
|
||||
this = ((void*) ptr < state->end) ?
|
||||
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
|
||||
return this != that;
|
||||
|
||||
case SRE_AT_UNI_NON_BOUNDARY:
|
||||
if (state->beginning == state->end)
|
||||
return 0;
|
||||
that = ((void*) ptr > state->beginning) ?
|
||||
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
|
||||
this = ((void*) ptr < state->end) ?
|
||||
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
|
||||
return this == that;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -1037,7 +1078,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
|||
|
||||
/* see if the tail matches */
|
||||
state->repeat = rp->prev;
|
||||
if (rp->pattern[2] == 65535) {
|
||||
/* FIXME: the following fix doesn't always work (#133283) */
|
||||
if (0 && rp->pattern[2] == 65535) {
|
||||
/* unbounded repeat */
|
||||
for (;;) {
|
||||
i = SRE_MATCH(state, pattern, level + 1);
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
* See the _sre.c file for information on usage and redistribution.
|
||||
*/
|
||||
|
||||
#define SRE_MAGIC 20010115
|
||||
#define SRE_MAGIC 20010320
|
||||
#define SRE_OP_FAILURE 0
|
||||
#define SRE_OP_SUCCESS 1
|
||||
#define SRE_OP_ANY 2
|
||||
|
@ -49,6 +49,10 @@
|
|||
#define SRE_AT_END 5
|
||||
#define SRE_AT_END_LINE 6
|
||||
#define SRE_AT_END_STRING 7
|
||||
#define SRE_AT_LOC_BOUNDARY 8
|
||||
#define SRE_AT_LOC_NON_BOUNDARY 9
|
||||
#define SRE_AT_UNI_BOUNDARY 10
|
||||
#define SRE_AT_UNI_NON_BOUNDARY 11
|
||||
#define SRE_CATEGORY_DIGIT 0
|
||||
#define SRE_CATEGORY_NOT_DIGIT 1
|
||||
#define SRE_CATEGORY_SPACE 2
|
||||
|
|
Loading…
Reference in New Issue