-- SRE 0.9.6 sync. this includes:
+ added "regs" attribute + fixed "pos" and "endpos" attributes + reset "lastindex" and "lastgroup" in scanner methods + removed (?P#id) syntax; the "lastindex" and "lastgroup" attributes are now always set + removed string module dependencies in sre_parse + better debugging support in sre_parse + various tweaks to build under 1.5.2
This commit is contained in:
parent
4f1b2081e9
commit
8a3ebf8ca8
33
Lib/sre.py
33
Lib/sre.py
|
@ -10,9 +10,13 @@
|
|||
# other compatibility work.
|
||||
#
|
||||
|
||||
# FIXME: change all FIXME's to XXX ;-)
|
||||
|
||||
import sre_compile
|
||||
import sre_parse
|
||||
|
||||
import string
|
||||
|
||||
# flags
|
||||
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
|
||||
L = LOCALE = sre_compile.SRE_FLAG_LOCALE
|
||||
|
@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0):
|
|||
def compile(pattern, flags=0):
|
||||
return _compile(pattern, flags)
|
||||
|
||||
def purge():
|
||||
_cache.clear()
|
||||
|
||||
def template(pattern, flags=0):
|
||||
return _compile(pattern, flags|T)
|
||||
|
||||
|
@ -65,7 +72,7 @@ def escape(pattern):
|
|||
s[i] = "\\000"
|
||||
else:
|
||||
s[i] = "\\" + c
|
||||
return pattern[:0].join(s)
|
||||
return _join(s, pattern)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# internals
|
||||
|
@ -73,10 +80,14 @@ def escape(pattern):
|
|||
_cache = {}
|
||||
_MAXCACHE = 100
|
||||
|
||||
def _join(seq, sep):
|
||||
# internal: join into string having the same type as sep
|
||||
return string.join(seq, sep[:0])
|
||||
|
||||
def _compile(pattern, flags=0):
|
||||
# internal: compile pattern
|
||||
tp = type(pattern)
|
||||
if tp not in (type(""), type(u"")):
|
||||
if tp not in sre_compile.STRING_TYPES:
|
||||
return pattern
|
||||
key = (tp, pattern, flags)
|
||||
try:
|
||||
|
@ -89,10 +100,6 @@ def _compile(pattern, flags=0):
|
|||
_cache[key] = p
|
||||
return p
|
||||
|
||||
def purge():
|
||||
# clear pattern cache
|
||||
_cache.clear()
|
||||
|
||||
def _sub(pattern, template, string, count=0):
|
||||
# internal: pattern.sub implementation hook
|
||||
return _subn(pattern, template, string, count)[0]
|
||||
|
@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0):
|
|||
i = e
|
||||
n = n + 1
|
||||
append(string[i:])
|
||||
return string[:0].join(s), n
|
||||
return _join(s, string[:0]), n
|
||||
|
||||
def _split(pattern, string, maxsplit=0):
|
||||
# internal: pattern.split implementation hook
|
||||
|
@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
|
|||
|
||||
class Scanner:
|
||||
def __init__(self, lexicon):
|
||||
from sre_constants import BRANCH, SUBPATTERN, INDEX
|
||||
self.lexicon = lexicon
|
||||
# combine phrases into a compound pattern
|
||||
p = []
|
||||
s = sre_parse.Pattern()
|
||||
for phrase, action in lexicon:
|
||||
p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
|
||||
self.scanner = _compile("|".join(p))
|
||||
p.append(sre_parse.SubPattern(s, [
|
||||
(SUBPATTERN, (None, sre_parse.parse(phrase))),
|
||||
(INDEX, len(p))
|
||||
]))
|
||||
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
|
||||
s.groups = len(p)
|
||||
self.scanner = sre_compile.compile(p)
|
||||
def scan(self, string):
|
||||
result = []
|
||||
append = result.append
|
||||
|
|
|
@ -197,10 +197,11 @@ def _compile(code, pattern, flags):
|
|||
else:
|
||||
emit(ATCODES[av])
|
||||
elif op is BRANCH:
|
||||
emit(OPCODES[op])
|
||||
tail = []
|
||||
for av in av[1]:
|
||||
emit(OPCODES[op])
|
||||
skip = len(code); emit(0)
|
||||
emit(MAXCODE) # save mark
|
||||
_compile(code, av, flags)
|
||||
emit(OPCODES[JUMP])
|
||||
tail.append(len(code)); emit(0)
|
||||
|
@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags):
|
|||
emit(OPCODES[FAILURE])
|
||||
code[skip] = len(code) - skip
|
||||
|
||||
STRING_TYPES = [type("")]
|
||||
|
||||
try:
|
||||
STRING_TYPES.append(type(unicode("")))
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
def compile(p, flags=0):
|
||||
# internal: convert pattern list to internal format
|
||||
|
||||
# compile, as necessary
|
||||
if type(p) in (type(""), type(u"")):
|
||||
if type(p) in STRING_TYPES:
|
||||
import sre_parse
|
||||
pattern = p
|
||||
p = sre_parse.parse(p, flags)
|
||||
|
@ -308,6 +316,8 @@ def compile(p, flags=0):
|
|||
|
||||
code.append(OPCODES[SUCCESS])
|
||||
|
||||
# print code
|
||||
|
||||
# FIXME: <fl> get rid of this limitation!
|
||||
assert p.pattern.groups <= 100,\
|
||||
"sorry, but this version only supports 100 named groups"
|
||||
|
|
|
@ -172,7 +172,7 @@ CH_UNICODE = {
|
|||
# flags
|
||||
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
|
||||
SRE_FLAG_IGNORECASE = 2 # case insensitive
|
||||
SRE_FLAG_LOCALE = 4 # honor system locale
|
||||
SRE_FLAG_LOCALE = 4 # honour system locale
|
||||
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
|
||||
SRE_FLAG_DOTALL = 16 # treat target as a single string
|
||||
SRE_FLAG_UNICODE = 32 # use unicode locale
|
||||
|
|
155
Lib/sre_parse.py
155
Lib/sre_parse.py
|
@ -25,12 +25,12 @@ CHARMASK = 0xff
|
|||
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
||||
REPEAT_CHARS = "*+?{"
|
||||
|
||||
DIGITS = tuple(string.digits)
|
||||
DIGITS = tuple("012345689")
|
||||
|
||||
OCTDIGITS = tuple("01234567")
|
||||
HEXDIGITS = tuple("0123456789abcdefABCDEF")
|
||||
|
||||
WHITESPACE = tuple(string.whitespace)
|
||||
WHITESPACE = tuple(" \t\n\r\v\f")
|
||||
|
||||
ESCAPES = {
|
||||
r"\a": (LITERAL, 7),
|
||||
|
@ -68,7 +68,8 @@ FLAGS = {
|
|||
"u": SRE_FLAG_UNICODE,
|
||||
}
|
||||
|
||||
class State:
|
||||
class Pattern:
|
||||
# master pattern object. keeps track of global attributes
|
||||
def __init__(self):
|
||||
self.flags = 0
|
||||
self.groups = 1
|
||||
|
@ -88,6 +89,33 @@ class SubPattern:
|
|||
data = []
|
||||
self.data = data
|
||||
self.width = None
|
||||
def dump(self, level=0):
|
||||
nl = 1
|
||||
for op, av in self.data:
|
||||
print level*" " + op,; nl = 0
|
||||
if op == "in":
|
||||
# member sublanguage
|
||||
print; nl = 1
|
||||
for op, a in av:
|
||||
print (level+1)*" " + op, a
|
||||
elif op == "branch":
|
||||
print; nl = 1
|
||||
i = 0
|
||||
for a in av[1]:
|
||||
if i > 0:
|
||||
print level*" " + "or"
|
||||
a.dump(level+1); nl = 1
|
||||
i = i + 1
|
||||
elif type(av) in (type(()), type([])):
|
||||
for a in av:
|
||||
if isinstance(a, SubPattern):
|
||||
if not nl: print
|
||||
a.dump(level+1); nl = 1
|
||||
else:
|
||||
print a, ; nl = 0
|
||||
else:
|
||||
print av, ; nl = 0
|
||||
if not nl: print
|
||||
def __repr__(self):
|
||||
return repr(self.data)
|
||||
def __len__(self):
|
||||
|
@ -255,10 +283,25 @@ def _escape(source, escape, state):
|
|||
pass
|
||||
raise error, "bogus escape: %s" % repr(escape)
|
||||
|
||||
def _branch(pattern, items):
|
||||
# form a branch operator from a set of items
|
||||
def _parse_sub(source, state, nested=1):
|
||||
# parse an alternation: a|b|c
|
||||
|
||||
subpattern = SubPattern(pattern)
|
||||
items = []
|
||||
while 1:
|
||||
items.append(_parse(source, state))
|
||||
if source.match("|"):
|
||||
continue
|
||||
if not nested:
|
||||
break
|
||||
if not source.next or source.match(")"):
|
||||
break
|
||||
else:
|
||||
raise error, "pattern not properly closed"
|
||||
|
||||
if len(items) == 1:
|
||||
return items[0]
|
||||
|
||||
subpattern = SubPattern(state)
|
||||
|
||||
# check if all items share a common prefix
|
||||
while 1:
|
||||
|
@ -285,7 +328,7 @@ def _branch(pattern, items):
|
|||
break
|
||||
else:
|
||||
# we can store this as a character set instead of a
|
||||
# branch (FIXME: use a range if possible)
|
||||
# branch (the compiler may optimize this even more)
|
||||
set = []
|
||||
for item in items:
|
||||
set.append(item[0])
|
||||
|
@ -296,8 +339,7 @@ def _branch(pattern, items):
|
|||
return subpattern
|
||||
|
||||
def _parse(source, state):
|
||||
|
||||
# parse regular expression pattern into an operator list.
|
||||
# parse a simple pattern
|
||||
|
||||
subpattern = SubPattern(state)
|
||||
|
||||
|
@ -451,22 +493,6 @@ def _parse(source, state):
|
|||
if gid is None:
|
||||
raise error, "unknown group name"
|
||||
subpattern.append((GROUPREF, gid))
|
||||
elif source.match("#"):
|
||||
index = ""
|
||||
while 1:
|
||||
char = source.get()
|
||||
if char is None:
|
||||
raise error, "unterminated index"
|
||||
if char == ")":
|
||||
break
|
||||
index = index + char
|
||||
try:
|
||||
index = int(index)
|
||||
if index < 0 or index > MAXREPEAT:
|
||||
raise ValueError
|
||||
except ValueError:
|
||||
raise error, "illegal index"
|
||||
subpattern.append((INDEX, index))
|
||||
continue
|
||||
else:
|
||||
char = source.get()
|
||||
|
@ -491,48 +517,27 @@ def _parse(source, state):
|
|||
raise error, "syntax error"
|
||||
dir = -1 # lookbehind
|
||||
char = source.get()
|
||||
b = []
|
||||
while 1:
|
||||
p = _parse(source, state)
|
||||
if source.next == ")":
|
||||
if b:
|
||||
b.append(p)
|
||||
p = _branch(state, b)
|
||||
if char == "=":
|
||||
subpattern.append((ASSERT, (dir, p)))
|
||||
else:
|
||||
subpattern.append((ASSERT_NOT, (dir, p)))
|
||||
break
|
||||
elif source.match("|"):
|
||||
b.append(p)
|
||||
else:
|
||||
raise error, "pattern not properly closed"
|
||||
p = _parse_sub(source, state)
|
||||
if char == "=":
|
||||
subpattern.append((ASSERT, (dir, p)))
|
||||
else:
|
||||
subpattern.append((ASSERT_NOT, (dir, p)))
|
||||
continue
|
||||
else:
|
||||
# flags
|
||||
while FLAGS.has_key(source.next):
|
||||
state.flags = state.flags | FLAGS[source.get()]
|
||||
if group:
|
||||
# parse group contents
|
||||
b = []
|
||||
if group == 2:
|
||||
# anonymous group
|
||||
group = None
|
||||
else:
|
||||
group = state.getgroup(name)
|
||||
while 1:
|
||||
p = _parse(source, state)
|
||||
if group is not None:
|
||||
p.append((INDEX, group))
|
||||
if source.match(")"):
|
||||
if b:
|
||||
b.append(p)
|
||||
p = _branch(state, b)
|
||||
subpattern.append((SUBPATTERN, (group, p)))
|
||||
break
|
||||
elif source.match("|"):
|
||||
b.append(p)
|
||||
else:
|
||||
raise error, "group not properly closed"
|
||||
p = _parse_sub(source, state)
|
||||
subpattern.append((SUBPATTERN, (group, p)))
|
||||
if group is not None:
|
||||
p.append((INDEX, group))
|
||||
else:
|
||||
while 1:
|
||||
char = source.get()
|
||||
|
@ -555,26 +560,24 @@ def _parse(source, state):
|
|||
|
||||
return subpattern
|
||||
|
||||
def parse(pattern, flags=0):
|
||||
def parse(str, flags=0):
|
||||
# parse 're' pattern into list of (opcode, argument) tuples
|
||||
source = Tokenizer(pattern)
|
||||
state = State()
|
||||
state.flags = flags
|
||||
b = []
|
||||
while 1:
|
||||
p = _parse(source, state)
|
||||
tail = source.get()
|
||||
if tail == "|":
|
||||
b.append(p)
|
||||
elif tail == ")":
|
||||
raise error, "unbalanced parenthesis"
|
||||
elif tail is None:
|
||||
if b:
|
||||
b.append(p)
|
||||
p = _branch(state, b)
|
||||
break
|
||||
else:
|
||||
raise error, "bogus characters at end of regular expression"
|
||||
|
||||
source = Tokenizer(str)
|
||||
|
||||
pattern = Pattern()
|
||||
pattern.flags = flags
|
||||
|
||||
p = _parse_sub(source, pattern, 0)
|
||||
|
||||
tail = source.get()
|
||||
if tail == ")":
|
||||
raise error, "unbalanced parenthesis"
|
||||
elif tail:
|
||||
raise error, "bogus characters at end of regular expression"
|
||||
|
||||
# p.dump()
|
||||
|
||||
return p
|
||||
|
||||
def parse_template(source, pattern):
|
||||
|
@ -656,4 +659,4 @@ def expand_template(template, match):
|
|||
if s is None:
|
||||
raise error, "empty group"
|
||||
a(s)
|
||||
return sep.join(p)
|
||||
return string.join(p, sep)
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
test_sre
|
||||
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
|
||||
=== Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
|
||||
=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None'
|
||||
=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None'
|
||||
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
|
||||
|
|
2191
Modules/_sre.c
2191
Modules/_sre.c
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,5 @@
|
|||
/*
|
||||
*
|
||||
* Secret Labs' Regular Expression Engine
|
||||
*
|
||||
* regular expression matching engine
|
||||
|
@ -33,6 +34,7 @@ typedef struct {
|
|||
typedef struct {
|
||||
PyObject_VAR_HEAD
|
||||
PyObject* string; /* link to the target string */
|
||||
PyObject* regs; /* cached list of matching spans */
|
||||
PatternObject* pattern; /* link to the regex (pattern) object */
|
||||
int pos, endpos; /* current target slice */
|
||||
int lastindex; /* last index marker seen by the engine (-1 if none) */
|
||||
|
@ -60,6 +62,9 @@ typedef struct {
|
|||
void* beginning; /* start of original string */
|
||||
void* start; /* start of current slice */
|
||||
void* end; /* end of original string */
|
||||
/* attributes for the match object */
|
||||
PyObject* string;
|
||||
int pos, endpos;
|
||||
/* character size */
|
||||
int charsize;
|
||||
/* registers */
|
||||
|
@ -78,7 +83,6 @@ typedef struct {
|
|||
/* scanner (internal helper object) */
|
||||
PyObject_HEAD
|
||||
PyObject* pattern;
|
||||
PyObject* string;
|
||||
SRE_STATE state;
|
||||
} ScannerObject;
|
||||
|
||||
|
|
Loading…
Reference in New Issue