SRE 0.9.8: passes the entire test suite
-- reverted REPEAT operator to use "repeat context" strategy (from 0.8.X), but done right this time. -- got rid of backtracking stack; use nested SRE_MATCH calls instead (should probably put it back again in 0.9.9 ;-) -- properly reset state in scanner mode -- don't use aggressive inlining by default
This commit is contained in:
parent
19c6afb42b
commit
29c4ba9ada
13
Lib/sre.py
13
Lib/sre.py
|
@ -5,8 +5,12 @@
|
|||
#
|
||||
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
||||
#
|
||||
# This version of the SRE library can be redistributed under CNRI's
|
||||
# Python 1.6 license. For any other use, please contact Secret Labs
|
||||
# AB (info@pythonware.com).
|
||||
#
|
||||
# Portions of this engine have been developed in cooperation with
|
||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
||||
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
|
||||
# other compatibility work.
|
||||
#
|
||||
|
||||
|
@ -24,7 +28,7 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
|
|||
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
|
||||
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
|
||||
|
||||
# sre extensions (may or may not be in 2.0 final)
|
||||
# sre extensions (may or may not be in 1.6/2.0 final)
|
||||
T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
|
||||
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
|
||||
|
||||
|
@ -168,15 +172,14 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
|
|||
|
||||
class Scanner:
|
||||
def __init__(self, lexicon):
|
||||
from sre_constants import BRANCH, SUBPATTERN, INDEX
|
||||
from sre_constants import BRANCH, SUBPATTERN
|
||||
self.lexicon = lexicon
|
||||
# combine phrases into a compound pattern
|
||||
p = []
|
||||
s = sre_parse.Pattern()
|
||||
for phrase, action in lexicon:
|
||||
p.append(sre_parse.SubPattern(s, [
|
||||
(SUBPATTERN, (None, sre_parse.parse(phrase))),
|
||||
(INDEX, len(p))
|
||||
(SUBPATTERN, (len(p), sre_parse.parse(phrase))),
|
||||
]))
|
||||
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
|
||||
s.groups = len(p)
|
||||
|
|
|
@ -5,9 +5,7 @@
|
|||
#
|
||||
# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
|
||||
#
|
||||
# Portions of this engine have been developed in cooperation with
|
||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
||||
# other compatibility work.
|
||||
# See the sre.py file for information on usage and redistribution.
|
||||
#
|
||||
|
||||
import _sre
|
||||
|
@ -124,6 +122,7 @@ def _compile(code, pattern, flags):
|
|||
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
|
||||
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
|
||||
if flags & SRE_FLAG_TEMPLATE:
|
||||
raise error, "internal: unsupported template operator"
|
||||
emit(OPCODES[REPEAT])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
|
@ -136,9 +135,8 @@ def _compile(code, pattern, flags):
|
|||
if lo == 0:
|
||||
raise error, "nothing to repeat"
|
||||
if 0 and lo == hi == 1 and op is MAX_REPEAT:
|
||||
# FIXME: <fl> need a better way to figure out when
|
||||
# it's safe to use this one (in the parser, probably)
|
||||
emit(OPCODES[MAX_REPEAT_ONE])
|
||||
# FIXME: <fl> fast and wrong (but we'll fix that)
|
||||
emit(OPCODES[REPEAT_ONE])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
|
@ -146,29 +144,24 @@ def _compile(code, pattern, flags):
|
|||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
else:
|
||||
emit(OPCODES[op])
|
||||
emit(OPCODES[REPEAT])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
mark = MAXCODE
|
||||
if av[2][0][0] == SUBPATTERN:
|
||||
# repeated subpattern
|
||||
gid, foo = av[2][0][1]
|
||||
if gid:
|
||||
mark = (gid-1)*2
|
||||
emit(mark)
|
||||
_compile(code, av[2], flags)
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
if op == MAX_REPEAT:
|
||||
emit(OPCODES[MAX_UNTIL])
|
||||
else:
|
||||
emit(OPCODES[MIN_UNTIL])
|
||||
elif op is SUBPATTERN:
|
||||
gid = av[0]
|
||||
if gid:
|
||||
if av[0]:
|
||||
emit(OPCODES[MARK])
|
||||
emit((gid-1)*2)
|
||||
emit((av[0]-1)*2)
|
||||
_compile(code, av[1], flags)
|
||||
if gid:
|
||||
if av[0]:
|
||||
emit(OPCODES[MARK])
|
||||
emit((gid-1)*2+1)
|
||||
emit((av[0]-1)*2+1)
|
||||
elif op in (SUCCESS, FAILURE):
|
||||
emit(OPCODES[op])
|
||||
elif op in (ASSERT, ASSERT_NOT):
|
||||
|
@ -197,11 +190,10 @@ def _compile(code, pattern, flags):
|
|||
else:
|
||||
emit(ATCODES[av])
|
||||
elif op is BRANCH:
|
||||
emit(OPCODES[op])
|
||||
tail = []
|
||||
for av in av[1]:
|
||||
emit(OPCODES[op])
|
||||
skip = len(code); emit(0)
|
||||
emit(MAXCODE) # save mark
|
||||
_compile(code, av, flags)
|
||||
emit(OPCODES[JUMP])
|
||||
tail.append(len(code)); emit(0)
|
||||
|
@ -223,9 +215,6 @@ def _compile(code, pattern, flags):
|
|||
else:
|
||||
emit(OPCODES[op])
|
||||
emit(av-1)
|
||||
elif op in (MARK, INDEX):
|
||||
emit(OPCODES[op])
|
||||
emit(av)
|
||||
else:
|
||||
raise ValueError, ("unsupported operand type", op)
|
||||
|
||||
|
@ -294,16 +283,7 @@ try:
|
|||
except NameError:
|
||||
pass
|
||||
|
||||
def compile(p, flags=0):
|
||||
# internal: convert pattern list to internal format
|
||||
|
||||
# compile, as necessary
|
||||
if type(p) in STRING_TYPES:
|
||||
import sre_parse
|
||||
pattern = p
|
||||
p = sre_parse.parse(p, flags)
|
||||
else:
|
||||
pattern = None
|
||||
def _compile1(p, flags):
|
||||
|
||||
flags = p.pattern.flags | flags
|
||||
code = []
|
||||
|
@ -316,6 +296,20 @@ def compile(p, flags=0):
|
|||
|
||||
code.append(OPCODES[SUCCESS])
|
||||
|
||||
return code
|
||||
|
||||
def compile(p, flags=0):
|
||||
# internal: convert pattern list to internal format
|
||||
|
||||
if type(p) in STRING_TYPES:
|
||||
import sre_parse
|
||||
pattern = p
|
||||
p = sre_parse.parse(p, flags)
|
||||
else:
|
||||
pattern = None
|
||||
|
||||
code = _compile1(p, flags)
|
||||
|
||||
# print code
|
||||
|
||||
# FIXME: <fl> get rid of this limitation!
|
||||
|
|
|
@ -6,9 +6,7 @@
|
|||
#
|
||||
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
||||
#
|
||||
# Portions of this engine have been developed in cooperation with
|
||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
||||
# other compatibility work.
|
||||
# See the sre.py file for information on usage and redistribution.
|
||||
#
|
||||
|
||||
# should this really be here?
|
||||
|
@ -33,15 +31,15 @@ GROUPREF = "groupref"
|
|||
GROUPREF_IGNORE = "groupref_ignore"
|
||||
IN = "in"
|
||||
IN_IGNORE = "in_ignore"
|
||||
INDEX = "index"
|
||||
INFO = "info"
|
||||
JUMP = "jump"
|
||||
LITERAL = "literal"
|
||||
LITERAL_IGNORE = "literal_ignore"
|
||||
MARK = "mark"
|
||||
MAX_REPEAT = "max_repeat"
|
||||
MAX_REPEAT_ONE = "max_repeat_one"
|
||||
MAX_UNTIL = "max_until"
|
||||
MIN_REPEAT = "min_repeat"
|
||||
MIN_UNTIL = "min_until"
|
||||
NEGATE = "negate"
|
||||
NOT_LITERAL = "not_literal"
|
||||
NOT_LITERAL_IGNORE = "not_literal_ignore"
|
||||
|
@ -91,19 +89,19 @@ OPCODES = [
|
|||
CATEGORY,
|
||||
CHARSET,
|
||||
GROUPREF, GROUPREF_IGNORE,
|
||||
INDEX,
|
||||
IN, IN_IGNORE,
|
||||
INFO,
|
||||
JUMP,
|
||||
LITERAL, LITERAL_IGNORE,
|
||||
MARK,
|
||||
MAX_REPEAT,
|
||||
MAX_REPEAT_ONE,
|
||||
MIN_REPEAT,
|
||||
MAX_UNTIL,
|
||||
MIN_UNTIL,
|
||||
NOT_LITERAL, NOT_LITERAL_IGNORE,
|
||||
NEGATE,
|
||||
RANGE,
|
||||
REPEAT
|
||||
REPEAT,
|
||||
REPEAT_ONE,
|
||||
SUBPATTERN
|
||||
|
||||
]
|
||||
|
||||
|
|
|
@ -5,9 +5,7 @@
|
|||
#
|
||||
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
||||
#
|
||||
# Portions of this engine have been developed in cooperation with
|
||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
||||
# other compatibility work.
|
||||
# See the sre.py file for information on usage and redistribution.
|
||||
#
|
||||
|
||||
import string, sys
|
||||
|
@ -536,8 +534,6 @@ def _parse(source, state):
|
|||
group = state.getgroup(name)
|
||||
p = _parse_sub(source, state)
|
||||
subpattern.append((SUBPATTERN, (group, p)))
|
||||
if group is not None:
|
||||
p.append((INDEX, group))
|
||||
else:
|
||||
while 1:
|
||||
char = source.get()
|
||||
|
|
779
Modules/_sre.c
779
Modules/_sre.c
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,4 @@
|
|||
/*
|
||||
*
|
||||
* Secret Labs' Regular Expression Engine
|
||||
*
|
||||
* regular expression matching engine
|
||||
|
@ -44,18 +43,15 @@ typedef struct {
|
|||
|
||||
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
|
||||
|
||||
typedef struct {
|
||||
/* stack elements */
|
||||
SRE_CODE* pattern;
|
||||
void* ptr;
|
||||
int mark;
|
||||
void* mark0;
|
||||
void* mark1;
|
||||
} SRE_STACK;
|
||||
|
||||
/* FIXME: <fl> shouldn't be a constant, really... */
|
||||
#define SRE_MARK_SIZE 200
|
||||
|
||||
typedef struct SRE_REPEAT_T {
|
||||
int count;
|
||||
SRE_CODE* pattern; /* points to REPEAT operator arguments */
|
||||
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
|
||||
} SRE_REPEAT;
|
||||
|
||||
typedef struct {
|
||||
/* string pointers */
|
||||
void* ptr; /* current position (also end of current slice) */
|
||||
|
@ -71,16 +67,16 @@ typedef struct {
|
|||
int lastindex;
|
||||
int lastmark;
|
||||
void* mark[SRE_MARK_SIZE];
|
||||
/* backtracking stack */
|
||||
SRE_STACK* stack;
|
||||
int stacksize;
|
||||
int stackbase;
|
||||
/* dynamically allocated stuff */
|
||||
void** mark_stack;
|
||||
int mark_stack_size;
|
||||
int mark_stack_base;
|
||||
SRE_REPEAT *repeat; /* current repeat context */
|
||||
/* hooks */
|
||||
SRE_TOLOWER_HOOK lower;
|
||||
} SRE_STATE;
|
||||
|
||||
typedef struct {
|
||||
/* scanner (internal helper object) */
|
||||
PyObject_HEAD
|
||||
PyObject* pattern;
|
||||
SRE_STATE state;
|
||||
|
|
|
@ -21,24 +21,24 @@
|
|||
#define SRE_OP_CALL 7
|
||||
#define SRE_OP_CATEGORY 8
|
||||
#define SRE_OP_CHARSET 9
|
||||
#define SRE_OP_GROUP 10
|
||||
#define SRE_OP_GROUP_IGNORE 11
|
||||
#define SRE_OP_INDEX 12
|
||||
#define SRE_OP_IN 13
|
||||
#define SRE_OP_IN_IGNORE 14
|
||||
#define SRE_OP_INFO 15
|
||||
#define SRE_OP_JUMP 16
|
||||
#define SRE_OP_LITERAL 17
|
||||
#define SRE_OP_LITERAL_IGNORE 18
|
||||
#define SRE_OP_MARK 19
|
||||
#define SRE_OP_MAX_REPEAT 20
|
||||
#define SRE_OP_MAX_REPEAT_ONE 21
|
||||
#define SRE_OP_MIN_REPEAT 22
|
||||
#define SRE_OP_NOT_LITERAL 23
|
||||
#define SRE_OP_NOT_LITERAL_IGNORE 24
|
||||
#define SRE_OP_NEGATE 25
|
||||
#define SRE_OP_RANGE 26
|
||||
#define SRE_OP_REPEAT 27
|
||||
#define SRE_OP_GROUPREF 10
|
||||
#define SRE_OP_GROUPREF_IGNORE 11
|
||||
#define SRE_OP_IN 12
|
||||
#define SRE_OP_IN_IGNORE 13
|
||||
#define SRE_OP_INFO 14
|
||||
#define SRE_OP_JUMP 15
|
||||
#define SRE_OP_LITERAL 16
|
||||
#define SRE_OP_LITERAL_IGNORE 17
|
||||
#define SRE_OP_MARK 18
|
||||
#define SRE_OP_MAX_UNTIL 19
|
||||
#define SRE_OP_MIN_UNTIL 20
|
||||
#define SRE_OP_NOT_LITERAL 21
|
||||
#define SRE_OP_NOT_LITERAL_IGNORE 22
|
||||
#define SRE_OP_NEGATE 23
|
||||
#define SRE_OP_RANGE 24
|
||||
#define SRE_OP_REPEAT 25
|
||||
#define SRE_OP_REPEAT_ONE 26
|
||||
#define SRE_OP_SUBPATTERN 27
|
||||
#define SRE_AT_BEGINNING 0
|
||||
#define SRE_AT_BEGINNING_LINE 1
|
||||
#define SRE_AT_BOUNDARY 2
|
||||
|
|
Loading…
Reference in New Issue