SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default
This commit is contained in:
Fredrik Lundh 2000-08-01 18:20:07 +00:00
parent 19c6afb42b
commit 29c4ba9ada
7 changed files with 391 additions and 557 deletions

View File

@ -5,8 +5,12 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# This version of the SRE library can be redistributed under CNRI's
# Python 1.6 license. For any other use, please contact Secret Labs
# AB (info@pythonware.com).
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
@ -24,7 +28,7 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
# sre extensions (may or may not be in 2.0 final)
# sre extensions (may or may not be in 1.6/2.0 final)
T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
@ -168,15 +172,14 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
class Scanner:
def __init__(self, lexicon):
from sre_constants import BRANCH, SUBPATTERN, INDEX
from sre_constants import BRANCH, SUBPATTERN
self.lexicon = lexicon
# combine phrases into a compound pattern
p = []
s = sre_parse.Pattern()
for phrase, action in lexicon:
p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (None, sre_parse.parse(phrase))),
(INDEX, len(p))
(SUBPATTERN, (len(p), sre_parse.parse(phrase))),
]))
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
s.groups = len(p)

View File

@ -5,9 +5,7 @@
#
# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# See the sre.py file for information on usage and redistribution.
#
import _sre
@ -124,6 +122,7 @@ def _compile(code, pattern, flags):
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE:
raise error, "internal: unsupported template operator"
emit(OPCODES[REPEAT])
skip = len(code); emit(0)
emit(av[0])
@ -136,9 +135,8 @@ def _compile(code, pattern, flags):
if lo == 0:
raise error, "nothing to repeat"
if 0 and lo == hi == 1 and op is MAX_REPEAT:
# FIXME: <fl> need a better way to figure out when
# it's safe to use this one (in the parser, probably)
emit(OPCODES[MAX_REPEAT_ONE])
# FIXME: <fl> fast and wrong (but we'll fix that)
emit(OPCODES[REPEAT_ONE])
skip = len(code); emit(0)
emit(av[0])
emit(av[1])
@ -146,29 +144,24 @@ def _compile(code, pattern, flags):
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else:
emit(OPCODES[op])
emit(OPCODES[REPEAT])
skip = len(code); emit(0)
emit(av[0])
emit(av[1])
mark = MAXCODE
if av[2][0][0] == SUBPATTERN:
# repeated subpattern
gid, foo = av[2][0][1]
if gid:
mark = (gid-1)*2
emit(mark)
_compile(code, av[2], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
if op == MAX_REPEAT:
emit(OPCODES[MAX_UNTIL])
else:
emit(OPCODES[MIN_UNTIL])
elif op is SUBPATTERN:
gid = av[0]
if gid:
if av[0]:
emit(OPCODES[MARK])
emit((gid-1)*2)
emit((av[0]-1)*2)
_compile(code, av[1], flags)
if gid:
if av[0]:
emit(OPCODES[MARK])
emit((gid-1)*2+1)
emit((av[0]-1)*2+1)
elif op in (SUCCESS, FAILURE):
emit(OPCODES[op])
elif op in (ASSERT, ASSERT_NOT):
@ -197,11 +190,10 @@ def _compile(code, pattern, flags):
else:
emit(ATCODES[av])
elif op is BRANCH:
emit(OPCODES[op])
tail = []
for av in av[1]:
emit(OPCODES[op])
skip = len(code); emit(0)
emit(MAXCODE) # save mark
_compile(code, av, flags)
emit(OPCODES[JUMP])
tail.append(len(code)); emit(0)
@ -223,9 +215,6 @@ def _compile(code, pattern, flags):
else:
emit(OPCODES[op])
emit(av-1)
elif op in (MARK, INDEX):
emit(OPCODES[op])
emit(av)
else:
raise ValueError, ("unsupported operand type", op)
@ -294,16 +283,7 @@ try:
except NameError:
pass
def compile(p, flags=0):
# internal: convert pattern list to internal format
# compile, as necessary
if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
def _compile1(p, flags):
flags = p.pattern.flags | flags
code = []
@ -316,6 +296,20 @@ def compile(p, flags=0):
code.append(OPCODES[SUCCESS])
return code
def compile(p, flags=0):
# internal: convert pattern list to internal format
if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
code = _compile1(p, flags)
# print code
# FIXME: <fl> get rid of this limitation!

View File

@ -6,9 +6,7 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# See the sre.py file for information on usage and redistribution.
#
# should this really be here?
@ -33,15 +31,15 @@ GROUPREF = "groupref"
GROUPREF_IGNORE = "groupref_ignore"
IN = "in"
IN_IGNORE = "in_ignore"
INDEX = "index"
INFO = "info"
JUMP = "jump"
LITERAL = "literal"
LITERAL_IGNORE = "literal_ignore"
MARK = "mark"
MAX_REPEAT = "max_repeat"
MAX_REPEAT_ONE = "max_repeat_one"
MAX_UNTIL = "max_until"
MIN_REPEAT = "min_repeat"
MIN_UNTIL = "min_until"
NEGATE = "negate"
NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore"
@ -91,19 +89,19 @@ OPCODES = [
CATEGORY,
CHARSET,
GROUPREF, GROUPREF_IGNORE,
INDEX,
IN, IN_IGNORE,
INFO,
JUMP,
LITERAL, LITERAL_IGNORE,
MARK,
MAX_REPEAT,
MAX_REPEAT_ONE,
MIN_REPEAT,
MAX_UNTIL,
MIN_UNTIL,
NOT_LITERAL, NOT_LITERAL_IGNORE,
NEGATE,
RANGE,
REPEAT
REPEAT,
REPEAT_ONE,
SUBPATTERN
]

View File

@ -5,9 +5,7 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# See the sre.py file for information on usage and redistribution.
#
import string, sys
@ -536,8 +534,6 @@ def _parse(source, state):
group = state.getgroup(name)
p = _parse_sub(source, state)
subpattern.append((SUBPATTERN, (group, p)))
if group is not None:
p.append((INDEX, group))
else:
while 1:
char = source.get()

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,4 @@
/*
*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
@ -44,18 +43,15 @@ typedef struct {
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
typedef struct {
/* stack elements */
SRE_CODE* pattern;
void* ptr;
int mark;
void* mark0;
void* mark1;
} SRE_STACK;
/* FIXME: <fl> shouldn't be a constant, really... */
#define SRE_MARK_SIZE 200
typedef struct SRE_REPEAT_T {
int count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
} SRE_REPEAT;
typedef struct {
/* string pointers */
void* ptr; /* current position (also end of current slice) */
@ -71,16 +67,16 @@ typedef struct {
int lastindex;
int lastmark;
void* mark[SRE_MARK_SIZE];
/* backtracking stack */
SRE_STACK* stack;
int stacksize;
int stackbase;
/* dynamically allocated stuff */
void** mark_stack;
int mark_stack_size;
int mark_stack_base;
SRE_REPEAT *repeat; /* current repeat context */
/* hooks */
SRE_TOLOWER_HOOK lower;
} SRE_STATE;
typedef struct {
/* scanner (internal helper object) */
PyObject_HEAD
PyObject* pattern;
SRE_STATE state;

View File

@ -21,24 +21,24 @@
#define SRE_OP_CALL 7
#define SRE_OP_CATEGORY 8
#define SRE_OP_CHARSET 9
#define SRE_OP_GROUP 10
#define SRE_OP_GROUP_IGNORE 11
#define SRE_OP_INDEX 12
#define SRE_OP_IN 13
#define SRE_OP_IN_IGNORE 14
#define SRE_OP_INFO 15
#define SRE_OP_JUMP 16
#define SRE_OP_LITERAL 17
#define SRE_OP_LITERAL_IGNORE 18
#define SRE_OP_MARK 19
#define SRE_OP_MAX_REPEAT 20
#define SRE_OP_MAX_REPEAT_ONE 21
#define SRE_OP_MIN_REPEAT 22
#define SRE_OP_NOT_LITERAL 23
#define SRE_OP_NOT_LITERAL_IGNORE 24
#define SRE_OP_NEGATE 25
#define SRE_OP_RANGE 26
#define SRE_OP_REPEAT 27
#define SRE_OP_GROUPREF 10
#define SRE_OP_GROUPREF_IGNORE 11
#define SRE_OP_IN 12
#define SRE_OP_IN_IGNORE 13
#define SRE_OP_INFO 14
#define SRE_OP_JUMP 15
#define SRE_OP_LITERAL 16
#define SRE_OP_LITERAL_IGNORE 17
#define SRE_OP_MARK 18
#define SRE_OP_MAX_UNTIL 19
#define SRE_OP_MIN_UNTIL 20
#define SRE_OP_NOT_LITERAL 21
#define SRE_OP_NOT_LITERAL_IGNORE 22
#define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25
#define SRE_OP_REPEAT_ONE 26
#define SRE_OP_SUBPATTERN 27
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2