SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default
This commit is contained in:
Fredrik Lundh 2000-08-01 18:20:07 +00:00
parent 19c6afb42b
commit 29c4ba9ada
7 changed files with 391 additions and 557 deletions

View File

@ -5,8 +5,12 @@
# #
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. # Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
# #
# This version of the SRE library can be redistributed under CNRI's
# Python 1.6 license. For any other use, please contact Secret Labs
# AB (info@pythonware.com).
#
# Portions of this engine have been developed in cooperation with # Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and # CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work. # other compatibility work.
# #
@ -24,7 +28,7 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
S = DOTALL = sre_compile.SRE_FLAG_DOTALL S = DOTALL = sre_compile.SRE_FLAG_DOTALL
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
# sre extensions (may or may not be in 2.0 final) # sre extensions (may or may not be in 1.6/2.0 final)
T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
U = UNICODE = sre_compile.SRE_FLAG_UNICODE U = UNICODE = sre_compile.SRE_FLAG_UNICODE
@ -168,15 +172,14 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
class Scanner: class Scanner:
def __init__(self, lexicon): def __init__(self, lexicon):
from sre_constants import BRANCH, SUBPATTERN, INDEX from sre_constants import BRANCH, SUBPATTERN
self.lexicon = lexicon self.lexicon = lexicon
# combine phrases into a compound pattern # combine phrases into a compound pattern
p = [] p = []
s = sre_parse.Pattern() s = sre_parse.Pattern()
for phrase, action in lexicon: for phrase, action in lexicon:
p.append(sre_parse.SubPattern(s, [ p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (None, sre_parse.parse(phrase))), (SUBPATTERN, (len(p), sre_parse.parse(phrase))),
(INDEX, len(p))
])) ]))
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
s.groups = len(p) s.groups = len(p)

View File

@ -5,9 +5,7 @@
# #
# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. # Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
# #
# Portions of this engine have been developed in cooperation with # See the sre.py file for information on usage and redistribution.
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# #
import _sre import _sre
@ -124,6 +122,7 @@ def _compile(code, pattern, flags):
emit(CHCODES[CATEGORY_NOT_LINEBREAK]) emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE: if flags & SRE_FLAG_TEMPLATE:
raise error, "internal: unsupported template operator"
emit(OPCODES[REPEAT]) emit(OPCODES[REPEAT])
skip = len(code); emit(0) skip = len(code); emit(0)
emit(av[0]) emit(av[0])
@ -136,9 +135,8 @@ def _compile(code, pattern, flags):
if lo == 0: if lo == 0:
raise error, "nothing to repeat" raise error, "nothing to repeat"
if 0 and lo == hi == 1 and op is MAX_REPEAT: if 0 and lo == hi == 1 and op is MAX_REPEAT:
# FIXME: <fl> need a better way to figure out when # FIXME: <fl> fast and wrong (but we'll fix that)
# it's safe to use this one (in the parser, probably) emit(OPCODES[REPEAT_ONE])
emit(OPCODES[MAX_REPEAT_ONE])
skip = len(code); emit(0) skip = len(code); emit(0)
emit(av[0]) emit(av[0])
emit(av[1]) emit(av[1])
@ -146,29 +144,24 @@ def _compile(code, pattern, flags):
emit(OPCODES[SUCCESS]) emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
else: else:
emit(OPCODES[op]) emit(OPCODES[REPEAT])
skip = len(code); emit(0) skip = len(code); emit(0)
emit(av[0]) emit(av[0])
emit(av[1]) emit(av[1])
mark = MAXCODE
if av[2][0][0] == SUBPATTERN:
# repeated subpattern
gid, foo = av[2][0][1]
if gid:
mark = (gid-1)*2
emit(mark)
_compile(code, av[2], flags) _compile(code, av[2], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
if op == MAX_REPEAT:
emit(OPCODES[MAX_UNTIL])
else:
emit(OPCODES[MIN_UNTIL])
elif op is SUBPATTERN: elif op is SUBPATTERN:
gid = av[0] if av[0]:
if gid:
emit(OPCODES[MARK]) emit(OPCODES[MARK])
emit((gid-1)*2) emit((av[0]-1)*2)
_compile(code, av[1], flags) _compile(code, av[1], flags)
if gid: if av[0]:
emit(OPCODES[MARK]) emit(OPCODES[MARK])
emit((gid-1)*2+1) emit((av[0]-1)*2+1)
elif op in (SUCCESS, FAILURE): elif op in (SUCCESS, FAILURE):
emit(OPCODES[op]) emit(OPCODES[op])
elif op in (ASSERT, ASSERT_NOT): elif op in (ASSERT, ASSERT_NOT):
@ -197,11 +190,10 @@ def _compile(code, pattern, flags):
else: else:
emit(ATCODES[av]) emit(ATCODES[av])
elif op is BRANCH: elif op is BRANCH:
emit(OPCODES[op])
tail = [] tail = []
for av in av[1]: for av in av[1]:
emit(OPCODES[op])
skip = len(code); emit(0) skip = len(code); emit(0)
emit(MAXCODE) # save mark
_compile(code, av, flags) _compile(code, av, flags)
emit(OPCODES[JUMP]) emit(OPCODES[JUMP])
tail.append(len(code)); emit(0) tail.append(len(code)); emit(0)
@ -223,9 +215,6 @@ def _compile(code, pattern, flags):
else: else:
emit(OPCODES[op]) emit(OPCODES[op])
emit(av-1) emit(av-1)
elif op in (MARK, INDEX):
emit(OPCODES[op])
emit(av)
else: else:
raise ValueError, ("unsupported operand type", op) raise ValueError, ("unsupported operand type", op)
@ -294,16 +283,7 @@ try:
except NameError: except NameError:
pass pass
def compile(p, flags=0): def _compile1(p, flags):
# internal: convert pattern list to internal format
# compile, as necessary
if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
flags = p.pattern.flags | flags flags = p.pattern.flags | flags
code = [] code = []
@ -316,6 +296,20 @@ def compile(p, flags=0):
code.append(OPCODES[SUCCESS]) code.append(OPCODES[SUCCESS])
return code
def compile(p, flags=0):
# internal: convert pattern list to internal format
if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
code = _compile1(p, flags)
# print code # print code
# FIXME: <fl> get rid of this limitation! # FIXME: <fl> get rid of this limitation!

View File

@ -6,9 +6,7 @@
# #
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. # Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
# #
# Portions of this engine have been developed in cooperation with # See the sre.py file for information on usage and redistribution.
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# #
# should this really be here? # should this really be here?
@ -33,15 +31,15 @@ GROUPREF = "groupref"
GROUPREF_IGNORE = "groupref_ignore" GROUPREF_IGNORE = "groupref_ignore"
IN = "in" IN = "in"
IN_IGNORE = "in_ignore" IN_IGNORE = "in_ignore"
INDEX = "index"
INFO = "info" INFO = "info"
JUMP = "jump" JUMP = "jump"
LITERAL = "literal" LITERAL = "literal"
LITERAL_IGNORE = "literal_ignore" LITERAL_IGNORE = "literal_ignore"
MARK = "mark" MARK = "mark"
MAX_REPEAT = "max_repeat" MAX_REPEAT = "max_repeat"
MAX_REPEAT_ONE = "max_repeat_one" MAX_UNTIL = "max_until"
MIN_REPEAT = "min_repeat" MIN_REPEAT = "min_repeat"
MIN_UNTIL = "min_until"
NEGATE = "negate" NEGATE = "negate"
NOT_LITERAL = "not_literal" NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore" NOT_LITERAL_IGNORE = "not_literal_ignore"
@ -91,19 +89,19 @@ OPCODES = [
CATEGORY, CATEGORY,
CHARSET, CHARSET,
GROUPREF, GROUPREF_IGNORE, GROUPREF, GROUPREF_IGNORE,
INDEX,
IN, IN_IGNORE, IN, IN_IGNORE,
INFO, INFO,
JUMP, JUMP,
LITERAL, LITERAL_IGNORE, LITERAL, LITERAL_IGNORE,
MARK, MARK,
MAX_REPEAT, MAX_UNTIL,
MAX_REPEAT_ONE, MIN_UNTIL,
MIN_REPEAT,
NOT_LITERAL, NOT_LITERAL_IGNORE, NOT_LITERAL, NOT_LITERAL_IGNORE,
NEGATE, NEGATE,
RANGE, RANGE,
REPEAT REPEAT,
REPEAT_ONE,
SUBPATTERN
] ]

View File

@ -5,9 +5,7 @@
# #
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. # Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
# #
# Portions of this engine have been developed in cooperation with # See the sre.py file for information on usage and redistribution.
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# #
import string, sys import string, sys
@ -536,8 +534,6 @@ def _parse(source, state):
group = state.getgroup(name) group = state.getgroup(name)
p = _parse_sub(source, state) p = _parse_sub(source, state)
subpattern.append((SUBPATTERN, (group, p))) subpattern.append((SUBPATTERN, (group, p)))
if group is not None:
p.append((INDEX, group))
else: else:
while 1: while 1:
char = source.get() char = source.get()

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,4 @@
/* /*
*
* Secret Labs' Regular Expression Engine * Secret Labs' Regular Expression Engine
* *
* regular expression matching engine * regular expression matching engine
@ -44,18 +43,15 @@ typedef struct {
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
typedef struct {
/* stack elements */
SRE_CODE* pattern;
void* ptr;
int mark;
void* mark0;
void* mark1;
} SRE_STACK;
/* FIXME: <fl> shouldn't be a constant, really... */ /* FIXME: <fl> shouldn't be a constant, really... */
#define SRE_MARK_SIZE 200 #define SRE_MARK_SIZE 200
typedef struct SRE_REPEAT_T {
int count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
} SRE_REPEAT;
typedef struct { typedef struct {
/* string pointers */ /* string pointers */
void* ptr; /* current position (also end of current slice) */ void* ptr; /* current position (also end of current slice) */
@ -71,16 +67,16 @@ typedef struct {
int lastindex; int lastindex;
int lastmark; int lastmark;
void* mark[SRE_MARK_SIZE]; void* mark[SRE_MARK_SIZE];
/* backtracking stack */ /* dynamically allocated stuff */
SRE_STACK* stack; void** mark_stack;
int stacksize; int mark_stack_size;
int stackbase; int mark_stack_base;
SRE_REPEAT *repeat; /* current repeat context */
/* hooks */ /* hooks */
SRE_TOLOWER_HOOK lower; SRE_TOLOWER_HOOK lower;
} SRE_STATE; } SRE_STATE;
typedef struct { typedef struct {
/* scanner (internal helper object) */
PyObject_HEAD PyObject_HEAD
PyObject* pattern; PyObject* pattern;
SRE_STATE state; SRE_STATE state;

View File

@ -21,24 +21,24 @@
#define SRE_OP_CALL 7 #define SRE_OP_CALL 7
#define SRE_OP_CATEGORY 8 #define SRE_OP_CATEGORY 8
#define SRE_OP_CHARSET 9 #define SRE_OP_CHARSET 9
#define SRE_OP_GROUP 10 #define SRE_OP_GROUPREF 10
#define SRE_OP_GROUP_IGNORE 11 #define SRE_OP_GROUPREF_IGNORE 11
#define SRE_OP_INDEX 12 #define SRE_OP_IN 12
#define SRE_OP_IN 13 #define SRE_OP_IN_IGNORE 13
#define SRE_OP_IN_IGNORE 14 #define SRE_OP_INFO 14
#define SRE_OP_INFO 15 #define SRE_OP_JUMP 15
#define SRE_OP_JUMP 16 #define SRE_OP_LITERAL 16
#define SRE_OP_LITERAL 17 #define SRE_OP_LITERAL_IGNORE 17
#define SRE_OP_LITERAL_IGNORE 18 #define SRE_OP_MARK 18
#define SRE_OP_MARK 19 #define SRE_OP_MAX_UNTIL 19
#define SRE_OP_MAX_REPEAT 20 #define SRE_OP_MIN_UNTIL 20
#define SRE_OP_MAX_REPEAT_ONE 21 #define SRE_OP_NOT_LITERAL 21
#define SRE_OP_MIN_REPEAT 22 #define SRE_OP_NOT_LITERAL_IGNORE 22
#define SRE_OP_NOT_LITERAL 23 #define SRE_OP_NEGATE 23
#define SRE_OP_NOT_LITERAL_IGNORE 24 #define SRE_OP_RANGE 24
#define SRE_OP_NEGATE 25 #define SRE_OP_REPEAT 25
#define SRE_OP_RANGE 26 #define SRE_OP_REPEAT_ONE 26
#define SRE_OP_REPEAT 27 #define SRE_OP_SUBPATTERN 27
#define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2 #define SRE_AT_BOUNDARY 2