SRE 0.9.8: passes the entire test suite
-- reverted REPEAT operator to use "repeat context" strategy (from 0.8.X), but done right this time. -- got rid of backtracking stack; use nested SRE_MATCH calls instead (should probably put it back again in 0.9.9 ;-) -- properly reset state in scanner mode -- don't use aggressive inlining by default
This commit is contained in:
parent
19c6afb42b
commit
29c4ba9ada
13
Lib/sre.py
13
Lib/sre.py
|
@ -5,8 +5,12 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
||||||
#
|
#
|
||||||
|
# This version of the SRE library can be redistributed under CNRI's
|
||||||
|
# Python 1.6 license. For any other use, please contact Secret Labs
|
||||||
|
# AB (info@pythonware.com).
|
||||||
|
#
|
||||||
# Portions of this engine have been developed in cooperation with
|
# Portions of this engine have been developed in cooperation with
|
||||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
|
||||||
# other compatibility work.
|
# other compatibility work.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
@ -24,7 +28,7 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
|
||||||
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
|
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
|
||||||
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
|
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
|
||||||
|
|
||||||
# sre extensions (may or may not be in 2.0 final)
|
# sre extensions (may or may not be in 1.6/2.0 final)
|
||||||
T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
|
T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
|
||||||
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
|
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
|
||||||
|
|
||||||
|
@ -168,15 +172,14 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
|
||||||
|
|
||||||
class Scanner:
|
class Scanner:
|
||||||
def __init__(self, lexicon):
|
def __init__(self, lexicon):
|
||||||
from sre_constants import BRANCH, SUBPATTERN, INDEX
|
from sre_constants import BRANCH, SUBPATTERN
|
||||||
self.lexicon = lexicon
|
self.lexicon = lexicon
|
||||||
# combine phrases into a compound pattern
|
# combine phrases into a compound pattern
|
||||||
p = []
|
p = []
|
||||||
s = sre_parse.Pattern()
|
s = sre_parse.Pattern()
|
||||||
for phrase, action in lexicon:
|
for phrase, action in lexicon:
|
||||||
p.append(sre_parse.SubPattern(s, [
|
p.append(sre_parse.SubPattern(s, [
|
||||||
(SUBPATTERN, (None, sre_parse.parse(phrase))),
|
(SUBPATTERN, (len(p), sre_parse.parse(phrase))),
|
||||||
(INDEX, len(p))
|
|
||||||
]))
|
]))
|
||||||
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
|
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
|
||||||
s.groups = len(p)
|
s.groups = len(p)
|
||||||
|
|
|
@ -5,9 +5,7 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
|
# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
|
||||||
#
|
#
|
||||||
# Portions of this engine have been developed in cooperation with
|
# See the sre.py file for information on usage and redistribution.
|
||||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
|
||||||
# other compatibility work.
|
|
||||||
#
|
#
|
||||||
|
|
||||||
import _sre
|
import _sre
|
||||||
|
@ -124,6 +122,7 @@ def _compile(code, pattern, flags):
|
||||||
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
|
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
|
||||||
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
|
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
|
||||||
if flags & SRE_FLAG_TEMPLATE:
|
if flags & SRE_FLAG_TEMPLATE:
|
||||||
|
raise error, "internal: unsupported template operator"
|
||||||
emit(OPCODES[REPEAT])
|
emit(OPCODES[REPEAT])
|
||||||
skip = len(code); emit(0)
|
skip = len(code); emit(0)
|
||||||
emit(av[0])
|
emit(av[0])
|
||||||
|
@ -136,9 +135,8 @@ def _compile(code, pattern, flags):
|
||||||
if lo == 0:
|
if lo == 0:
|
||||||
raise error, "nothing to repeat"
|
raise error, "nothing to repeat"
|
||||||
if 0 and lo == hi == 1 and op is MAX_REPEAT:
|
if 0 and lo == hi == 1 and op is MAX_REPEAT:
|
||||||
# FIXME: <fl> need a better way to figure out when
|
# FIXME: <fl> fast and wrong (but we'll fix that)
|
||||||
# it's safe to use this one (in the parser, probably)
|
emit(OPCODES[REPEAT_ONE])
|
||||||
emit(OPCODES[MAX_REPEAT_ONE])
|
|
||||||
skip = len(code); emit(0)
|
skip = len(code); emit(0)
|
||||||
emit(av[0])
|
emit(av[0])
|
||||||
emit(av[1])
|
emit(av[1])
|
||||||
|
@ -146,29 +144,24 @@ def _compile(code, pattern, flags):
|
||||||
emit(OPCODES[SUCCESS])
|
emit(OPCODES[SUCCESS])
|
||||||
code[skip] = len(code) - skip
|
code[skip] = len(code) - skip
|
||||||
else:
|
else:
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[REPEAT])
|
||||||
skip = len(code); emit(0)
|
skip = len(code); emit(0)
|
||||||
emit(av[0])
|
emit(av[0])
|
||||||
emit(av[1])
|
emit(av[1])
|
||||||
mark = MAXCODE
|
|
||||||
if av[2][0][0] == SUBPATTERN:
|
|
||||||
# repeated subpattern
|
|
||||||
gid, foo = av[2][0][1]
|
|
||||||
if gid:
|
|
||||||
mark = (gid-1)*2
|
|
||||||
emit(mark)
|
|
||||||
_compile(code, av[2], flags)
|
_compile(code, av[2], flags)
|
||||||
emit(OPCODES[SUCCESS])
|
|
||||||
code[skip] = len(code) - skip
|
code[skip] = len(code) - skip
|
||||||
|
if op == MAX_REPEAT:
|
||||||
|
emit(OPCODES[MAX_UNTIL])
|
||||||
|
else:
|
||||||
|
emit(OPCODES[MIN_UNTIL])
|
||||||
elif op is SUBPATTERN:
|
elif op is SUBPATTERN:
|
||||||
gid = av[0]
|
if av[0]:
|
||||||
if gid:
|
|
||||||
emit(OPCODES[MARK])
|
emit(OPCODES[MARK])
|
||||||
emit((gid-1)*2)
|
emit((av[0]-1)*2)
|
||||||
_compile(code, av[1], flags)
|
_compile(code, av[1], flags)
|
||||||
if gid:
|
if av[0]:
|
||||||
emit(OPCODES[MARK])
|
emit(OPCODES[MARK])
|
||||||
emit((gid-1)*2+1)
|
emit((av[0]-1)*2+1)
|
||||||
elif op in (SUCCESS, FAILURE):
|
elif op in (SUCCESS, FAILURE):
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
elif op in (ASSERT, ASSERT_NOT):
|
elif op in (ASSERT, ASSERT_NOT):
|
||||||
|
@ -197,11 +190,10 @@ def _compile(code, pattern, flags):
|
||||||
else:
|
else:
|
||||||
emit(ATCODES[av])
|
emit(ATCODES[av])
|
||||||
elif op is BRANCH:
|
elif op is BRANCH:
|
||||||
|
emit(OPCODES[op])
|
||||||
tail = []
|
tail = []
|
||||||
for av in av[1]:
|
for av in av[1]:
|
||||||
emit(OPCODES[op])
|
|
||||||
skip = len(code); emit(0)
|
skip = len(code); emit(0)
|
||||||
emit(MAXCODE) # save mark
|
|
||||||
_compile(code, av, flags)
|
_compile(code, av, flags)
|
||||||
emit(OPCODES[JUMP])
|
emit(OPCODES[JUMP])
|
||||||
tail.append(len(code)); emit(0)
|
tail.append(len(code)); emit(0)
|
||||||
|
@ -223,9 +215,6 @@ def _compile(code, pattern, flags):
|
||||||
else:
|
else:
|
||||||
emit(OPCODES[op])
|
emit(OPCODES[op])
|
||||||
emit(av-1)
|
emit(av-1)
|
||||||
elif op in (MARK, INDEX):
|
|
||||||
emit(OPCODES[op])
|
|
||||||
emit(av)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError, ("unsupported operand type", op)
|
raise ValueError, ("unsupported operand type", op)
|
||||||
|
|
||||||
|
@ -294,16 +283,7 @@ try:
|
||||||
except NameError:
|
except NameError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def compile(p, flags=0):
|
def _compile1(p, flags):
|
||||||
# internal: convert pattern list to internal format
|
|
||||||
|
|
||||||
# compile, as necessary
|
|
||||||
if type(p) in STRING_TYPES:
|
|
||||||
import sre_parse
|
|
||||||
pattern = p
|
|
||||||
p = sre_parse.parse(p, flags)
|
|
||||||
else:
|
|
||||||
pattern = None
|
|
||||||
|
|
||||||
flags = p.pattern.flags | flags
|
flags = p.pattern.flags | flags
|
||||||
code = []
|
code = []
|
||||||
|
@ -316,6 +296,20 @@ def compile(p, flags=0):
|
||||||
|
|
||||||
code.append(OPCODES[SUCCESS])
|
code.append(OPCODES[SUCCESS])
|
||||||
|
|
||||||
|
return code
|
||||||
|
|
||||||
|
def compile(p, flags=0):
|
||||||
|
# internal: convert pattern list to internal format
|
||||||
|
|
||||||
|
if type(p) in STRING_TYPES:
|
||||||
|
import sre_parse
|
||||||
|
pattern = p
|
||||||
|
p = sre_parse.parse(p, flags)
|
||||||
|
else:
|
||||||
|
pattern = None
|
||||||
|
|
||||||
|
code = _compile1(p, flags)
|
||||||
|
|
||||||
# print code
|
# print code
|
||||||
|
|
||||||
# FIXME: <fl> get rid of this limitation!
|
# FIXME: <fl> get rid of this limitation!
|
||||||
|
|
|
@ -6,9 +6,7 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
||||||
#
|
#
|
||||||
# Portions of this engine have been developed in cooperation with
|
# See the sre.py file for information on usage and redistribution.
|
||||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
|
||||||
# other compatibility work.
|
|
||||||
#
|
#
|
||||||
|
|
||||||
# should this really be here?
|
# should this really be here?
|
||||||
|
@ -33,15 +31,15 @@ GROUPREF = "groupref"
|
||||||
GROUPREF_IGNORE = "groupref_ignore"
|
GROUPREF_IGNORE = "groupref_ignore"
|
||||||
IN = "in"
|
IN = "in"
|
||||||
IN_IGNORE = "in_ignore"
|
IN_IGNORE = "in_ignore"
|
||||||
INDEX = "index"
|
|
||||||
INFO = "info"
|
INFO = "info"
|
||||||
JUMP = "jump"
|
JUMP = "jump"
|
||||||
LITERAL = "literal"
|
LITERAL = "literal"
|
||||||
LITERAL_IGNORE = "literal_ignore"
|
LITERAL_IGNORE = "literal_ignore"
|
||||||
MARK = "mark"
|
MARK = "mark"
|
||||||
MAX_REPEAT = "max_repeat"
|
MAX_REPEAT = "max_repeat"
|
||||||
MAX_REPEAT_ONE = "max_repeat_one"
|
MAX_UNTIL = "max_until"
|
||||||
MIN_REPEAT = "min_repeat"
|
MIN_REPEAT = "min_repeat"
|
||||||
|
MIN_UNTIL = "min_until"
|
||||||
NEGATE = "negate"
|
NEGATE = "negate"
|
||||||
NOT_LITERAL = "not_literal"
|
NOT_LITERAL = "not_literal"
|
||||||
NOT_LITERAL_IGNORE = "not_literal_ignore"
|
NOT_LITERAL_IGNORE = "not_literal_ignore"
|
||||||
|
@ -91,19 +89,19 @@ OPCODES = [
|
||||||
CATEGORY,
|
CATEGORY,
|
||||||
CHARSET,
|
CHARSET,
|
||||||
GROUPREF, GROUPREF_IGNORE,
|
GROUPREF, GROUPREF_IGNORE,
|
||||||
INDEX,
|
|
||||||
IN, IN_IGNORE,
|
IN, IN_IGNORE,
|
||||||
INFO,
|
INFO,
|
||||||
JUMP,
|
JUMP,
|
||||||
LITERAL, LITERAL_IGNORE,
|
LITERAL, LITERAL_IGNORE,
|
||||||
MARK,
|
MARK,
|
||||||
MAX_REPEAT,
|
MAX_UNTIL,
|
||||||
MAX_REPEAT_ONE,
|
MIN_UNTIL,
|
||||||
MIN_REPEAT,
|
|
||||||
NOT_LITERAL, NOT_LITERAL_IGNORE,
|
NOT_LITERAL, NOT_LITERAL_IGNORE,
|
||||||
NEGATE,
|
NEGATE,
|
||||||
RANGE,
|
RANGE,
|
||||||
REPEAT
|
REPEAT,
|
||||||
|
REPEAT_ONE,
|
||||||
|
SUBPATTERN
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -5,9 +5,7 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
||||||
#
|
#
|
||||||
# Portions of this engine have been developed in cooperation with
|
# See the sre.py file for information on usage and redistribution.
|
||||||
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
|
|
||||||
# other compatibility work.
|
|
||||||
#
|
#
|
||||||
|
|
||||||
import string, sys
|
import string, sys
|
||||||
|
@ -536,8 +534,6 @@ def _parse(source, state):
|
||||||
group = state.getgroup(name)
|
group = state.getgroup(name)
|
||||||
p = _parse_sub(source, state)
|
p = _parse_sub(source, state)
|
||||||
subpattern.append((SUBPATTERN, (group, p)))
|
subpattern.append((SUBPATTERN, (group, p)))
|
||||||
if group is not None:
|
|
||||||
p.append((INDEX, group))
|
|
||||||
else:
|
else:
|
||||||
while 1:
|
while 1:
|
||||||
char = source.get()
|
char = source.get()
|
||||||
|
|
785
Modules/_sre.c
785
Modules/_sre.c
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,4 @@
|
||||||
/*
|
/*
|
||||||
*
|
|
||||||
* Secret Labs' Regular Expression Engine
|
* Secret Labs' Regular Expression Engine
|
||||||
*
|
*
|
||||||
* regular expression matching engine
|
* regular expression matching engine
|
||||||
|
@ -44,18 +43,15 @@ typedef struct {
|
||||||
|
|
||||||
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
|
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
/* stack elements */
|
|
||||||
SRE_CODE* pattern;
|
|
||||||
void* ptr;
|
|
||||||
int mark;
|
|
||||||
void* mark0;
|
|
||||||
void* mark1;
|
|
||||||
} SRE_STACK;
|
|
||||||
|
|
||||||
/* FIXME: <fl> shouldn't be a constant, really... */
|
/* FIXME: <fl> shouldn't be a constant, really... */
|
||||||
#define SRE_MARK_SIZE 200
|
#define SRE_MARK_SIZE 200
|
||||||
|
|
||||||
|
typedef struct SRE_REPEAT_T {
|
||||||
|
int count;
|
||||||
|
SRE_CODE* pattern; /* points to REPEAT operator arguments */
|
||||||
|
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
|
||||||
|
} SRE_REPEAT;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/* string pointers */
|
/* string pointers */
|
||||||
void* ptr; /* current position (also end of current slice) */
|
void* ptr; /* current position (also end of current slice) */
|
||||||
|
@ -71,16 +67,16 @@ typedef struct {
|
||||||
int lastindex;
|
int lastindex;
|
||||||
int lastmark;
|
int lastmark;
|
||||||
void* mark[SRE_MARK_SIZE];
|
void* mark[SRE_MARK_SIZE];
|
||||||
/* backtracking stack */
|
/* dynamically allocated stuff */
|
||||||
SRE_STACK* stack;
|
void** mark_stack;
|
||||||
int stacksize;
|
int mark_stack_size;
|
||||||
int stackbase;
|
int mark_stack_base;
|
||||||
|
SRE_REPEAT *repeat; /* current repeat context */
|
||||||
/* hooks */
|
/* hooks */
|
||||||
SRE_TOLOWER_HOOK lower;
|
SRE_TOLOWER_HOOK lower;
|
||||||
} SRE_STATE;
|
} SRE_STATE;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/* scanner (internal helper object) */
|
|
||||||
PyObject_HEAD
|
PyObject_HEAD
|
||||||
PyObject* pattern;
|
PyObject* pattern;
|
||||||
SRE_STATE state;
|
SRE_STATE state;
|
||||||
|
|
|
@ -21,24 +21,24 @@
|
||||||
#define SRE_OP_CALL 7
|
#define SRE_OP_CALL 7
|
||||||
#define SRE_OP_CATEGORY 8
|
#define SRE_OP_CATEGORY 8
|
||||||
#define SRE_OP_CHARSET 9
|
#define SRE_OP_CHARSET 9
|
||||||
#define SRE_OP_GROUP 10
|
#define SRE_OP_GROUPREF 10
|
||||||
#define SRE_OP_GROUP_IGNORE 11
|
#define SRE_OP_GROUPREF_IGNORE 11
|
||||||
#define SRE_OP_INDEX 12
|
#define SRE_OP_IN 12
|
||||||
#define SRE_OP_IN 13
|
#define SRE_OP_IN_IGNORE 13
|
||||||
#define SRE_OP_IN_IGNORE 14
|
#define SRE_OP_INFO 14
|
||||||
#define SRE_OP_INFO 15
|
#define SRE_OP_JUMP 15
|
||||||
#define SRE_OP_JUMP 16
|
#define SRE_OP_LITERAL 16
|
||||||
#define SRE_OP_LITERAL 17
|
#define SRE_OP_LITERAL_IGNORE 17
|
||||||
#define SRE_OP_LITERAL_IGNORE 18
|
#define SRE_OP_MARK 18
|
||||||
#define SRE_OP_MARK 19
|
#define SRE_OP_MAX_UNTIL 19
|
||||||
#define SRE_OP_MAX_REPEAT 20
|
#define SRE_OP_MIN_UNTIL 20
|
||||||
#define SRE_OP_MAX_REPEAT_ONE 21
|
#define SRE_OP_NOT_LITERAL 21
|
||||||
#define SRE_OP_MIN_REPEAT 22
|
#define SRE_OP_NOT_LITERAL_IGNORE 22
|
||||||
#define SRE_OP_NOT_LITERAL 23
|
#define SRE_OP_NEGATE 23
|
||||||
#define SRE_OP_NOT_LITERAL_IGNORE 24
|
#define SRE_OP_RANGE 24
|
||||||
#define SRE_OP_NEGATE 25
|
#define SRE_OP_REPEAT 25
|
||||||
#define SRE_OP_RANGE 26
|
#define SRE_OP_REPEAT_ONE 26
|
||||||
#define SRE_OP_REPEAT 27
|
#define SRE_OP_SUBPATTERN 27
|
||||||
#define SRE_AT_BEGINNING 0
|
#define SRE_AT_BEGINNING 0
|
||||||
#define SRE_AT_BEGINNING_LINE 1
|
#define SRE_AT_BEGINNING_LINE 1
|
||||||
#define SRE_AT_BOUNDARY 2
|
#define SRE_AT_BOUNDARY 2
|
||||||
|
|
Loading…
Reference in New Issue