- fixed lookahead assertions (#10, #11, #12)

- untabified sre_constants.py
This commit is contained in:
Fredrik Lundh 2000-06-30 10:41:31 +00:00
parent a4657f736c
commit 43b3b49b5a
6 changed files with 146 additions and 90 deletions

View File

@ -26,52 +26,12 @@ def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit = code.append
for op, av in pattern:
if op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(OPCODES[op])
else:
emit(OPCODES[CATEGORY])
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (SUCCESS, FAILURE):
emit(OPCODES[op])
elif op is AT:
emit(OPCODES[op])
if flags & SRE_FLAG_MULTILINE:
emit(ATCODES[AT_MULTILINE[av]])
else:
emit(ATCODES[av])
elif op is BRANCH:
emit(OPCODES[op])
tail = []
for av in av[1]:
skip = len(code); emit(0)
_compile(code, av, flags)
emit(OPCODES[JUMP])
tail.append(len(code)); emit(0)
code[skip] = len(code) - skip
emit(0) # end of branch
for tail in tail:
code[tail] = len(code) - tail
elif op is CALL:
emit(OPCODES[op])
skip = len(code); emit(0)
_compile(code, av, flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is CATEGORY:
emit(OPCODES[op])
if flags & SRE_FLAG_LOCALE:
emit(CHCODES[CH_LOCALE[av]])
elif flags & SRE_FLAG_UNICODE:
emit(CHCODES[CH_UNICODE[av]])
else:
emit(CHCODES[av])
elif op is GROUP:
if op in (LITERAL, NOT_LITERAL):
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
else:
emit(OPCODES[op])
emit(av-1)
emit(ord(av))
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
@ -101,15 +61,12 @@ def _compile(code, pattern, flags):
raise error, "internal: unsupported set operator"
emit(OPCODES[FAILURE])
code[skip] = len(code) - skip
elif op in (LITERAL, NOT_LITERAL):
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
else:
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(OPCODES[op])
emit(ord(av))
elif op is MARK:
emit(OPCODES[op])
emit(av)
else:
emit(OPCODES[CATEGORY])
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE:
emit(OPCODES[REPEAT])
@ -150,6 +107,49 @@ def _compile(code, pattern, flags):
if group:
emit(OPCODES[MARK])
emit((group-1)*2+1)
elif op in (SUCCESS, FAILURE):
emit(OPCODES[op])
elif op in (ASSERT, ASSERT_NOT, CALL):
emit(OPCODES[op])
skip = len(code); emit(0)
_compile(code, av, flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is AT:
emit(OPCODES[op])
if flags & SRE_FLAG_MULTILINE:
emit(ATCODES[AT_MULTILINE[av]])
else:
emit(ATCODES[av])
elif op is BRANCH:
emit(OPCODES[op])
tail = []
for av in av[1]:
skip = len(code); emit(0)
_compile(code, av, flags)
emit(OPCODES[JUMP])
tail.append(len(code)); emit(0)
code[skip] = len(code) - skip
emit(0) # end of branch
for tail in tail:
code[tail] = len(code) - tail
elif op is CATEGORY:
emit(OPCODES[op])
if flags & SRE_FLAG_LOCALE:
emit(CHCODES[CH_LOCALE[av]])
elif flags & SRE_FLAG_UNICODE:
emit(CHCODES[CH_UNICODE[av]])
else:
emit(CHCODES[av])
elif op is GROUP:
if flags & SRE_FLAG_IGNORECASE:
emit(OPCODES[OP_IGNORE[op]])
else:
emit(OPCODES[op])
emit(av-1)
elif op is MARK:
emit(OPCODES[op])
emit(av)
else:
raise ValueError, ("unsupported operand type", op)

View File

@ -23,6 +23,7 @@ SUCCESS = "success"
ANY = "any"
ASSERT = "assert"
ASSERT_NOT = "assert_not"
AT = "at"
BRANCH = "branch"
CALL = "call"
@ -81,7 +82,7 @@ OPCODES = [
FAILURE, SUCCESS,
ANY,
ASSERT,
ASSERT, ASSERT_NOT,
AT,
BRANCH,
CALL,
@ -121,8 +122,8 @@ def makedict(list):
d = {}
i = 0
for item in list:
d[item] = i
i = i + 1
d[item] = i
i = i + 1
return d
OPCODES = makedict(OPCODES)
@ -176,12 +177,27 @@ SRE_FLAG_VERBOSE = 64
if __name__ == "__main__":
import string
def dump(f, d, prefix):
items = d.items()
items.sort(lambda a, b: cmp(a[1], b[1]))
for k, v in items:
f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
items = d.items()
items.sort(lambda a, b: cmp(a[1], b[1]))
for k, v in items:
f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
f = open("sre_constants.h", "w")
f.write("/* generated from sre_constants.py */\n")
f.write("""\
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
""")
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")

View File

@ -470,6 +470,25 @@ def _parse(source, state, flags=0):
if source.next is None or source.next == ")":
break
source.get()
elif source.next in ("=", "!"):
# lookahead assertions
char = source.get()
b = []
while 1:
p = _parse(source, state, flags)
if source.next == ")":
if b:
b.append(p)
p = _branch(state, b)
if char == "=":
subpattern.append((ASSERT, p))
else:
subpattern.append((ASSERT_NOT, p))
break
elif source.match("|"):
b.append(p)
else:
raise error, "pattern not properly closed"
else:
# flags
while FLAGS.has_key(source.next):

View File

@ -6,7 +6,4 @@ test_support -- test failed re module cPickle
=== grouping error ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/') 'd:msgs/tdir/sub1/-trial/' should be 'd:msgs/tdir/sub1/-tdir/'
=== grouping error ('([abc])*bcd', 'abcd', 0, 'found+"-"+g1', 'abcd-a') 'abcd-c' should be 'abcd-a'
=== grouping error ('(?i)([abc])*bcd', 'ABCD', 0, 'found+"-"+g1', 'ABCD-A') 'ABCD-C' should be 'ABCD-A'
=== Syntax error: ('a(?!b).', 'abad', 0, 'found', 'ad')
=== Syntax error: ('a(?=d).', 'abad', 0, 'found', 'ad')
=== Syntax error: ('a(?=c|d).', 'abad', 0, 'found', 'ad')
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')

View File

@ -20,6 +20,7 @@
* 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
* 00-06-30 fl tuning, fast search (0.9.3)
* 00-06-30 fl added assert (lookahead) primitives (0.9.4)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@ -30,7 +31,7 @@
#ifndef SRE_RECURSIVE
char copyright[] = " SRE 0.9.3 Copyright (c) 1997-2000 by Secret Labs AB ";
char copyright[] = " SRE 0.9.4 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
@ -576,11 +577,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
pattern += pattern[0];
break;
#if 0
case SRE_OP_CALL:
/* match subpattern, without backtracking */
case SRE_OP_ASSERT:
/* assert subpattern */
/* args: <skip> <pattern> */
TRACE(("%8d: subpattern\n", PTR(ptr)));
TRACE(("%8d: assert subpattern\n", PTR(ptr)));
state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1);
if (i < 0)
@ -588,9 +588,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
if (!i)
goto failure;
pattern += pattern[0];
ptr = state->ptr;
break;
#endif
case SRE_OP_ASSERT_NOT:
/* assert not subpattern */
/* args: <skip> <pattern> */
TRACE(("%8d: assert not subpattern\n", PTR(ptr)));
state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1);
if (i < 0)
return i;
if (i)
goto failure;
pattern += pattern[0];
break;
#if 0
case SRE_OP_MAX_REPEAT_ONE:

View File

@ -1,29 +1,42 @@
/* generated from sre_constants.py */
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
#define SRE_OP_ASSERT 3
#define SRE_OP_AT 4
#define SRE_OP_BRANCH 5
#define SRE_OP_CALL 6
#define SRE_OP_CATEGORY 7
#define SRE_OP_GROUP 8
#define SRE_OP_GROUP_IGNORE 9
#define SRE_OP_IN 10
#define SRE_OP_IN_IGNORE 11
#define SRE_OP_INFO 12
#define SRE_OP_JUMP 13
#define SRE_OP_LITERAL 14
#define SRE_OP_LITERAL_IGNORE 15
#define SRE_OP_MARK 16
#define SRE_OP_MAX_REPEAT 17
#define SRE_OP_MAX_REPEAT_ONE 18
#define SRE_OP_MIN_REPEAT 19
#define SRE_OP_NOT_LITERAL 20
#define SRE_OP_NOT_LITERAL_IGNORE 21
#define SRE_OP_NEGATE 22
#define SRE_OP_RANGE 23
#define SRE_OP_REPEAT 24
#define SRE_OP_ASSERT_NOT 4
#define SRE_OP_AT 5
#define SRE_OP_BRANCH 6
#define SRE_OP_CALL 7
#define SRE_OP_CATEGORY 8
#define SRE_OP_GROUP 9
#define SRE_OP_GROUP_IGNORE 10
#define SRE_OP_IN 11
#define SRE_OP_IN_IGNORE 12
#define SRE_OP_INFO 13
#define SRE_OP_JUMP 14
#define SRE_OP_LITERAL 15
#define SRE_OP_LITERAL_IGNORE 16
#define SRE_OP_MARK 17
#define SRE_OP_MAX_REPEAT 18
#define SRE_OP_MAX_REPEAT_ONE 19
#define SRE_OP_MIN_REPEAT 20
#define SRE_OP_NOT_LITERAL 21
#define SRE_OP_NOT_LITERAL_IGNORE 22
#define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2