SF patch #720991 by Gary Herron:

A small fix for bug #545855 and Greg Chapman's
addition of op code SRE_OP_MIN_REPEAT_ONE for
eliminating recursion on simple uses of pattern '*?' on a
long string.
This commit is contained in:
Guido van Rossum 2003-04-14 17:59:34 +00:00
parent 44c62ef5ee
commit 41c99e7f96
7 changed files with 89 additions and 4 deletions

View File

@ -55,8 +55,11 @@ def _compile(code, pattern, flags):
_compile(code, av[2], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif _simple(av) and op == MAX_REPEAT:
elif _simple(av) and op != REPEAT:
if op == MAX_REPEAT:
emit(OPCODES[REPEAT_ONE])
else:
emit(OPCODES[MIN_REPEAT_ONE])
skip = len(code); emit(0)
emit(av[0])
emit(av[1])

View File

@ -60,6 +60,7 @@ RANGE = "range"
REPEAT = "repeat"
REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern"
MIN_REPEAT_ONE = "min_repeat_one"
# positions
AT_BEGINNING = "at_beginning"
@ -120,7 +121,8 @@ OPCODES = [
RANGE,
REPEAT,
REPEAT_ONE,
SUBPATTERN
SUBPATTERN,
MIN_REPEAT_ONE
]

View File

@ -419,7 +419,7 @@ def _parse(source, state):
set.append(code1)
set.append((LITERAL, ord("-")))
break
else:
elif this:
if this[0] == "\\":
code2 = _class_escape(source, this)
else:
@ -431,6 +431,8 @@ def _parse(source, state):
if hi < lo:
raise error, "bad character range"
set.append((RANGE, (lo, hi)))
else:
raise error, "unexpected end of regular expression"
else:
if code1[0] is IN:
code1 = code1[1][0]

View File

@ -83,6 +83,19 @@ test(r"""sre.match(r'(a)?a','a').lastindex""", None)
test(r"""sre.match(r'(a)(b)?b','ab').lastindex""", 1)
test(r"""sre.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup""", 'a')
# bug 545855 -- This pattern failed to cause a compile error as it
# should, instead provoking a TypeError.
test(r"""sre.compile('foo[a-')""", None, sre.error)
# bugs 418626 at al. -- Testing Greg Chapman's addition of op code
# SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
# pattern '*?' on a long string.
test(r"""sre.match('.*?c', 10000*'ab'+'cd').end(0)""", 20001)
test(r"""sre.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0)""", 20003)
test(r"""sre.match('.*?cd', 20000*'abc'+'de').end(0)""", 60001)
# non-simple '*?' still recurses and hits the recursion limit
test(r"""sre.search('(a|b)*?c', 10000*'ab'+'cd').end(0)""", None, RuntimeError)
if verbose:
print 'Running tests on sre.sub'

View File

@ -54,6 +54,10 @@ Core and builtins
Extension modules
-----------------
- The .*? pattern in the re module is now special-cased to avoid the
recursion limit. (SF patch #720991 -- many thanks to Gary Herron
and Greg Chapman.)
- New function sys.call_tracing() allows pdb to debug code
recursively.

View File

@ -993,6 +993,66 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
}
return 0;
case SRE_OP_MIN_REPEAT_ONE:
/* match repeated sequence (minimizing regexp) */
/* this operator only works if the repeated item is
exactly one character wide, and we're not already
collecting backtracking points. for other cases,
use the MIN_REPEAT operator */
/* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr,
pattern[1], pattern[2]));
if (ptr + pattern[1] > end)
return 0; /* cannot match */
state->ptr = ptr;
if (pattern[1] == 0)
count = 0;
else {
/* count using pattern min as the maximum */
count = SRE_COUNT(state, pattern + 3, pattern[1], level + 1);
if (count < 0)
return count; /* exception */
if (count < (int) pattern[1])
return 0; /* did not match minimum number of times */
ptr += count; /* advance past minimum matches of repeat */
}
if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
/* tail is empty. we're finished */
state->ptr = ptr;
return 1;
} else {
/* general case */
int matchmax = ((int)pattern[2] == 65535);
int c;
lastmark = state->lastmark;
while (matchmax || count <= (int) pattern[2]) {
state->ptr = ptr;
i = SRE_MATCH(state, pattern + pattern[0], level + 1);
if (i)
return i;
state->ptr = ptr;
c = SRE_COUNT(state, pattern+3, 1, level+1);
if (c < 0)
return c;
if (c == 0)
break;
assert(c == 1);
ptr++;
count++;
}
lastmark_restore(state, lastmark);
}
return 0;
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */

View File

@ -42,6 +42,7 @@
#define SRE_OP_REPEAT 27
#define SRE_OP_REPEAT_ONE 28
#define SRE_OP_SUBPATTERN 29
#define SRE_OP_MIN_REPEAT_ONE 30
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2