-- fixed width calculations for alternations
-- fixed literal check in branch operator (this broke test_tokenize, as reported by Mark Favas) -- added REPEAT_ONE operator (still not enabled, though) -- added some debugging stuff (maxlevel)
This commit is contained in:
parent
329e29198d
commit
2f2c67d7e5
|
@ -270,6 +270,7 @@ def _compile_info(code, pattern, flags):
|
|||
table[i+1] = table[table[i+1]-1]+1
|
||||
code.extend(table[1:]) # don't store first entry
|
||||
elif charset:
|
||||
# FIXME: use charset optimizer!
|
||||
for char in charset:
|
||||
emit(OPCODES[LITERAL])
|
||||
emit(char)
|
||||
|
@ -283,7 +284,7 @@ try:
|
|||
except NameError:
|
||||
pass
|
||||
|
||||
def _compile1(p, flags):
|
||||
def _code(p, flags):
|
||||
|
||||
flags = p.pattern.flags | flags
|
||||
code = []
|
||||
|
@ -308,7 +309,7 @@ def compile(p, flags=0):
|
|||
else:
|
||||
pattern = None
|
||||
|
||||
code = _compile1(p, flags)
|
||||
code = _code(p, flags)
|
||||
|
||||
# print code
|
||||
|
||||
|
|
|
@ -137,12 +137,12 @@ class SubPattern:
|
|||
lo = hi = 0L
|
||||
for op, av in self.data:
|
||||
if op is BRANCH:
|
||||
l = sys.maxint
|
||||
h = 0
|
||||
i = sys.maxint
|
||||
j = 0
|
||||
for av in av[1]:
|
||||
i, j = av.getwidth()
|
||||
l = min(l, i)
|
||||
h = min(h, j)
|
||||
l, h = av.getwidth()
|
||||
i = min(i, l)
|
||||
j = min(j, h)
|
||||
lo = lo + i
|
||||
hi = hi + j
|
||||
elif op is CALL:
|
||||
|
|
217
Modules/_sre.c
217
Modules/_sre.c
|
@ -219,6 +219,14 @@ mark_init(SRE_STATE* state)
|
|||
static void
|
||||
mark_fini(SRE_STATE* state)
|
||||
{
|
||||
#if 0
|
||||
/* FIXME: debugging */
|
||||
if (state->maxlevel > 0)
|
||||
printf("max %d\n", state->maxlevel);
|
||||
if (state->mark_stack_base > 0)
|
||||
printf("mark stack %d\n", state->mark_stack_base);
|
||||
#endif
|
||||
|
||||
if (state->mark_stack)
|
||||
free(state->mark_stack);
|
||||
mark_init(state);
|
||||
|
@ -430,7 +438,7 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
|
|||
}
|
||||
|
||||
LOCAL(int)
|
||||
SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
||||
SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
||||
{
|
||||
/* check if string matches the given pattern. returns -1 for
|
||||
error, 0 for failure, and 1 for success */
|
||||
|
@ -443,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
|
||||
SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
|
||||
|
||||
TRACE(("%8d: enter\n", PTR(ptr)));
|
||||
TRACE(("%8d: enter %d\n", PTR(ptr), level));
|
||||
|
||||
if (pattern[0] == SRE_OP_INFO) {
|
||||
/* optimization info block */
|
||||
|
@ -456,6 +464,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
pattern += pattern[1] + 1;
|
||||
}
|
||||
|
||||
/* FIXME: debugging */
|
||||
if (level > state->maxlevel)
|
||||
state->maxlevel = level;
|
||||
|
||||
for (;;) {
|
||||
|
||||
switch (*pattern++) {
|
||||
|
@ -623,7 +635,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
state->ptr = ptr - pattern[1];
|
||||
if (state->ptr < state->beginning)
|
||||
return 0;
|
||||
i = SRE_MATCH(state, pattern + 2);
|
||||
i = SRE_MATCH(state, pattern + 2, level + 1);
|
||||
if (i <= 0)
|
||||
return i;
|
||||
if (pattern[1] > 0 && state->ptr != ptr)
|
||||
|
@ -638,7 +650,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
state->ptr = ptr - pattern[1];
|
||||
if (state->ptr < state->beginning)
|
||||
return 0;
|
||||
i = SRE_MATCH(state, pattern + 2);
|
||||
i = SRE_MATCH(state, pattern + 2, level + 1);
|
||||
if (i < 0)
|
||||
return i;
|
||||
if (i)
|
||||
|
@ -656,10 +668,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
lastmark = state->lastmark;
|
||||
while (pattern[0]) {
|
||||
TRACE(("%8d: try branch\n", PTR(ptr)));
|
||||
if (pattern[2] != SRE_OP_LITERAL ||
|
||||
(ptr < end && (SRE_CODE) ptr[0] == pattern[3])) {
|
||||
if (pattern[1] != SRE_OP_LITERAL ||
|
||||
(ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
|
||||
state->ptr = ptr;
|
||||
i = SRE_MATCH(state, pattern + 1);
|
||||
i = SRE_MATCH(state, pattern + 1, level + 1);
|
||||
if (i)
|
||||
return i;
|
||||
}
|
||||
|
@ -670,6 +682,155 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
}
|
||||
return 0;
|
||||
|
||||
case SRE_OP_REPEAT_ONE:
|
||||
/* match repeated sequence (maximizing regexp) */
|
||||
|
||||
/* this operator only works if the repeated item is
|
||||
exactly one character wide, and we're not already
|
||||
collecting backtracking points. for other cases,
|
||||
use the MAX_REPEAT operator instead */
|
||||
|
||||
/* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
|
||||
|
||||
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
|
||||
pattern[1], pattern[2]));
|
||||
|
||||
count = 0;
|
||||
|
||||
if (pattern[3] == SRE_OP_ANY) {
|
||||
/* repeated wildcard. skip to the end of the target
|
||||
string, and backtrack from there */
|
||||
/* FIXME: must look for line endings */
|
||||
if (ptr + pattern[1] > end)
|
||||
return 0; /* cannot match */
|
||||
count = pattern[2];
|
||||
if (count > end - ptr)
|
||||
count = end - ptr;
|
||||
ptr += count;
|
||||
|
||||
} else if (pattern[3] == SRE_OP_LITERAL) {
|
||||
/* repeated literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) ptr[0] != chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
|
||||
} else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
|
||||
/* repeated literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
|
||||
} else if (pattern[3] == SRE_OP_NOT_LITERAL) {
|
||||
/* repeated non-literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) ptr[0] == chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
|
||||
} else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
|
||||
/* repeated non-literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
|
||||
} else if (pattern[3] == SRE_OP_IN) {
|
||||
/* repeated set */
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
|
||||
} else {
|
||||
/* repeated single character pattern */
|
||||
state->ptr = ptr;
|
||||
while (count < (int) pattern[2]) {
|
||||
i = SRE_MATCH(state, pattern + 3, level + 1);
|
||||
if (i < 0)
|
||||
return i;
|
||||
if (!i)
|
||||
break;
|
||||
count++;
|
||||
}
|
||||
state->ptr = ptr;
|
||||
ptr += count;
|
||||
}
|
||||
|
||||
/* when we arrive here, count contains the number of
|
||||
matches, and ptr points to the tail of the target
|
||||
string. check if the rest of the pattern matches,
|
||||
and backtrack if not. */
|
||||
|
||||
TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
|
||||
|
||||
if (count < (int) pattern[1])
|
||||
return 0;
|
||||
|
||||
if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
|
||||
/* tail is empty. we're finished */
|
||||
TRACE(("%8d: tail is empty\n", PTR(ptr)));
|
||||
state->ptr = ptr;
|
||||
return 1;
|
||||
|
||||
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
|
||||
/* tail starts with a literal. skip positions where
|
||||
the rest of the pattern cannot possibly match */
|
||||
SRE_CODE chr = pattern[pattern[0]+1];
|
||||
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
|
||||
for (;;) {
|
||||
TRACE(("%8d: scan for tail match\n", PTR(ptr)));
|
||||
while (count >= (int) pattern[1] &&
|
||||
(ptr >= end || *ptr != chr)) {
|
||||
ptr--;
|
||||
count--;
|
||||
}
|
||||
TRACE(("%8d: check tail\n", PTR(ptr)));
|
||||
if (count < (int) pattern[1])
|
||||
break;
|
||||
state->ptr = ptr;
|
||||
i = SRE_MATCH(state, pattern + pattern[0], level + 1);
|
||||
if (i > 0) {
|
||||
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
|
||||
return 1;
|
||||
}
|
||||
ptr--;
|
||||
count--;
|
||||
}
|
||||
|
||||
} else {
|
||||
/* general case */
|
||||
TRACE(("%8d: tail is pattern\n", PTR(ptr)));
|
||||
while (count >= (int) pattern[1]) {
|
||||
state->ptr = ptr;
|
||||
i = SRE_MATCH(state, pattern + pattern[0], level + 1);
|
||||
if (i < 0)
|
||||
return i;
|
||||
if (i) {
|
||||
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
|
||||
return 1;
|
||||
}
|
||||
ptr--;
|
||||
count--;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
case SRE_OP_REPEAT:
|
||||
/* create repeat context. all the hard work is done
|
||||
by the UNTIL operator */
|
||||
|
@ -677,8 +838,6 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
TRACE(("%8d: repeat {%d,%d}\n", PTR(ptr),
|
||||
pattern[1], pattern[2]));
|
||||
|
||||
state->ptr = ptr;
|
||||
|
||||
rep.count = -1;
|
||||
rep.pattern = pattern;
|
||||
|
||||
|
@ -686,10 +845,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
rep.prev = state->repeat;
|
||||
state->repeat = &rep;
|
||||
|
||||
i = SRE_MATCH(state, pattern + pattern[0]);
|
||||
state->ptr = ptr;
|
||||
i = SRE_MATCH(state, pattern + pattern[0], level + 1);
|
||||
|
||||
state->repeat = rep.prev;
|
||||
/* free(rp); */
|
||||
|
||||
return i;
|
||||
|
||||
|
@ -714,7 +873,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
/* not enough matches */
|
||||
TRACE(("%8d: match item (required)\n", PTR(ptr)));
|
||||
rp->count = count;
|
||||
i = SRE_MATCH(state, rp->pattern + 3);
|
||||
i = SRE_MATCH(state, rp->pattern + 3, level + 1);
|
||||
if (i)
|
||||
return i;
|
||||
rp->count = count - 1;
|
||||
|
@ -729,7 +888,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
rp->count = count;
|
||||
lastmark = state->lastmark;
|
||||
mark_save(state, 0, lastmark);
|
||||
i = SRE_MATCH(state, rp->pattern + 3);
|
||||
i = SRE_MATCH(state, rp->pattern + 3, level + 1);
|
||||
if (i)
|
||||
return i;
|
||||
mark_restore(state, 0, lastmark);
|
||||
|
@ -741,11 +900,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
tail matches */
|
||||
TRACE(("%8d: match tail\n", PTR(ptr)));
|
||||
state->repeat = rp->prev;
|
||||
i = SRE_MATCH(state, pattern);
|
||||
if (i) {
|
||||
/* free(rp); */
|
||||
i = SRE_MATCH(state, pattern, level + 1);
|
||||
if (i)
|
||||
return i;
|
||||
}
|
||||
state->repeat = rp;
|
||||
return 0;
|
||||
|
||||
|
@ -767,7 +924,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
/* not enough matches */
|
||||
TRACE(("%8d: match item (required)\n", PTR(ptr)));
|
||||
rp->count = count;
|
||||
i = SRE_MATCH(state, rp->pattern + 3);
|
||||
i = SRE_MATCH(state, rp->pattern + 3, level + 1);
|
||||
if (i)
|
||||
return i;
|
||||
rp->count = count-1;
|
||||
|
@ -778,7 +935,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
/* see if the tail matches */
|
||||
TRACE(("%8d: match tail\n", PTR(ptr)));
|
||||
state->repeat = rp->prev;
|
||||
i = SRE_MATCH(state, pattern);
|
||||
i = SRE_MATCH(state, pattern, level + 1);
|
||||
if (i) {
|
||||
/* free(rp); */
|
||||
return i;
|
||||
|
@ -790,7 +947,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
|
||||
TRACE(("%8d: match item (optional)\n", PTR(ptr)));
|
||||
rp->count = count;
|
||||
i = SRE_MATCH(state, rp->pattern + 3);
|
||||
i = SRE_MATCH(state, rp->pattern + 3, level + 1);
|
||||
if (i)
|
||||
return i;
|
||||
rp->count = count - 1;
|
||||
|
@ -865,7 +1022,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
state->ptr = ptr + 1;
|
||||
if (flags & SRE_INFO_LITERAL)
|
||||
return 1; /* we got all of it */
|
||||
status = SRE_MATCH(state, pattern + 2*prefix_len);
|
||||
status = SRE_MATCH(state, pattern + 2*prefix_len, 1);
|
||||
if (status != 0)
|
||||
return status;
|
||||
/* close but no cigar -- try again */
|
||||
|
@ -893,7 +1050,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
|
||||
state->start = ptr;
|
||||
state->ptr = ++ptr;
|
||||
status = SRE_MATCH(state, pattern + 2);
|
||||
status = SRE_MATCH(state, pattern + 2, 1);
|
||||
if (status != 0)
|
||||
break;
|
||||
}
|
||||
|
@ -907,7 +1064,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
|
||||
state->start = ptr;
|
||||
state->ptr = ptr;
|
||||
status = SRE_MATCH(state, pattern);
|
||||
status = SRE_MATCH(state, pattern, 1);
|
||||
if (status != 0)
|
||||
break;
|
||||
}
|
||||
|
@ -916,7 +1073,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
while (ptr <= end) {
|
||||
TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
|
||||
state->start = state->ptr = ptr++;
|
||||
status = SRE_MATCH(state, pattern);
|
||||
status = SRE_MATCH(state, pattern, 1);
|
||||
if (status != 0)
|
||||
break;
|
||||
}
|
||||
|
@ -1032,6 +1189,9 @@ state_reset(SRE_STATE* state)
|
|||
|
||||
state->repeat = NULL;
|
||||
|
||||
/* FIXME: debugging */
|
||||
state->maxlevel = 0;
|
||||
|
||||
mark_fini(state);
|
||||
}
|
||||
|
||||
|
@ -1110,6 +1270,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
|
|||
state->lower = sre_lower;
|
||||
|
||||
state->mark_stack = NULL;
|
||||
state->mark_stack_base = 0;
|
||||
|
||||
state_reset(state);
|
||||
|
||||
|
@ -1262,10 +1423,10 @@ pattern_match(PatternObject* self, PyObject* args)
|
|||
state.ptr = state.start;
|
||||
|
||||
if (state.charsize == 1) {
|
||||
status = sre_match(&state, PatternObject_GetCode(self));
|
||||
status = sre_match(&state, PatternObject_GetCode(self), 1);
|
||||
} else {
|
||||
#if defined(HAVE_UNICODE)
|
||||
status = sre_umatch(&state, PatternObject_GetCode(self));
|
||||
status = sre_umatch(&state, PatternObject_GetCode(self), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1941,10 +2102,10 @@ scanner_match(ScannerObject* self, PyObject* args)
|
|||
state->ptr = state->start;
|
||||
|
||||
if (state->charsize == 1) {
|
||||
status = sre_match(state, PatternObject_GetCode(self->pattern));
|
||||
status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
|
||||
} else {
|
||||
#if defined(HAVE_UNICODE)
|
||||
status = sre_umatch(state, PatternObject_GetCode(self->pattern));
|
||||
status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -74,6 +74,8 @@ typedef struct {
|
|||
SRE_REPEAT *repeat; /* current repeat context */
|
||||
/* hooks */
|
||||
SRE_TOLOWER_HOOK lower;
|
||||
/* debugging */
|
||||
int maxlevel;
|
||||
} SRE_STATE;
|
||||
|
||||
typedef struct {
|
||||
|
|
Loading…
Reference in New Issue