final 0.9.8 updates:
-- added REPEAT_ONE operator -- added ANY_ALL operator (used to represent "(?s).")
This commit is contained in:
parent
fb06539e99
commit
e186983842
|
@ -98,7 +98,10 @@ def _compile(pattern, flags=0):
|
|||
return _cache[key]
|
||||
except KeyError:
|
||||
pass
|
||||
p = sre_compile.compile(pattern, flags)
|
||||
try:
|
||||
p = sre_compile.compile(pattern, flags)
|
||||
except error, v:
|
||||
raise error, v # invalid expression
|
||||
if len(_cache) >= _MAXCACHE:
|
||||
_cache.clear()
|
||||
_cache[key] = p
|
||||
|
|
|
@ -73,6 +73,13 @@ def _charset(charset, fixup=None):
|
|||
return out
|
||||
return charset
|
||||
|
||||
def _simple(av):
|
||||
# check if av is a "simple" operator
|
||||
lo, hi = av[2].getwidth()
|
||||
if lo == 0:
|
||||
raise error, "nothing to repeat"
|
||||
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
|
||||
|
||||
def _compile(code, pattern, flags):
|
||||
# internal: compile a (sub)pattern
|
||||
emit = code.append
|
||||
|
@ -116,10 +123,9 @@ def _compile(code, pattern, flags):
|
|||
code[skip] = len(code) - skip
|
||||
elif op is ANY:
|
||||
if flags & SRE_FLAG_DOTALL:
|
||||
emit(OPCODES[op])
|
||||
emit(OPCODES[ANY_ALL])
|
||||
else:
|
||||
emit(OPCODES[CATEGORY])
|
||||
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
|
||||
emit(OPCODES[ANY])
|
||||
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
|
||||
if flags & SRE_FLAG_TEMPLATE:
|
||||
raise error, "internal: unsupported template operator"
|
||||
|
@ -130,30 +136,25 @@ def _compile(code, pattern, flags):
|
|||
_compile(code, av[2], flags)
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
elif _simple(av) and op == MAX_REPEAT:
|
||||
emit(OPCODES[REPEAT_ONE])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
_compile(code, av[2], flags)
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
else:
|
||||
lo, hi = av[2].getwidth()
|
||||
if lo == 0:
|
||||
raise error, "nothing to repeat"
|
||||
if 0 and lo == hi == 1 and op is MAX_REPEAT:
|
||||
# FIXME: <fl> fast and wrong (but we'll fix that)
|
||||
emit(OPCODES[REPEAT_ONE])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
_compile(code, av[2], flags)
|
||||
emit(OPCODES[SUCCESS])
|
||||
code[skip] = len(code) - skip
|
||||
emit(OPCODES[REPEAT])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
_compile(code, av[2], flags)
|
||||
code[skip] = len(code) - skip
|
||||
if op == MAX_REPEAT:
|
||||
emit(OPCODES[MAX_UNTIL])
|
||||
else:
|
||||
emit(OPCODES[REPEAT])
|
||||
skip = len(code); emit(0)
|
||||
emit(av[0])
|
||||
emit(av[1])
|
||||
_compile(code, av[2], flags)
|
||||
code[skip] = len(code) - skip
|
||||
if op == MAX_REPEAT:
|
||||
emit(OPCODES[MAX_UNTIL])
|
||||
else:
|
||||
emit(OPCODES[MIN_UNTIL])
|
||||
emit(OPCODES[MIN_UNTIL])
|
||||
elif op is SUBPATTERN:
|
||||
if av[0]:
|
||||
emit(OPCODES[MARK])
|
||||
|
|
|
@ -20,6 +20,7 @@ FAILURE = "failure"
|
|||
SUCCESS = "success"
|
||||
|
||||
ANY = "any"
|
||||
ANY_ALL = "any_all"
|
||||
ASSERT = "assert"
|
||||
ASSERT_NOT = "assert_not"
|
||||
AT = "at"
|
||||
|
@ -81,7 +82,7 @@ OPCODES = [
|
|||
# failure=0 success=1 (just because it looks better that way :-)
|
||||
FAILURE, SUCCESS,
|
||||
|
||||
ANY,
|
||||
ANY, ANY_ALL,
|
||||
ASSERT, ASSERT_NOT,
|
||||
AT,
|
||||
BRANCH,
|
||||
|
|
|
@ -142,7 +142,7 @@ class SubPattern:
|
|||
for av in av[1]:
|
||||
l, h = av.getwidth()
|
||||
i = min(i, l)
|
||||
j = min(j, h)
|
||||
j = max(j, h)
|
||||
lo = lo + i
|
||||
hi = hi + j
|
||||
elif op is CALL:
|
||||
|
|
|
@ -448,6 +448,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
|||
int i, count;
|
||||
SRE_REPEAT* rp;
|
||||
int lastmark;
|
||||
SRE_CODE chr;
|
||||
|
||||
SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
|
||||
|
||||
|
@ -525,8 +526,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
|||
break;
|
||||
|
||||
case SRE_OP_ANY:
|
||||
/* match anything */
|
||||
/* match anything (except a newline) */
|
||||
/* <ANY> */
|
||||
TRACE(("%8d: anything (except newline)\n", PTR(ptr)));
|
||||
if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
|
||||
return 0;
|
||||
ptr++;
|
||||
break;
|
||||
|
||||
case SRE_OP_ANY_ALL:
|
||||
/* match anything */
|
||||
/* <ANY_ALL> */
|
||||
TRACE(("%8d: anything\n", PTR(ptr)));
|
||||
if (ptr >= end)
|
||||
return 0;
|
||||
|
@ -695,60 +705,79 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
|||
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
|
||||
pattern[1], pattern[2]));
|
||||
|
||||
if (ptr + pattern[1] > end)
|
||||
return 0; /* cannot match */
|
||||
|
||||
count = 0;
|
||||
|
||||
if (pattern[3] == SRE_OP_ANY) {
|
||||
switch (pattern[3]) {
|
||||
|
||||
case SRE_OP_ANY:
|
||||
/* repeated wildcard. */
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
break;
|
||||
|
||||
case SRE_OP_ANY_ALL:
|
||||
/* repeated wildcard. skip to the end of the target
|
||||
string, and backtrack from there */
|
||||
/* FIXME: must look for line endings */
|
||||
if (ptr + pattern[1] > end)
|
||||
return 0; /* cannot match */
|
||||
count = pattern[2];
|
||||
if (count > end - ptr)
|
||||
count = end - ptr;
|
||||
ptr += count;
|
||||
break;
|
||||
|
||||
} else if (pattern[3] == SRE_OP_LITERAL) {
|
||||
case SRE_OP_LITERAL:
|
||||
/* repeated literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) ptr[0] != chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
break;
|
||||
|
||||
} else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
|
||||
case SRE_OP_LITERAL_IGNORE:
|
||||
/* repeated literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
break;
|
||||
|
||||
} else if (pattern[3] == SRE_OP_NOT_LITERAL) {
|
||||
case SRE_OP_NOT_LITERAL:
|
||||
/* repeated non-literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) ptr[0] == chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
|
||||
} else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
|
||||
break;
|
||||
|
||||
case SRE_OP_NOT_LITERAL_IGNORE:
|
||||
/* repeated non-literal */
|
||||
SRE_CODE chr = pattern[4];
|
||||
chr = pattern[4];
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
|
||||
break;
|
||||
ptr++;
|
||||
count++;
|
||||
}
|
||||
break;
|
||||
|
||||
} else if (pattern[3] == SRE_OP_IN) {
|
||||
case SRE_OP_IN:
|
||||
/* repeated set */
|
||||
while (count < (int) pattern[2]) {
|
||||
if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
|
||||
|
@ -756,8 +785,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
|||
ptr++;
|
||||
count++;
|
||||
}
|
||||
break;
|
||||
|
||||
} else {
|
||||
default:
|
||||
/* repeated single character pattern */
|
||||
state->ptr = ptr;
|
||||
while (count < (int) pattern[2]) {
|
||||
|
@ -770,6 +800,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
|||
}
|
||||
state->ptr = ptr;
|
||||
ptr += count;
|
||||
break;
|
||||
}
|
||||
|
||||
/* when we arrive here, count contains the number of
|
||||
|
@ -791,7 +822,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
|
|||
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
|
||||
/* tail starts with a literal. skip positions where
|
||||
the rest of the pattern cannot possibly match */
|
||||
SRE_CODE chr = pattern[pattern[0]+1];
|
||||
chr = pattern[pattern[0]+1];
|
||||
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
|
||||
for (;;) {
|
||||
TRACE(("%8d: scan for tail match\n", PTR(ptr)));
|
||||
|
|
|
@ -14,31 +14,32 @@
|
|||
#define SRE_OP_FAILURE 0
|
||||
#define SRE_OP_SUCCESS 1
|
||||
#define SRE_OP_ANY 2
|
||||
#define SRE_OP_ASSERT 3
|
||||
#define SRE_OP_ASSERT_NOT 4
|
||||
#define SRE_OP_AT 5
|
||||
#define SRE_OP_BRANCH 6
|
||||
#define SRE_OP_CALL 7
|
||||
#define SRE_OP_CATEGORY 8
|
||||
#define SRE_OP_CHARSET 9
|
||||
#define SRE_OP_GROUPREF 10
|
||||
#define SRE_OP_GROUPREF_IGNORE 11
|
||||
#define SRE_OP_IN 12
|
||||
#define SRE_OP_IN_IGNORE 13
|
||||
#define SRE_OP_INFO 14
|
||||
#define SRE_OP_JUMP 15
|
||||
#define SRE_OP_LITERAL 16
|
||||
#define SRE_OP_LITERAL_IGNORE 17
|
||||
#define SRE_OP_MARK 18
|
||||
#define SRE_OP_MAX_UNTIL 19
|
||||
#define SRE_OP_MIN_UNTIL 20
|
||||
#define SRE_OP_NOT_LITERAL 21
|
||||
#define SRE_OP_NOT_LITERAL_IGNORE 22
|
||||
#define SRE_OP_NEGATE 23
|
||||
#define SRE_OP_RANGE 24
|
||||
#define SRE_OP_REPEAT 25
|
||||
#define SRE_OP_REPEAT_ONE 26
|
||||
#define SRE_OP_SUBPATTERN 27
|
||||
#define SRE_OP_ANY_ALL 3
|
||||
#define SRE_OP_ASSERT 4
|
||||
#define SRE_OP_ASSERT_NOT 5
|
||||
#define SRE_OP_AT 6
|
||||
#define SRE_OP_BRANCH 7
|
||||
#define SRE_OP_CALL 8
|
||||
#define SRE_OP_CATEGORY 9
|
||||
#define SRE_OP_CHARSET 10
|
||||
#define SRE_OP_GROUPREF 11
|
||||
#define SRE_OP_GROUPREF_IGNORE 12
|
||||
#define SRE_OP_IN 13
|
||||
#define SRE_OP_IN_IGNORE 14
|
||||
#define SRE_OP_INFO 15
|
||||
#define SRE_OP_JUMP 16
|
||||
#define SRE_OP_LITERAL 17
|
||||
#define SRE_OP_LITERAL_IGNORE 18
|
||||
#define SRE_OP_MARK 19
|
||||
#define SRE_OP_MAX_UNTIL 20
|
||||
#define SRE_OP_MIN_UNTIL 21
|
||||
#define SRE_OP_NOT_LITERAL 22
|
||||
#define SRE_OP_NOT_LITERAL_IGNORE 23
|
||||
#define SRE_OP_NEGATE 24
|
||||
#define SRE_OP_RANGE 25
|
||||
#define SRE_OP_REPEAT 26
|
||||
#define SRE_OP_REPEAT_ONE 27
|
||||
#define SRE_OP_SUBPATTERN 28
|
||||
#define SRE_AT_BEGINNING 0
|
||||
#define SRE_AT_BEGINNING_LINE 1
|
||||
#define SRE_AT_BOUNDARY 2
|
||||
|
|
Loading…
Reference in New Issue