From e186983842f0b27606b141010513fa8e3d0cc5db Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Tue, 1 Aug 2000 22:47:49 +0000 Subject: [PATCH] final 0.9.8 updates: -- added REPEAT_ONE operator -- added ANY_ALL operator (used to represent "(?s).") --- Lib/sre.py | 5 +++- Lib/sre_compile.py | 51 +++++++++++++++++----------------- Lib/sre_constants.py | 3 +- Lib/sre_parse.py | 2 +- Modules/_sre.c | 61 +++++++++++++++++++++++++++++++---------- Modules/sre_constants.h | 51 +++++++++++++++++----------------- 6 files changed, 105 insertions(+), 68 deletions(-) diff --git a/Lib/sre.py b/Lib/sre.py index 3e125a783a4..edfefc12b77 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -98,7 +98,10 @@ def _compile(pattern, flags=0): return _cache[key] except KeyError: pass - p = sre_compile.compile(pattern, flags) + try: + p = sre_compile.compile(pattern, flags) + except error, v: + raise error, v # invalid expression if len(_cache) >= _MAXCACHE: _cache.clear() _cache[key] = p diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 8fdcecf953a..abd619e1e9b 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -73,6 +73,13 @@ def _charset(charset, fixup=None): return out return charset +def _simple(av): + # check if av is a "simple" operator + lo, hi = av[2].getwidth() + if lo == 0: + raise error, "nothing to repeat" + return lo == hi == 1 and av[2][0][0] != SUBPATTERN + def _compile(code, pattern, flags): # internal: compile a (sub)pattern emit = code.append @@ -116,10 +123,9 @@ def _compile(code, pattern, flags): code[skip] = len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: - emit(OPCODES[op]) + emit(OPCODES[ANY_ALL]) else: - emit(OPCODES[CATEGORY]) - emit(CHCODES[CATEGORY_NOT_LINEBREAK]) + emit(OPCODES[ANY]) elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): if flags & SRE_FLAG_TEMPLATE: raise error, "internal: unsupported template operator" @@ -130,30 +136,25 @@ def _compile(code, pattern, flags): _compile(code, av[2], flags) emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip + elif _simple(av) and op == MAX_REPEAT: + emit(OPCODES[REPEAT_ONE]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip else: - lo, hi = av[2].getwidth() - if lo == 0: - raise error, "nothing to repeat" - if 0 and lo == hi == 1 and op is MAX_REPEAT: - # FIXME: fast and wrong (but we'll fix that) - emit(OPCODES[REPEAT_ONE]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip + emit(OPCODES[REPEAT]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + code[skip] = len(code) - skip + if op == MAX_REPEAT: + emit(OPCODES[MAX_UNTIL]) else: - emit(OPCODES[REPEAT]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - code[skip] = len(code) - skip - if op == MAX_REPEAT: - emit(OPCODES[MAX_UNTIL]) - else: - emit(OPCODES[MIN_UNTIL]) + emit(OPCODES[MIN_UNTIL]) elif op is SUBPATTERN: if av[0]: emit(OPCODES[MARK]) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index e5959150df8..5a20930ce1d 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -20,6 +20,7 @@ FAILURE = "failure" SUCCESS = "success" ANY = "any" +ANY_ALL = "any_all" ASSERT = "assert" ASSERT_NOT = "assert_not" AT = "at" @@ -81,7 +82,7 @@ OPCODES = [ # failure=0 success=1 (just because it looks better that way :-) FAILURE, SUCCESS, - ANY, + ANY, ANY_ALL, ASSERT, ASSERT_NOT, AT, BRANCH, diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 1eec3d3d192..1c1d0d5d44d 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -142,7 +142,7 @@ class SubPattern: for av in av[1]: l, h = av.getwidth() i = min(i, l) - j = min(j, h) + j = max(j, h) lo = lo + i hi = hi + j elif op is CALL: diff --git a/Modules/_sre.c b/Modules/_sre.c index 69bc17114e2..677edb8842e 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -448,6 +448,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) int i, count; SRE_REPEAT* rp; int lastmark; + SRE_CODE chr; SRE_REPEAT rep; /* FIXME: allocate in STATE instead */ @@ -525,8 +526,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) break; case SRE_OP_ANY: - /* match anything */ + /* match anything (except a newline) */ /* */ + TRACE(("%8d: anything (except newline)\n", PTR(ptr))); + if (ptr >= end || SRE_IS_LINEBREAK(ptr[0])) + return 0; + ptr++; + break; + + case SRE_OP_ANY_ALL: + /* match anything */ + /* */ TRACE(("%8d: anything\n", PTR(ptr))); if (ptr >= end) return 0; @@ -695,60 +705,79 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr), pattern[1], pattern[2])); + if (ptr + pattern[1] > end) + return 0; /* cannot match */ + count = 0; - if (pattern[3] == SRE_OP_ANY) { + switch (pattern[3]) { + + case SRE_OP_ANY: + /* repeated wildcard. */ + while (count < (int) pattern[2]) { + if (ptr >= end || SRE_IS_LINEBREAK(ptr[0])) + break; + ptr++; + count++; + } + break; + + case SRE_OP_ANY_ALL: /* repeated wildcard. skip to the end of the target string, and backtrack from there */ - /* FIXME: must look for line endings */ if (ptr + pattern[1] > end) return 0; /* cannot match */ count = pattern[2]; if (count > end - ptr) count = end - ptr; ptr += count; + break; - } else if (pattern[3] == SRE_OP_LITERAL) { + case SRE_OP_LITERAL: /* repeated literal */ - SRE_CODE chr = pattern[4]; + chr = pattern[4]; while (count < (int) pattern[2]) { if (ptr >= end || (SRE_CODE) ptr[0] != chr) break; ptr++; count++; } + break; - } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) { + case SRE_OP_LITERAL_IGNORE: /* repeated literal */ - SRE_CODE chr = pattern[4]; + chr = pattern[4]; while (count < (int) pattern[2]) { if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr) break; ptr++; count++; } + break; - } else if (pattern[3] == SRE_OP_NOT_LITERAL) { + case SRE_OP_NOT_LITERAL: /* repeated non-literal */ - SRE_CODE chr = pattern[4]; + chr = pattern[4]; while (count < (int) pattern[2]) { if (ptr >= end || (SRE_CODE) ptr[0] == chr) break; ptr++; count++; } - - } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) { + break; + + case SRE_OP_NOT_LITERAL_IGNORE: /* repeated non-literal */ - SRE_CODE chr = pattern[4]; + chr = pattern[4]; while (count < (int) pattern[2]) { if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr) break; ptr++; count++; } + break; - } else if (pattern[3] == SRE_OP_IN) { + case SRE_OP_IN: /* repeated set */ while (count < (int) pattern[2]) { if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr)) @@ -756,8 +785,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ptr++; count++; } + break; - } else { + default: /* repeated single character pattern */ state->ptr = ptr; while (count < (int) pattern[2]) { @@ -770,6 +800,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) } state->ptr = ptr; ptr += count; + break; } /* when we arrive here, count contains the number of @@ -791,7 +822,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) } else if (pattern[pattern[0]] == SRE_OP_LITERAL) { /* tail starts with a literal. skip positions where the rest of the pattern cannot possibly match */ - SRE_CODE chr = pattern[pattern[0]+1]; + chr = pattern[pattern[0]+1]; TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr)); for (;;) { TRACE(("%8d: scan for tail match\n", PTR(ptr))); diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 5cfe49570b6..5c55c3dbd91 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -14,31 +14,32 @@ #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 -#define SRE_OP_ASSERT 3 -#define SRE_OP_ASSERT_NOT 4 -#define SRE_OP_AT 5 -#define SRE_OP_BRANCH 6 -#define SRE_OP_CALL 7 -#define SRE_OP_CATEGORY 8 -#define SRE_OP_CHARSET 9 -#define SRE_OP_GROUPREF 10 -#define SRE_OP_GROUPREF_IGNORE 11 -#define SRE_OP_IN 12 -#define SRE_OP_IN_IGNORE 13 -#define SRE_OP_INFO 14 -#define SRE_OP_JUMP 15 -#define SRE_OP_LITERAL 16 -#define SRE_OP_LITERAL_IGNORE 17 -#define SRE_OP_MARK 18 -#define SRE_OP_MAX_UNTIL 19 -#define SRE_OP_MIN_UNTIL 20 -#define SRE_OP_NOT_LITERAL 21 -#define SRE_OP_NOT_LITERAL_IGNORE 22 -#define SRE_OP_NEGATE 23 -#define SRE_OP_RANGE 24 -#define SRE_OP_REPEAT 25 -#define SRE_OP_REPEAT_ONE 26 -#define SRE_OP_SUBPATTERN 27 +#define SRE_OP_ANY_ALL 3 +#define SRE_OP_ASSERT 4 +#define SRE_OP_ASSERT_NOT 5 +#define SRE_OP_AT 6 +#define SRE_OP_BRANCH 7 +#define SRE_OP_CALL 8 +#define SRE_OP_CATEGORY 9 +#define SRE_OP_CHARSET 10 +#define SRE_OP_GROUPREF 11 +#define SRE_OP_GROUPREF_IGNORE 12 +#define SRE_OP_IN 13 +#define SRE_OP_IN_IGNORE 14 +#define SRE_OP_INFO 15 +#define SRE_OP_JUMP 16 +#define SRE_OP_LITERAL 17 +#define SRE_OP_LITERAL_IGNORE 18 +#define SRE_OP_MARK 19 +#define SRE_OP_MAX_UNTIL 20 +#define SRE_OP_MIN_UNTIL 21 +#define SRE_OP_NOT_LITERAL 22 +#define SRE_OP_NOT_LITERAL_IGNORE 23 +#define SRE_OP_NEGATE 24 +#define SRE_OP_RANGE 25 +#define SRE_OP_REPEAT 26 +#define SRE_OP_REPEAT_ONE 27 +#define SRE_OP_SUBPATTERN 28 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BOUNDARY 2