From 3562f1176403653ebfbef6275d449ad42d1b843a Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Sun, 2 Jul 2000 12:00:07 +0000 Subject: [PATCH] -- use charset bitmaps where appropriate. this gives a 5-10% speedup for some tests, including the python tokenizer. -- added support for an optional charset anchor to the engine (currently unused by the code generator). -- removed workaround for array module bug. --- Lib/sre_compile.py | 110 +++++++++++++++++++++++++++++++++------- Lib/sre_constants.py | 27 +++++++--- Lib/sre_parse.py | 7 ++- Modules/_sre.c | 63 ++++++++++++++++------- Modules/sre_constants.h | 38 +++++++------- 5 files changed, 182 insertions(+), 63 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 14b1970d56c..a593ee73f05 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -16,12 +16,71 @@ import _sre from sre_constants import * # find an array type code that matches the engine's code size -for WORDSIZE in "BHil": +for WORDSIZE in "Hil": if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): break else: raise RuntimeError, "cannot find a useable array type" +MAXCODE = 65535 + +def _charset(charset, fixup): + # internal: optimize character set + out = [] + charmap = [0]*256 + try: + for op, av in charset: + if op is NEGATE: + out.append((op, av)) + elif op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in range(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is CATEGORY: + # FIXME: could append to charmap tail + return charset # cannot compress + except IndexError: + # unicode + return charset + # compress character map + i = p = n = 0 + runs = [] + for c in charmap: + if c: + if n == 0: + p = i + n = n + 1 + elif n: + runs.append((p, n)) + n = 0 + i = i + 1 + if n: + runs.append((p, n)) + if len(runs) <= 2: + # use literal/range + for p, n in runs: + if n == 1: + out.append((LITERAL, p)) + else: + out.append((RANGE, (p, p+n-1))) + if len(out) < len(charset): + return out + else: + # use bitmap + data = [] + m = 1; v = 0 + for c in charmap: + if c: + v = v + m + m = m << 1 + if m > MAXCODE: + data.append(v) + m = 1; v = 0 + out.append((CHARSET, data)) + return out + return charset + def _compile(code, pattern, flags): # internal: compile a (sub)pattern emit = code.append @@ -41,7 +100,7 @@ def _compile(code, pattern, flags): emit(OPCODES[op]) fixup = lambda x: x skip = len(code); emit(0) - for op, av in av: + for op, av in _charset(av, fixup): emit(OPCODES[op]) if op is NEGATE: pass @@ -50,6 +109,8 @@ def _compile(code, pattern, flags): elif op is RANGE: emit(fixup(av[0])) emit(fixup(av[1])) + elif op is CHARSET: + code.extend(av) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CHCODES[CH_LOCALE[av]]) @@ -155,13 +216,14 @@ def _compile(code, pattern, flags): def _compile_info(code, pattern, flags): # internal: compile an info block. in the current version, - # this contains min/max pattern width and a literal prefix, - # if any + # this contains min/max pattern width, and an optional literal + # prefix or a character map lo, hi = pattern.getwidth() if lo == 0: return # not worth it # look for a literal prefix prefix = [] + charset = [] # not used if not (flags & SRE_FLAG_IGNORECASE): for op, av in pattern.data: if op is LITERAL: @@ -174,26 +236,40 @@ def _compile_info(code, pattern, flags): skip = len(code); emit(0) # literal flag mask = 0 - if len(prefix) == len(pattern.data): - mask = 1 + if prefix: + mask = SRE_INFO_PREFIX + if len(prefix) == len(pattern.data): + mask = mask + SRE_INFO_LITERAL + elif charset: + mask = mask + SRE_INFO_CHARSET emit(mask) # pattern length - emit(lo) - if hi < 32768: + if lo < MAXCODE: + emit(lo) + else: + emit(MAXCODE) + prefix = prefix[:MAXCODE] + if hi < MAXCODE: emit(hi) else: emit(0) # add literal prefix - emit(len(prefix)) if prefix: - code.extend(prefix) - # generate overlap table - table = [-1] + ([0]*len(prefix)) - for i in range(len(prefix)): - table[i+1] = table[i]+1 - while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: - table[i+1] = table[table[i+1]-1]+1 - code.extend(table[1:]) # don't store first entry + emit(len(prefix)) + if prefix: + code.extend(prefix) + # generate overlap table + table = [-1] + ([0]*len(prefix)) + for i in range(len(prefix)): + table[i+1] = table[i]+1 + while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: + table[i+1] = table[table[i+1]-1]+1 + code.extend(table[1:]) # don't store first entry + elif charset: + for char in charset: + emit(OPCODES[LITERAL]) + emit(char) + emit(OPCODES[FAILURE]) code[skip] = len(code) - skip def compile(p, flags=0): diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 39db58fd4f9..f0e45ea5105 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -28,6 +28,7 @@ AT = "at" BRANCH = "branch" CALL = "call" CATEGORY = "category" +CHARSET = "charset" GROUP = "group" GROUP_IGNORE = "group_ignore" IN = "in" @@ -87,6 +88,7 @@ OPCODES = [ BRANCH, CALL, CATEGORY, + CHARSET, GROUP, GROUP_IGNORE, IN, IN_IGNORE, INFO, @@ -166,13 +168,18 @@ CH_UNICODE = { } # flags -SRE_FLAG_TEMPLATE = 1 -SRE_FLAG_IGNORECASE = 2 -SRE_FLAG_LOCALE = 4 -SRE_FLAG_MULTILINE = 8 -SRE_FLAG_DOTALL = 16 -SRE_FLAG_UNICODE = 32 -SRE_FLAG_VERBOSE = 64 +SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) +SRE_FLAG_IGNORECASE = 2 # case insensitive +SRE_FLAG_LOCALE = 4 # honour system locale +SRE_FLAG_MULTILINE = 8 # treat target as multiline string +SRE_FLAG_DOTALL = 16 # treat target as a single string +SRE_FLAG_UNICODE = 32 # use unicode locale +SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments + +# flags for INFO primitive +SRE_INFO_PREFIX = 1 # has prefix +SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) +SRE_INFO_CHARSET = 4 # pattern starts with character from given set if __name__ == "__main__": import string @@ -201,6 +208,7 @@ if __name__ == "__main__": dump(f, OPCODES, "SRE_OP") dump(f, ATCODES, "SRE") dump(f, CHCODES, "SRE") + f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) @@ -208,5 +216,10 @@ if __name__ == "__main__": f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) + + f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) + f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) + f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) + f.close() print "done" diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 12f49c3620d..b2632563c75 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -16,11 +16,10 @@ import _sre from sre_constants import * -# FIXME: should be 65535, but the arraymodule is still broken -MAXREPEAT = 32767 +MAXREPEAT = 65535 -# FIXME: might change in 2.0 final. but for now, this seems -# to be the best way to be compatible with 1.5.2 +# FIXME: the following might change in 2.0 final. but for now, this +# seems to be the best way to be compatible with 1.5.2 CHARMASK = 0xff SPECIAL_CHARS = ".\\[{()*+?^$|" diff --git a/Modules/_sre.c b/Modules/_sre.c index 7206b9570e2..3bc023789a2 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -378,6 +378,13 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch) set += 2; break; + case SRE_OP_CHARSET: + /* args: (16 bits per code word) */ + if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) + return ok; + set += 16; + break; + case SRE_OP_CATEGORY: /* args: */ if (sre_category(set[0], (int) ch)) @@ -952,35 +959,38 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) SRE_CHAR* ptr = state->start; SRE_CHAR* end = state->end; int status = 0; - int prefix_len = 0; - SRE_CODE* prefix; - SRE_CODE* overlap; - int literal = 0; + int prefix_len; + SRE_CODE* prefix = NULL; + SRE_CODE* charset = NULL; + SRE_CODE* overlap = NULL; + int flags = 0; if (pattern[0] == SRE_OP_INFO) { /* optimization info block */ - /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */ + /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + + flags = pattern[2]; if (pattern[3] > 0) { /* adjust end point (but make sure we leave at least one - character in there) */ + character in there, so literal search will work) */ end -= pattern[3]-1; if (end <= ptr) end = ptr+1; } - literal = pattern[2]; - - prefix = pattern + 6; - prefix_len = pattern[5]; - - overlap = prefix + prefix_len - 1; + if (flags & SRE_INFO_PREFIX) { + prefix_len = pattern[5]; + prefix = pattern + 6; + overlap = prefix + prefix_len - 1; + } else if (flags & SRE_INFO_CHARSET) + charset = pattern + 5; pattern += 1 + pattern[1]; } #if defined(USE_FAST_SEARCH) - if (prefix_len > 1) { + if (prefix && overlap && prefix_len > 1) { /* pattern starts with a known prefix. use the overlap table to skip forward as fast as we possibly can */ int i = 0; @@ -998,8 +1008,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) TRACE(("%8d: === SEARCH === hit\n", PTR(ptr))); state->start = ptr - prefix_len + 1; state->ptr = ptr + 1; - if (literal) - return 1; /* all of it */ + if (flags & SRE_INFO_LITERAL) + return 1; /* we got all of it */ status = SRE_MATCH(state, pattern + 2*prefix_len); if (status != 0) return status; @@ -1016,9 +1026,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) } #endif - if (pattern[0] == SRE_OP_LITERAL) { - /* pattern starts with a literal character. this is used for - short prefixes, and if fast search is disabled*/ + if (pattern[0] == SRE_OP_LITERAL) { + /* pattern starts with a literal character. this is used + for short prefixes, and if fast search is disabled */ SRE_CODE chr = pattern[1]; for (;;) { while (ptr < end && (SRE_CODE) ptr[0] != chr) @@ -1032,6 +1042,22 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (status != 0) break; } +#if 0 + } else if (charset) { + /* pattern starts with a character from a known set */ + for (;;) { + while (ptr < end && !SRE_MEMBER(charset, ptr[0])) + ptr++; + if (ptr == end) + return 0; + TRACE(("%8d: === SEARCH === charset\n", PTR(ptr))); + state->start = ptr; + state->ptr = ptr; + status = SRE_MATCH(state, pattern); + if (status != 0) + break; + } +#endif } else /* general case */ while (ptr <= end) { @@ -1044,6 +1070,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) return status; } + #if !defined(SRE_RECURSIVE) diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 2ec00bada47..da25ec4bb35 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -20,23 +20,24 @@ #define SRE_OP_BRANCH 6 #define SRE_OP_CALL 7 #define SRE_OP_CATEGORY 8 -#define SRE_OP_GROUP 9 -#define SRE_OP_GROUP_IGNORE 10 -#define SRE_OP_IN 11 -#define SRE_OP_IN_IGNORE 12 -#define SRE_OP_INFO 13 -#define SRE_OP_JUMP 14 -#define SRE_OP_LITERAL 15 -#define SRE_OP_LITERAL_IGNORE 16 -#define SRE_OP_MARK 17 -#define SRE_OP_MAX_REPEAT 18 -#define SRE_OP_MAX_REPEAT_ONE 19 -#define SRE_OP_MIN_REPEAT 20 -#define SRE_OP_NOT_LITERAL 21 -#define SRE_OP_NOT_LITERAL_IGNORE 22 -#define SRE_OP_NEGATE 23 -#define SRE_OP_RANGE 24 -#define SRE_OP_REPEAT 25 +#define SRE_OP_CHARSET 9 +#define SRE_OP_GROUP 10 +#define SRE_OP_GROUP_IGNORE 11 +#define SRE_OP_IN 12 +#define SRE_OP_IN_IGNORE 13 +#define SRE_OP_INFO 14 +#define SRE_OP_JUMP 15 +#define SRE_OP_LITERAL 16 +#define SRE_OP_LITERAL_IGNORE 17 +#define SRE_OP_MARK 18 +#define SRE_OP_MAX_REPEAT 19 +#define SRE_OP_MAX_REPEAT_ONE 20 +#define SRE_OP_MIN_REPEAT 21 +#define SRE_OP_NOT_LITERAL 22 +#define SRE_OP_NOT_LITERAL_IGNORE 23 +#define SRE_OP_NEGATE 24 +#define SRE_OP_RANGE 25 +#define SRE_OP_REPEAT 26 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BOUNDARY 2 @@ -68,3 +69,6 @@ #define SRE_FLAG_DOTALL 16 #define SRE_FLAG_UNICODE 32 #define SRE_FLAG_VERBOSE 64 +#define SRE_INFO_PREFIX 1 +#define SRE_INFO_LITERAL 2 +#define SRE_INFO_CHARSET 4