still trying to figure out how to fix the remaining

group reset problem.  in the meantime, I added some
optimizations:

- added "inline" directive to LOCAL

  (this assumes that AC_C_INLINE does what it's
  supposed to do).  to compile SRE on a non-unix
  platform that doesn't support inline, you have
  to add a "#define inline" somewhere...

- added code to generate a SRE_OP_INFO primitive

- added code to do fast prefix search

  (enabled by the USE_FAST_SEARCH define; default
  is on, in this release)
This commit is contained in:
Fredrik Lundh 2000-06-29 23:33:12 +00:00
parent 22e1bf7da5
commit 29c08beab0
2 changed files with 134 additions and 13 deletions

View File

@ -23,6 +23,7 @@ else:
raise RuntimeError, "cannot find a useable array type"
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit = code.append
for op, av in pattern:
if op is ANY:
@ -152,21 +153,75 @@ def _compile(code, pattern, flags):
else:
raise ValueError, ("unsupported operand type", op)
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
# this contains min/max pattern width and a literal prefix,
# if any
lo, hi = pattern.getwidth()
if lo == 0:
return # not worth it
# look for a literal prefix
prefix = []
if not (flags & SRE_FLAG_IGNORECASE):
for op, av in pattern.data:
if op is LITERAL:
prefix.append(ord(av))
else:
break
# add an info block
emit = code.append
emit(OPCODES[INFO])
skip = len(code); emit(0)
# literal flag
mask = 0
if len(prefix) == len(pattern.data):
mask = 1
emit(mask)
# pattern length
emit(lo)
if hi < 32768:
emit(hi)
else:
emit(0)
# add literal prefix
emit(len(prefix))
if prefix:
code.extend(prefix)
# generate overlap table
table = [-1] + ([0]*len(prefix))
for i in range(len(prefix)):
table[i+1] = table[i]+1
while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
table[i+1] = table[table[i+1]-1]+1
code.extend(table[1:]) # don't store first entry
code[skip] = len(code) - skip
def compile(p, flags=0):
# internal: convert pattern list to internal format
# compile, as necessary
if type(p) in (type(""), type(u"")):
import sre_parse
pattern = p
p = sre_parse.parse(p)
else:
pattern = None
flags = p.pattern.flags | flags
code = []
# compile info block
_compile_info(code, p, flags)
# compile the pattern
_compile(code, p.data, flags)
code.append(OPCODES[SUCCESS])
# FIXME: <fl> get rid of this limitation
# FIXME: <fl> get rid of this limitation!
assert p.pattern.groups <= 100,\
"sorry, but this version only supports 100 named groups"
return _sre.compile(
pattern, flags,
array.array(WORDSIZE, code).tostring(),

View File

@ -19,6 +19,7 @@
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
* 00-06-30 fl tuning, fast search (0.9.3)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@ -29,8 +30,7 @@
#ifndef SRE_RECURSIVE
static char
copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB ";
char copyright[] = " SRE 0.9.3 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
@ -55,12 +55,15 @@ copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB ";
#define HAVE_UNICODE
#endif
/* optional features */
#define USE_FAST_SEARCH
#if defined(_MSC_VER)
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
/* fastest possible local call under MSVC */
#define LOCAL(type) static __inline type __fastcall
#else
#define LOCAL(type) static type
#define LOCAL(type) static inline type
#endif
/* error codes */
@ -396,6 +399,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: enter\n", PTR(ptr)));
if (pattern[0] == SRE_OP_INFO) {
/* optimization info block */
/* args: <1=skip> <2=flags> <3=min> ... */
if (pattern[3] && (end - ptr) < pattern[3]) {
TRACE(("reject (got %d chars, need %d)\n",
(end - ptr), pattern[3]));
return 0;
}
pattern += pattern[1] + 1;
}
stackbase = stack = state->stackbase;
lastmark = state->lastmark;
@ -917,20 +931,72 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* end = state->end;
int status = 0;
int prefix_len = 0;
SRE_CODE* prefix = NULL;
SRE_CODE* prefix;
SRE_CODE* overlap;
int literal = 0;
if (pattern[0] == SRE_OP_INFO) {
/* args: <skip> <min> <max> <prefix> <prefix data...> */
end -= pattern[2];
prefix_len = pattern[4];
prefix = pattern + 5;
pattern += pattern[1];
/* optimization info block */
/* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */
if (pattern[3] > 0) {
/* adjust end point (but make sure we leave at least one
character in there) */
end -= pattern[3]-1;
if (end <= ptr)
end = ptr+1;
}
literal = pattern[2];
prefix = pattern + 6;
prefix_len = pattern[5];
overlap = prefix + prefix_len - 1;
pattern += 1 + pattern[1];
}
/* if (prefix_len > 0) ... */
#if defined(USE_FAST_SEARCH)
if (prefix_len > 1) {
/* pattern starts with a known prefix. use the overlap
table to skip forward as fast as we possibly can */
int i = 0;
end = state->end;
while (ptr < end) {
for (;;) {
if (*ptr != (SRE_CHAR) prefix[i]) {
if (!i)
break;
else
i = overlap[i];
} else {
if (++i == prefix_len) {
/* found a potential match */
TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
state->start = ptr - prefix_len + 1;
state->ptr = ptr + 1;
if (literal)
return 1; /* all of it */
status = SRE_MATCH(state, pattern + 2*prefix_len);
if (status != 0)
return status;
/* close but no cigar -- try again */
i = overlap[i];
}
break;
}
}
ptr++;
}
return 0;
}
#endif
if (pattern[0] == SRE_OP_LITERAL) {
/* pattern starts with a literal */
/* pattern starts with a literal character. this is used for
short prefixes, and if fast search is disabled*/
SRE_CHAR chr = (SRE_CHAR) pattern[1];
for (;;) {
while (ptr < end && *ptr != chr)
@ -944,8 +1010,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
break;
}
} else
/* general case */
while (ptr <= end) {
TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
state->start = state->ptr = ptr++;