Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups. Fast searching optimization now can't be disabled at compile time.
This commit is contained in:
parent
6ee588f14e
commit
66dc4648fc
|
@ -409,57 +409,39 @@ def _generate_overlap_table(prefix):
|
|||
table[i] = idx + 1
|
||||
return table
|
||||
|
||||
def _compile_info(code, pattern, flags):
|
||||
# internal: compile an info block. in the current version,
|
||||
# this contains min/max pattern width, and an optional literal
|
||||
# prefix or a character map
|
||||
lo, hi = pattern.getwidth()
|
||||
if hi > MAXCODE:
|
||||
hi = MAXCODE
|
||||
if lo == 0:
|
||||
code.extend([INFO, 4, 0, lo, hi])
|
||||
return
|
||||
# look for a literal prefix
|
||||
def _get_literal_prefix(pattern):
|
||||
# look for literal prefix
|
||||
prefix = []
|
||||
prefixappend = prefix.append
|
||||
prefix_skip = 0
|
||||
prefix_skip = None
|
||||
got_all = True
|
||||
for op, av in pattern.data:
|
||||
if op is LITERAL:
|
||||
prefixappend(av)
|
||||
elif op is SUBPATTERN:
|
||||
prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
|
||||
if prefix_skip is None:
|
||||
if av[0] is not None:
|
||||
prefix_skip = len(prefix)
|
||||
elif prefix_skip1 is not None:
|
||||
prefix_skip = len(prefix) + prefix_skip1
|
||||
prefix.extend(prefix1)
|
||||
if not got_all:
|
||||
break
|
||||
else:
|
||||
got_all = False
|
||||
break
|
||||
return prefix, prefix_skip, got_all
|
||||
|
||||
def _get_charset_prefix(pattern):
|
||||
charset = [] # not used
|
||||
charsetappend = charset.append
|
||||
if not (flags & SRE_FLAG_IGNORECASE):
|
||||
# look for literal prefix
|
||||
for op, av in pattern.data:
|
||||
if pattern.data:
|
||||
op, av = pattern.data[0]
|
||||
if op is SUBPATTERN and av[1]:
|
||||
op, av = av[1][0]
|
||||
if op is LITERAL:
|
||||
if len(prefix) == prefix_skip:
|
||||
prefix_skip = prefix_skip + 1
|
||||
prefixappend(av)
|
||||
elif op is SUBPATTERN and len(av[1]) == 1:
|
||||
op, av = av[1][0]
|
||||
if op is LITERAL:
|
||||
prefixappend(av)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
# if no prefix, look for charset prefix
|
||||
if not prefix and pattern.data:
|
||||
op, av = pattern.data[0]
|
||||
if op is SUBPATTERN and av[1]:
|
||||
op, av = av[1][0]
|
||||
if op is LITERAL:
|
||||
charsetappend((op, av))
|
||||
elif op is BRANCH:
|
||||
c = []
|
||||
cappend = c.append
|
||||
for p in av[1]:
|
||||
if not p:
|
||||
break
|
||||
op, av = p[0]
|
||||
if op is LITERAL:
|
||||
cappend((op, av))
|
||||
else:
|
||||
break
|
||||
else:
|
||||
charset = c
|
||||
charsetappend((op, av))
|
||||
elif op is BRANCH:
|
||||
c = []
|
||||
cappend = c.append
|
||||
|
@ -473,8 +455,43 @@ def _compile_info(code, pattern, flags):
|
|||
break
|
||||
else:
|
||||
charset = c
|
||||
elif op is IN:
|
||||
charset = av
|
||||
elif op is BRANCH:
|
||||
c = []
|
||||
cappend = c.append
|
||||
for p in av[1]:
|
||||
if not p:
|
||||
break
|
||||
op, av = p[0]
|
||||
if op is LITERAL:
|
||||
cappend((op, av))
|
||||
else:
|
||||
break
|
||||
else:
|
||||
charset = c
|
||||
elif op is IN:
|
||||
charset = av
|
||||
return charset
|
||||
|
||||
def _compile_info(code, pattern, flags):
|
||||
# internal: compile an info block. in the current version,
|
||||
# this contains min/max pattern width, and an optional literal
|
||||
# prefix or a character map
|
||||
lo, hi = pattern.getwidth()
|
||||
if hi > MAXCODE:
|
||||
hi = MAXCODE
|
||||
if lo == 0:
|
||||
code.extend([INFO, 4, 0, lo, hi])
|
||||
return
|
||||
# look for a literal prefix
|
||||
prefix = []
|
||||
prefix_skip = 0
|
||||
charset = [] # not used
|
||||
if not (flags & SRE_FLAG_IGNORECASE):
|
||||
# look for literal prefix
|
||||
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
|
||||
# if no prefix, look for charset prefix
|
||||
if not prefix:
|
||||
charset = _get_charset_prefix(pattern)
|
||||
## if prefix:
|
||||
## print("*** PREFIX", prefix, prefix_skip)
|
||||
## if charset:
|
||||
|
@ -487,7 +504,7 @@ def _compile_info(code, pattern, flags):
|
|||
mask = 0
|
||||
if prefix:
|
||||
mask = SRE_INFO_PREFIX
|
||||
if len(prefix) == prefix_skip == len(pattern.data):
|
||||
if prefix_skip is None and got_all:
|
||||
mask = mask | SRE_INFO_LITERAL
|
||||
elif charset:
|
||||
mask = mask | SRE_INFO_CHARSET
|
||||
|
@ -502,6 +519,8 @@ def _compile_info(code, pattern, flags):
|
|||
# add literal prefix
|
||||
if prefix:
|
||||
emit(len(prefix)) # length
|
||||
if prefix_skip is None:
|
||||
prefix_skip = len(prefix)
|
||||
emit(prefix_skip) # skip
|
||||
code.extend(prefix)
|
||||
# generate overlap table
|
||||
|
|
|
@ -13,6 +13,10 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #24426: Fast searching optimization in regular expressions now works
|
||||
for patterns that starts with capturing groups. Fast searching optimization
|
||||
now can't be disabled at compile time.
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
|
|
|
@ -62,9 +62,6 @@ static char copyright[] =
|
|||
/* -------------------------------------------------------------------- */
|
||||
/* optional features */
|
||||
|
||||
/* enables fast searching */
|
||||
#define USE_FAST_SEARCH
|
||||
|
||||
/* enables copy/deepcopy handling (work in progress) */
|
||||
#undef USE_BUILTIN_COPY
|
||||
|
||||
|
|
|
@ -1248,7 +1248,32 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
|
|||
prefix, prefix_len, prefix_skip));
|
||||
TRACE(("charset = %p\n", charset));
|
||||
|
||||
#if defined(USE_FAST_SEARCH)
|
||||
if (prefix_len == 1) {
|
||||
/* pattern starts with a literal character */
|
||||
SRE_CHAR c = (SRE_CHAR) prefix[0];
|
||||
#if SIZEOF_SRE_CHAR < 4
|
||||
if ((SRE_CODE) c != prefix[0])
|
||||
return 0; /* literal can't match: doesn't fit in char width */
|
||||
#endif
|
||||
end = (SRE_CHAR *)state->end;
|
||||
while (ptr < end) {
|
||||
while (*ptr != c) {
|
||||
if (++ptr >= end)
|
||||
return 0;
|
||||
}
|
||||
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
||||
state->start = ptr;
|
||||
state->ptr = ptr + prefix_skip;
|
||||
if (flags & SRE_INFO_LITERAL)
|
||||
return 1; /* we got all of it */
|
||||
status = SRE(match)(state, pattern + 2*prefix_skip, 0);
|
||||
if (status != 0)
|
||||
return status;
|
||||
++ptr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (prefix_len > 1) {
|
||||
/* pattern starts with a known prefix. use the overlap
|
||||
table to skip forward as fast as we possibly can */
|
||||
|
@ -1297,32 +1322,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (pattern[0] == SRE_OP_LITERAL) {
|
||||
/* pattern starts with a literal character. this is used
|
||||
for short prefixes, and if fast search is disabled */
|
||||
SRE_CHAR c = (SRE_CHAR) pattern[1];
|
||||
#if SIZEOF_SRE_CHAR < 4
|
||||
if ((SRE_CODE) c != pattern[1])
|
||||
return 0; /* literal can't match: doesn't fit in char width */
|
||||
#endif
|
||||
end = (SRE_CHAR *)state->end;
|
||||
while (ptr < end) {
|
||||
while (*ptr != c) {
|
||||
if (++ptr >= end)
|
||||
return 0;
|
||||
}
|
||||
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
||||
state->start = ptr;
|
||||
state->ptr = ++ptr;
|
||||
if (flags & SRE_INFO_LITERAL)
|
||||
return 1; /* we got all of it */
|
||||
status = SRE(match)(state, pattern + 2, 0);
|
||||
if (status != 0)
|
||||
break;
|
||||
}
|
||||
} else if (charset) {
|
||||
if (charset) {
|
||||
/* pattern starts with a character from a known set */
|
||||
end = (SRE_CHAR *)state->end;
|
||||
for (;;) {
|
||||
|
|
Loading…
Reference in New Issue