mirror of https://github.com/python/cpython
bpo-41972: Use the two-way algorithm for string searching (GH-22904)
Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed. Co-authored-by: Tim Peters <tim.peters@gmail.com>
This commit is contained in:
parent
2183d06bc8
commit
73a85c4e1d
|
@ -5,6 +5,7 @@ Common tests shared by test_unicode, test_userstring and test_bytes.
|
||||||
import unittest, string, sys, struct
|
import unittest, string, sys, struct
|
||||||
from test import support
|
from test import support
|
||||||
from collections import UserList
|
from collections import UserList
|
||||||
|
import random
|
||||||
|
|
||||||
class Sequence:
|
class Sequence:
|
||||||
def __init__(self, seq='wxyz'): self.seq = seq
|
def __init__(self, seq='wxyz'): self.seq = seq
|
||||||
|
@ -317,6 +318,44 @@ class BaseTest:
|
||||||
else:
|
else:
|
||||||
self.checkraises(TypeError, 'hello', 'rindex', 42)
|
self.checkraises(TypeError, 'hello', 'rindex', 42)
|
||||||
|
|
||||||
|
def test_find_periodic_pattern(self):
|
||||||
|
"""Cover the special path for periodic patterns."""
|
||||||
|
def reference_find(p, s):
|
||||||
|
for i in range(len(s)):
|
||||||
|
if s.startswith(p, i):
|
||||||
|
return i
|
||||||
|
return -1
|
||||||
|
|
||||||
|
rr = random.randrange
|
||||||
|
choices = random.choices
|
||||||
|
for _ in range(1000):
|
||||||
|
p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20)
|
||||||
|
p = p0[:len(p0) - rr(10)] # pop off some characters
|
||||||
|
left = ''.join(choices('abcdef', k=rr(2000)))
|
||||||
|
right = ''.join(choices('abcdef', k=rr(2000)))
|
||||||
|
text = left + p + right
|
||||||
|
with self.subTest(p=p, text=text):
|
||||||
|
self.checkequal(reference_find(p, text),
|
||||||
|
text, 'find', p)
|
||||||
|
|
||||||
|
def test_find_shift_table_overflow(self):
|
||||||
|
"""When the table of 8-bit shifts overflows."""
|
||||||
|
N = 2**8 + 100
|
||||||
|
|
||||||
|
# first check the periodic case
|
||||||
|
# here, the shift for 'b' is N + 1.
|
||||||
|
pattern1 = 'a' * N + 'b' + 'a' * N
|
||||||
|
text1 = 'babbaa' * N + pattern1
|
||||||
|
self.checkequal(len(text1)-len(pattern1),
|
||||||
|
text1, 'find', pattern1)
|
||||||
|
|
||||||
|
# now check the non-periodic case
|
||||||
|
# here, the shift for 'd' is 3*(N+1)+1
|
||||||
|
pattern2 = 'ddd' + 'abc' * N + "eee"
|
||||||
|
text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N
|
||||||
|
self.checkequal(len(text2) - N*len("de") - len(pattern2),
|
||||||
|
text2, 'find', pattern2)
|
||||||
|
|
||||||
def test_lower(self):
|
def test_lower(self):
|
||||||
self.checkequal('hello', 'HeLLo', 'lower')
|
self.checkequal('hello', 'HeLLo', 'lower')
|
||||||
self.checkequal('hello', 'hello', 'lower')
|
self.checkequal('hello', 'hello', 'lower')
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now sometimes use the "Two-Way" string comparison algorithm to avoid quadratic behavior on long strings.
|
|
@ -9,10 +9,16 @@
|
||||||
/* note: fastsearch may access s[n], which isn't a problem when using
|
/* note: fastsearch may access s[n], which isn't a problem when using
|
||||||
Python's ordinary string types, but may cause problems if you're
|
Python's ordinary string types, but may cause problems if you're
|
||||||
using this code in other contexts. also, the count mode returns -1
|
using this code in other contexts. also, the count mode returns -1
|
||||||
if there cannot possible be a match in the target string, and 0 if
|
if there cannot possibly be a match in the target string, and 0 if
|
||||||
it has actually checked for matches, but didn't find any. callers
|
it has actually checked for matches, but didn't find any. callers
|
||||||
beware! */
|
beware! */
|
||||||
|
|
||||||
|
/* If the strings are long enough, use Crochemore and Perrin's Two-Way
|
||||||
|
algorithm, which has worst-case O(n) runtime and best-case O(n/k).
|
||||||
|
Also compute a table of shifts to achieve O(n/k) in more cases,
|
||||||
|
and often (data dependent) deduce larger shifts than pure C&P can
|
||||||
|
deduce. */
|
||||||
|
|
||||||
#define FAST_COUNT 0
|
#define FAST_COUNT 0
|
||||||
#define FAST_SEARCH 1
|
#define FAST_SEARCH 1
|
||||||
#define FAST_RSEARCH 2
|
#define FAST_RSEARCH 2
|
||||||
|
@ -160,6 +166,353 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
|
||||||
|
|
||||||
#undef MEMCHR_CUT_OFF
|
#undef MEMCHR_CUT_OFF
|
||||||
|
|
||||||
|
/* Change to a 1 to see logging comments walk through the algorithm. */
|
||||||
|
#if 0 && STRINGLIB_SIZEOF_CHAR == 1
|
||||||
|
# define LOG(...) printf(__VA_ARGS__)
|
||||||
|
# define LOG_STRING(s, n) printf("\"%.*s\"", n, s)
|
||||||
|
#else
|
||||||
|
# define LOG(...)
|
||||||
|
# define LOG_STRING(s, n)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
|
STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
|
||||||
|
Py_ssize_t *return_period, int invert_alphabet)
|
||||||
|
{
|
||||||
|
/* Do a lexicographic search. Essentially this:
|
||||||
|
>>> max(needle[i:] for i in range(len(needle)+1))
|
||||||
|
Also find the period of the right half. */
|
||||||
|
Py_ssize_t max_suffix = 0;
|
||||||
|
Py_ssize_t candidate = 1;
|
||||||
|
Py_ssize_t k = 0;
|
||||||
|
// The period of the right half.
|
||||||
|
Py_ssize_t period = 1;
|
||||||
|
|
||||||
|
while (candidate + k < len_needle) {
|
||||||
|
// each loop increases candidate + k + max_suffix
|
||||||
|
STRINGLIB_CHAR a = needle[candidate + k];
|
||||||
|
STRINGLIB_CHAR b = needle[max_suffix + k];
|
||||||
|
// check if the suffix at candidate is better than max_suffix
|
||||||
|
if (invert_alphabet ? (b < a) : (a < b)) {
|
||||||
|
// Fell short of max_suffix.
|
||||||
|
// The next k + 1 characters are non-increasing
|
||||||
|
// from candidate, so they won't start a maximal suffix.
|
||||||
|
candidate += k + 1;
|
||||||
|
k = 0;
|
||||||
|
// We've ruled out any period smaller than what's
|
||||||
|
// been scanned since max_suffix.
|
||||||
|
period = candidate - max_suffix;
|
||||||
|
}
|
||||||
|
else if (a == b) {
|
||||||
|
if (k + 1 != period) {
|
||||||
|
// Keep scanning the equal strings
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Matched a whole period.
|
||||||
|
// Start matching the next period.
|
||||||
|
candidate += period;
|
||||||
|
k = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Did better than max_suffix, so replace it.
|
||||||
|
max_suffix = candidate;
|
||||||
|
candidate++;
|
||||||
|
k = 0;
|
||||||
|
period = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*return_period = period;
|
||||||
|
return max_suffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
|
STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle,
|
||||||
|
Py_ssize_t len_needle,
|
||||||
|
Py_ssize_t *return_period)
|
||||||
|
{
|
||||||
|
/* Do a "critical factorization", making it so that:
|
||||||
|
>>> needle = (left := needle[:cut]) + (right := needle[cut:])
|
||||||
|
where the "local period" of the cut is maximal.
|
||||||
|
|
||||||
|
The local period of the cut is the minimal length of a string w
|
||||||
|
such that (left endswith w or w endswith left)
|
||||||
|
and (right startswith w or w startswith left).
|
||||||
|
|
||||||
|
The Critical Factorization Theorem says that this maximal local
|
||||||
|
period is the global period of the string.
|
||||||
|
|
||||||
|
Crochemore and Perrin (1991) show that this cut can be computed
|
||||||
|
as the later of two cuts: one that gives a lexicographically
|
||||||
|
maximal right half, and one that gives the same with the
|
||||||
|
with respect to a reversed alphabet-ordering.
|
||||||
|
|
||||||
|
This is what we want to happen:
|
||||||
|
>>> x = "GCAGAGAG"
|
||||||
|
>>> cut, period = factorize(x)
|
||||||
|
>>> x[:cut], (right := x[cut:])
|
||||||
|
('GC', 'AGAGAG')
|
||||||
|
>>> period # right half period
|
||||||
|
2
|
||||||
|
>>> right[period:] == right[:-period]
|
||||||
|
True
|
||||||
|
|
||||||
|
This is how the local period lines up in the above example:
|
||||||
|
GC | AGAGAG
|
||||||
|
AGAGAGC = AGAGAGC
|
||||||
|
The length of this minimal repetition is 7, which is indeed the
|
||||||
|
period of the original string. */
|
||||||
|
|
||||||
|
Py_ssize_t cut1, period1, cut2, period2, cut, period;
|
||||||
|
cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0);
|
||||||
|
cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1);
|
||||||
|
|
||||||
|
// Take the later cut.
|
||||||
|
if (cut1 > cut2) {
|
||||||
|
period = period1;
|
||||||
|
cut = cut1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
period = period2;
|
||||||
|
cut = cut2;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG("split: "); LOG_STRING(needle, cut);
|
||||||
|
LOG(" + "); LOG_STRING(needle + cut, len_needle - cut);
|
||||||
|
LOG("\n");
|
||||||
|
|
||||||
|
*return_period = period;
|
||||||
|
return cut;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SHIFT_TYPE uint8_t
|
||||||
|
#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U)
|
||||||
|
#define SHIFT_OVERFLOW (NOT_FOUND - 1U)
|
||||||
|
|
||||||
|
#define TABLE_SIZE_BITS 6
|
||||||
|
#define TABLE_SIZE (1U << TABLE_SIZE_BITS)
|
||||||
|
#define TABLE_MASK (TABLE_SIZE - 1U)
|
||||||
|
|
||||||
|
typedef struct STRINGLIB(_pre) {
|
||||||
|
const STRINGLIB_CHAR *needle;
|
||||||
|
Py_ssize_t len_needle;
|
||||||
|
Py_ssize_t cut;
|
||||||
|
Py_ssize_t period;
|
||||||
|
int is_periodic;
|
||||||
|
SHIFT_TYPE table[TABLE_SIZE];
|
||||||
|
} STRINGLIB(prework);
|
||||||
|
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(void)
|
||||||
|
STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
|
||||||
|
STRINGLIB(prework) *p)
|
||||||
|
{
|
||||||
|
p->needle = needle;
|
||||||
|
p->len_needle = len_needle;
|
||||||
|
p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period));
|
||||||
|
assert(p->period + p->cut <= len_needle);
|
||||||
|
p->is_periodic = (0 == memcmp(needle,
|
||||||
|
needle + p->period,
|
||||||
|
p->cut * STRINGLIB_SIZEOF_CHAR));
|
||||||
|
if (p->is_periodic) {
|
||||||
|
assert(p->cut <= len_needle/2);
|
||||||
|
assert(p->cut < p->period);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// A lower bound on the period
|
||||||
|
p->period = Py_MAX(p->cut, len_needle - p->cut) + 1;
|
||||||
|
}
|
||||||
|
// Now fill up a table
|
||||||
|
memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE));
|
||||||
|
assert(p->table[0] == NOT_FOUND);
|
||||||
|
assert(p->table[TABLE_MASK] == NOT_FOUND);
|
||||||
|
for (Py_ssize_t i = 0; i < len_needle; i++) {
|
||||||
|
Py_ssize_t shift = len_needle - i;
|
||||||
|
if (shift > SHIFT_OVERFLOW) {
|
||||||
|
shift = SHIFT_OVERFLOW;
|
||||||
|
}
|
||||||
|
p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift,
|
||||||
|
Py_ssize_t,
|
||||||
|
SHIFT_TYPE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
|
STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
|
||||||
|
STRINGLIB(prework) *p)
|
||||||
|
{
|
||||||
|
// Crochemore and Perrin's (1991) Two-Way algorithm.
|
||||||
|
// See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
|
||||||
|
Py_ssize_t len_needle = p->len_needle;
|
||||||
|
Py_ssize_t cut = p->cut;
|
||||||
|
Py_ssize_t period = p->period;
|
||||||
|
const STRINGLIB_CHAR *needle = p->needle;
|
||||||
|
const STRINGLIB_CHAR *window = haystack;
|
||||||
|
const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle;
|
||||||
|
SHIFT_TYPE *table = p->table;
|
||||||
|
LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack);
|
||||||
|
|
||||||
|
if (p->is_periodic) {
|
||||||
|
LOG("Needle is periodic.\n");
|
||||||
|
Py_ssize_t memory = 0;
|
||||||
|
periodicwindowloop:
|
||||||
|
while (window <= last_window) {
|
||||||
|
Py_ssize_t i = Py_MAX(cut, memory);
|
||||||
|
|
||||||
|
// Visualize the line-up:
|
||||||
|
LOG("> "); LOG_STRING(haystack, len_haystack);
|
||||||
|
LOG("\n> "); LOG("%*s", window - haystack, "");
|
||||||
|
LOG_STRING(needle, len_needle);
|
||||||
|
LOG("\n> "); LOG("%*s", window - haystack + i, "");
|
||||||
|
LOG(" ^ <-- cut\n");
|
||||||
|
|
||||||
|
if (window[i] != needle[i]) {
|
||||||
|
// Sunday's trick: if we're going to jump, we might
|
||||||
|
// as well jump to line up the character *after* the
|
||||||
|
// current window.
|
||||||
|
STRINGLIB_CHAR first_outside = window[len_needle];
|
||||||
|
SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
|
||||||
|
if (shift == NOT_FOUND) {
|
||||||
|
LOG("\"%c\" not found. Skipping entirely.\n",
|
||||||
|
first_outside);
|
||||||
|
window += len_needle + 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
LOG("Shifting to line up \"%c\".\n", first_outside);
|
||||||
|
Py_ssize_t memory_shift = i - cut + 1;
|
||||||
|
window += Py_MAX(shift, memory_shift);
|
||||||
|
}
|
||||||
|
memory = 0;
|
||||||
|
goto periodicwindowloop;
|
||||||
|
}
|
||||||
|
for (i = i + 1; i < len_needle; i++) {
|
||||||
|
if (needle[i] != window[i]) {
|
||||||
|
LOG("Right half does not match. Jump ahead by %d.\n",
|
||||||
|
i - cut + 1);
|
||||||
|
window += i - cut + 1;
|
||||||
|
memory = 0;
|
||||||
|
goto periodicwindowloop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (i = memory; i < cut; i++) {
|
||||||
|
if (needle[i] != window[i]) {
|
||||||
|
LOG("Left half does not match. Jump ahead by period %d.\n",
|
||||||
|
period);
|
||||||
|
window += period;
|
||||||
|
memory = len_needle - period;
|
||||||
|
goto periodicwindowloop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG("Left half matches. Returning %d.\n",
|
||||||
|
window - haystack);
|
||||||
|
return window - haystack;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
LOG("Needle is not periodic.\n");
|
||||||
|
assert(cut < len_needle);
|
||||||
|
STRINGLIB_CHAR needle_cut = needle[cut];
|
||||||
|
windowloop:
|
||||||
|
while (window <= last_window) {
|
||||||
|
|
||||||
|
// Visualize the line-up:
|
||||||
|
LOG("> "); LOG_STRING(haystack, len_haystack);
|
||||||
|
LOG("\n> "); LOG("%*s", window - haystack, "");
|
||||||
|
LOG_STRING(needle, len_needle);
|
||||||
|
LOG("\n> "); LOG("%*s", window - haystack + cut, "");
|
||||||
|
LOG(" ^ <-- cut\n");
|
||||||
|
|
||||||
|
if (window[cut] != needle_cut) {
|
||||||
|
// Sunday's trick: if we're going to jump, we might
|
||||||
|
// as well jump to line up the character *after* the
|
||||||
|
// current window.
|
||||||
|
STRINGLIB_CHAR first_outside = window[len_needle];
|
||||||
|
SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
|
||||||
|
if (shift == NOT_FOUND) {
|
||||||
|
LOG("\"%c\" not found. Skipping entirely.\n",
|
||||||
|
first_outside);
|
||||||
|
window += len_needle + 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
LOG("Shifting to line up \"%c\".\n", first_outside);
|
||||||
|
window += shift;
|
||||||
|
}
|
||||||
|
goto windowloop;
|
||||||
|
}
|
||||||
|
for (Py_ssize_t i = cut + 1; i < len_needle; i++) {
|
||||||
|
if (needle[i] != window[i]) {
|
||||||
|
LOG("Right half does not match. Advance by %d.\n",
|
||||||
|
i - cut + 1);
|
||||||
|
window += i - cut + 1;
|
||||||
|
goto windowloop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Py_ssize_t i = 0; i < cut; i++) {
|
||||||
|
if (needle[i] != window[i]) {
|
||||||
|
LOG("Left half does not match. Advance by period %d.\n",
|
||||||
|
period);
|
||||||
|
window += period;
|
||||||
|
goto windowloop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG("Left half matches. Returning %d.\n", window - haystack);
|
||||||
|
return window - haystack;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG("Not found. Returning -1.\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
|
STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack,
|
||||||
|
Py_ssize_t len_haystack,
|
||||||
|
const STRINGLIB_CHAR *needle,
|
||||||
|
Py_ssize_t len_needle)
|
||||||
|
{
|
||||||
|
LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack);
|
||||||
|
STRINGLIB(prework) p;
|
||||||
|
STRINGLIB(_preprocess)(needle, len_needle, &p);
|
||||||
|
return STRINGLIB(_two_way)(haystack, len_haystack, &p);
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
|
STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack,
|
||||||
|
Py_ssize_t len_haystack,
|
||||||
|
const STRINGLIB_CHAR *needle,
|
||||||
|
Py_ssize_t len_needle,
|
||||||
|
Py_ssize_t maxcount)
|
||||||
|
{
|
||||||
|
LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack);
|
||||||
|
STRINGLIB(prework) p;
|
||||||
|
STRINGLIB(_preprocess)(needle, len_needle, &p);
|
||||||
|
Py_ssize_t index = 0, count = 0;
|
||||||
|
while (1) {
|
||||||
|
Py_ssize_t result;
|
||||||
|
result = STRINGLIB(_two_way)(haystack + index,
|
||||||
|
len_haystack - index, &p);
|
||||||
|
if (result == -1) {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
if (count == maxcount) {
|
||||||
|
return maxcount;
|
||||||
|
}
|
||||||
|
index += result + len_needle;
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef SHIFT_TYPE
|
||||||
|
#undef NOT_FOUND
|
||||||
|
#undef SHIFT_OVERFLOW
|
||||||
|
#undef TABLE_SIZE_BITS
|
||||||
|
#undef TABLE_SIZE
|
||||||
|
#undef TABLE_MASK
|
||||||
|
|
||||||
|
#undef LOG
|
||||||
|
#undef LOG_STRING
|
||||||
|
|
||||||
Py_LOCAL_INLINE(Py_ssize_t)
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
||||||
const STRINGLIB_CHAR* p, Py_ssize_t m,
|
const STRINGLIB_CHAR* p, Py_ssize_t m,
|
||||||
|
@ -195,10 +548,22 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
||||||
}
|
}
|
||||||
|
|
||||||
mlast = m - 1;
|
mlast = m - 1;
|
||||||
skip = mlast - 1;
|
skip = mlast;
|
||||||
mask = 0;
|
mask = 0;
|
||||||
|
|
||||||
if (mode != FAST_RSEARCH) {
|
if (mode != FAST_RSEARCH) {
|
||||||
|
if (m >= 100 && w >= 2000 && w / m >= 5) {
|
||||||
|
/* For larger problems where the needle isn't a huge
|
||||||
|
percentage of the size of the haystack, the relatively
|
||||||
|
expensive O(m) startup cost of the two-way algorithm
|
||||||
|
will surely pay off. */
|
||||||
|
if (mode == FAST_SEARCH) {
|
||||||
|
return STRINGLIB(_two_way_find)(s, n, p, m);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return STRINGLIB(_two_way_count)(s, n, p, m, maxcount);
|
||||||
|
}
|
||||||
|
}
|
||||||
const STRINGLIB_CHAR *ss = s + m - 1;
|
const STRINGLIB_CHAR *ss = s + m - 1;
|
||||||
const STRINGLIB_CHAR *pp = p + m - 1;
|
const STRINGLIB_CHAR *pp = p + m - 1;
|
||||||
|
|
||||||
|
@ -207,41 +572,118 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
||||||
/* process pattern[:-1] */
|
/* process pattern[:-1] */
|
||||||
for (i = 0; i < mlast; i++) {
|
for (i = 0; i < mlast; i++) {
|
||||||
STRINGLIB_BLOOM_ADD(mask, p[i]);
|
STRINGLIB_BLOOM_ADD(mask, p[i]);
|
||||||
if (p[i] == p[mlast])
|
if (p[i] == p[mlast]) {
|
||||||
skip = mlast - i - 1;
|
skip = mlast - i - 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/* process pattern[-1] outside the loop */
|
/* process pattern[-1] outside the loop */
|
||||||
STRINGLIB_BLOOM_ADD(mask, p[mlast]);
|
STRINGLIB_BLOOM_ADD(mask, p[mlast]);
|
||||||
|
|
||||||
|
if (m >= 100 && w >= 8000) {
|
||||||
|
/* To ensure that we have good worst-case behavior,
|
||||||
|
here's an adaptive version of the algorithm, where if
|
||||||
|
we match O(m) characters without any matches of the
|
||||||
|
entire needle, then we predict that the startup cost of
|
||||||
|
the two-way algorithm will probably be worth it. */
|
||||||
|
Py_ssize_t hits = 0;
|
||||||
|
for (i = 0; i <= w; i++) {
|
||||||
|
if (ss[i] == pp[0]) {
|
||||||
|
/* candidate match */
|
||||||
|
for (j = 0; j < mlast; j++) {
|
||||||
|
if (s[i+j] != p[j]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (j == mlast) {
|
||||||
|
/* got a match! */
|
||||||
|
if (mode != FAST_COUNT) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
if (count == maxcount) {
|
||||||
|
return maxcount;
|
||||||
|
}
|
||||||
|
i = i + mlast;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* miss: check if next character is part of pattern */
|
||||||
|
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
|
||||||
|
i = i + m;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
i = i + skip;
|
||||||
|
}
|
||||||
|
hits += j + 1;
|
||||||
|
if (hits >= m / 4 && i < w - 1000) {
|
||||||
|
/* We've done O(m) fruitless comparisons
|
||||||
|
anyway, so spend the O(m) cost on the
|
||||||
|
setup for the two-way algorithm. */
|
||||||
|
Py_ssize_t res;
|
||||||
|
if (mode == FAST_COUNT) {
|
||||||
|
res = STRINGLIB(_two_way_count)(
|
||||||
|
s+i, n-i, p, m, maxcount-count);
|
||||||
|
return count + res;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
res = STRINGLIB(_two_way_find)(s+i, n-i, p, m);
|
||||||
|
if (res == -1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return i + res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* skip: check if next character is part of pattern */
|
||||||
|
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
|
||||||
|
i = i + m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mode != FAST_COUNT) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
/* The standard, non-adaptive version of the algorithm. */
|
||||||
for (i = 0; i <= w; i++) {
|
for (i = 0; i <= w; i++) {
|
||||||
/* note: using mlast in the skip path slows things down on x86 */
|
/* note: using mlast in the skip path slows things down on x86 */
|
||||||
if (ss[i] == pp[0]) {
|
if (ss[i] == pp[0]) {
|
||||||
/* candidate match */
|
/* candidate match */
|
||||||
for (j = 0; j < mlast; j++)
|
for (j = 0; j < mlast; j++) {
|
||||||
if (s[i+j] != p[j])
|
if (s[i+j] != p[j]) {
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (j == mlast) {
|
if (j == mlast) {
|
||||||
/* got a match! */
|
/* got a match! */
|
||||||
if (mode != FAST_COUNT)
|
if (mode != FAST_COUNT) {
|
||||||
return i;
|
return i;
|
||||||
|
}
|
||||||
count++;
|
count++;
|
||||||
if (count == maxcount)
|
if (count == maxcount) {
|
||||||
return maxcount;
|
return maxcount;
|
||||||
|
}
|
||||||
i = i + mlast;
|
i = i + mlast;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* miss: check if next character is part of pattern */
|
/* miss: check if next character is part of pattern */
|
||||||
if (!STRINGLIB_BLOOM(mask, ss[i+1]))
|
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
|
||||||
i = i + m;
|
i = i + m;
|
||||||
else
|
}
|
||||||
|
else {
|
||||||
i = i + skip;
|
i = i + skip;
|
||||||
} else {
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
/* skip: check if next character is part of pattern */
|
/* skip: check if next character is part of pattern */
|
||||||
if (!STRINGLIB_BLOOM(mask, ss[i+1]))
|
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
|
||||||
i = i + m;
|
i = i + m;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else { /* FAST_RSEARCH */
|
}
|
||||||
|
else { /* FAST_RSEARCH */
|
||||||
|
|
||||||
/* create compressed boyer-moore delta 1 table */
|
/* create compressed boyer-moore delta 1 table */
|
||||||
|
|
||||||
|
@ -250,28 +692,36 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
||||||
/* process pattern[:0:-1] */
|
/* process pattern[:0:-1] */
|
||||||
for (i = mlast; i > 0; i--) {
|
for (i = mlast; i > 0; i--) {
|
||||||
STRINGLIB_BLOOM_ADD(mask, p[i]);
|
STRINGLIB_BLOOM_ADD(mask, p[i]);
|
||||||
if (p[i] == p[0])
|
if (p[i] == p[0]) {
|
||||||
skip = i - 1;
|
skip = i - 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = w; i >= 0; i--) {
|
for (i = w; i >= 0; i--) {
|
||||||
if (s[i] == p[0]) {
|
if (s[i] == p[0]) {
|
||||||
/* candidate match */
|
/* candidate match */
|
||||||
for (j = mlast; j > 0; j--)
|
for (j = mlast; j > 0; j--) {
|
||||||
if (s[i+j] != p[j])
|
if (s[i+j] != p[j]) {
|
||||||
break;
|
break;
|
||||||
if (j == 0)
|
}
|
||||||
|
}
|
||||||
|
if (j == 0) {
|
||||||
/* got a match! */
|
/* got a match! */
|
||||||
return i;
|
return i;
|
||||||
|
}
|
||||||
/* miss: check if previous character is part of pattern */
|
/* miss: check if previous character is part of pattern */
|
||||||
if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
|
if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
|
||||||
i = i - m;
|
i = i - m;
|
||||||
else
|
}
|
||||||
|
else {
|
||||||
i = i - skip;
|
i = i - skip;
|
||||||
} else {
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
/* skip: check if previous character is part of pattern */
|
/* skip: check if previous character is part of pattern */
|
||||||
if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
|
if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
|
||||||
i = i - m;
|
i = i - m;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,431 @@
|
||||||
|
This document explains Crochemore and Perrin's Two-Way string matching
|
||||||
|
algorithm, in which a smaller string (the "pattern" or "needle")
|
||||||
|
is searched for in a longer string (the "text" or "haystack"),
|
||||||
|
determining whether the needle is a substring of the haystack, and if
|
||||||
|
so, at what index(es). It is to be used by Python's string
|
||||||
|
(and bytes-like) objects when calling `find`, `index`, `__contains__`,
|
||||||
|
or implicitly in methods like `replace` or `partition`.
|
||||||
|
|
||||||
|
This is essentially a re-telling of the paper
|
||||||
|
|
||||||
|
Crochemore M., Perrin D., 1991, Two-way string-matching,
|
||||||
|
Journal of the ACM 38(3):651-675.
|
||||||
|
|
||||||
|
focused more on understanding and examples than on rigor. See also
|
||||||
|
the code sample here:
|
||||||
|
|
||||||
|
http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
|
||||||
|
|
||||||
|
The algorithm runs in O(len(needle) + len(haystack)) time and with
|
||||||
|
O(1) space. However, since there is a larger preprocessing cost than
|
||||||
|
simpler algorithms, this Two-Way algorithm is to be used only when the
|
||||||
|
needle and haystack lengths meet certain thresholds.
|
||||||
|
|
||||||
|
|
||||||
|
These are the basic steps of the algorithm:
|
||||||
|
|
||||||
|
* "Very carefully" cut the needle in two.
|
||||||
|
* For each alignment attempted:
|
||||||
|
1. match the right part
|
||||||
|
* On failure, jump by the amount matched + 1
|
||||||
|
2. then match the left part.
|
||||||
|
* On failure jump by max(len(left), len(right)) + 1
|
||||||
|
* If the needle is periodic, don't re-do comparisons; maintain
|
||||||
|
a "memory" of how many characters you already know match.
|
||||||
|
|
||||||
|
|
||||||
|
-------- Matching the right part --------
|
||||||
|
|
||||||
|
We first scan the right part of the needle to check if it matches the
|
||||||
|
the aligned characters in the haystack. We scan left-to-right,
|
||||||
|
and if a mismatch occurs, we jump ahead by the amount matched plus 1.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
text: ........EFGX...................
|
||||||
|
pattern: ....abcdEFGH....
|
||||||
|
cut: <<<<>>>>
|
||||||
|
|
||||||
|
Matched 3, so jump ahead by 4:
|
||||||
|
|
||||||
|
text: ........EFGX...................
|
||||||
|
pattern: ....abcdEFGH....
|
||||||
|
cut: <<<<>>>>
|
||||||
|
|
||||||
|
Why are we allowed to do this? Because we cut the needle very
|
||||||
|
carefully, in such a way that if the cut is ...abcd + EFGH... then
|
||||||
|
we have
|
||||||
|
|
||||||
|
d != E
|
||||||
|
cd != EF
|
||||||
|
bcd != EFG
|
||||||
|
abcd != EFGH
|
||||||
|
... and so on.
|
||||||
|
|
||||||
|
If this is true for every pair of equal-length substrings around the
|
||||||
|
cut, then the following alignments do not work, so we can skip them:
|
||||||
|
|
||||||
|
text: ........EFG....................
|
||||||
|
pattern: ....abcdEFGH....
|
||||||
|
^ (Bad because d != E)
|
||||||
|
text: ........EFG....................
|
||||||
|
pattern: ....abcdEFGH....
|
||||||
|
^^ (Bad because cd != EF)
|
||||||
|
text: ........EFG....................
|
||||||
|
pattern: ....abcdEFGH....
|
||||||
|
^^^ (Bad because bcd != EFG)
|
||||||
|
|
||||||
|
Skip 3 alignments => increment alignment by 4.
|
||||||
|
|
||||||
|
|
||||||
|
-------- If len(left_part) < len(right_part) --------
|
||||||
|
|
||||||
|
Above is the core idea, and it begins to suggest how the algorithm can
|
||||||
|
be linear-time. There is one bit of subtlety involving what to do
|
||||||
|
around the end of the needle: if the left half is shorter than the
|
||||||
|
right, then we could run into something like this:
|
||||||
|
|
||||||
|
text: .....EFG......
|
||||||
|
pattern: cdEFGH
|
||||||
|
|
||||||
|
The same argument holds that we can skip ahead by 4, so long as
|
||||||
|
|
||||||
|
d != E
|
||||||
|
cd != EF
|
||||||
|
?cd != EFG
|
||||||
|
??cd != EFGH
|
||||||
|
etc.
|
||||||
|
|
||||||
|
The question marks represent "wildcards" that always match; they're
|
||||||
|
outside the limits of the needle, so there's no way for them to
|
||||||
|
invalidate a match. To ensure that the inequalities above are always
|
||||||
|
true, we need them to be true for all possible '?' values. We thus
|
||||||
|
need cd != FG and cd != GH, etc.
|
||||||
|
|
||||||
|
|
||||||
|
-------- Matching the left part --------
|
||||||
|
|
||||||
|
Once we have ensured the right part matches, we scan the left part
|
||||||
|
(order doesn't matter, but traditionally right-to-left), and if we
|
||||||
|
find a mismatch, we jump ahead by
|
||||||
|
max(len(left_part), len(right_part)) + 1. That we can jump by
|
||||||
|
at least len(right_part) + 1 we have already seen:
|
||||||
|
|
||||||
|
text: .....EFG.....
|
||||||
|
pattern: abcdEFG
|
||||||
|
Matched 3, so jump by 4,
|
||||||
|
using the fact that d != E, cd != EF, and bcd != EFG.
|
||||||
|
|
||||||
|
But we can also jump by at least len(left_part) + 1:
|
||||||
|
|
||||||
|
text: ....cdEF.....
|
||||||
|
pattern: abcdEF
|
||||||
|
Jump by len('abcd') + 1 = 5.
|
||||||
|
|
||||||
|
Skip the alignments:
|
||||||
|
text: ....cdEF.....
|
||||||
|
pattern: abcdEF
|
||||||
|
text: ....cdEF.....
|
||||||
|
pattern: abcdEF
|
||||||
|
text: ....cdEF.....
|
||||||
|
pattern: abcdEF
|
||||||
|
text: ....cdEF.....
|
||||||
|
pattern: abcdEF
|
||||||
|
|
||||||
|
This requires the following facts:
|
||||||
|
d != E
|
||||||
|
cd != EF
|
||||||
|
bcd != EF?
|
||||||
|
abcd != EF??
|
||||||
|
etc., for all values of ?s, as above.
|
||||||
|
|
||||||
|
If we have both sets of inequalities, then we can indeed jump by
|
||||||
|
max(len(left_part), len(right_part)) + 1. Under the assumption of such
|
||||||
|
a nice splitting of the needle, we now have enough to prove linear
|
||||||
|
time for the search: consider the forward-progress/comparisons ratio
|
||||||
|
at each alignment position. If a mismatch occurs in the right part,
|
||||||
|
the ratio is 1 position forward per comparison. On the other hand,
|
||||||
|
if a mismatch occurs in the left half, we advance by more than
|
||||||
|
len(needle)//2 positions for at most len(needle) comparisons,
|
||||||
|
so this ratio is more than 1/2. This average "movement speed" is
|
||||||
|
bounded below by the constant "1 position per 2 comparisons", so we
|
||||||
|
have linear time.
|
||||||
|
|
||||||
|
|
||||||
|
-------- The periodic case --------
|
||||||
|
|
||||||
|
The sets of inequalities listed so far seem too good to be true in
|
||||||
|
the general case. Indeed, they fail when a needle is periodic:
|
||||||
|
there's no way to split 'AAbAAbAAbA' in two such that
|
||||||
|
|
||||||
|
(the stuff n characters to the left of the split)
|
||||||
|
cannot equal
|
||||||
|
(the stuff n characters to the right of the split)
|
||||||
|
for all n.
|
||||||
|
|
||||||
|
This is because no matter how you cut it, you'll get
|
||||||
|
s[cut-3:cut] == s[cut:cut+3]. So what do we do? We still cut the
|
||||||
|
needle in two so that n can be as big as possible. If we were to
|
||||||
|
split it as
|
||||||
|
|
||||||
|
AAbA + AbAAbA
|
||||||
|
|
||||||
|
then A == A at the split, so this is bad (we failed at length 1), but
|
||||||
|
if we split it as
|
||||||
|
|
||||||
|
AA + bAAbAAbA
|
||||||
|
|
||||||
|
we at least have A != b and AA != bA, and we fail at length 3
|
||||||
|
since ?AA == bAA. We already knew that a cut to make length-3
|
||||||
|
mismatch was impossible due to the period, but we now see that the
|
||||||
|
bound is sharp; we can get length-1 and length-2 to mismatch.
|
||||||
|
|
||||||
|
This is exactly the content of the *critical factorization theorem*:
|
||||||
|
that no matter the period of the original needle, you can cut it in
|
||||||
|
such a way that (with the appropriate question marks),
|
||||||
|
needle[cut-k:cut] mismatches needle[cut:cut+k] for all k < the period.
|
||||||
|
|
||||||
|
Even "non-periodic" strings are periodic with a period equal to
|
||||||
|
their length, so for such needles, the CFT already guarantees that
|
||||||
|
the algorithm described so far will work, since we can cut the needle
|
||||||
|
so that the length-k chunks on either side of the cut mismatch for all
|
||||||
|
k < len(needle). Looking closer at the algorithm, we only actually
|
||||||
|
require that k go up to max(len(left_part), len(right_part)).
|
||||||
|
So long as the period exceeds that, we're good.
|
||||||
|
|
||||||
|
The more general shorter-period case is a bit harder. The essentials
|
||||||
|
are the same, except we use the periodicity to our advantage by
|
||||||
|
"remembering" periods that we've already compared. In our running
|
||||||
|
example, say we're computing
|
||||||
|
|
||||||
|
"AAbAAbAAbA" in "bbbAbbAAbAAbAAbbbAAbAAbAAbAA".
|
||||||
|
|
||||||
|
We cut as AA + bAAbAAbA, and then the algorithm runs as follows:
|
||||||
|
|
||||||
|
First alignment:
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
^^X
|
||||||
|
- Mismatch at third position, so jump by 3.
|
||||||
|
- This requires that A!=b and AA != bA.
|
||||||
|
|
||||||
|
Second alignment:
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
^^^^^^^^
|
||||||
|
X
|
||||||
|
- Matched entire right part
|
||||||
|
- Mismatch at left part.
|
||||||
|
- Jump forward a period, remembering the existing comparisons
|
||||||
|
|
||||||
|
Third alignment:
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
mmmmmmm^^X
|
||||||
|
- There's "memory": a bunch of characters were already matched.
|
||||||
|
- Two more characters match beyond that.
|
||||||
|
- The 8th character of the right part mismatched, so jump by 8
|
||||||
|
- The above rule is more complicated than usual: we don't have
|
||||||
|
the right inequalities for lengths 1 through 7, but we do have
|
||||||
|
shifted copies of the length-1 and length-2 inequalities,
|
||||||
|
along with knowledge of the mismatch. We can skip all of these
|
||||||
|
alignments at once:
|
||||||
|
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
~ A != b at the cut
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
~~ AA != bA at the cut
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
^^^^X 7-3=4 match, and the 5th misses.
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
~ A != b at the cut
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
~~ AA != bA at the cut
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
^X 7-3-3=1 match and the 2nd misses.
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
~ A != b at the cut
|
||||||
|
|
||||||
|
Fourth alignment:
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
^X
|
||||||
|
- Second character mismatches, so jump by 2.
|
||||||
|
|
||||||
|
Fifth alignment:
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
^^^^^^^^
|
||||||
|
X
|
||||||
|
- Right half matches, so use memory and skip ahead by period=3
|
||||||
|
|
||||||
|
Sixth alignment:
|
||||||
|
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
|
||||||
|
AAbAAbAAbA
|
||||||
|
mmmmmmmm^^
|
||||||
|
- Right part matches, left part is remembered, found a match!
|
||||||
|
|
||||||
|
The one tricky skip by 8 here generalizes: if we have a period of p,
|
||||||
|
then the CFT says we can ensure the cut has the inequality property
|
||||||
|
for lengths 1 through p-1, and jumping by p would line up the
|
||||||
|
matching characters and mismatched character one period earlier.
|
||||||
|
Inductively, this proves that we can skip by the number of characters
|
||||||
|
matched in the right half, plus 1, just as in the original algorithm.
|
||||||
|
|
||||||
|
To make it explicit, the memory is set whenever the entire right part
|
||||||
|
is matched and is then used as a starting point in the next alignment.
|
||||||
|
In such a case, the alignment jumps forward one period, and the right
|
||||||
|
half matches all except possibly the last period. Additionally,
|
||||||
|
if we cut so that the left part has a length strictly less than the
|
||||||
|
period (we always can!), then we can know that the left part already
|
||||||
|
matches. The memory is reset to 0 whenever there is a mismatch in the
|
||||||
|
right part.
|
||||||
|
|
||||||
|
To prove linearity for the periodic case, note that if a right-part
|
||||||
|
character mismatches, then we advance forward 1 unit per comparison.
|
||||||
|
On the other hand, if the entire right part matches, then the skipping
|
||||||
|
forward by one period "defers" some of the comparisons to the next
|
||||||
|
alignment, where they will then be spent at the usual rate of
|
||||||
|
one comparison per step forward. Even if left-half comparisons
|
||||||
|
are always "wasted", they constitute less than half of all
|
||||||
|
comparisons, so the average rate is certainly at least 1 move forward
|
||||||
|
per 2 comparisons.
|
||||||
|
|
||||||
|
|
||||||
|
-------- When to choose the periodic algorithm ---------
|
||||||
|
|
||||||
|
The periodic algorithm is always valid but has an overhead of one
|
||||||
|
more "memory" register and some memory computation steps, so the
|
||||||
|
here-described-first non-periodic/long-period algorithm -- skipping by
|
||||||
|
max(len(left_part), len(right_part)) + 1 rather than the period --
|
||||||
|
should be preferred when possible.
|
||||||
|
|
||||||
|
Interestingly, the long-period algorithm does not require an exact
|
||||||
|
computation of the period; it works even with some long-period, but
|
||||||
|
undeniably "periodic" needles:
|
||||||
|
|
||||||
|
Cut: AbcdefAbc == Abcde + fAbc
|
||||||
|
|
||||||
|
This cut gives these inequalities:
|
||||||
|
|
||||||
|
e != f
|
||||||
|
de != fA
|
||||||
|
cde != fAb
|
||||||
|
bcde != fAbc
|
||||||
|
Abcde != fAbc?
|
||||||
|
The first failure is a period long, per the CFT:
|
||||||
|
?Abcde == fAbc??
|
||||||
|
|
||||||
|
A sufficient condition for using the long-period algorithm is having
|
||||||
|
the period of the needle be greater than
|
||||||
|
max(len(left_part), len(right_part)). This way, after choosing a good
|
||||||
|
split, we get all of the max(len(left_part), len(right_part))
|
||||||
|
inequalities around the cut that were required in the long-period
|
||||||
|
version of the algorithm.
|
||||||
|
|
||||||
|
With all of this in mind, here's how we choose:
|
||||||
|
|
||||||
|
(1) Choose a "critical factorization" of the needle -- a cut
|
||||||
|
where we have period minus 1 inequalities in a row.
|
||||||
|
More specifically, choose a cut so that the left_part
|
||||||
|
is less than one period long.
|
||||||
|
(2) Determine the period P_r of the right_part.
|
||||||
|
(3) Check if the left part is just an extension of the pattern of
|
||||||
|
the right part, so that the whole needle has period P_r.
|
||||||
|
Explicitly, check if
|
||||||
|
needle[0:cut] == needle[0+P_r:cut+P_r]
|
||||||
|
If so, we use the periodic algorithm. If not equal, we use the
|
||||||
|
long-period algorithm.
|
||||||
|
|
||||||
|
Note that if equality holds in (3), then the period of the whole
|
||||||
|
string is P_r. On the other hand, suppose equality does not hold.
|
||||||
|
The period of the needle is then strictly greater than P_r. Here's
|
||||||
|
a general fact:
|
||||||
|
|
||||||
|
If p is a substring of s and p has period r, then the period
|
||||||
|
of s is either equal to r or greater than len(p).
|
||||||
|
|
||||||
|
We know that needle_period != P_r,
|
||||||
|
and therefore needle_period > len(right_part).
|
||||||
|
Additionally, we'll choose the cut (see below)
|
||||||
|
so that len(left_part) < needle_period.
|
||||||
|
|
||||||
|
Thus, in the case where equality does not hold, we have that
|
||||||
|
needle_period >= max(len(left_part), len(right_part)) + 1,
|
||||||
|
so the long-period algorithm works, but otherwise, we know the period
|
||||||
|
of the needle.
|
||||||
|
|
||||||
|
Note that this decision process doesn't always require an exact
|
||||||
|
computation of the period -- we can get away with only computing P_r!
|
||||||
|
|
||||||
|
|
||||||
|
-------- Computing the cut --------
|
||||||
|
|
||||||
|
Our remaining tasks are now to compute a cut of the needle with as
|
||||||
|
many inequalities as possible, ensuring that cut < needle_period.
|
||||||
|
Meanwhile, we must also compute the period P_r of the right_part.
|
||||||
|
|
||||||
|
The computation is relatively simple, essentially doing this:
|
||||||
|
|
||||||
|
suffix1 = max(needle[i:] for i in range(len(needle)))
|
||||||
|
suffix2 = ... # the same as above, but invert the alphabet
|
||||||
|
cut1 = len(needle) - len(suffix1)
|
||||||
|
cut2 = len(needle) - len(suffix2)
|
||||||
|
cut = max(cut1, cut2) # the later cut
|
||||||
|
|
||||||
|
For cut2, "invert the alphabet" is different than saying min(...),
|
||||||
|
since in lexicographic order, we still put "py" < "python", even
|
||||||
|
if the alphabet is inverted. Computing these, along with the method
|
||||||
|
of computing the period of the right half, is easiest to read directly
|
||||||
|
from the source code in fastsearch.h, in which these are computed
|
||||||
|
in linear time.
|
||||||
|
|
||||||
|
Crochemore & Perrin's Theorem 3.1 give that "cut" above is a
|
||||||
|
critical factorization less than the period, but a very brief sketch
|
||||||
|
of their proof goes something like this (this is far from complete):
|
||||||
|
|
||||||
|
* If this cut splits the needle as some
|
||||||
|
needle == (a + w) + (w + b), meaning there's a bad equality
|
||||||
|
w == w, it's impossible for w + b to be bigger than both
|
||||||
|
b and w + w + b, so this can't happen. We thus have all of
|
||||||
|
the ineuqalities with no question marks.
|
||||||
|
* By maximality, the right part is not a substring of the left
|
||||||
|
part. Thus, we have all of the inequalities involving no
|
||||||
|
left-side question marks.
|
||||||
|
* If you have all of the inequalities without right-side question
|
||||||
|
marks, we have a critical factorization.
|
||||||
|
* If one such inequality fails, then there's a smaller period,
|
||||||
|
but the factorization is nonetheless critical. Here's where
|
||||||
|
you need the redundancy coming from computing both cuts and
|
||||||
|
choosing the later one.
|
||||||
|
|
||||||
|
|
||||||
|
-------- Some more Bells and Whistles --------
|
||||||
|
|
||||||
|
Beyond Crochemore & Perrin's original algorithm, we can use a couple
|
||||||
|
more tricks for speed in fastsearch.h:
|
||||||
|
|
||||||
|
1. Even though C&P has a best-case O(n/m) time, this doesn't occur
|
||||||
|
very often, so we add a Boyer-Moore bad character table to
|
||||||
|
achieve sublinear time in more cases.
|
||||||
|
|
||||||
|
2. The prework of computing the cut/period is expensive per
|
||||||
|
needle character, so we shouldn't do it if it won't pay off.
|
||||||
|
For this reason, if the needle and haystack are long enough,
|
||||||
|
only automatically start with two-way if the needle's length
|
||||||
|
is a small percentage of the length of the haystack.
|
||||||
|
|
||||||
|
3. In cases where the needle and haystack are large but the needle
|
||||||
|
makes up a significant percentage of the length of the
|
||||||
|
haystack, don't pay the expensive two-way preprocessing cost
|
||||||
|
if you don't need to. Instead, keep track of how many
|
||||||
|
character comparisons are equal, and if that exceeds
|
||||||
|
O(len(needle)), then pay that cost, since the simpler algorithm
|
||||||
|
isn't doing very well.
|
Loading…
Reference in New Issue