bpo-41972: Use the two-way algorithm for string searching (GH-22904)

Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed.

Co-authored-by: Tim Peters <tim.peters@gmail.com>
This commit is contained in:
Dennis Sweeney 2021-02-28 13:20:50 -05:00 committed by GitHub
parent 2183d06bc8
commit 73a85c4e1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 941 additions and 20 deletions

View File

@ -5,6 +5,7 @@ Common tests shared by test_unicode, test_userstring and test_bytes.
import unittest, string, sys, struct import unittest, string, sys, struct
from test import support from test import support
from collections import UserList from collections import UserList
import random
class Sequence: class Sequence:
def __init__(self, seq='wxyz'): self.seq = seq def __init__(self, seq='wxyz'): self.seq = seq
@ -317,6 +318,44 @@ class BaseTest:
else: else:
self.checkraises(TypeError, 'hello', 'rindex', 42) self.checkraises(TypeError, 'hello', 'rindex', 42)
def test_find_periodic_pattern(self):
"""Cover the special path for periodic patterns."""
def reference_find(p, s):
for i in range(len(s)):
if s.startswith(p, i):
return i
return -1
rr = random.randrange
choices = random.choices
for _ in range(1000):
p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20)
p = p0[:len(p0) - rr(10)] # pop off some characters
left = ''.join(choices('abcdef', k=rr(2000)))
right = ''.join(choices('abcdef', k=rr(2000)))
text = left + p + right
with self.subTest(p=p, text=text):
self.checkequal(reference_find(p, text),
text, 'find', p)
def test_find_shift_table_overflow(self):
"""When the table of 8-bit shifts overflows."""
N = 2**8 + 100
# first check the periodic case
# here, the shift for 'b' is N + 1.
pattern1 = 'a' * N + 'b' + 'a' * N
text1 = 'babbaa' * N + pattern1
self.checkequal(len(text1)-len(pattern1),
text1, 'find', pattern1)
# now check the non-periodic case
# here, the shift for 'd' is 3*(N+1)+1
pattern2 = 'ddd' + 'abc' * N + "eee"
text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N
self.checkequal(len(text2) - N*len("de") - len(pattern2),
text2, 'find', pattern2)
def test_lower(self): def test_lower(self):
self.checkequal('hello', 'HeLLo', 'lower') self.checkequal('hello', 'HeLLo', 'lower')
self.checkequal('hello', 'hello', 'lower') self.checkequal('hello', 'hello', 'lower')

View File

@ -0,0 +1 @@
Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now sometimes use the "Two-Way" string comparison algorithm to avoid quadratic behavior on long strings.

View File

@ -9,10 +9,16 @@
/* note: fastsearch may access s[n], which isn't a problem when using /* note: fastsearch may access s[n], which isn't a problem when using
Python's ordinary string types, but may cause problems if you're Python's ordinary string types, but may cause problems if you're
using this code in other contexts. also, the count mode returns -1 using this code in other contexts. also, the count mode returns -1
if there cannot possible be a match in the target string, and 0 if if there cannot possibly be a match in the target string, and 0 if
it has actually checked for matches, but didn't find any. callers it has actually checked for matches, but didn't find any. callers
beware! */ beware! */
/* If the strings are long enough, use Crochemore and Perrin's Two-Way
algorithm, which has worst-case O(n) runtime and best-case O(n/k).
Also compute a table of shifts to achieve O(n/k) in more cases,
and often (data dependent) deduce larger shifts than pure C&P can
deduce. */
#define FAST_COUNT 0 #define FAST_COUNT 0
#define FAST_SEARCH 1 #define FAST_SEARCH 1
#define FAST_RSEARCH 2 #define FAST_RSEARCH 2
@ -160,6 +166,353 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
#undef MEMCHR_CUT_OFF #undef MEMCHR_CUT_OFF
/* Change to a 1 to see logging comments walk through the algorithm. */
#if 0 && STRINGLIB_SIZEOF_CHAR == 1
# define LOG(...) printf(__VA_ARGS__)
# define LOG_STRING(s, n) printf("\"%.*s\"", n, s)
#else
# define LOG(...)
# define LOG_STRING(s, n)
#endif
Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
Py_ssize_t *return_period, int invert_alphabet)
{
/* Do a lexicographic search. Essentially this:
>>> max(needle[i:] for i in range(len(needle)+1))
Also find the period of the right half. */
Py_ssize_t max_suffix = 0;
Py_ssize_t candidate = 1;
Py_ssize_t k = 0;
// The period of the right half.
Py_ssize_t period = 1;
while (candidate + k < len_needle) {
// each loop increases candidate + k + max_suffix
STRINGLIB_CHAR a = needle[candidate + k];
STRINGLIB_CHAR b = needle[max_suffix + k];
// check if the suffix at candidate is better than max_suffix
if (invert_alphabet ? (b < a) : (a < b)) {
// Fell short of max_suffix.
// The next k + 1 characters are non-increasing
// from candidate, so they won't start a maximal suffix.
candidate += k + 1;
k = 0;
// We've ruled out any period smaller than what's
// been scanned since max_suffix.
period = candidate - max_suffix;
}
else if (a == b) {
if (k + 1 != period) {
// Keep scanning the equal strings
k++;
}
else {
// Matched a whole period.
// Start matching the next period.
candidate += period;
k = 0;
}
}
else {
// Did better than max_suffix, so replace it.
max_suffix = candidate;
candidate++;
k = 0;
period = 1;
}
}
*return_period = period;
return max_suffix;
}
Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle,
Py_ssize_t len_needle,
Py_ssize_t *return_period)
{
/* Do a "critical factorization", making it so that:
>>> needle = (left := needle[:cut]) + (right := needle[cut:])
where the "local period" of the cut is maximal.
The local period of the cut is the minimal length of a string w
such that (left endswith w or w endswith left)
and (right startswith w or w startswith left).
The Critical Factorization Theorem says that this maximal local
period is the global period of the string.
Crochemore and Perrin (1991) show that this cut can be computed
as the later of two cuts: one that gives a lexicographically
maximal right half, and one that gives the same with the
with respect to a reversed alphabet-ordering.
This is what we want to happen:
>>> x = "GCAGAGAG"
>>> cut, period = factorize(x)
>>> x[:cut], (right := x[cut:])
('GC', 'AGAGAG')
>>> period # right half period
2
>>> right[period:] == right[:-period]
True
This is how the local period lines up in the above example:
GC | AGAGAG
AGAGAGC = AGAGAGC
The length of this minimal repetition is 7, which is indeed the
period of the original string. */
Py_ssize_t cut1, period1, cut2, period2, cut, period;
cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0);
cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1);
// Take the later cut.
if (cut1 > cut2) {
period = period1;
cut = cut1;
}
else {
period = period2;
cut = cut2;
}
LOG("split: "); LOG_STRING(needle, cut);
LOG(" + "); LOG_STRING(needle + cut, len_needle - cut);
LOG("\n");
*return_period = period;
return cut;
}
#define SHIFT_TYPE uint8_t
#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U)
#define SHIFT_OVERFLOW (NOT_FOUND - 1U)
#define TABLE_SIZE_BITS 6
#define TABLE_SIZE (1U << TABLE_SIZE_BITS)
#define TABLE_MASK (TABLE_SIZE - 1U)
typedef struct STRINGLIB(_pre) {
const STRINGLIB_CHAR *needle;
Py_ssize_t len_needle;
Py_ssize_t cut;
Py_ssize_t period;
int is_periodic;
SHIFT_TYPE table[TABLE_SIZE];
} STRINGLIB(prework);
Py_LOCAL_INLINE(void)
STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
STRINGLIB(prework) *p)
{
p->needle = needle;
p->len_needle = len_needle;
p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period));
assert(p->period + p->cut <= len_needle);
p->is_periodic = (0 == memcmp(needle,
needle + p->period,
p->cut * STRINGLIB_SIZEOF_CHAR));
if (p->is_periodic) {
assert(p->cut <= len_needle/2);
assert(p->cut < p->period);
}
else {
// A lower bound on the period
p->period = Py_MAX(p->cut, len_needle - p->cut) + 1;
}
// Now fill up a table
memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE));
assert(p->table[0] == NOT_FOUND);
assert(p->table[TABLE_MASK] == NOT_FOUND);
for (Py_ssize_t i = 0; i < len_needle; i++) {
Py_ssize_t shift = len_needle - i;
if (shift > SHIFT_OVERFLOW) {
shift = SHIFT_OVERFLOW;
}
p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift,
Py_ssize_t,
SHIFT_TYPE);
}
}
Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
STRINGLIB(prework) *p)
{
// Crochemore and Perrin's (1991) Two-Way algorithm.
// See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
Py_ssize_t len_needle = p->len_needle;
Py_ssize_t cut = p->cut;
Py_ssize_t period = p->period;
const STRINGLIB_CHAR *needle = p->needle;
const STRINGLIB_CHAR *window = haystack;
const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle;
SHIFT_TYPE *table = p->table;
LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack);
if (p->is_periodic) {
LOG("Needle is periodic.\n");
Py_ssize_t memory = 0;
periodicwindowloop:
while (window <= last_window) {
Py_ssize_t i = Py_MAX(cut, memory);
// Visualize the line-up:
LOG("> "); LOG_STRING(haystack, len_haystack);
LOG("\n> "); LOG("%*s", window - haystack, "");
LOG_STRING(needle, len_needle);
LOG("\n> "); LOG("%*s", window - haystack + i, "");
LOG(" ^ <-- cut\n");
if (window[i] != needle[i]) {
// Sunday's trick: if we're going to jump, we might
// as well jump to line up the character *after* the
// current window.
STRINGLIB_CHAR first_outside = window[len_needle];
SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
if (shift == NOT_FOUND) {
LOG("\"%c\" not found. Skipping entirely.\n",
first_outside);
window += len_needle + 1;
}
else {
LOG("Shifting to line up \"%c\".\n", first_outside);
Py_ssize_t memory_shift = i - cut + 1;
window += Py_MAX(shift, memory_shift);
}
memory = 0;
goto periodicwindowloop;
}
for (i = i + 1; i < len_needle; i++) {
if (needle[i] != window[i]) {
LOG("Right half does not match. Jump ahead by %d.\n",
i - cut + 1);
window += i - cut + 1;
memory = 0;
goto periodicwindowloop;
}
}
for (i = memory; i < cut; i++) {
if (needle[i] != window[i]) {
LOG("Left half does not match. Jump ahead by period %d.\n",
period);
window += period;
memory = len_needle - period;
goto periodicwindowloop;
}
}
LOG("Left half matches. Returning %d.\n",
window - haystack);
return window - haystack;
}
}
else {
LOG("Needle is not periodic.\n");
assert(cut < len_needle);
STRINGLIB_CHAR needle_cut = needle[cut];
windowloop:
while (window <= last_window) {
// Visualize the line-up:
LOG("> "); LOG_STRING(haystack, len_haystack);
LOG("\n> "); LOG("%*s", window - haystack, "");
LOG_STRING(needle, len_needle);
LOG("\n> "); LOG("%*s", window - haystack + cut, "");
LOG(" ^ <-- cut\n");
if (window[cut] != needle_cut) {
// Sunday's trick: if we're going to jump, we might
// as well jump to line up the character *after* the
// current window.
STRINGLIB_CHAR first_outside = window[len_needle];
SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
if (shift == NOT_FOUND) {
LOG("\"%c\" not found. Skipping entirely.\n",
first_outside);
window += len_needle + 1;
}
else {
LOG("Shifting to line up \"%c\".\n", first_outside);
window += shift;
}
goto windowloop;
}
for (Py_ssize_t i = cut + 1; i < len_needle; i++) {
if (needle[i] != window[i]) {
LOG("Right half does not match. Advance by %d.\n",
i - cut + 1);
window += i - cut + 1;
goto windowloop;
}
}
for (Py_ssize_t i = 0; i < cut; i++) {
if (needle[i] != window[i]) {
LOG("Left half does not match. Advance by period %d.\n",
period);
window += period;
goto windowloop;
}
}
LOG("Left half matches. Returning %d.\n", window - haystack);
return window - haystack;
}
}
LOG("Not found. Returning -1.\n");
return -1;
}
Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack,
Py_ssize_t len_haystack,
const STRINGLIB_CHAR *needle,
Py_ssize_t len_needle)
{
LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack);
STRINGLIB(prework) p;
STRINGLIB(_preprocess)(needle, len_needle, &p);
return STRINGLIB(_two_way)(haystack, len_haystack, &p);
}
Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack,
Py_ssize_t len_haystack,
const STRINGLIB_CHAR *needle,
Py_ssize_t len_needle,
Py_ssize_t maxcount)
{
LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack);
STRINGLIB(prework) p;
STRINGLIB(_preprocess)(needle, len_needle, &p);
Py_ssize_t index = 0, count = 0;
while (1) {
Py_ssize_t result;
result = STRINGLIB(_two_way)(haystack + index,
len_haystack - index, &p);
if (result == -1) {
return count;
}
count++;
if (count == maxcount) {
return maxcount;
}
index += result + len_needle;
}
return count;
}
#undef SHIFT_TYPE
#undef NOT_FOUND
#undef SHIFT_OVERFLOW
#undef TABLE_SIZE_BITS
#undef TABLE_SIZE
#undef TABLE_MASK
#undef LOG
#undef LOG_STRING
Py_LOCAL_INLINE(Py_ssize_t) Py_LOCAL_INLINE(Py_ssize_t)
FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
const STRINGLIB_CHAR* p, Py_ssize_t m, const STRINGLIB_CHAR* p, Py_ssize_t m,
@ -195,10 +548,22 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
} }
mlast = m - 1; mlast = m - 1;
skip = mlast - 1; skip = mlast;
mask = 0; mask = 0;
if (mode != FAST_RSEARCH) { if (mode != FAST_RSEARCH) {
if (m >= 100 && w >= 2000 && w / m >= 5) {
/* For larger problems where the needle isn't a huge
percentage of the size of the haystack, the relatively
expensive O(m) startup cost of the two-way algorithm
will surely pay off. */
if (mode == FAST_SEARCH) {
return STRINGLIB(_two_way_find)(s, n, p, m);
}
else {
return STRINGLIB(_two_way_count)(s, n, p, m, maxcount);
}
}
const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *ss = s + m - 1;
const STRINGLIB_CHAR *pp = p + m - 1; const STRINGLIB_CHAR *pp = p + m - 1;
@ -207,41 +572,118 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* process pattern[:-1] */ /* process pattern[:-1] */
for (i = 0; i < mlast; i++) { for (i = 0; i < mlast; i++) {
STRINGLIB_BLOOM_ADD(mask, p[i]); STRINGLIB_BLOOM_ADD(mask, p[i]);
if (p[i] == p[mlast]) if (p[i] == p[mlast]) {
skip = mlast - i - 1; skip = mlast - i - 1;
}
} }
/* process pattern[-1] outside the loop */ /* process pattern[-1] outside the loop */
STRINGLIB_BLOOM_ADD(mask, p[mlast]); STRINGLIB_BLOOM_ADD(mask, p[mlast]);
if (m >= 100 && w >= 8000) {
/* To ensure that we have good worst-case behavior,
here's an adaptive version of the algorithm, where if
we match O(m) characters without any matches of the
entire needle, then we predict that the startup cost of
the two-way algorithm will probably be worth it. */
Py_ssize_t hits = 0;
for (i = 0; i <= w; i++) {
if (ss[i] == pp[0]) {
/* candidate match */
for (j = 0; j < mlast; j++) {
if (s[i+j] != p[j]) {
break;
}
}
if (j == mlast) {
/* got a match! */
if (mode != FAST_COUNT) {
return i;
}
count++;
if (count == maxcount) {
return maxcount;
}
i = i + mlast;
continue;
}
/* miss: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
}
else {
i = i + skip;
}
hits += j + 1;
if (hits >= m / 4 && i < w - 1000) {
/* We've done O(m) fruitless comparisons
anyway, so spend the O(m) cost on the
setup for the two-way algorithm. */
Py_ssize_t res;
if (mode == FAST_COUNT) {
res = STRINGLIB(_two_way_count)(
s+i, n-i, p, m, maxcount-count);
return count + res;
}
else {
res = STRINGLIB(_two_way_find)(s+i, n-i, p, m);
if (res == -1) {
return -1;
}
return i + res;
}
}
}
else {
/* skip: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
}
}
}
if (mode != FAST_COUNT) {
return -1;
}
return count;
}
/* The standard, non-adaptive version of the algorithm. */
for (i = 0; i <= w; i++) { for (i = 0; i <= w; i++) {
/* note: using mlast in the skip path slows things down on x86 */ /* note: using mlast in the skip path slows things down on x86 */
if (ss[i] == pp[0]) { if (ss[i] == pp[0]) {
/* candidate match */ /* candidate match */
for (j = 0; j < mlast; j++) for (j = 0; j < mlast; j++) {
if (s[i+j] != p[j]) if (s[i+j] != p[j]) {
break; break;
}
}
if (j == mlast) { if (j == mlast) {
/* got a match! */ /* got a match! */
if (mode != FAST_COUNT) if (mode != FAST_COUNT) {
return i; return i;
}
count++; count++;
if (count == maxcount) if (count == maxcount) {
return maxcount; return maxcount;
}
i = i + mlast; i = i + mlast;
continue; continue;
} }
/* miss: check if next character is part of pattern */ /* miss: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, ss[i+1])) if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m; i = i + m;
else }
else {
i = i + skip; i = i + skip;
} else { }
}
else {
/* skip: check if next character is part of pattern */ /* skip: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, ss[i+1])) if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m; i = i + m;
}
} }
} }
} else { /* FAST_RSEARCH */ }
else { /* FAST_RSEARCH */
/* create compressed boyer-moore delta 1 table */ /* create compressed boyer-moore delta 1 table */
@ -250,28 +692,36 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* process pattern[:0:-1] */ /* process pattern[:0:-1] */
for (i = mlast; i > 0; i--) { for (i = mlast; i > 0; i--) {
STRINGLIB_BLOOM_ADD(mask, p[i]); STRINGLIB_BLOOM_ADD(mask, p[i]);
if (p[i] == p[0]) if (p[i] == p[0]) {
skip = i - 1; skip = i - 1;
}
} }
for (i = w; i >= 0; i--) { for (i = w; i >= 0; i--) {
if (s[i] == p[0]) { if (s[i] == p[0]) {
/* candidate match */ /* candidate match */
for (j = mlast; j > 0; j--) for (j = mlast; j > 0; j--) {
if (s[i+j] != p[j]) if (s[i+j] != p[j]) {
break; break;
if (j == 0) }
}
if (j == 0) {
/* got a match! */ /* got a match! */
return i; return i;
}
/* miss: check if previous character is part of pattern */ /* miss: check if previous character is part of pattern */
if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
i = i - m; i = i - m;
else }
else {
i = i - skip; i = i - skip;
} else { }
}
else {
/* skip: check if previous character is part of pattern */ /* skip: check if previous character is part of pattern */
if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
i = i - m; i = i - m;
}
} }
} }
} }

View File

@ -0,0 +1,431 @@
This document explains Crochemore and Perrin's Two-Way string matching
algorithm, in which a smaller string (the "pattern" or "needle")
is searched for in a longer string (the "text" or "haystack"),
determining whether the needle is a substring of the haystack, and if
so, at what index(es). It is to be used by Python's string
(and bytes-like) objects when calling `find`, `index`, `__contains__`,
or implicitly in methods like `replace` or `partition`.
This is essentially a re-telling of the paper
Crochemore M., Perrin D., 1991, Two-way string-matching,
Journal of the ACM 38(3):651-675.
focused more on understanding and examples than on rigor. See also
the code sample here:
http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
The algorithm runs in O(len(needle) + len(haystack)) time and with
O(1) space. However, since there is a larger preprocessing cost than
simpler algorithms, this Two-Way algorithm is to be used only when the
needle and haystack lengths meet certain thresholds.
These are the basic steps of the algorithm:
* "Very carefully" cut the needle in two.
* For each alignment attempted:
1. match the right part
* On failure, jump by the amount matched + 1
2. then match the left part.
* On failure jump by max(len(left), len(right)) + 1
* If the needle is periodic, don't re-do comparisons; maintain
a "memory" of how many characters you already know match.
-------- Matching the right part --------
We first scan the right part of the needle to check if it matches the
the aligned characters in the haystack. We scan left-to-right,
and if a mismatch occurs, we jump ahead by the amount matched plus 1.
Example:
text: ........EFGX...................
pattern: ....abcdEFGH....
cut: <<<<>>>>
Matched 3, so jump ahead by 4:
text: ........EFGX...................
pattern: ....abcdEFGH....
cut: <<<<>>>>
Why are we allowed to do this? Because we cut the needle very
carefully, in such a way that if the cut is ...abcd + EFGH... then
we have
d != E
cd != EF
bcd != EFG
abcd != EFGH
... and so on.
If this is true for every pair of equal-length substrings around the
cut, then the following alignments do not work, so we can skip them:
text: ........EFG....................
pattern: ....abcdEFGH....
^ (Bad because d != E)
text: ........EFG....................
pattern: ....abcdEFGH....
^^ (Bad because cd != EF)
text: ........EFG....................
pattern: ....abcdEFGH....
^^^ (Bad because bcd != EFG)
Skip 3 alignments => increment alignment by 4.
-------- If len(left_part) < len(right_part) --------
Above is the core idea, and it begins to suggest how the algorithm can
be linear-time. There is one bit of subtlety involving what to do
around the end of the needle: if the left half is shorter than the
right, then we could run into something like this:
text: .....EFG......
pattern: cdEFGH
The same argument holds that we can skip ahead by 4, so long as
d != E
cd != EF
?cd != EFG
??cd != EFGH
etc.
The question marks represent "wildcards" that always match; they're
outside the limits of the needle, so there's no way for them to
invalidate a match. To ensure that the inequalities above are always
true, we need them to be true for all possible '?' values. We thus
need cd != FG and cd != GH, etc.
-------- Matching the left part --------
Once we have ensured the right part matches, we scan the left part
(order doesn't matter, but traditionally right-to-left), and if we
find a mismatch, we jump ahead by
max(len(left_part), len(right_part)) + 1. That we can jump by
at least len(right_part) + 1 we have already seen:
text: .....EFG.....
pattern: abcdEFG
Matched 3, so jump by 4,
using the fact that d != E, cd != EF, and bcd != EFG.
But we can also jump by at least len(left_part) + 1:
text: ....cdEF.....
pattern: abcdEF
Jump by len('abcd') + 1 = 5.
Skip the alignments:
text: ....cdEF.....
pattern: abcdEF
text: ....cdEF.....
pattern: abcdEF
text: ....cdEF.....
pattern: abcdEF
text: ....cdEF.....
pattern: abcdEF
This requires the following facts:
d != E
cd != EF
bcd != EF?
abcd != EF??
etc., for all values of ?s, as above.
If we have both sets of inequalities, then we can indeed jump by
max(len(left_part), len(right_part)) + 1. Under the assumption of such
a nice splitting of the needle, we now have enough to prove linear
time for the search: consider the forward-progress/comparisons ratio
at each alignment position. If a mismatch occurs in the right part,
the ratio is 1 position forward per comparison. On the other hand,
if a mismatch occurs in the left half, we advance by more than
len(needle)//2 positions for at most len(needle) comparisons,
so this ratio is more than 1/2. This average "movement speed" is
bounded below by the constant "1 position per 2 comparisons", so we
have linear time.
-------- The periodic case --------
The sets of inequalities listed so far seem too good to be true in
the general case. Indeed, they fail when a needle is periodic:
there's no way to split 'AAbAAbAAbA' in two such that
(the stuff n characters to the left of the split)
cannot equal
(the stuff n characters to the right of the split)
for all n.
This is because no matter how you cut it, you'll get
s[cut-3:cut] == s[cut:cut+3]. So what do we do? We still cut the
needle in two so that n can be as big as possible. If we were to
split it as
AAbA + AbAAbA
then A == A at the split, so this is bad (we failed at length 1), but
if we split it as
AA + bAAbAAbA
we at least have A != b and AA != bA, and we fail at length 3
since ?AA == bAA. We already knew that a cut to make length-3
mismatch was impossible due to the period, but we now see that the
bound is sharp; we can get length-1 and length-2 to mismatch.
This is exactly the content of the *critical factorization theorem*:
that no matter the period of the original needle, you can cut it in
such a way that (with the appropriate question marks),
needle[cut-k:cut] mismatches needle[cut:cut+k] for all k < the period.
Even "non-periodic" strings are periodic with a period equal to
their length, so for such needles, the CFT already guarantees that
the algorithm described so far will work, since we can cut the needle
so that the length-k chunks on either side of the cut mismatch for all
k < len(needle). Looking closer at the algorithm, we only actually
require that k go up to max(len(left_part), len(right_part)).
So long as the period exceeds that, we're good.
The more general shorter-period case is a bit harder. The essentials
are the same, except we use the periodicity to our advantage by
"remembering" periods that we've already compared. In our running
example, say we're computing
"AAbAAbAAbA" in "bbbAbbAAbAAbAAbbbAAbAAbAAbAA".
We cut as AA + bAAbAAbA, and then the algorithm runs as follows:
First alignment:
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
^^X
- Mismatch at third position, so jump by 3.
- This requires that A!=b and AA != bA.
Second alignment:
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
^^^^^^^^
X
- Matched entire right part
- Mismatch at left part.
- Jump forward a period, remembering the existing comparisons
Third alignment:
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
mmmmmmm^^X
- There's "memory": a bunch of characters were already matched.
- Two more characters match beyond that.
- The 8th character of the right part mismatched, so jump by 8
- The above rule is more complicated than usual: we don't have
the right inequalities for lengths 1 through 7, but we do have
shifted copies of the length-1 and length-2 inequalities,
along with knowledge of the mismatch. We can skip all of these
alignments at once:
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
~ A != b at the cut
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
~~ AA != bA at the cut
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
^^^^X 7-3=4 match, and the 5th misses.
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
~ A != b at the cut
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
~~ AA != bA at the cut
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
^X 7-3-3=1 match and the 2nd misses.
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
~ A != b at the cut
Fourth alignment:
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
^X
- Second character mismatches, so jump by 2.
Fifth alignment:
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
^^^^^^^^
X
- Right half matches, so use memory and skip ahead by period=3
Sixth alignment:
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
mmmmmmmm^^
- Right part matches, left part is remembered, found a match!
The one tricky skip by 8 here generalizes: if we have a period of p,
then the CFT says we can ensure the cut has the inequality property
for lengths 1 through p-1, and jumping by p would line up the
matching characters and mismatched character one period earlier.
Inductively, this proves that we can skip by the number of characters
matched in the right half, plus 1, just as in the original algorithm.
To make it explicit, the memory is set whenever the entire right part
is matched and is then used as a starting point in the next alignment.
In such a case, the alignment jumps forward one period, and the right
half matches all except possibly the last period. Additionally,
if we cut so that the left part has a length strictly less than the
period (we always can!), then we can know that the left part already
matches. The memory is reset to 0 whenever there is a mismatch in the
right part.
To prove linearity for the periodic case, note that if a right-part
character mismatches, then we advance forward 1 unit per comparison.
On the other hand, if the entire right part matches, then the skipping
forward by one period "defers" some of the comparisons to the next
alignment, where they will then be spent at the usual rate of
one comparison per step forward. Even if left-half comparisons
are always "wasted", they constitute less than half of all
comparisons, so the average rate is certainly at least 1 move forward
per 2 comparisons.
-------- When to choose the periodic algorithm ---------
The periodic algorithm is always valid but has an overhead of one
more "memory" register and some memory computation steps, so the
here-described-first non-periodic/long-period algorithm -- skipping by
max(len(left_part), len(right_part)) + 1 rather than the period --
should be preferred when possible.
Interestingly, the long-period algorithm does not require an exact
computation of the period; it works even with some long-period, but
undeniably "periodic" needles:
Cut: AbcdefAbc == Abcde + fAbc
This cut gives these inequalities:
e != f
de != fA
cde != fAb
bcde != fAbc
Abcde != fAbc?
The first failure is a period long, per the CFT:
?Abcde == fAbc??
A sufficient condition for using the long-period algorithm is having
the period of the needle be greater than
max(len(left_part), len(right_part)). This way, after choosing a good
split, we get all of the max(len(left_part), len(right_part))
inequalities around the cut that were required in the long-period
version of the algorithm.
With all of this in mind, here's how we choose:
(1) Choose a "critical factorization" of the needle -- a cut
where we have period minus 1 inequalities in a row.
More specifically, choose a cut so that the left_part
is less than one period long.
(2) Determine the period P_r of the right_part.
(3) Check if the left part is just an extension of the pattern of
the right part, so that the whole needle has period P_r.
Explicitly, check if
needle[0:cut] == needle[0+P_r:cut+P_r]
If so, we use the periodic algorithm. If not equal, we use the
long-period algorithm.
Note that if equality holds in (3), then the period of the whole
string is P_r. On the other hand, suppose equality does not hold.
The period of the needle is then strictly greater than P_r. Here's
a general fact:
If p is a substring of s and p has period r, then the period
of s is either equal to r or greater than len(p).
We know that needle_period != P_r,
and therefore needle_period > len(right_part).
Additionally, we'll choose the cut (see below)
so that len(left_part) < needle_period.
Thus, in the case where equality does not hold, we have that
needle_period >= max(len(left_part), len(right_part)) + 1,
so the long-period algorithm works, but otherwise, we know the period
of the needle.
Note that this decision process doesn't always require an exact
computation of the period -- we can get away with only computing P_r!
-------- Computing the cut --------
Our remaining tasks are now to compute a cut of the needle with as
many inequalities as possible, ensuring that cut < needle_period.
Meanwhile, we must also compute the period P_r of the right_part.
The computation is relatively simple, essentially doing this:
suffix1 = max(needle[i:] for i in range(len(needle)))
suffix2 = ... # the same as above, but invert the alphabet
cut1 = len(needle) - len(suffix1)
cut2 = len(needle) - len(suffix2)
cut = max(cut1, cut2) # the later cut
For cut2, "invert the alphabet" is different than saying min(...),
since in lexicographic order, we still put "py" < "python", even
if the alphabet is inverted. Computing these, along with the method
of computing the period of the right half, is easiest to read directly
from the source code in fastsearch.h, in which these are computed
in linear time.
Crochemore & Perrin's Theorem 3.1 give that "cut" above is a
critical factorization less than the period, but a very brief sketch
of their proof goes something like this (this is far from complete):
* If this cut splits the needle as some
needle == (a + w) + (w + b), meaning there's a bad equality
w == w, it's impossible for w + b to be bigger than both
b and w + w + b, so this can't happen. We thus have all of
the ineuqalities with no question marks.
* By maximality, the right part is not a substring of the left
part. Thus, we have all of the inequalities involving no
left-side question marks.
* If you have all of the inequalities without right-side question
marks, we have a critical factorization.
* If one such inequality fails, then there's a smaller period,
but the factorization is nonetheless critical. Here's where
you need the redundancy coming from computing both cuts and
choosing the later one.
-------- Some more Bells and Whistles --------
Beyond Crochemore & Perrin's original algorithm, we can use a couple
more tricks for speed in fastsearch.h:
1. Even though C&P has a best-case O(n/m) time, this doesn't occur
very often, so we add a Boyer-Moore bad character table to
achieve sublinear time in more cases.
2. The prework of computing the cut/period is expensive per
needle character, so we shouldn't do it if it won't pay off.
For this reason, if the needle and haystack are long enough,
only automatically start with two-way if the needle's length
is a small percentage of the length of the haystack.
3. In cases where the needle and haystack are large but the needle
makes up a significant percentage of the length of the
haystack, don't pay the expensive two-way preprocessing cost
if you don't need to. Instead, keep track of how many
character comparisons are equal, and if that exceeds
O(len(needle)), then pay that cost, since the simpler algorithm
isn't doing very well.