From 73a85c4e1da42db28e3de57c868d24a089b8d277 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com> Date: Sun, 28 Feb 2021 13:20:50 -0500 Subject: [PATCH] bpo-41972: Use the two-way algorithm for string searching (GH-22904) Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed. Co-authored-by: Tim Peters --- Lib/test/string_tests.py | 39 ++ .../2020-10-23-23-17-23.bpo-41972.kbAwg4.rst | 1 + Objects/stringlib/fastsearch.h | 490 +++++++++++++++++- .../stringlib_find_two_way_notes.txt | 431 +++++++++++++++ 4 files changed, 941 insertions(+), 20 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst create mode 100644 Objects/stringlib/stringlib_find_two_way_notes.txt diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 527f505c016..840d7bb7550 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -5,6 +5,7 @@ Common tests shared by test_unicode, test_userstring and test_bytes. import unittest, string, sys, struct from test import support from collections import UserList +import random class Sequence: def __init__(self, seq='wxyz'): self.seq = seq @@ -317,6 +318,44 @@ class BaseTest: else: self.checkraises(TypeError, 'hello', 'rindex', 42) + def test_find_periodic_pattern(self): + """Cover the special path for periodic patterns.""" + def reference_find(p, s): + for i in range(len(s)): + if s.startswith(p, i): + return i + return -1 + + rr = random.randrange + choices = random.choices + for _ in range(1000): + p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20) + p = p0[:len(p0) - rr(10)] # pop off some characters + left = ''.join(choices('abcdef', k=rr(2000))) + right = ''.join(choices('abcdef', k=rr(2000))) + text = left + p + right + with self.subTest(p=p, text=text): + self.checkequal(reference_find(p, text), + text, 'find', p) + + def test_find_shift_table_overflow(self): + """When the table of 8-bit shifts overflows.""" + N = 2**8 + 100 + + # first check the periodic case + # here, the shift for 'b' is N + 1. + pattern1 = 'a' * N + 'b' + 'a' * N + text1 = 'babbaa' * N + pattern1 + self.checkequal(len(text1)-len(pattern1), + text1, 'find', pattern1) + + # now check the non-periodic case + # here, the shift for 'd' is 3*(N+1)+1 + pattern2 = 'ddd' + 'abc' * N + "eee" + text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N + self.checkequal(len(text2) - N*len("de") - len(pattern2), + text2, 'find', pattern2) + def test_lower(self): self.checkequal('hello', 'HeLLo', 'lower') self.checkequal('hello', 'hello', 'lower') diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst b/Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst new file mode 100644 index 00000000000..609a0ff0be2 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst @@ -0,0 +1 @@ +Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now sometimes use the "Two-Way" string comparison algorithm to avoid quadratic behavior on long strings. \ No newline at end of file diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 56a4467d353..6574720b609 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -9,10 +9,16 @@ /* note: fastsearch may access s[n], which isn't a problem when using Python's ordinary string types, but may cause problems if you're using this code in other contexts. also, the count mode returns -1 - if there cannot possible be a match in the target string, and 0 if + if there cannot possibly be a match in the target string, and 0 if it has actually checked for matches, but didn't find any. callers beware! */ +/* If the strings are long enough, use Crochemore and Perrin's Two-Way + algorithm, which has worst-case O(n) runtime and best-case O(n/k). + Also compute a table of shifts to achieve O(n/k) in more cases, + and often (data dependent) deduce larger shifts than pure C&P can + deduce. */ + #define FAST_COUNT 0 #define FAST_SEARCH 1 #define FAST_RSEARCH 2 @@ -160,6 +166,353 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) #undef MEMCHR_CUT_OFF +/* Change to a 1 to see logging comments walk through the algorithm. */ +#if 0 && STRINGLIB_SIZEOF_CHAR == 1 +# define LOG(...) printf(__VA_ARGS__) +# define LOG_STRING(s, n) printf("\"%.*s\"", n, s) +#else +# define LOG(...) +# define LOG_STRING(s, n) +#endif + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, + Py_ssize_t *return_period, int invert_alphabet) +{ + /* Do a lexicographic search. Essentially this: + >>> max(needle[i:] for i in range(len(needle)+1)) + Also find the period of the right half. */ + Py_ssize_t max_suffix = 0; + Py_ssize_t candidate = 1; + Py_ssize_t k = 0; + // The period of the right half. + Py_ssize_t period = 1; + + while (candidate + k < len_needle) { + // each loop increases candidate + k + max_suffix + STRINGLIB_CHAR a = needle[candidate + k]; + STRINGLIB_CHAR b = needle[max_suffix + k]; + // check if the suffix at candidate is better than max_suffix + if (invert_alphabet ? (b < a) : (a < b)) { + // Fell short of max_suffix. + // The next k + 1 characters are non-increasing + // from candidate, so they won't start a maximal suffix. + candidate += k + 1; + k = 0; + // We've ruled out any period smaller than what's + // been scanned since max_suffix. + period = candidate - max_suffix; + } + else if (a == b) { + if (k + 1 != period) { + // Keep scanning the equal strings + k++; + } + else { + // Matched a whole period. + // Start matching the next period. + candidate += period; + k = 0; + } + } + else { + // Did better than max_suffix, so replace it. + max_suffix = candidate; + candidate++; + k = 0; + period = 1; + } + } + *return_period = period; + return max_suffix; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t *return_period) +{ + /* Do a "critical factorization", making it so that: + >>> needle = (left := needle[:cut]) + (right := needle[cut:]) + where the "local period" of the cut is maximal. + + The local period of the cut is the minimal length of a string w + such that (left endswith w or w endswith left) + and (right startswith w or w startswith left). + + The Critical Factorization Theorem says that this maximal local + period is the global period of the string. + + Crochemore and Perrin (1991) show that this cut can be computed + as the later of two cuts: one that gives a lexicographically + maximal right half, and one that gives the same with the + with respect to a reversed alphabet-ordering. + + This is what we want to happen: + >>> x = "GCAGAGAG" + >>> cut, period = factorize(x) + >>> x[:cut], (right := x[cut:]) + ('GC', 'AGAGAG') + >>> period # right half period + 2 + >>> right[period:] == right[:-period] + True + + This is how the local period lines up in the above example: + GC | AGAGAG + AGAGAGC = AGAGAGC + The length of this minimal repetition is 7, which is indeed the + period of the original string. */ + + Py_ssize_t cut1, period1, cut2, period2, cut, period; + cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0); + cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1); + + // Take the later cut. + if (cut1 > cut2) { + period = period1; + cut = cut1; + } + else { + period = period2; + cut = cut2; + } + + LOG("split: "); LOG_STRING(needle, cut); + LOG(" + "); LOG_STRING(needle + cut, len_needle - cut); + LOG("\n"); + + *return_period = period; + return cut; +} + +#define SHIFT_TYPE uint8_t +#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) +#define SHIFT_OVERFLOW (NOT_FOUND - 1U) + +#define TABLE_SIZE_BITS 6 +#define TABLE_SIZE (1U << TABLE_SIZE_BITS) +#define TABLE_MASK (TABLE_SIZE - 1U) + +typedef struct STRINGLIB(_pre) { + const STRINGLIB_CHAR *needle; + Py_ssize_t len_needle; + Py_ssize_t cut; + Py_ssize_t period; + int is_periodic; + SHIFT_TYPE table[TABLE_SIZE]; +} STRINGLIB(prework); + + +Py_LOCAL_INLINE(void) +STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, + STRINGLIB(prework) *p) +{ + p->needle = needle; + p->len_needle = len_needle; + p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period)); + assert(p->period + p->cut <= len_needle); + p->is_periodic = (0 == memcmp(needle, + needle + p->period, + p->cut * STRINGLIB_SIZEOF_CHAR)); + if (p->is_periodic) { + assert(p->cut <= len_needle/2); + assert(p->cut < p->period); + } + else { + // A lower bound on the period + p->period = Py_MAX(p->cut, len_needle - p->cut) + 1; + } + // Now fill up a table + memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE)); + assert(p->table[0] == NOT_FOUND); + assert(p->table[TABLE_MASK] == NOT_FOUND); + for (Py_ssize_t i = 0; i < len_needle; i++) { + Py_ssize_t shift = len_needle - i; + if (shift > SHIFT_OVERFLOW) { + shift = SHIFT_OVERFLOW; + } + p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift, + Py_ssize_t, + SHIFT_TYPE); + } +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, + STRINGLIB(prework) *p) +{ + // Crochemore and Perrin's (1991) Two-Way algorithm. + // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + Py_ssize_t len_needle = p->len_needle; + Py_ssize_t cut = p->cut; + Py_ssize_t period = p->period; + const STRINGLIB_CHAR *needle = p->needle; + const STRINGLIB_CHAR *window = haystack; + const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle; + SHIFT_TYPE *table = p->table; + LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); + + if (p->is_periodic) { + LOG("Needle is periodic.\n"); + Py_ssize_t memory = 0; + periodicwindowloop: + while (window <= last_window) { + Py_ssize_t i = Py_MAX(cut, memory); + + // Visualize the line-up: + LOG("> "); LOG_STRING(haystack, len_haystack); + LOG("\n> "); LOG("%*s", window - haystack, ""); + LOG_STRING(needle, len_needle); + LOG("\n> "); LOG("%*s", window - haystack + i, ""); + LOG(" ^ <-- cut\n"); + + if (window[i] != needle[i]) { + // Sunday's trick: if we're going to jump, we might + // as well jump to line up the character *after* the + // current window. + STRINGLIB_CHAR first_outside = window[len_needle]; + SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; + if (shift == NOT_FOUND) { + LOG("\"%c\" not found. Skipping entirely.\n", + first_outside); + window += len_needle + 1; + } + else { + LOG("Shifting to line up \"%c\".\n", first_outside); + Py_ssize_t memory_shift = i - cut + 1; + window += Py_MAX(shift, memory_shift); + } + memory = 0; + goto periodicwindowloop; + } + for (i = i + 1; i < len_needle; i++) { + if (needle[i] != window[i]) { + LOG("Right half does not match. Jump ahead by %d.\n", + i - cut + 1); + window += i - cut + 1; + memory = 0; + goto periodicwindowloop; + } + } + for (i = memory; i < cut; i++) { + if (needle[i] != window[i]) { + LOG("Left half does not match. Jump ahead by period %d.\n", + period); + window += period; + memory = len_needle - period; + goto periodicwindowloop; + } + } + LOG("Left half matches. Returning %d.\n", + window - haystack); + return window - haystack; + } + } + else { + LOG("Needle is not periodic.\n"); + assert(cut < len_needle); + STRINGLIB_CHAR needle_cut = needle[cut]; + windowloop: + while (window <= last_window) { + + // Visualize the line-up: + LOG("> "); LOG_STRING(haystack, len_haystack); + LOG("\n> "); LOG("%*s", window - haystack, ""); + LOG_STRING(needle, len_needle); + LOG("\n> "); LOG("%*s", window - haystack + cut, ""); + LOG(" ^ <-- cut\n"); + + if (window[cut] != needle_cut) { + // Sunday's trick: if we're going to jump, we might + // as well jump to line up the character *after* the + // current window. + STRINGLIB_CHAR first_outside = window[len_needle]; + SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; + if (shift == NOT_FOUND) { + LOG("\"%c\" not found. Skipping entirely.\n", + first_outside); + window += len_needle + 1; + } + else { + LOG("Shifting to line up \"%c\".\n", first_outside); + window += shift; + } + goto windowloop; + } + for (Py_ssize_t i = cut + 1; i < len_needle; i++) { + if (needle[i] != window[i]) { + LOG("Right half does not match. Advance by %d.\n", + i - cut + 1); + window += i - cut + 1; + goto windowloop; + } + } + for (Py_ssize_t i = 0; i < cut; i++) { + if (needle[i] != window[i]) { + LOG("Left half does not match. Advance by period %d.\n", + period); + window += period; + goto windowloop; + } + } + LOG("Left half matches. Returning %d.\n", window - haystack); + return window - haystack; + } + } + LOG("Not found. Returning -1.\n"); + return -1; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle) +{ + LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); + STRINGLIB(prework) p; + STRINGLIB(_preprocess)(needle, len_needle, &p); + return STRINGLIB(_two_way)(haystack, len_haystack, &p); +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t maxcount) +{ + LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); + STRINGLIB(prework) p; + STRINGLIB(_preprocess)(needle, len_needle, &p); + Py_ssize_t index = 0, count = 0; + while (1) { + Py_ssize_t result; + result = STRINGLIB(_two_way)(haystack + index, + len_haystack - index, &p); + if (result == -1) { + return count; + } + count++; + if (count == maxcount) { + return maxcount; + } + index += result + len_needle; + } + return count; +} + +#undef SHIFT_TYPE +#undef NOT_FOUND +#undef SHIFT_OVERFLOW +#undef TABLE_SIZE_BITS +#undef TABLE_SIZE +#undef TABLE_MASK + +#undef LOG +#undef LOG_STRING + Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, @@ -195,10 +548,22 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } mlast = m - 1; - skip = mlast - 1; + skip = mlast; mask = 0; if (mode != FAST_RSEARCH) { + if (m >= 100 && w >= 2000 && w / m >= 5) { + /* For larger problems where the needle isn't a huge + percentage of the size of the haystack, the relatively + expensive O(m) startup cost of the two-way algorithm + will surely pay off. */ + if (mode == FAST_SEARCH) { + return STRINGLIB(_two_way_find)(s, n, p, m); + } + else { + return STRINGLIB(_two_way_count)(s, n, p, m, maxcount); + } + } const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; @@ -207,41 +572,118 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[:-1] */ for (i = 0; i < mlast; i++) { STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[mlast]) + if (p[i] == p[mlast]) { skip = mlast - i - 1; + } } /* process pattern[-1] outside the loop */ STRINGLIB_BLOOM_ADD(mask, p[mlast]); + if (m >= 100 && w >= 8000) { + /* To ensure that we have good worst-case behavior, + here's an adaptive version of the algorithm, where if + we match O(m) characters without any matches of the + entire needle, then we predict that the startup cost of + the two-way algorithm will probably be worth it. */ + Py_ssize_t hits = 0; + for (i = 0; i <= w; i++) { + if (ss[i] == pp[0]) { + /* candidate match */ + for (j = 0; j < mlast; j++) { + if (s[i+j] != p[j]) { + break; + } + } + if (j == mlast) { + /* got a match! */ + if (mode != FAST_COUNT) { + return i; + } + count++; + if (count == maxcount) { + return maxcount; + } + i = i + mlast; + continue; + } + /* miss: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + else { + i = i + skip; + } + hits += j + 1; + if (hits >= m / 4 && i < w - 1000) { + /* We've done O(m) fruitless comparisons + anyway, so spend the O(m) cost on the + setup for the two-way algorithm. */ + Py_ssize_t res; + if (mode == FAST_COUNT) { + res = STRINGLIB(_two_way_count)( + s+i, n-i, p, m, maxcount-count); + return count + res; + } + else { + res = STRINGLIB(_two_way_find)(s+i, n-i, p, m); + if (res == -1) { + return -1; + } + return i + res; + } + } + } + else { + /* skip: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + } + } + if (mode != FAST_COUNT) { + return -1; + } + return count; + } + /* The standard, non-adaptive version of the algorithm. */ for (i = 0; i <= w; i++) { /* note: using mlast in the skip path slows things down on x86 */ if (ss[i] == pp[0]) { /* candidate match */ - for (j = 0; j < mlast; j++) - if (s[i+j] != p[j]) + for (j = 0; j < mlast; j++) { + if (s[i+j] != p[j]) { break; + } + } if (j == mlast) { /* got a match! */ - if (mode != FAST_COUNT) + if (mode != FAST_COUNT) { return i; + } count++; - if (count == maxcount) + if (count == maxcount) { return maxcount; + } i = i + mlast; continue; } /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; - else + } + else { i = i + skip; - } else { + } + } + else { /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; + } } } - } else { /* FAST_RSEARCH */ + } + else { /* FAST_RSEARCH */ /* create compressed boyer-moore delta 1 table */ @@ -250,28 +692,36 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[:0:-1] */ for (i = mlast; i > 0; i--) { STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[0]) + if (p[i] == p[0]) { skip = i - 1; + } } for (i = w; i >= 0; i--) { if (s[i] == p[0]) { /* candidate match */ - for (j = mlast; j > 0; j--) - if (s[i+j] != p[j]) + for (j = mlast; j > 0; j--) { + if (s[i+j] != p[j]) { break; - if (j == 0) + } + } + if (j == 0) { /* got a match! */ return i; + } /* miss: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { i = i - m; - else + } + else { i = i - skip; - } else { + } + } + else { /* skip: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { i = i - m; + } } } } diff --git a/Objects/stringlib/stringlib_find_two_way_notes.txt b/Objects/stringlib/stringlib_find_two_way_notes.txt new file mode 100644 index 00000000000..afe45431a75 --- /dev/null +++ b/Objects/stringlib/stringlib_find_two_way_notes.txt @@ -0,0 +1,431 @@ +This document explains Crochemore and Perrin's Two-Way string matching +algorithm, in which a smaller string (the "pattern" or "needle") +is searched for in a longer string (the "text" or "haystack"), +determining whether the needle is a substring of the haystack, and if +so, at what index(es). It is to be used by Python's string +(and bytes-like) objects when calling `find`, `index`, `__contains__`, +or implicitly in methods like `replace` or `partition`. + +This is essentially a re-telling of the paper + + Crochemore M., Perrin D., 1991, Two-way string-matching, + Journal of the ACM 38(3):651-675. + +focused more on understanding and examples than on rigor. See also +the code sample here: + + http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + +The algorithm runs in O(len(needle) + len(haystack)) time and with +O(1) space. However, since there is a larger preprocessing cost than +simpler algorithms, this Two-Way algorithm is to be used only when the +needle and haystack lengths meet certain thresholds. + + +These are the basic steps of the algorithm: + + * "Very carefully" cut the needle in two. + * For each alignment attempted: + 1. match the right part + * On failure, jump by the amount matched + 1 + 2. then match the left part. + * On failure jump by max(len(left), len(right)) + 1 + * If the needle is periodic, don't re-do comparisons; maintain + a "memory" of how many characters you already know match. + + +-------- Matching the right part -------- + +We first scan the right part of the needle to check if it matches the +the aligned characters in the haystack. We scan left-to-right, +and if a mismatch occurs, we jump ahead by the amount matched plus 1. + +Example: + + text: ........EFGX................... + pattern: ....abcdEFGH.... + cut: <<<<>>>> + +Matched 3, so jump ahead by 4: + + text: ........EFGX................... + pattern: ....abcdEFGH.... + cut: <<<<>>>> + +Why are we allowed to do this? Because we cut the needle very +carefully, in such a way that if the cut is ...abcd + EFGH... then +we have + + d != E + cd != EF + bcd != EFG + abcd != EFGH + ... and so on. + +If this is true for every pair of equal-length substrings around the +cut, then the following alignments do not work, so we can skip them: + + text: ........EFG.................... + pattern: ....abcdEFGH.... + ^ (Bad because d != E) + text: ........EFG.................... + pattern: ....abcdEFGH.... + ^^ (Bad because cd != EF) + text: ........EFG.................... + pattern: ....abcdEFGH.... + ^^^ (Bad because bcd != EFG) + +Skip 3 alignments => increment alignment by 4. + + +-------- If len(left_part) < len(right_part) -------- + +Above is the core idea, and it begins to suggest how the algorithm can +be linear-time. There is one bit of subtlety involving what to do +around the end of the needle: if the left half is shorter than the +right, then we could run into something like this: + + text: .....EFG...... + pattern: cdEFGH + +The same argument holds that we can skip ahead by 4, so long as + + d != E + cd != EF + ?cd != EFG + ??cd != EFGH + etc. + +The question marks represent "wildcards" that always match; they're +outside the limits of the needle, so there's no way for them to +invalidate a match. To ensure that the inequalities above are always +true, we need them to be true for all possible '?' values. We thus +need cd != FG and cd != GH, etc. + + +-------- Matching the left part -------- + +Once we have ensured the right part matches, we scan the left part +(order doesn't matter, but traditionally right-to-left), and if we +find a mismatch, we jump ahead by +max(len(left_part), len(right_part)) + 1. That we can jump by +at least len(right_part) + 1 we have already seen: + + text: .....EFG..... + pattern: abcdEFG + Matched 3, so jump by 4, + using the fact that d != E, cd != EF, and bcd != EFG. + +But we can also jump by at least len(left_part) + 1: + + text: ....cdEF..... + pattern: abcdEF + Jump by len('abcd') + 1 = 5. + + Skip the alignments: + text: ....cdEF..... + pattern: abcdEF + text: ....cdEF..... + pattern: abcdEF + text: ....cdEF..... + pattern: abcdEF + text: ....cdEF..... + pattern: abcdEF + +This requires the following facts: + d != E + cd != EF + bcd != EF? + abcd != EF?? + etc., for all values of ?s, as above. + +If we have both sets of inequalities, then we can indeed jump by +max(len(left_part), len(right_part)) + 1. Under the assumption of such +a nice splitting of the needle, we now have enough to prove linear +time for the search: consider the forward-progress/comparisons ratio +at each alignment position. If a mismatch occurs in the right part, +the ratio is 1 position forward per comparison. On the other hand, +if a mismatch occurs in the left half, we advance by more than +len(needle)//2 positions for at most len(needle) comparisons, +so this ratio is more than 1/2. This average "movement speed" is +bounded below by the constant "1 position per 2 comparisons", so we +have linear time. + + +-------- The periodic case -------- + +The sets of inequalities listed so far seem too good to be true in +the general case. Indeed, they fail when a needle is periodic: +there's no way to split 'AAbAAbAAbA' in two such that + + (the stuff n characters to the left of the split) + cannot equal + (the stuff n characters to the right of the split) + for all n. + +This is because no matter how you cut it, you'll get +s[cut-3:cut] == s[cut:cut+3]. So what do we do? We still cut the +needle in two so that n can be as big as possible. If we were to +split it as + + AAbA + AbAAbA + +then A == A at the split, so this is bad (we failed at length 1), but +if we split it as + + AA + bAAbAAbA + +we at least have A != b and AA != bA, and we fail at length 3 +since ?AA == bAA. We already knew that a cut to make length-3 +mismatch was impossible due to the period, but we now see that the +bound is sharp; we can get length-1 and length-2 to mismatch. + +This is exactly the content of the *critical factorization theorem*: +that no matter the period of the original needle, you can cut it in +such a way that (with the appropriate question marks), +needle[cut-k:cut] mismatches needle[cut:cut+k] for all k < the period. + +Even "non-periodic" strings are periodic with a period equal to +their length, so for such needles, the CFT already guarantees that +the algorithm described so far will work, since we can cut the needle +so that the length-k chunks on either side of the cut mismatch for all +k < len(needle). Looking closer at the algorithm, we only actually +require that k go up to max(len(left_part), len(right_part)). +So long as the period exceeds that, we're good. + +The more general shorter-period case is a bit harder. The essentials +are the same, except we use the periodicity to our advantage by +"remembering" periods that we've already compared. In our running +example, say we're computing + + "AAbAAbAAbA" in "bbbAbbAAbAAbAAbbbAAbAAbAAbAA". + +We cut as AA + bAAbAAbA, and then the algorithm runs as follows: + + First alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^X + - Mismatch at third position, so jump by 3. + - This requires that A!=b and AA != bA. + + Second alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^^^^^^^ + X + - Matched entire right part + - Mismatch at left part. + - Jump forward a period, remembering the existing comparisons + + Third alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + mmmmmmm^^X + - There's "memory": a bunch of characters were already matched. + - Two more characters match beyond that. + - The 8th character of the right part mismatched, so jump by 8 + - The above rule is more complicated than usual: we don't have + the right inequalities for lengths 1 through 7, but we do have + shifted copies of the length-1 and length-2 inequalities, + along with knowledge of the mismatch. We can skip all of these + alignments at once: + + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~ A != b at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~~ AA != bA at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^^^X 7-3=4 match, and the 5th misses. + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~ A != b at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~~ AA != bA at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^X 7-3-3=1 match and the 2nd misses. + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~ A != b at the cut + + Fourth alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^X + - Second character mismatches, so jump by 2. + + Fifth alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^^^^^^^ + X + - Right half matches, so use memory and skip ahead by period=3 + + Sixth alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + mmmmmmmm^^ + - Right part matches, left part is remembered, found a match! + +The one tricky skip by 8 here generalizes: if we have a period of p, +then the CFT says we can ensure the cut has the inequality property +for lengths 1 through p-1, and jumping by p would line up the +matching characters and mismatched character one period earlier. +Inductively, this proves that we can skip by the number of characters +matched in the right half, plus 1, just as in the original algorithm. + +To make it explicit, the memory is set whenever the entire right part +is matched and is then used as a starting point in the next alignment. +In such a case, the alignment jumps forward one period, and the right +half matches all except possibly the last period. Additionally, +if we cut so that the left part has a length strictly less than the +period (we always can!), then we can know that the left part already +matches. The memory is reset to 0 whenever there is a mismatch in the +right part. + +To prove linearity for the periodic case, note that if a right-part +character mismatches, then we advance forward 1 unit per comparison. +On the other hand, if the entire right part matches, then the skipping +forward by one period "defers" some of the comparisons to the next +alignment, where they will then be spent at the usual rate of +one comparison per step forward. Even if left-half comparisons +are always "wasted", they constitute less than half of all +comparisons, so the average rate is certainly at least 1 move forward +per 2 comparisons. + + +-------- When to choose the periodic algorithm --------- + +The periodic algorithm is always valid but has an overhead of one +more "memory" register and some memory computation steps, so the +here-described-first non-periodic/long-period algorithm -- skipping by +max(len(left_part), len(right_part)) + 1 rather than the period -- +should be preferred when possible. + +Interestingly, the long-period algorithm does not require an exact +computation of the period; it works even with some long-period, but +undeniably "periodic" needles: + + Cut: AbcdefAbc == Abcde + fAbc + +This cut gives these inequalities: + + e != f + de != fA + cde != fAb + bcde != fAbc + Abcde != fAbc? + The first failure is a period long, per the CFT: + ?Abcde == fAbc?? + +A sufficient condition for using the long-period algorithm is having +the period of the needle be greater than +max(len(left_part), len(right_part)). This way, after choosing a good +split, we get all of the max(len(left_part), len(right_part)) +inequalities around the cut that were required in the long-period +version of the algorithm. + +With all of this in mind, here's how we choose: + + (1) Choose a "critical factorization" of the needle -- a cut + where we have period minus 1 inequalities in a row. + More specifically, choose a cut so that the left_part + is less than one period long. + (2) Determine the period P_r of the right_part. + (3) Check if the left part is just an extension of the pattern of + the right part, so that the whole needle has period P_r. + Explicitly, check if + needle[0:cut] == needle[0+P_r:cut+P_r] + If so, we use the periodic algorithm. If not equal, we use the + long-period algorithm. + +Note that if equality holds in (3), then the period of the whole +string is P_r. On the other hand, suppose equality does not hold. +The period of the needle is then strictly greater than P_r. Here's +a general fact: + + If p is a substring of s and p has period r, then the period + of s is either equal to r or greater than len(p). + +We know that needle_period != P_r, +and therefore needle_period > len(right_part). +Additionally, we'll choose the cut (see below) +so that len(left_part) < needle_period. + +Thus, in the case where equality does not hold, we have that +needle_period >= max(len(left_part), len(right_part)) + 1, +so the long-period algorithm works, but otherwise, we know the period +of the needle. + +Note that this decision process doesn't always require an exact +computation of the period -- we can get away with only computing P_r! + + +-------- Computing the cut -------- + +Our remaining tasks are now to compute a cut of the needle with as +many inequalities as possible, ensuring that cut < needle_period. +Meanwhile, we must also compute the period P_r of the right_part. + +The computation is relatively simple, essentially doing this: + + suffix1 = max(needle[i:] for i in range(len(needle))) + suffix2 = ... # the same as above, but invert the alphabet + cut1 = len(needle) - len(suffix1) + cut2 = len(needle) - len(suffix2) + cut = max(cut1, cut2) # the later cut + +For cut2, "invert the alphabet" is different than saying min(...), +since in lexicographic order, we still put "py" < "python", even +if the alphabet is inverted. Computing these, along with the method +of computing the period of the right half, is easiest to read directly +from the source code in fastsearch.h, in which these are computed +in linear time. + +Crochemore & Perrin's Theorem 3.1 give that "cut" above is a +critical factorization less than the period, but a very brief sketch +of their proof goes something like this (this is far from complete): + + * If this cut splits the needle as some + needle == (a + w) + (w + b), meaning there's a bad equality + w == w, it's impossible for w + b to be bigger than both + b and w + w + b, so this can't happen. We thus have all of + the ineuqalities with no question marks. + * By maximality, the right part is not a substring of the left + part. Thus, we have all of the inequalities involving no + left-side question marks. + * If you have all of the inequalities without right-side question + marks, we have a critical factorization. + * If one such inequality fails, then there's a smaller period, + but the factorization is nonetheless critical. Here's where + you need the redundancy coming from computing both cuts and + choosing the later one. + + +-------- Some more Bells and Whistles -------- + +Beyond Crochemore & Perrin's original algorithm, we can use a couple +more tricks for speed in fastsearch.h: + + 1. Even though C&P has a best-case O(n/m) time, this doesn't occur + very often, so we add a Boyer-Moore bad character table to + achieve sublinear time in more cases. + + 2. The prework of computing the cut/period is expensive per + needle character, so we shouldn't do it if it won't pay off. + For this reason, if the needle and haystack are long enough, + only automatically start with two-way if the needle's length + is a small percentage of the length of the haystack. + + 3. In cases where the needle and haystack are large but the needle + makes up a significant percentage of the length of the + haystack, don't pay the expensive two-way preprocessing cost + if you don't need to. Instead, keep track of how many + character comparisons are equal, and if that exceeds + O(len(needle)), then pay that cost, since the simpler algorithm + isn't doing very well.