/* stringlib: fastsearch implementation */ #define STRINGLIB_FASTSEARCH_H /* FAST_SEARCH and FAST_COUNT use the two-way algorithm. See: http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm Largely influenced by glibc: https://code.woboq.org/userspace/glibc/string/str-two-way.h.html https://code.woboq.org/userspace/glibc/string/memmem.c.html FAST_RSEARCH uses another algorithm, based on a mix between boyer- moore and horspool, with a few more bells and whistles on the top. for some more background, see: http://effbot.org/zone/stringlib.htm */ #define FAST_COUNT 0 #define FAST_SEARCH 1 #define FAST_RSEARCH 2 /* Change to a 1 to see logging comments walk through the algorithm. */ #if 0 && STRINGLIB_SIZEOF_CHAR == 1 #define LOG(...) printf(__VA_ARGS__) #define LOG_STRING(s, n) printf("%.*s", n, s) #else #define LOG(...) #define LOG_STRING(s, n) #endif #if LONG_BIT >= 128 #define STRINGLIB_BLOOM_WIDTH 128 #elif LONG_BIT >= 64 #define STRINGLIB_BLOOM_WIDTH 64 #elif LONG_BIT >= 32 #define STRINGLIB_BLOOM_WIDTH 32 #else #error "LONG_BIT is smaller than 32" #endif #define STRINGLIB_BLOOM_ADD(mask, ch) \ ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) #define STRINGLIB_BLOOM(mask, ch) \ ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) #if STRINGLIB_SIZEOF_CHAR == 1 # define MEMCHR_CUT_OFF 15 #else # define MEMCHR_CUT_OFF 40 #endif Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) { const STRINGLIB_CHAR *p, *e; p = s; e = s + n; if (n > MEMCHR_CUT_OFF) { #if STRINGLIB_SIZEOF_CHAR == 1 p = memchr(s, ch, n); if (p != NULL) return (p - s); return -1; #else /* use memchr if we can choose a needle without too many likely false positives */ const STRINGLIB_CHAR *s1, *e1; unsigned char needle = ch & 0xff; /* If looking for a multiple of 256, we'd have too many false positives looking for the '\0' byte in UCS2 and UCS4 representations. */ if (needle != 0) { do { void *candidate = memchr(p, needle, (e - p) * sizeof(STRINGLIB_CHAR)); if (candidate == NULL) return -1; s1 = p; p = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); if (*p == ch) return (p - s); /* False positive */ p++; if (p - s1 > MEMCHR_CUT_OFF) continue; if (e - p <= MEMCHR_CUT_OFF) break; e1 = p + MEMCHR_CUT_OFF; while (p != e1) { if (*p == ch) return (p - s); p++; } } while (e - p > MEMCHR_CUT_OFF); } #endif } while (p < e) { if (*p == ch) return (p - s); p++; } return -1; } Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) { const STRINGLIB_CHAR *p; #ifdef HAVE_MEMRCHR /* memrchr() is a GNU extension, available since glibc 2.1.91. it doesn't seem as optimized as memchr(), but is still quite faster than our hand-written loop below */ if (n > MEMCHR_CUT_OFF) { #if STRINGLIB_SIZEOF_CHAR == 1 p = memrchr(s, ch, n); if (p != NULL) return (p - s); return -1; #else /* use memrchr if we can choose a needle without too many likely false positives */ const STRINGLIB_CHAR *s1; Py_ssize_t n1; unsigned char needle = ch & 0xff; /* If looking for a multiple of 256, we'd have too many false positives looking for the '\0' byte in UCS2 and UCS4 representations. */ if (needle != 0) { do { void *candidate = memrchr(s, needle, n * sizeof(STRINGLIB_CHAR)); if (candidate == NULL) return -1; n1 = n; p = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); n = p - s; if (*p == ch) return n; /* False positive */ if (n1 - n > MEMCHR_CUT_OFF) continue; if (n <= MEMCHR_CUT_OFF) break; s1 = p - MEMCHR_CUT_OFF; while (p > s1) { p--; if (*p == ch) return (p - s); } n = p - s; } while (n > MEMCHR_CUT_OFF); } #endif } #endif /* HAVE_MEMRCHR */ p = s + n; while (p > s) { p--; if (*p == ch) return (p - s); } return -1; } #undef MEMCHR_CUT_OFF Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, int inverted, Py_ssize_t *return_period) { // We'll eventually partition needle into // needle[:max_suffix + 1] + needle[max_suffix + 1:] Py_ssize_t max_suffix = -1; Py_ssize_t suffix = 0; // candidate for max_suffix Py_ssize_t period = 1; // candidate for return_period Py_ssize_t k = 1; // working index while (suffix + k < needle_len) { STRINGLIB_CHAR a = needle[suffix + k]; STRINGLIB_CHAR b = needle[max_suffix + k]; if (inverted ? (a < b) : (b < a)) { // Suffix is smaller, period is entire prefix so far. suffix += k; k = 1; period = suffix - max_suffix; } else if (a == b) { // Advance through the repitition of the current period. if (k != period) { k++; } else { suffix += period; k = 1; } } else { // Found a bigger suffix. max_suffix = suffix; suffix += 1; k = 1; period = 1; } } *return_period = period; return max_suffix + 1; } Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, Py_ssize_t *return_period) { /* Morally, this is what we want to happen: >>> x = "GCAGAGAG" >>> suf, period = _critical_factorization(x) >>> x[:suf], x[suf:] ('GC', 'AGAGAG') >>> period 2 */ Py_ssize_t period1, period2, max_suf1, max_suf2; // Search using both forward and inverted character-orderings max_suf1 = STRINGLIB(_lex_search)(needle, needle_len, 0, &period1); max_suf2 = STRINGLIB(_lex_search)(needle, needle_len, 1, &period2); // Choose the later suffix if (max_suf2 < max_suf1) { *return_period = period1; return max_suf1; } else { *return_period = period2; return max_suf2; } } Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, Py_ssize_t suffix, Py_ssize_t period) { LOG("========================\n"); LOG("Two-way with needle="); LOG_STRING(needle, needle_len); LOG(" and haystack="); LOG_STRING(haystack, haystack_len); LOG("\nSplit "); LOG_STRING(needle, needle_len); LOG(" into "); LOG_STRING(needle, suffix); LOG(" and "); LOG_STRING(needle + suffix, needle_len - suffix); LOG(".\n"); if (memcmp(needle, needle+period, suffix * STRINGLIB_SIZEOF_CHAR) == 0) { LOG("needle is completely periodic.\n"); // a mismatch can only advance by the period. // use memory to avoid re-scanning known occurrences of the period. Py_ssize_t memory = 0; Py_ssize_t j = 0; // index into haystack while (j <= haystack_len - needle_len) { // Visualize the line-up: LOG("> "); LOG_STRING(haystack, haystack_len); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); LOG("\n"); LOG("Scanning right half.\n"); Py_ssize_t i = Py_MAX(suffix, memory); while (i < needle_len && needle[i] == haystack[j+i]) { i++; } if (i >= needle_len) { LOG("Right half matched. Scanning left half.\n"); i = suffix - 1; while (memory < i + 1 && needle[i] == haystack[j+i]) { i--; } if (i + 1 < memory + 1) { LOG("Left half matches. Returning %d.\n", j); return j; } LOG("No match.\n"); // Remember how many periods were scanned on the right j += period; memory = needle_len - period; } else { LOG("Skip without checking left half.\n"); j += i - suffix + 1; memory = 0; } } } else { LOG("needle is NOT completely periodic.\n"); // The two halves are distinct; // no extra memory is required, // and a mismatch results in a maximal shift. period = 1 + Py_MAX(suffix, needle_len - suffix); STRINGLIB_CHAR suffix_start = needle[suffix]; LOG("Using period %d.\n", period); LOG("Right half starts with %c\n", suffix_start); Py_ssize_t j = 0; while (j <= haystack_len - needle_len) { // use faster code looking for first char. Py_ssize_t find; find = STRINGLIB(find_char)(haystack + suffix + j, haystack_len - needle_len - j + 1, suffix_start); if (find == -1) { LOG("Not found. Return -1.\n"); return -1; } j += find; LOG("Found at %d.\n", j); LOG("> "); LOG_STRING(haystack, haystack_len); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); LOG("\n"); LOG("Checking the right half.\n"); Py_ssize_t i = suffix+1; for (; i < needle_len; i++) { if (needle[i] != haystack[j + i]){ LOG("No match.\n"); break; } } if (needle_len <= i) { LOG("Matches. Checking the left half.\n"); i = suffix - 1; for (i = suffix - 1; i >= 0; i--) { if (needle[i] != haystack[j + i]) { break; } } if (i == -1) { LOG("Match! (at %d)\n", j); return j; } j += period; } else { LOG("Jump forward without checking left half.\n"); j += i - suffix + 1; } } } LOG("Reached end. Returning -1.\n"); return -1; } Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) { LOG("memmem: needle="); LOG_STRING(needle, needle_len); LOG(" and haystack="); LOG_STRING(haystack, haystack_len); LOG("\n"); if (needle_len == 0) { LOG("Easy out: empty.\n"); return 0; } STRINGLIB_CHAR first = needle[0]; Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first); if (index == -1) { LOG("Easy out: empty.\n"); return -1; } if (haystack_len < needle_len + index) { LOG("Easy out: no room left after first found.\n"); return -1; } // Do a fast compare to maybe avoid the initialization overhead if (memcmp(haystack+index, needle, needle_len*STRINGLIB_SIZEOF_CHAR) == 0) { LOG("Easy out: Naive guess was correct.\n"); return index; } // Start later. index++; Py_ssize_t period, suffix; suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); Py_ssize_t result = STRINGLIB(_two_way)(needle, needle_len, haystack + index, haystack_len - index, suffix, period); if (result == -1) { return -1; } return index + result; } Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_fastcount)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, Py_ssize_t maxcount) { if (maxcount == 0) { return 0; } if (needle_len == 1) { Py_ssize_t count = 0; for (Py_ssize_t i = 0; i < haystack_len; i++) { if (haystack[i] == needle[0]) { count++; if (count == maxcount) { return maxcount; } } } return count; } if (needle_len == 0) { return haystack_len + 1; } STRINGLIB_CHAR first = needle[0]; Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first); if (index == -1) { return 0; } if (haystack_len < needle_len + index) { return -1; } Py_ssize_t suffix, period; suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); Py_ssize_t count = 0; while (1) { Py_ssize_t result = STRINGLIB(_two_way)(needle, needle_len, haystack + index, haystack_len - index, suffix, period); if (result == -1) { return count; } else { count++; if (count == maxcount) { return maxcount; } index += result + needle_len; } } } Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, Py_ssize_t maxcount, int mode) { if (m > n) { return -1; } if (mode == FAST_SEARCH) { return STRINGLIB(_fastsearch)(p, m, s, n); } else if (mode == FAST_COUNT) { return STRINGLIB(_fastcount)(p, m, s, n, maxcount); } else { /* FAST_RSEARCH */ if (m == 1) { return STRINGLIB(rfind_char)(s, n, p[0]); } Py_ssize_t w = n - m; Py_ssize_t mlast = m - 1; Py_ssize_t skip = mlast - 1; Py_ssize_t mask = 0; Py_ssize_t i, j; /* create compressed boyer-moore delta 1 table */ /* process pattern[0] outside the loop */ STRINGLIB_BLOOM_ADD(mask, p[0]); /* process pattern[:0:-1] */ for (i = mlast; i > 0; i--) { STRINGLIB_BLOOM_ADD(mask, p[i]); if (p[i] == p[0]) skip = i - 1; } for (i = w; i >= 0; i--) { if (s[i] == p[0]) { /* candidate match */ for (j = mlast; j > 0; j--) if (s[i+j] != p[j]) break; if (j == 0) /* got a match! */ return i; /* miss: check if previous character is part of pattern */ if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) i = i - m; else i = i - skip; } else { /* skip: check if previous character is part of pattern */ if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) i = i - m; } } return -1; } } #undef LOG #undef LOG_STRING