Add bitap_string_match algo (#11060)

edy252 · web-flow · commit e3eb9daba415 · 2023-10-29T00:48:50.000+02:00
* Add bitap_string_match algo

* Fix types

* Fix spelling and add ignore word

* Add suggested changes and change return type

* Resolve suggestions
diff --git a/pyproject.toml b/pyproject.toml
@@ -135,5 +135,5 @@ omit = [
 sort = "Cover"
 
 [tool.codespell]
-ignore-words-list = "3rt,ans,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar"
+ignore-words-list = "3rt,ans,bitap,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar"
 skip = "./.*,*.json,ciphers/prehistoric_men.txt,project_euler/problem_022/p022_names.txt,pyproject.toml,strings/dictionary.txt,strings/words.txt"
diff --git a/strings/bitap_string_match.py b/strings/bitap_string_match.py
@@ -0,0 +1,79 @@
+"""
+Bitap exact string matching
+https://en.wikipedia.org/wiki/Bitap_algorithm
+
+Searches for a pattern inside text, and returns the index of the first occurrence
+of the pattern. Both text and pattern consist of lowercase alphabetical characters only.
+
+Complexity: O(m*n)
+    n = length of text
+    m = length of pattern
+
+Python doctests can be run using this command:
+python3 -m doctest -v bitap_string_match.py
+"""
+
+
+def bitap_string_match(text: str, pattern: str) -> int:
+    """
+    Retrieves the index of the first occurrence of pattern in text.
+
+    Args:
+        text: A string consisting only of lowercase alphabetical characters.
+        pattern: A string consisting only of lowercase alphabetical characters.
+
+    Returns:
+        int: The index where pattern first occurs. Return -1  if not found.
+
+    >>> bitap_string_match('abdabababc', 'ababc')
+    5
+    >>> bitap_string_match('aaaaaaaaaaaaaaaaaa', 'a')
+    0
+    >>> bitap_string_match('zxywsijdfosdfnso', 'zxywsijdfosdfnso')
+    0
+    >>> bitap_string_match('abdabababc', '')
+    0
+    >>> bitap_string_match('abdabababc', 'c')
+    9
+    >>> bitap_string_match('abdabababc', 'fofosdfo')
+    -1
+    >>> bitap_string_match('abdab', 'fofosdfo')
+    -1
+    """
+    if not pattern:
+        return 0
+    m = len(pattern)
+    if m > len(text):
+        return -1
+
+    # Initial state of bit string 1110
+    state = ~1
+    # Bit = 0 if character appears at index, and 1 otherwise
+    pattern_mask: list[int] = [~0] * 27  # 1111
+
+    for i, char in enumerate(pattern):
+        # For the pattern mask for this character, set the bit to 0 for each i
+        # the character appears.
+        pattern_index: int = ord(char) - ord("a")
+        pattern_mask[pattern_index] &= ~(1 << i)
+
+    for i, char in enumerate(text):
+        text_index = ord(char) - ord("a")
+        # If this character does not appear in pattern, it's pattern mask is 1111.
+        # Performing a bitwise OR between state and 1111 will reset the state to 1111
+        # and start searching the start of pattern again.
+        state |= pattern_mask[text_index]
+        state <<= 1
+
+        # If the mth bit (counting right to left) of the state is 0, then we have
+        # found pattern in text
+        if (state & (1 << m)) == 0:
+            return i - m + 1
+
+    return -1
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()