From 11ee79a42d6fde68dc00494e8d7bed6cfe896635 Mon Sep 17 00:00:00 2001 From: Edgar Date: Sat, 28 Oct 2023 13:19:32 -0700 Subject: [PATCH 1/5] Add bitap_string_match algo --- strings/bitap_string_match.py | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 strings/bitap_string_match.py diff --git a/strings/bitap_string_match.py b/strings/bitap_string_match.py new file mode 100644 index 000000000000..0291a59c415b --- /dev/null +++ b/strings/bitap_string_match.py @@ -0,0 +1,79 @@ +""" +Bitap exact string matching +https://en.wikipedia.org/wiki/Bitap_algorithm + +Searches for a pattern inside text, and returns the index of the first occurence +of the pattern. Both text and pattern consist of lowercase alphabetical characters only. + +Complexity: O(m*n) + n = length of text + m = length of pattern + +Python doctests can be run using this command: +python3 -m doctest -v bitap_string_match.py +""" + + +def bitap_string_match(text: str, pattern: str) -> int | None: + """ + Retrieves the index of the first occurrence of pattern in text. + + Args: + text: A string consisting only of lowercase alphabetical characters. + pattern: A string consisting only of lowercase alphabetical characters. + + Returns: + int: The index where pattern first occurs. + + >>> bitap_string_match('abdabababc', 'ababc') + 5 + >>> bitap_string_match('aaaaaaaaaaaaaaaaaa', 'a') + 0 + >>> bitap_string_match('zxywsijdfosdfnso', 'zxywsijdfosdfnso') + 0 + >>> bitap_string_match('abdabababc', '') + 0 + >>> bitap_string_match('abdabababc', 'c') + 9 + >>> bitap_string_match('abdabababc', 'fofosdfo') is None + True + >>> bitap_string_match('abdab', 'fofosdfo') is None + True + """ + m: int = len(pattern) + if m == 0: + return 0 + if m > len(text): + return None + + # Initial state of bit string 1110 + state: str = ~1 + # Bit = 0 if character appears at index, and 1 otherwise + pattern_mask: [str] = [~0] * 27 # 1111 + + for i in range(m): + # For the pattern mask for this character, set the bit to 0 for each i + # the character appears. + pattern_index: int = ord(pattern[i]) - ord("a") + pattern_mask[pattern_index] &= ~(1 << i) + + for i in range(len(text)): + text_index: int = ord(text[i]) - ord("a") + # If this character does not appear in pattern, it's pattern mask is 1111. + # Performing a bitwise OR between state and 1111 will reset the state to 1111 + # and start searching the start of pattern again. + state |= pattern_mask[text_index] + state <<= 1 + + # If the mth bit (counting right to left) of the state is 0, then we have + # found pattern in text + if (state & (1 << m)) == 0: + return i - m + 1 + + return None + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From 5b46c6c61e7f96d2256c80f96530216d5784bb1d Mon Sep 17 00:00:00 2001 From: Edgar Date: Sat, 28 Oct 2023 13:25:58 -0700 Subject: [PATCH 2/5] Fix types --- strings/bitap_string_match.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/strings/bitap_string_match.py b/strings/bitap_string_match.py index 0291a59c415b..11d91677cc68 100644 --- a/strings/bitap_string_match.py +++ b/strings/bitap_string_match.py @@ -47,9 +47,9 @@ def bitap_string_match(text: str, pattern: str) -> int | None: return None # Initial state of bit string 1110 - state: str = ~1 + state: int = ~1 # Bit = 0 if character appears at index, and 1 otherwise - pattern_mask: [str] = [~0] * 27 # 1111 + pattern_mask: list[int] = [~0] * 27 # 1111 for i in range(m): # For the pattern mask for this character, set the bit to 0 for each i From c803f0e693f27cf4280156e54f88d2ab77f1a597 Mon Sep 17 00:00:00 2001 From: Edgar Date: Sat, 28 Oct 2023 13:47:53 -0700 Subject: [PATCH 3/5] Fix spelling and add ignore word --- pyproject.toml | 2 +- strings/bitap_string_match.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 790a328b3564..f1abb1cc73b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,5 +135,5 @@ omit = [ sort = "Cover" [tool.codespell] -ignore-words-list = "3rt,ans,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar" +ignore-words-list = "3rt,ans,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar,bitap" skip = "./.*,*.json,ciphers/prehistoric_men.txt,project_euler/problem_022/p022_names.txt,pyproject.toml,strings/dictionary.txt,strings/words.txt" diff --git a/strings/bitap_string_match.py b/strings/bitap_string_match.py index 11d91677cc68..df65055a762f 100644 --- a/strings/bitap_string_match.py +++ b/strings/bitap_string_match.py @@ -2,7 +2,7 @@ Bitap exact string matching https://en.wikipedia.org/wiki/Bitap_algorithm -Searches for a pattern inside text, and returns the index of the first occurence +Searches for a pattern inside text, and returns the index of the first occurrence of the pattern. Both text and pattern consist of lowercase alphabetical characters only. Complexity: O(m*n) From 0d7821e9852940251fb9b82060b6f73821d43091 Mon Sep 17 00:00:00 2001 From: Edgar Date: Sat, 28 Oct 2023 13:52:06 -0700 Subject: [PATCH 4/5] Add suggested changes and change return type --- strings/bitap_string_match.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/strings/bitap_string_match.py b/strings/bitap_string_match.py index df65055a762f..96d5e88d4359 100644 --- a/strings/bitap_string_match.py +++ b/strings/bitap_string_match.py @@ -14,7 +14,7 @@ """ -def bitap_string_match(text: str, pattern: str) -> int | None: +def bitap_string_match(text: str, pattern: str) -> int: """ Retrieves the index of the first occurrence of pattern in text. @@ -23,7 +23,7 @@ def bitap_string_match(text: str, pattern: str) -> int | None: pattern: A string consisting only of lowercase alphabetical characters. Returns: - int: The index where pattern first occurs. + int: The index where pattern first occurs. Return -1 if not found. >>> bitap_string_match('abdabababc', 'ababc') 5 @@ -35,30 +35,30 @@ def bitap_string_match(text: str, pattern: str) -> int | None: 0 >>> bitap_string_match('abdabababc', 'c') 9 - >>> bitap_string_match('abdabababc', 'fofosdfo') is None - True - >>> bitap_string_match('abdab', 'fofosdfo') is None - True + >>> bitap_string_match('abdabababc', 'fofosdfo') + -1 + >>> bitap_string_match('abdab', 'fofosdfo') + -1 """ m: int = len(pattern) if m == 0: return 0 if m > len(text): - return None + return -1 # Initial state of bit string 1110 state: int = ~1 # Bit = 0 if character appears at index, and 1 otherwise pattern_mask: list[int] = [~0] * 27 # 1111 - for i in range(m): + for i, char in enumerate(pattern): # For the pattern mask for this character, set the bit to 0 for each i # the character appears. - pattern_index: int = ord(pattern[i]) - ord("a") + pattern_index: int = ord(char) - ord("a") pattern_mask[pattern_index] &= ~(1 << i) - for i in range(len(text)): - text_index: int = ord(text[i]) - ord("a") + for i, char in enumerate(text): + text_index = ord(char) - ord("a") # If this character does not appear in pattern, it's pattern mask is 1111. # Performing a bitwise OR between state and 1111 will reset the state to 1111 # and start searching the start of pattern again. @@ -70,7 +70,7 @@ def bitap_string_match(text: str, pattern: str) -> int | None: if (state & (1 << m)) == 0: return i - m + 1 - return None + return -1 if __name__ == "__main__": From 4c19e29512620d60b70e7d1b3c46b438048b8284 Mon Sep 17 00:00:00 2001 From: Edgar Date: Sat, 28 Oct 2023 15:45:18 -0700 Subject: [PATCH 5/5] Resolve suggestions --- pyproject.toml | 2 +- strings/bitap_string_match.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f1abb1cc73b4..5d27142d16e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,5 +135,5 @@ omit = [ sort = "Cover" [tool.codespell] -ignore-words-list = "3rt,ans,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar,bitap" +ignore-words-list = "3rt,ans,bitap,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar" skip = "./.*,*.json,ciphers/prehistoric_men.txt,project_euler/problem_022/p022_names.txt,pyproject.toml,strings/dictionary.txt,strings/words.txt" diff --git a/strings/bitap_string_match.py b/strings/bitap_string_match.py index 96d5e88d4359..bd8a0f0d73ec 100644 --- a/strings/bitap_string_match.py +++ b/strings/bitap_string_match.py @@ -40,14 +40,14 @@ def bitap_string_match(text: str, pattern: str) -> int: >>> bitap_string_match('abdab', 'fofosdfo') -1 """ - m: int = len(pattern) - if m == 0: + if not pattern: return 0 + m = len(pattern) if m > len(text): return -1 # Initial state of bit string 1110 - state: int = ~1 + state = ~1 # Bit = 0 if character appears at index, and 1 otherwise pattern_mask: list[int] = [~0] * 27 # 1111