Skip to content

Commit e3eb9da

Browse files
authored
Add bitap_string_match algo (#11060)
* Add bitap_string_match algo * Fix types * Fix spelling and add ignore word * Add suggested changes and change return type * Resolve suggestions
1 parent aa5c97d commit e3eb9da

File tree

2 files changed

+80
-1
lines changed

2 files changed

+80
-1
lines changed

Diff for: pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,5 +135,5 @@ omit = [
135135
sort = "Cover"
136136

137137
[tool.codespell]
138-
ignore-words-list = "3rt,ans,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar"
138+
ignore-words-list = "3rt,ans,bitap,crate,damon,fo,followings,hist,iff,kwanza,manuel,mater,secant,som,sur,tim,toi,zar"
139139
skip = "./.*,*.json,ciphers/prehistoric_men.txt,project_euler/problem_022/p022_names.txt,pyproject.toml,strings/dictionary.txt,strings/words.txt"

Diff for: strings/bitap_string_match.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
Bitap exact string matching
3+
https://en.wikipedia.org/wiki/Bitap_algorithm
4+
5+
Searches for a pattern inside text, and returns the index of the first occurrence
6+
of the pattern. Both text and pattern consist of lowercase alphabetical characters only.
7+
8+
Complexity: O(m*n)
9+
n = length of text
10+
m = length of pattern
11+
12+
Python doctests can be run using this command:
13+
python3 -m doctest -v bitap_string_match.py
14+
"""
15+
16+
17+
def bitap_string_match(text: str, pattern: str) -> int:
18+
"""
19+
Retrieves the index of the first occurrence of pattern in text.
20+
21+
Args:
22+
text: A string consisting only of lowercase alphabetical characters.
23+
pattern: A string consisting only of lowercase alphabetical characters.
24+
25+
Returns:
26+
int: The index where pattern first occurs. Return -1 if not found.
27+
28+
>>> bitap_string_match('abdabababc', 'ababc')
29+
5
30+
>>> bitap_string_match('aaaaaaaaaaaaaaaaaa', 'a')
31+
0
32+
>>> bitap_string_match('zxywsijdfosdfnso', 'zxywsijdfosdfnso')
33+
0
34+
>>> bitap_string_match('abdabababc', '')
35+
0
36+
>>> bitap_string_match('abdabababc', 'c')
37+
9
38+
>>> bitap_string_match('abdabababc', 'fofosdfo')
39+
-1
40+
>>> bitap_string_match('abdab', 'fofosdfo')
41+
-1
42+
"""
43+
if not pattern:
44+
return 0
45+
m = len(pattern)
46+
if m > len(text):
47+
return -1
48+
49+
# Initial state of bit string 1110
50+
state = ~1
51+
# Bit = 0 if character appears at index, and 1 otherwise
52+
pattern_mask: list[int] = [~0] * 27 # 1111
53+
54+
for i, char in enumerate(pattern):
55+
# For the pattern mask for this character, set the bit to 0 for each i
56+
# the character appears.
57+
pattern_index: int = ord(char) - ord("a")
58+
pattern_mask[pattern_index] &= ~(1 << i)
59+
60+
for i, char in enumerate(text):
61+
text_index = ord(char) - ord("a")
62+
# If this character does not appear in pattern, it's pattern mask is 1111.
63+
# Performing a bitwise OR between state and 1111 will reset the state to 1111
64+
# and start searching the start of pattern again.
65+
state |= pattern_mask[text_index]
66+
state <<= 1
67+
68+
# If the mth bit (counting right to left) of the state is 0, then we have
69+
# found pattern in text
70+
if (state & (1 << m)) == 0:
71+
return i - m + 1
72+
73+
return -1
74+
75+
76+
if __name__ == "__main__":
77+
import doctest
78+
79+
doctest.testmod()

0 commit comments

Comments
 (0)