Skip to content

Commit e662a5a

Browse files
brunohadlichcclauss
authored andcommitted
Added Burrows-Wheeler transform algorithm. (#1029)
* Added doctest and more explanation about Dijkstra execution. * tests were not passing with python2 due to missing __init__.py file at number_theory folder * Removed the dot at the beginning of the imported modules names because 'python3 -m doctest -v data_structures/hashing/*.py' and 'python3 -m doctest -v data_structures/stacks/*.py' were failing not finding hash_table.py and stack.py modules. * Moved global code to main scope and added doctest for project euler problems 1 to 14. * Added test case for negative input. * Changed N variable to do not use end of line scape because in case there is a space after it the script will break making it much more error prone. * Added problems description and doctests to the ones that were missing. Limited line length to 79 and executed python black over all scripts. * Changed the way files are loaded to support pytest call. * Added __init__.py to problems to make them modules and allow pytest execution. * Added project_euler folder to test units execution * Changed 'os.path.split(os.path.realpath(__file__))' to 'os.path.dirname()' * Added Burrows-Wheeler transform algorithm. * Added changes suggested by cclauss
1 parent f64b602 commit e662a5a

File tree

1 file changed

+176
-0
lines changed

1 file changed

+176
-0
lines changed

Diff for: compression/burrows_wheeler.py

+176
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
"""
2+
https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
3+
4+
The Burrows–Wheeler transform (BWT, also called block-sorting compression)
5+
rearranges a character string into runs of similar characters. This is useful
6+
for compression, since it tends to be easy to compress a string that has runs
7+
of repeated characters by techniques such as move-to-front transform and
8+
run-length encoding. More importantly, the transformation is reversible,
9+
without needing to store any additional data except the position of the first
10+
original character. The BWT is thus a "free" method of improving the efficiency
11+
of text compression algorithms, costing only some extra computation.
12+
"""
13+
from typing import List, Dict
14+
15+
16+
def all_rotations(s: str) -> List[str]:
17+
"""
18+
:param s: The string that will be rotated len(s) times.
19+
:return: A list with the rotations.
20+
:raises TypeError: If s is not an instance of str.
21+
Examples:
22+
23+
>>> all_rotations("^BANANA|") # doctest: +NORMALIZE_WHITESPACE
24+
['^BANANA|', 'BANANA|^', 'ANANA|^B', 'NANA|^BA', 'ANA|^BAN', 'NA|^BANA',
25+
'A|^BANAN', '|^BANANA']
26+
>>> all_rotations("a_asa_da_casa") # doctest: +NORMALIZE_WHITESPACE
27+
['a_asa_da_casa', '_asa_da_casaa', 'asa_da_casaa_', 'sa_da_casaa_a',
28+
'a_da_casaa_as', '_da_casaa_asa', 'da_casaa_asa_', 'a_casaa_asa_d',
29+
'_casaa_asa_da', 'casaa_asa_da_', 'asaa_asa_da_c', 'saa_asa_da_ca',
30+
'aa_asa_da_cas']
31+
>>> all_rotations("panamabanana") # doctest: +NORMALIZE_WHITESPACE
32+
['panamabanana', 'anamabananap', 'namabananapa', 'amabananapan',
33+
'mabananapana', 'abananapanam', 'bananapanama', 'ananapanamab',
34+
'nanapanamaba', 'anapanamaban', 'napanamabana', 'apanamabanan']
35+
>>> all_rotations(5)
36+
Traceback (most recent call last):
37+
...
38+
TypeError: The parameter s type must be str.
39+
"""
40+
if not isinstance(s, str):
41+
raise TypeError("The parameter s type must be str.")
42+
43+
return [s[i:] + s[:i] for i in range(len(s))]
44+
45+
46+
def bwt_transform(s: str) -> Dict:
47+
"""
48+
:param s: The string that will be used at bwt algorithm
49+
:return: the string composed of the last char of each row of the ordered
50+
rotations and the index of the original string at ordered rotations list
51+
:raises TypeError: If the s parameter type is not str
52+
:raises ValueError: If the s parameter is empty
53+
Examples:
54+
55+
>>> bwt_transform("^BANANA")
56+
{'bwt_string': 'BNN^AAA', 'idx_original_string': 6}
57+
>>> bwt_transform("a_asa_da_casa")
58+
{'bwt_string': 'aaaadss_c__aa', 'idx_original_string': 3}
59+
>>> bwt_transform("panamabanana")
60+
{'bwt_string': 'mnpbnnaaaaaa', 'idx_original_string': 11}
61+
>>> bwt_transform(4)
62+
Traceback (most recent call last):
63+
...
64+
TypeError: The parameter s type must be str.
65+
>>> bwt_transform('')
66+
Traceback (most recent call last):
67+
...
68+
ValueError: The parameter s must not be empty.
69+
"""
70+
if not isinstance(s, str):
71+
raise TypeError("The parameter s type must be str.")
72+
if not s:
73+
raise ValueError("The parameter s must not be empty.")
74+
75+
rotations = all_rotations(s)
76+
rotations.sort() # sort the list of rotations in alphabetically order
77+
# make a string composed of the last char of each rotation
78+
return {
79+
"bwt_string": "".join([word[-1] for word in rotations]),
80+
"idx_original_string": rotations.index(s),
81+
}
82+
83+
84+
def reverse_bwt(bwt_string: str, idx_original_string: int) -> str:
85+
"""
86+
:param bwt_string: The string returned from bwt algorithm execution
87+
:param idx_original_string: A 0-based index of the string that was used to
88+
generate bwt_string at ordered rotations list
89+
:return: The string used to generate bwt_string when bwt was executed
90+
:raises TypeError: If the bwt_string parameter type is not str
91+
:raises ValueError: If the bwt_string parameter is empty
92+
:raises TypeError: If the idx_original_string type is not int or if not
93+
possible to cast it to int
94+
:raises ValueError: If the idx_original_string value is lower than 0 or
95+
greater than len(bwt_string) - 1
96+
97+
>>> reverse_bwt("BNN^AAA", 6)
98+
'^BANANA'
99+
>>> reverse_bwt("aaaadss_c__aa", 3)
100+
'a_asa_da_casa'
101+
>>> reverse_bwt("mnpbnnaaaaaa", 11)
102+
'panamabanana'
103+
>>> reverse_bwt(4, 11)
104+
Traceback (most recent call last):
105+
...
106+
TypeError: The parameter bwt_string type must be str.
107+
>>> reverse_bwt("", 11)
108+
Traceback (most recent call last):
109+
...
110+
ValueError: The parameter bwt_string must not be empty.
111+
>>> reverse_bwt("mnpbnnaaaaaa", "asd") # doctest: +NORMALIZE_WHITESPACE
112+
Traceback (most recent call last):
113+
...
114+
TypeError: The parameter idx_original_string type must be int or passive
115+
of cast to int.
116+
>>> reverse_bwt("mnpbnnaaaaaa", -1)
117+
Traceback (most recent call last):
118+
...
119+
ValueError: The parameter idx_original_string must not be lower than 0.
120+
>>> reverse_bwt("mnpbnnaaaaaa", 12) # doctest: +NORMALIZE_WHITESPACE
121+
Traceback (most recent call last):
122+
...
123+
ValueError: The parameter idx_original_string must be lower than
124+
len(bwt_string).
125+
>>> reverse_bwt("mnpbnnaaaaaa", 11.0)
126+
'panamabanana'
127+
>>> reverse_bwt("mnpbnnaaaaaa", 11.4)
128+
'panamabanana'
129+
"""
130+
if not isinstance(bwt_string, str):
131+
raise TypeError("The parameter bwt_string type must be str.")
132+
if not bwt_string:
133+
raise ValueError("The parameter bwt_string must not be empty.")
134+
try:
135+
idx_original_string = int(idx_original_string)
136+
except ValueError:
137+
raise TypeError(
138+
(
139+
"The parameter idx_original_string type must be int or passive"
140+
" of cast to int."
141+
)
142+
)
143+
if idx_original_string < 0:
144+
raise ValueError(
145+
"The parameter idx_original_string must not be lower than 0."
146+
)
147+
if idx_original_string >= len(bwt_string):
148+
raise ValueError(
149+
(
150+
"The parameter idx_original_string must be lower than"
151+
" len(bwt_string)."
152+
)
153+
)
154+
155+
ordered_rotations = [""] * len(bwt_string)
156+
for x in range(len(bwt_string)):
157+
for i in range(len(bwt_string)):
158+
ordered_rotations[i] = bwt_string[i] + ordered_rotations[i]
159+
ordered_rotations.sort()
160+
return ordered_rotations[idx_original_string]
161+
162+
163+
if __name__ == "__main__":
164+
entry_msg = "Provide a string that I will generate its BWT transform: "
165+
s = input(entry_msg).strip()
166+
result = bwt_transform(s)
167+
bwt_output_msg = "Burrows Wheeler tranform for string '{}' results in '{}'"
168+
print(bwt_output_msg.format(s, result["bwt_string"]))
169+
original_string = reverse_bwt(
170+
result["bwt_string"], result["idx_original_string"]
171+
)
172+
fmt = (
173+
"Reversing Burrows Wheeler tranform for entry '{}' we get original"
174+
" string '{}'"
175+
)
176+
print(fmt.format(result["bwt_string"], original_string))

0 commit comments

Comments
 (0)