Skip to content

Commit dfb26b2

Browse files
committed
Improve Huffman algorithm by providing a way to compress strings not in a file, and adding doctests
1 parent ed1900f commit dfb26b2

File tree

2 files changed

+111
-30
lines changed

2 files changed

+111
-30
lines changed

compression/huffman.py

Lines changed: 110 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import sys
4+
import doctest
45

56

67
class Letter:
@@ -20,25 +21,71 @@ def __init__(self, freq: int, left: Letter | TreeNode, right: Letter | TreeNode)
2021
self.right: Letter | TreeNode = right
2122

2223

24+
def parse_string(string: str) -> list[Letter]:
25+
"""
26+
Return a list of Letter objects storing frequency
27+
>>> string_in1 = "goose"
28+
>>> out1 = parse_string(string_in1)
29+
>>> out1
30+
[g:1, s:1, e:1, o:2]
31+
>>> string_in2 = ""
32+
>>> out2 = parse_string(string_in2)
33+
>>> out2
34+
[]
35+
>>> string_in3 = "abbcccd"
36+
>>> out3 = parse_string(string_in3)
37+
>>> out3
38+
[a:1, d:1, b:2, c:3]
39+
"""
40+
chars: dict[str, Letter] = {}
41+
for char in string:
42+
if char not in chars:
43+
chars[char] = Letter(char, 1)
44+
else:
45+
chars[char].freq += 1
46+
return sorted(chars.values(), key=lambda letter: letter.freq)
47+
48+
2349
def parse_file(file_path: str) -> list[Letter]:
2450
"""
25-
Read the file and build a dict of all letters and their
26-
frequencies, then convert the dict into a list of Letters.
51+
Read file and return a list of Letter objects storing frequency
52+
>>> test_file_path_in1 = "text_data/text_original.txt"
53+
>>> out1 = parse_file(test_file_path_in1)
54+
>>> out1
55+
[T:1, h:1, a:1, e:1, i:2, t:2, s:3, :3, .:3]
2756
"""
28-
chars: dict[str, int] = {}
29-
with open(file_path) as f:
30-
while True:
31-
c = f.read(1)
32-
if not c:
33-
break
34-
chars[c] = chars[c] + 1 if c in chars else 1
35-
return sorted((Letter(c, f) for c, f in chars.items()), key=lambda x: x.freq)
57+
with open(file_path, "r", encoding="utf8") as file:
58+
string = file.read()
59+
return parse_string(string)
3660

3761

3862
def build_tree(letters: list[Letter]) -> Letter | TreeNode:
3963
"""
40-
Run through the list of Letters and build the min heap
41-
for the Huffman Tree.
64+
Build the min heap for the Huffman Tree; return root node
65+
>>> letters_in1 = [Letter('g', 1), Letter('s', 1), Letter('e', 1), Letter('o', 2)]
66+
>>> out1 = build_tree(letters_in1)
67+
>>> out1.freq
68+
5
69+
>>> out1.left.freq
70+
2
71+
>>> out1.left.left
72+
g:1
73+
>>> out1.left.right
74+
s:1
75+
>>> out1.right.freq
76+
3
77+
>>> out1.right.left
78+
e:1
79+
>>> out1.right.right
80+
o:2
81+
>>> letters_in2 = [Letter('a', 1), Letter('b', 1)]
82+
>>> out2 = build_tree(letters_in2)
83+
>>> out2.freq
84+
2
85+
>>> out2.left
86+
a:1
87+
>>> out2.right
88+
b:1
4289
"""
4390
response: list[Letter | TreeNode] = list(letters)
4491
while len(response) > 1:
@@ -51,10 +98,30 @@ def build_tree(letters: list[Letter]) -> Letter | TreeNode:
5198
return response[0]
5299

53100

54-
def traverse_tree(root: Letter | TreeNode, bitstring: str) -> list[Letter]:
101+
def traverse_tree(root: Letter | TreeNode, bitstring: str = "") -> list[Letter]:
55102
"""
56103
Recursively traverse the Huffman Tree to set each
57104
Letter's bitstring dictionary, and return the list of Letters
105+
>>> root_in1 = build_tree(parse_string("goose"))
106+
>>> out1 = traverse_tree(root_in1, "")
107+
>>> out1
108+
[g:1, s:1, e:1, o:2]
109+
>>> out1[0].bitstring['g']
110+
'00'
111+
>>> out1[1].bitstring['s']
112+
'01'
113+
>>> out1[2].bitstring['e']
114+
'10'
115+
>>> out1[3].bitstring['o']
116+
'11'
117+
>>> root_in2 = build_tree(parse_file("text_data/text_original.txt"))
118+
>>> out2 = traverse_tree(root_in2)
119+
>>> out2
120+
[.:3, i:2, t:2, T:1, h:1, a:1, e:1, s:3, :3]
121+
>>> out2[0].bitstring['.']
122+
'00'
123+
>>> out2[4].bitstring['h']
124+
'1001'
58125
"""
59126
if isinstance(root, Letter):
60127
root.bitstring[root.letter] = bitstring
@@ -66,27 +133,40 @@ def traverse_tree(root: Letter | TreeNode, bitstring: str) -> list[Letter]:
66133
return letters
67134

68135

69-
def huffman(file_path: str) -> None:
136+
def huffman_string(string: str, sep=" ") -> str:
70137
"""
71-
Parse the file, build the tree, then run through the file
72-
again, using the letters dictionary to find and print out the
73-
bitstring for each letter.
138+
Return huffman coded string, with
139+
letter bitstrings separated by sep parameter
140+
>>> huffman_string("goose")
141+
'00 11 11 01 10'
142+
>>> huffman_string("This is a test...", "")
143+
'1000100101011011101011011110101110111011110011000000'
74144
"""
75-
letters_list = parse_file(file_path)
145+
letters_list = parse_string(string)
76146
root = build_tree(letters_list)
77-
letters = {
78-
k: v for letter in traverse_tree(root, "") for k, v in letter.bitstring.items()
147+
letter_bitstrings = {
148+
k: v for letter in traverse_tree(root) for k, v in letter.bitstring.items()
79149
}
80-
print(f"Huffman Coding of {file_path}: ")
81-
with open(file_path) as f:
82-
while True:
83-
c = f.read(1)
84-
if not c:
85-
break
86-
print(letters[c], end=" ")
87-
print()
150+
return sep.join(letter_bitstrings[char] for char in string)
151+
152+
153+
def huffman(file_path: str) -> None:
154+
"""
155+
Parse the file, huffman code it and print the result
156+
>>> huffman("text_data/text_original.txt")
157+
Huffman Coding of text_data/text_original.txt:
158+
1000 1001 010 110 111 010 110 111 1010 111 011 1011 110 011 00 00 00
159+
"""
160+
with open(file_path, "r", encoding="utf8") as file:
161+
string = file.read()
162+
result = huffman_string(string, " ")
163+
print(f"Huffman Coding of {file_path}:\n{result}")
88164

89165

90166
if __name__ == "__main__":
91-
# pass the file path to the huffman function
92-
huffman(sys.argv[1])
167+
if len(sys.argv) < 2:
168+
# if no file path given, test the module
169+
doctest.testmod()
170+
else:
171+
# pass the file path to the huffman function
172+
huffman(sys.argv[1])
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a test...

0 commit comments

Comments
 (0)