1
1
from __future__ import annotations
2
2
3
3
import sys
4
+ import doctest
4
5
5
6
6
7
class Letter :
@@ -20,25 +21,71 @@ def __init__(self, freq: int, left: Letter | TreeNode, right: Letter | TreeNode)
20
21
self .right : Letter | TreeNode = right
21
22
22
23
24
+ def parse_string (string : str ) -> list [Letter ]:
25
+ """
26
+ Return a list of Letter objects storing frequency
27
+ >>> string_in1 = "goose"
28
+ >>> out1 = parse_string(string_in1)
29
+ >>> out1
30
+ [g:1, s:1, e:1, o:2]
31
+ >>> string_in2 = ""
32
+ >>> out2 = parse_string(string_in2)
33
+ >>> out2
34
+ []
35
+ >>> string_in3 = "abbcccd"
36
+ >>> out3 = parse_string(string_in3)
37
+ >>> out3
38
+ [a:1, d:1, b:2, c:3]
39
+ """
40
+ chars : dict [str , Letter ] = {}
41
+ for char in string :
42
+ if char not in chars :
43
+ chars [char ] = Letter (char , 1 )
44
+ else :
45
+ chars [char ].freq += 1
46
+ return sorted (chars .values (), key = lambda letter : letter .freq )
47
+
48
+
23
49
def parse_file (file_path : str ) -> list [Letter ]:
24
50
"""
25
- Read the file and build a dict of all letters and their
26
- frequencies, then convert the dict into a list of Letters.
51
+ Read file and return a list of Letter objects storing frequency
52
+ >>> test_file_path_in1 = "text_data/text_original.txt"
53
+ >>> out1 = parse_file(test_file_path_in1)
54
+ >>> out1
55
+ [T:1, h:1, a:1, e:1, i:2, t:2, s:3, :3, .:3]
27
56
"""
28
- chars : dict [str , int ] = {}
29
- with open (file_path ) as f :
30
- while True :
31
- c = f .read (1 )
32
- if not c :
33
- break
34
- chars [c ] = chars [c ] + 1 if c in chars else 1
35
- return sorted ((Letter (c , f ) for c , f in chars .items ()), key = lambda x : x .freq )
57
+ with open (file_path , "r" , encoding = "utf8" ) as file :
58
+ string = file .read ()
59
+ return parse_string (string )
36
60
37
61
38
62
def build_tree (letters : list [Letter ]) -> Letter | TreeNode :
39
63
"""
40
- Run through the list of Letters and build the min heap
41
- for the Huffman Tree.
64
+ Build the min heap for the Huffman Tree; return root node
65
+ >>> letters_in1 = [Letter('g', 1), Letter('s', 1), Letter('e', 1), Letter('o', 2)]
66
+ >>> out1 = build_tree(letters_in1)
67
+ >>> out1.freq
68
+ 5
69
+ >>> out1.left.freq
70
+ 2
71
+ >>> out1.left.left
72
+ g:1
73
+ >>> out1.left.right
74
+ s:1
75
+ >>> out1.right.freq
76
+ 3
77
+ >>> out1.right.left
78
+ e:1
79
+ >>> out1.right.right
80
+ o:2
81
+ >>> letters_in2 = [Letter('a', 1), Letter('b', 1)]
82
+ >>> out2 = build_tree(letters_in2)
83
+ >>> out2.freq
84
+ 2
85
+ >>> out2.left
86
+ a:1
87
+ >>> out2.right
88
+ b:1
42
89
"""
43
90
response : list [Letter | TreeNode ] = list (letters )
44
91
while len (response ) > 1 :
@@ -51,10 +98,30 @@ def build_tree(letters: list[Letter]) -> Letter | TreeNode:
51
98
return response [0 ]
52
99
53
100
54
- def traverse_tree (root : Letter | TreeNode , bitstring : str ) -> list [Letter ]:
101
+ def traverse_tree (root : Letter | TreeNode , bitstring : str = "" ) -> list [Letter ]:
55
102
"""
56
103
Recursively traverse the Huffman Tree to set each
57
104
Letter's bitstring dictionary, and return the list of Letters
105
+ >>> root_in1 = build_tree(parse_string("goose"))
106
+ >>> out1 = traverse_tree(root_in1, "")
107
+ >>> out1
108
+ [g:1, s:1, e:1, o:2]
109
+ >>> out1[0].bitstring['g']
110
+ '00'
111
+ >>> out1[1].bitstring['s']
112
+ '01'
113
+ >>> out1[2].bitstring['e']
114
+ '10'
115
+ >>> out1[3].bitstring['o']
116
+ '11'
117
+ >>> root_in2 = build_tree(parse_file("text_data/text_original.txt"))
118
+ >>> out2 = traverse_tree(root_in2)
119
+ >>> out2
120
+ [.:3, i:2, t:2, T:1, h:1, a:1, e:1, s:3, :3]
121
+ >>> out2[0].bitstring['.']
122
+ '00'
123
+ >>> out2[4].bitstring['h']
124
+ '1001'
58
125
"""
59
126
if isinstance (root , Letter ):
60
127
root .bitstring [root .letter ] = bitstring
@@ -66,27 +133,40 @@ def traverse_tree(root: Letter | TreeNode, bitstring: str) -> list[Letter]:
66
133
return letters
67
134
68
135
69
- def huffman ( file_path : str ) -> None :
136
+ def huffman_string ( string : str , sep = " " ) -> str :
70
137
"""
71
- Parse the file, build the tree, then run through the file
72
- again, using the letters dictionary to find and print out the
73
- bitstring for each letter.
138
+ Return huffman coded string, with
139
+ letter bitstrings separated by sep parameter
140
+ >>> huffman_string("goose")
141
+ '00 11 11 01 10'
142
+ >>> huffman_string("This is a test...", "")
143
+ '1000100101011011101011011110101110111011110011000000'
74
144
"""
75
- letters_list = parse_file ( file_path )
145
+ letters_list = parse_string ( string )
76
146
root = build_tree (letters_list )
77
- letters = {
78
- k : v for letter in traverse_tree (root , "" ) for k , v in letter .bitstring .items ()
147
+ letter_bitstrings = {
148
+ k : v for letter in traverse_tree (root ) for k , v in letter .bitstring .items ()
79
149
}
80
- print (f"Huffman Coding of { file_path } : " )
81
- with open (file_path ) as f :
82
- while True :
83
- c = f .read (1 )
84
- if not c :
85
- break
86
- print (letters [c ], end = " " )
87
- print ()
150
+ return sep .join (letter_bitstrings [char ] for char in string )
151
+
152
+
153
+ def huffman (file_path : str ) -> None :
154
+ """
155
+ Parse the file, huffman code it and print the result
156
+ >>> huffman("text_data/text_original.txt")
157
+ Huffman Coding of text_data/text_original.txt:
158
+ 1000 1001 010 110 111 010 110 111 1010 111 011 1011 110 011 00 00 00
159
+ """
160
+ with open (file_path , "r" , encoding = "utf8" ) as file :
161
+ string = file .read ()
162
+ result = huffman_string (string , " " )
163
+ print (f"Huffman Coding of { file_path } :\n { result } " )
88
164
89
165
90
166
if __name__ == "__main__" :
91
- # pass the file path to the huffman function
92
- huffman (sys .argv [1 ])
167
+ if len (sys .argv ) < 2 :
168
+ # if no file path given, test the module
169
+ doctest .testmod ()
170
+ else :
171
+ # pass the file path to the huffman function
172
+ huffman (sys .argv [1 ])
0 commit comments