Skip to content

Commit 8269a63

Browse files
authored
Create lz78.py
added lz78 compression algorithm
1 parent fcf82a1 commit 8269a63

File tree

1 file changed

+117
-0
lines changed

1 file changed

+117
-0
lines changed

compression/lz78.py

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""
2+
Sources:
3+
https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ78
4+
"""
5+
6+
from dataclasses import dataclass
7+
8+
__version__ = "1.0"
9+
__author__ = "Ahmed Tamer"
10+
11+
@dataclass
12+
class Token:
13+
"""
14+
Dataclass representing pair called token consisting of the dictionary index and a single character that follows the phrase in the dictionary.
15+
"""
16+
17+
index: int
18+
char: str
19+
20+
def __repr__(self) -> str:
21+
"""
22+
>>> token = Token(1, "c")
23+
>>> repr(token)
24+
'(1, c)'
25+
>>> str(token)
26+
'(1, c)'
27+
"""
28+
return f"({self.index}, {self.char})"
29+
30+
31+
class LZ78Compressor:
32+
"""
33+
Class containing compress and decompress methods using LZ78 compression algorithm.
34+
"""
35+
36+
def compress(self, text: str) -> list[Token]:
37+
"""
38+
Compress the given string text using LZ78 compression algorithm.
39+
40+
Args:
41+
text: string to be compressed
42+
43+
Returns:
44+
output: the compressed text as a list of Tokens
45+
46+
Tests:
47+
>>> lz78_compressor = LZ78Compressor()
48+
>>> str(lz78_compressor.compress("ababcbababaa"))
49+
'[(0, a), (0, b), (1, b), (0, c), (2, a), (5, b), (1, a)]'
50+
>>> str(lz78_compressor.compress("aacaacabcabaaac"))
51+
'[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]'
52+
"""
53+
54+
phrase_dict = {}
55+
tokens = []
56+
code = 1
57+
phrase = ''
58+
for char in text:
59+
phrase += char
60+
if phrase not in phrase_dict:
61+
phrase_dict[phrase] = str(code)
62+
if len(phrase) == 1:
63+
tokens.append(Token('0', phrase))
64+
else:
65+
tokens.append(Token(phrase_dict[phrase[:-1]], phrase[-1]))
66+
code += 1
67+
phrase = ''
68+
return tokens
69+
70+
71+
def decompress(self, tokens: list[Token]) -> str:
72+
"""
73+
Convert the list of tokens into an output string.
74+
75+
Args:
76+
tokens: list containing pairs (index, char)
77+
78+
Returns:
79+
output: decompressed text
80+
81+
Tests:
82+
>>> lz78_compressor = LZ78Compressor()
83+
>>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), Token(0, 'r'), Token(2, 'c'),
84+
... Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), Token(4, 'r'), Token(2, 'r'), Token(8, 'd')])
85+
'cabracadabrarrarrad'
86+
>>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), Token(0, 'c'),
87+
... Token(2, 'a'), Token(5, 'b'), Token(1, 'a')])
88+
'ababcbababaa'
89+
>>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), Token(0, 'c'),
90+
... Token(1, 'b'), Token(4, 'a'), Token(0, 'b'), Token(3, 'a')])
91+
'aacaacabcabaaa'
92+
"""
93+
94+
text = ''
95+
phrase_dict = {'0': ''}
96+
code = 1
97+
for token in tokens:
98+
phrase = phrase_dict[str(token.index)] + token.char
99+
phrase_dict[str(code)] = phrase
100+
code += 1
101+
text += phrase
102+
return text
103+
104+
105+
if __name__ == '__main__':
106+
from doctest import testmod
107+
108+
testmod()
109+
110+
111+
lz78_compressor = LZ78Compressor()
112+
113+
# Example
114+
text = 'aacaacabcabaaa'
115+
tokens = lz78_compressor.compress(text)
116+
decompressedText = lz78_compressor.decompress(tokens)
117+
assert decompressedText == text, 'Invalid result.'

0 commit comments

Comments
 (0)