Skip to content

Commit 90686e3

Browse files
LuciaHarcekovapre-commit-ci[bot]cclauss
authored
Add LZ77 compression algorithm (#8059)
* - add "lz77_compressor" class with compress and decompress methods using LZ77 compression algorithm * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * - use "list" instead "List", formatting * - fix spelling * - add Python type hints * - add 'Token' class to represent triplet (offset, length, indicator) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * - add test, hange type rom List to list * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * - remove extra import * - remove extra types in comments * - better test * - edit comments * - add return types * - add tests for __str__ and __repr__ * Update lz77.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss <[email protected]>
1 parent 27d56ba commit 90686e3

File tree

1 file changed

+227
-0
lines changed

1 file changed

+227
-0
lines changed

Diff for: compression/lz77.py

+227
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
"""
2+
LZ77 compression algorithm
3+
- lossless data compression published in papers by Abraham Lempel and Jacob Ziv in 1977
4+
- also known as LZ1 or sliding-window compression
5+
- form the basis for many variations including LZW, LZSS, LZMA and others
6+
7+
It uses a “sliding window” method. Within the sliding window we have:
8+
- search buffer
9+
- look ahead buffer
10+
len(sliding_window) = len(search_buffer) + len(look_ahead_buffer)
11+
12+
LZ77 manages a dictionary that uses triples composed of:
13+
- Offset into search buffer, it's the distance between the start of a phrase and
14+
the beginning of a file.
15+
- Length of the match, it's the number of characters that make up a phrase.
16+
- The indicator is represented by a character that is going to be encoded next.
17+
18+
As a file is parsed, the dictionary is dynamically updated to reflect the compressed
19+
data contents and size.
20+
21+
Examples:
22+
"cabracadabrarrarrad" <-> [(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'),
23+
(3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')]
24+
"ababcbababaa" <-> [(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')]
25+
"aacaacabcabaaac" <-> [(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')]
26+
27+
Sources:
28+
en.wikipedia.org/wiki/LZ77_and_LZ78
29+
"""
30+
31+
32+
from dataclasses import dataclass
33+
34+
__version__ = "0.1"
35+
__author__ = "Lucia Harcekova"
36+
37+
38+
@dataclass
39+
class Token:
40+
"""
41+
Dataclass representing triplet called token consisting of length, offset
42+
and indicator. This triplet is used during LZ77 compression.
43+
"""
44+
45+
offset: int
46+
length: int
47+
indicator: str
48+
49+
def __repr__(self) -> str:
50+
"""
51+
>>> token = Token(1, 2, "c")
52+
>>> repr(token)
53+
'(1, 2, c)'
54+
>>> str(token)
55+
'(1, 2, c)'
56+
"""
57+
return f"({self.offset}, {self.length}, {self.indicator})"
58+
59+
60+
class LZ77Compressor:
61+
"""
62+
Class containing compress and decompress methods using LZ77 compression algorithm.
63+
"""
64+
65+
def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> None:
66+
self.window_size = window_size
67+
self.lookahead_buffer_size = lookahead_buffer_size
68+
self.search_buffer_size = self.window_size - self.lookahead_buffer_size
69+
70+
def compress(self, text: str) -> list[Token]:
71+
"""
72+
Compress the given string text using LZ77 compression algorithm.
73+
74+
Args:
75+
text: string to be compressed
76+
77+
Returns:
78+
output: the compressed text as a list of Tokens
79+
80+
>>> lz77_compressor = LZ77Compressor()
81+
>>> str(lz77_compressor.compress("ababcbababaa"))
82+
'[(0, 0, a), (0, 0, b), (2, 2, c), (4, 3, a), (2, 2, a)]'
83+
>>> str(lz77_compressor.compress("aacaacabcabaaac"))
84+
'[(0, 0, a), (1, 1, c), (3, 4, b), (3, 3, a), (1, 2, c)]'
85+
"""
86+
87+
output = []
88+
search_buffer = ""
89+
90+
# while there are still characters in text to compress
91+
while text:
92+
93+
# find the next encoding phrase
94+
# - triplet with offset, length, indicator (the next encoding character)
95+
token = self._find_encoding_token(text, search_buffer)
96+
97+
# update the search buffer:
98+
# - add new characters from text into it
99+
# - check if size exceed the max search buffer size, if so, drop the
100+
# oldest elements
101+
search_buffer += text[: token.length + 1]
102+
if len(search_buffer) > self.search_buffer_size:
103+
search_buffer = search_buffer[-self.search_buffer_size :]
104+
105+
# update the text
106+
text = text[token.length + 1 :]
107+
108+
# append the token to output
109+
output.append(token)
110+
111+
return output
112+
113+
def decompress(self, tokens: list[Token]) -> str:
114+
"""
115+
Convert the list of tokens into an output string.
116+
117+
Args:
118+
tokens: list containing triplets (offset, length, char)
119+
120+
Returns:
121+
output: decompressed text
122+
123+
Tests:
124+
>>> lz77_compressor = LZ77Compressor()
125+
>>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'),
126+
... Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'),
127+
... Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')])
128+
'cabracadabrarrarrad'
129+
>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'),
130+
... Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')])
131+
'ababcbababaa'
132+
>>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'),
133+
... Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')])
134+
'aacaacabcabaaac'
135+
"""
136+
137+
output = ""
138+
139+
for token in tokens:
140+
for _ in range(token.length):
141+
output += output[-token.offset]
142+
output += token.indicator
143+
144+
return output
145+
146+
def _find_encoding_token(self, text: str, search_buffer: str) -> Token:
147+
"""Finds the encoding token for the first character in the text.
148+
149+
Tests:
150+
>>> lz77_compressor = LZ77Compressor()
151+
>>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset
152+
7
153+
>>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac").length
154+
1
155+
>>> lz77_compressor._find_encoding_token("abc", "xyz").offset
156+
0
157+
>>> lz77_compressor._find_encoding_token("", "xyz").offset
158+
Traceback (most recent call last):
159+
...
160+
ValueError: We need some text to work with.
161+
>>> lz77_compressor._find_encoding_token("abc", "").offset
162+
0
163+
"""
164+
165+
if not text:
166+
raise ValueError("We need some text to work with.")
167+
168+
# Initialise result parameters to default values
169+
length, offset = 0, 0
170+
171+
if not search_buffer:
172+
return Token(offset, length, text[length])
173+
174+
for i, character in enumerate(search_buffer):
175+
found_offset = len(search_buffer) - i
176+
if character == text[0]:
177+
found_length = self._match_length_from_index(text, search_buffer, 0, i)
178+
# if the found length is bigger than the current or if it's equal,
179+
# which means it's offset is smaller: update offset and length
180+
if found_length >= length:
181+
offset, length = found_offset, found_length
182+
183+
return Token(offset, length, text[length])
184+
185+
def _match_length_from_index(
186+
self, text: str, window: str, text_index: int, window_index: int
187+
) -> int:
188+
"""Calculate the longest possible match of text and window characters from
189+
text_index in text and window_index in window.
190+
191+
Args:
192+
text: _description_
193+
window: sliding window
194+
text_index: index of character in text
195+
window_index: index of character in sliding window
196+
197+
Returns:
198+
The maximum match between text and window, from given indexes.
199+
200+
Tests:
201+
>>> lz77_compressor = LZ77Compressor(13, 6)
202+
>>> lz77_compressor._match_length_from_index("rarrad", "adabrar", 0, 4)
203+
5
204+
>>> lz77_compressor._match_length_from_index("adabrarrarrad",
205+
... "cabrac", 0, 1)
206+
1
207+
"""
208+
if not text or text[text_index] != window[window_index]:
209+
return 0
210+
return 1 + self._match_length_from_index(
211+
text, window + text[text_index], text_index + 1, window_index + 1
212+
)
213+
214+
215+
if __name__ == "__main__":
216+
from doctest import testmod
217+
218+
testmod()
219+
# Initialize compressor class
220+
lz77_compressor = LZ77Compressor(window_size=13, lookahead_buffer_size=6)
221+
222+
# Example
223+
TEXT = "cabracadabrarrarrad"
224+
compressed_text = lz77_compressor.compress(TEXT)
225+
print(lz77_compressor.compress("ababcbababaa"))
226+
decompressed_text = lz77_compressor.decompress(compressed_text)
227+
assert decompressed_text == TEXT, "The LZ77 algorithm returned the invalid result."

0 commit comments

Comments
 (0)