From 8269a63712dd17f0ebd4974e2d294e853ea66e17 Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Mon, 7 Oct 2024 06:49:31 +0300 Subject: [PATCH 01/11] Create lz78.py added lz78 compression algorithm --- compression/lz78.py | 117 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 compression/lz78.py diff --git a/compression/lz78.py b/compression/lz78.py new file mode 100644 index 000000000000..da907c734205 --- /dev/null +++ b/compression/lz78.py @@ -0,0 +1,117 @@ +""" +Sources: +https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ78 +""" + +from dataclasses import dataclass + +__version__ = "1.0" +__author__ = "Ahmed Tamer" + +@dataclass +class Token: + """ + Dataclass representing pair called token consisting of the dictionary index and a single character that follows the phrase in the dictionary. + """ + + index: int + char: str + + def __repr__(self) -> str: + """ + >>> token = Token(1, "c") + >>> repr(token) + '(1, c)' + >>> str(token) + '(1, c)' + """ + return f"({self.index}, {self.char})" + + +class LZ78Compressor: + """ + Class containing compress and decompress methods using LZ78 compression algorithm. + """ + + def compress(self, text: str) -> list[Token]: + """ + Compress the given string text using LZ78 compression algorithm. + + Args: + text: string to be compressed + + Returns: + output: the compressed text as a list of Tokens + + Tests: + >>> lz78_compressor = LZ78Compressor() + >>> str(lz78_compressor.compress("ababcbababaa")) + '[(0, a), (0, b), (1, b), (0, c), (2, a), (5, b), (1, a)]' + >>> str(lz78_compressor.compress("aacaacabcabaaac")) + '[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]' + """ + + phrase_dict = {} + tokens = [] + code = 1 + phrase = '' + for char in text: + phrase += char + if phrase not in phrase_dict: + phrase_dict[phrase] = str(code) + if len(phrase) == 1: + tokens.append(Token('0', phrase)) + else: + tokens.append(Token(phrase_dict[phrase[:-1]], phrase[-1])) + code += 1 + phrase = '' + return tokens + + + def decompress(self, tokens: list[Token]) -> str: + """ + Convert the list of tokens into an output string. + + Args: + tokens: list containing pairs (index, char) + + Returns: + output: decompressed text + + Tests: + >>> lz78_compressor = LZ78Compressor() + >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), Token(0, 'r'), Token(2, 'c'), + ... Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), Token(4, 'r'), Token(2, 'r'), Token(8, 'd')]) + 'cabracadabrarrarrad' + >>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), Token(0, 'c'), + ... Token(2, 'a'), Token(5, 'b'), Token(1, 'a')]) + 'ababcbababaa' + >>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), Token(0, 'c'), + ... Token(1, 'b'), Token(4, 'a'), Token(0, 'b'), Token(3, 'a')]) + 'aacaacabcabaaa' + """ + + text = '' + phrase_dict = {'0': ''} + code = 1 + for token in tokens: + phrase = phrase_dict[str(token.index)] + token.char + phrase_dict[str(code)] = phrase + code += 1 + text += phrase + return text + + +if __name__ == '__main__': + from doctest import testmod + + testmod() + + + lz78_compressor = LZ78Compressor() + + # Example + text = 'aacaacabcabaaa' + tokens = lz78_compressor.compress(text) + decompressedText = lz78_compressor.decompress(tokens) + assert decompressedText == text, 'Invalid result.' From 7a2d33e51aefdedd7bae7a7c5c57a762df779617 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 03:53:59 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- compression/lz78.py | 163 ++++++++++++++++++++++---------------------- 1 file changed, 81 insertions(+), 82 deletions(-) diff --git a/compression/lz78.py b/compression/lz78.py index da907c734205..80df9f1291e5 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -8,68 +8,68 @@ __version__ = "1.0" __author__ = "Ahmed Tamer" + @dataclass class Token: - """ - Dataclass representing pair called token consisting of the dictionary index and a single character that follows the phrase in the dictionary. - """ - - index: int - char: str - - def __repr__(self) -> str: """ - >>> token = Token(1, "c") - >>> repr(token) - '(1, c)' - >>> str(token) - '(1, c)' + Dataclass representing pair called token consisting of the dictionary index and a single character that follows the phrase in the dictionary. """ - return f"({self.index}, {self.char})" - + + index: int + char: str + + def __repr__(self) -> str: + """ + >>> token = Token(1, "c") + >>> repr(token) + '(1, c)' + >>> str(token) + '(1, c)' + """ + return f"({self.index}, {self.char})" + class LZ78Compressor: - """ - Class containing compress and decompress methods using LZ78 compression algorithm. - """ - - def compress(self, text: str) -> list[Token]: """ - Compress the given string text using LZ78 compression algorithm. - - Args: - text: string to be compressed - - Returns: - output: the compressed text as a list of Tokens - - Tests: - >>> lz78_compressor = LZ78Compressor() - >>> str(lz78_compressor.compress("ababcbababaa")) - '[(0, a), (0, b), (1, b), (0, c), (2, a), (5, b), (1, a)]' - >>> str(lz78_compressor.compress("aacaacabcabaaac")) - '[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]' + Class containing compress and decompress methods using LZ78 compression algorithm. """ - phrase_dict = {} - tokens = [] - code = 1 - phrase = '' - for char in text: - phrase += char - if phrase not in phrase_dict: - phrase_dict[phrase] = str(code) - if len(phrase) == 1: - tokens.append(Token('0', phrase)) - else: - tokens.append(Token(phrase_dict[phrase[:-1]], phrase[-1])) - code += 1 - phrase = '' - return tokens - - - def decompress(self, tokens: list[Token]) -> str: - """ + def compress(self, text: str) -> list[Token]: + """ + Compress the given string text using LZ78 compression algorithm. + + Args: + text: string to be compressed + + Returns: + output: the compressed text as a list of Tokens + + Tests: + >>> lz78_compressor = LZ78Compressor() + >>> str(lz78_compressor.compress("ababcbababaa")) + '[(0, a), (0, b), (1, b), (0, c), (2, a), (5, b), (1, a)]' + >>> str(lz78_compressor.compress("aacaacabcabaaac")) + '[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]' + """ + + phrase_dict = {} + tokens = [] + code = 1 + phrase = "" + for char in text: + phrase += char + if phrase not in phrase_dict: + phrase_dict[phrase] = str(code) + if len(phrase) == 1: + tokens.append(Token("0", phrase)) + else: + tokens.append(Token(phrase_dict[phrase[:-1]], phrase[-1])) + code += 1 + phrase = "" + return tokens + + def decompress(self, tokens: list[Token]) -> str: + """ Convert the list of tokens into an output string. Args: @@ -77,41 +77,40 @@ def decompress(self, tokens: list[Token]) -> str: Returns: output: decompressed text - + Tests: >>> lz78_compressor = LZ78Compressor() - >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), Token(0, 'r'), Token(2, 'c'), + >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), Token(0, 'r'), Token(2, 'c'), ... Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), Token(4, 'r'), Token(2, 'r'), Token(8, 'd')]) 'cabracadabrarrarrad' - >>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), Token(0, 'c'), + >>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), Token(0, 'c'), ... Token(2, 'a'), Token(5, 'b'), Token(1, 'a')]) 'ababcbababaa' - >>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), Token(0, 'c'), + >>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), Token(0, 'c'), ... Token(1, 'b'), Token(4, 'a'), Token(0, 'b'), Token(3, 'a')]) 'aacaacabcabaaa' - """ + """ + + text = "" + phrase_dict = {"0": ""} + code = 1 + for token in tokens: + phrase = phrase_dict[str(token.index)] + token.char + phrase_dict[str(code)] = phrase + code += 1 + text += phrase + return text + + +if __name__ == "__main__": + from doctest import testmod + + testmod() + + lz78_compressor = LZ78Compressor() - text = '' - phrase_dict = {'0': ''} - code = 1 - for token in tokens: - phrase = phrase_dict[str(token.index)] + token.char - phrase_dict[str(code)] = phrase - code += 1 - text += phrase - return text - - -if __name__ == '__main__': - from doctest import testmod - - testmod() - - - lz78_compressor = LZ78Compressor() - - # Example - text = 'aacaacabcabaaa' - tokens = lz78_compressor.compress(text) - decompressedText = lz78_compressor.decompress(tokens) - assert decompressedText == text, 'Invalid result.' + # Example + text = "aacaacabcabaaa" + tokens = lz78_compressor.compress(text) + decompressedText = lz78_compressor.decompress(tokens) + assert decompressedText == text, "Invalid result." From b544f10039f91b0d1d6003973e3715b5a901d2ad Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Mon, 7 Oct 2024 07:05:24 +0300 Subject: [PATCH 03/11] Update lz78.py fixed some errors --- compression/lz78.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/compression/lz78.py b/compression/lz78.py index 80df9f1291e5..b89b2d19fa4b 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -12,7 +12,8 @@ @dataclass class Token: """ - Dataclass representing pair called token consisting of the dictionary index and a single character that follows the phrase in the dictionary. + Dataclass representing pair called token consisting of the dictionary index + and a single character that follows the phrase in the dictionary. """ index: int @@ -80,14 +81,15 @@ def decompress(self, tokens: list[Token]) -> str: Tests: >>> lz78_compressor = LZ78Compressor() - >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), Token(0, 'r'), Token(2, 'c'), - ... Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), Token(4, 'r'), Token(2, 'r'), Token(8, 'd')]) + >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), + ... Token(0, 'r'), Token(2, 'c'), Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), + ... Token(4, 'r'), Token(2, 'r'), Token(8, 'd')]) 'cabracadabrarrarrad' - >>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), Token(0, 'c'), - ... Token(2, 'a'), Token(5, 'b'), Token(1, 'a')]) + >>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), + ... Token(0, 'c'), Token(2, 'a'), Token(5, 'b'), Token(1, 'a')]) 'ababcbababaa' - >>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), Token(0, 'c'), - ... Token(1, 'b'), Token(4, 'a'), Token(0, 'b'), Token(3, 'a')]) + >>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), + ... Token(0, 'c'), Token(1, 'b'), Token(4, 'a'), Token(0, 'b'), Token(3, 'a')]) 'aacaacabcabaaa' """ @@ -112,5 +114,5 @@ def decompress(self, tokens: list[Token]) -> str: # Example text = "aacaacabcabaaa" tokens = lz78_compressor.compress(text) - decompressedText = lz78_compressor.decompress(tokens) - assert decompressedText == text, "Invalid result." + decompressedtext = lz78_compressor.decompress(tokens) + assert decompressedtext == text, "Invalid result." From 1afdf0f887154a577acf74d4d94519e2856094a4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 04:05:45 +0000 Subject: [PATCH 04/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- compression/lz78.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compression/lz78.py b/compression/lz78.py index b89b2d19fa4b..8c22942130c3 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -12,7 +12,7 @@ @dataclass class Token: """ - Dataclass representing pair called token consisting of the dictionary index + Dataclass representing pair called token consisting of the dictionary index and a single character that follows the phrase in the dictionary. """ @@ -81,8 +81,8 @@ def decompress(self, tokens: list[Token]) -> str: Tests: >>> lz78_compressor = LZ78Compressor() - >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), - ... Token(0, 'r'), Token(2, 'c'), Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), + >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), + ... Token(0, 'r'), Token(2, 'c'), Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), ... Token(4, 'r'), Token(2, 'r'), Token(8, 'd')]) 'cabracadabrarrarrad' >>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), From c307a603830048cabbeef4d011b07f02485890f6 Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Mon, 7 Oct 2024 07:09:02 +0300 Subject: [PATCH 05/11] Update lz78.py fixed errors --- compression/lz78.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/compression/lz78.py b/compression/lz78.py index 8c22942130c3..6df96683ef73 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -62,9 +62,9 @@ def compress(self, text: str) -> list[Token]: if phrase not in phrase_dict: phrase_dict[phrase] = str(code) if len(phrase) == 1: - tokens.append(Token("0", phrase)) + tokens.append(Token(0, phrase)) else: - tokens.append(Token(phrase_dict[phrase[:-1]], phrase[-1])) + tokens.append(Token(int(phrase_dict[phrase[:-1]]), phrase[-1])) code += 1 phrase = "" return tokens @@ -89,7 +89,8 @@ def decompress(self, tokens: list[Token]) -> str: ... Token(0, 'c'), Token(2, 'a'), Token(5, 'b'), Token(1, 'a')]) 'ababcbababaa' >>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), - ... Token(0, 'c'), Token(1, 'b'), Token(4, 'a'), Token(0, 'b'), Token(3, 'a')]) + ... Token(0, 'c'), Token(1, 'b'), Token(4, 'a'), + ... Token(0, 'b'), Token(3, 'a')]) 'aacaacabcabaaa' """ From d85fde21249fd53a9a6f495889c6f46899805192 Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Mon, 7 Oct 2024 14:44:30 +0300 Subject: [PATCH 06/11] Update compression/lz78.py Co-authored-by: Christian Clauss --- compression/lz78.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compression/lz78.py b/compression/lz78.py index 6df96683ef73..8fc6d162ffd9 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -115,5 +115,5 @@ def decompress(self, tokens: list[Token]) -> str: # Example text = "aacaacabcabaaa" tokens = lz78_compressor.compress(text) - decompressedtext = lz78_compressor.decompress(tokens) - assert decompressedtext == text, "Invalid result." + decompressed_text = lz78_compressor.decompress(tokens) + assert decompressed_text == text, "Invalid result." From c7d1ece9973f2adf1ba2cff1b46c9d0eb3c081a7 Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Mon, 7 Oct 2024 14:49:08 +0300 Subject: [PATCH 07/11] Update lz78.py added more tests --- compression/lz78.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/compression/lz78.py b/compression/lz78.py index 8fc6d162ffd9..17f2200fc624 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -51,6 +51,14 @@ def compress(self, text: str) -> list[Token]: '[(0, a), (0, b), (1, b), (0, c), (2, a), (5, b), (1, a)]' >>> str(lz78_compressor.compress("aacaacabcabaaac")) '[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]' + >>> lz78_compressor.compress("") + [] + >>> lz78_compressor.compress([]) + [] + >>> lz78_compressor.compress({}) + [] + >>> len("ababc") >= len(compressor.compress("ababc")) + True """ phrase_dict = {} From 34fe6bf4c77cd20bcfae132152dcce4f4f8c2e1b Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Mon, 7 Oct 2024 14:56:30 +0300 Subject: [PATCH 08/11] Update lz78.py fixed some errors --- compression/lz78.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/compression/lz78.py b/compression/lz78.py index 17f2200fc624..107a31307d41 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -51,13 +51,13 @@ def compress(self, text: str) -> list[Token]: '[(0, a), (0, b), (1, b), (0, c), (2, a), (5, b), (1, a)]' >>> str(lz78_compressor.compress("aacaacabcabaaac")) '[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]' - >>> lz78_compressor.compress("") - [] - >>> lz78_compressor.compress([]) - [] - >>> lz78_compressor.compress({}) - [] - >>> len("ababc") >= len(compressor.compress("ababc")) + >>> str(lz78_compressor.compress("")) + '[]' + >>> str(lz78_compressor.compress([])) + '[]' + >>> str(lz78_compressor.compress({})) + '[]' + >>> len("ababc") >= len(lz78_compressor.compress("ababc")) True """ From 676da5034434cf341547b8fffcaa641767f9ad61 Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Tue, 8 Oct 2024 18:39:00 +0300 Subject: [PATCH 09/11] Update lz78.py adding error handling and more tests --- compression/lz78.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/compression/lz78.py b/compression/lz78.py index 107a31307d41..25db7de37a79 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -53,14 +53,19 @@ def compress(self, text: str) -> list[Token]: '[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]' >>> str(lz78_compressor.compress("")) '[]' - >>> str(lz78_compressor.compress([])) - '[]' - >>> str(lz78_compressor.compress({})) - '[]' + >>> lz78_compressor.compress([]) + Traceback (most recent call last): + TypeError: Expected string. + >>> lz78_compressor.compress({}) + Traceback (most recent call last): + TypeError: Expected string. >>> len("ababc") >= len(lz78_compressor.compress("ababc")) True """ + if not isinstance(text, str): + raise TypeError("Expected string.") + phrase_dict = {} tokens = [] code = 1 From c27a91ebf680bf561028a7c81a8ac3fb2f5d2a53 Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Tue, 8 Oct 2024 18:39:15 +0300 Subject: [PATCH 10/11] Update compression/lz78.py Co-authored-by: Christian Clauss --- compression/lz78.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compression/lz78.py b/compression/lz78.py index 25db7de37a79..c2f2b5f132d4 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -59,7 +59,8 @@ def compress(self, text: str) -> list[Token]: >>> lz78_compressor.compress({}) Traceback (most recent call last): TypeError: Expected string. - >>> len("ababc") >= len(lz78_compressor.compress("ababc")) + >>> all(len(s) >= len(lz78_compressor.compress(x)) for s in ( + ... "", "AA", "AB", "AAA", "ABC", "ABCDEFGH")) True """ From bab8e63c9b089f5890dbec30ed5694b5850411bc Mon Sep 17 00:00:00 2001 From: Ahmed Tamer Date: Tue, 8 Oct 2024 18:47:13 +0300 Subject: [PATCH 11/11] Update lz78.py fixed a bug in a test case --- compression/lz78.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compression/lz78.py b/compression/lz78.py index c2f2b5f132d4..b4987488f81e 100644 --- a/compression/lz78.py +++ b/compression/lz78.py @@ -59,7 +59,7 @@ def compress(self, text: str) -> list[Token]: >>> lz78_compressor.compress({}) Traceback (most recent call last): TypeError: Expected string. - >>> all(len(s) >= len(lz78_compressor.compress(x)) for s in ( + >>> all(len(s) >= len(lz78_compressor.compress(s)) for s in ( ... "", "AA", "AB", "AAA", "ABC", "ABCDEFGH")) True """