|
| 1 | +""" |
| 2 | +Sources: |
| 3 | +https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ78 |
| 4 | +""" |
| 5 | + |
| 6 | +from dataclasses import dataclass |
| 7 | + |
| 8 | +__version__ = "1.0" |
| 9 | +__author__ = "Ahmed Tamer" |
| 10 | + |
| 11 | +@dataclass |
| 12 | +class Token: |
| 13 | + """ |
| 14 | + Dataclass representing pair called token consisting of the dictionary index and a single character that follows the phrase in the dictionary. |
| 15 | + """ |
| 16 | + |
| 17 | + index: int |
| 18 | + char: str |
| 19 | + |
| 20 | + def __repr__(self) -> str: |
| 21 | + """ |
| 22 | + >>> token = Token(1, "c") |
| 23 | + >>> repr(token) |
| 24 | + '(1, c)' |
| 25 | + >>> str(token) |
| 26 | + '(1, c)' |
| 27 | + """ |
| 28 | + return f"({self.index}, {self.char})" |
| 29 | + |
| 30 | + |
| 31 | +class LZ78Compressor: |
| 32 | + """ |
| 33 | + Class containing compress and decompress methods using LZ78 compression algorithm. |
| 34 | + """ |
| 35 | + |
| 36 | + def compress(self, text: str) -> list[Token]: |
| 37 | + """ |
| 38 | + Compress the given string text using LZ78 compression algorithm. |
| 39 | +
|
| 40 | + Args: |
| 41 | + text: string to be compressed |
| 42 | +
|
| 43 | + Returns: |
| 44 | + output: the compressed text as a list of Tokens |
| 45 | + |
| 46 | + Tests: |
| 47 | + >>> lz78_compressor = LZ78Compressor() |
| 48 | + >>> str(lz78_compressor.compress("ababcbababaa")) |
| 49 | + '[(0, a), (0, b), (1, b), (0, c), (2, a), (5, b), (1, a)]' |
| 50 | + >>> str(lz78_compressor.compress("aacaacabcabaaac")) |
| 51 | + '[(0, a), (1, c), (1, a), (0, c), (1, b), (4, a), (0, b), (3, a)]' |
| 52 | + """ |
| 53 | + |
| 54 | + phrase_dict = {} |
| 55 | + tokens = [] |
| 56 | + code = 1 |
| 57 | + phrase = '' |
| 58 | + for char in text: |
| 59 | + phrase += char |
| 60 | + if phrase not in phrase_dict: |
| 61 | + phrase_dict[phrase] = str(code) |
| 62 | + if len(phrase) == 1: |
| 63 | + tokens.append(Token('0', phrase)) |
| 64 | + else: |
| 65 | + tokens.append(Token(phrase_dict[phrase[:-1]], phrase[-1])) |
| 66 | + code += 1 |
| 67 | + phrase = '' |
| 68 | + return tokens |
| 69 | + |
| 70 | + |
| 71 | + def decompress(self, tokens: list[Token]) -> str: |
| 72 | + """ |
| 73 | + Convert the list of tokens into an output string. |
| 74 | +
|
| 75 | + Args: |
| 76 | + tokens: list containing pairs (index, char) |
| 77 | +
|
| 78 | + Returns: |
| 79 | + output: decompressed text |
| 80 | + |
| 81 | + Tests: |
| 82 | + >>> lz78_compressor = LZ78Compressor() |
| 83 | + >>> lz78_compressor.decompress([Token(0, 'c'), Token(0, 'a'), Token(0, 'b'), Token(0, 'r'), Token(2, 'c'), |
| 84 | + ... Token(2, 'd'), Token(2, 'b'), Token(4, 'a'), Token(4, 'r'), Token(2, 'r'), Token(8, 'd')]) |
| 85 | + 'cabracadabrarrarrad' |
| 86 | + >>> lz78_compressor.decompress([Token(0, 'a'), Token(0, 'b'), Token(1, 'b'), Token(0, 'c'), |
| 87 | + ... Token(2, 'a'), Token(5, 'b'), Token(1, 'a')]) |
| 88 | + 'ababcbababaa' |
| 89 | + >>> lz78_compressor.decompress([Token(0, 'a'), Token(1, 'c'), Token(1, 'a'), Token(0, 'c'), |
| 90 | + ... Token(1, 'b'), Token(4, 'a'), Token(0, 'b'), Token(3, 'a')]) |
| 91 | + 'aacaacabcabaaa' |
| 92 | + """ |
| 93 | + |
| 94 | + text = '' |
| 95 | + phrase_dict = {'0': ''} |
| 96 | + code = 1 |
| 97 | + for token in tokens: |
| 98 | + phrase = phrase_dict[str(token.index)] + token.char |
| 99 | + phrase_dict[str(code)] = phrase |
| 100 | + code += 1 |
| 101 | + text += phrase |
| 102 | + return text |
| 103 | + |
| 104 | + |
| 105 | +if __name__ == '__main__': |
| 106 | + from doctest import testmod |
| 107 | + |
| 108 | + testmod() |
| 109 | + |
| 110 | + |
| 111 | + lz78_compressor = LZ78Compressor() |
| 112 | + |
| 113 | + # Example |
| 114 | + text = 'aacaacabcabaaa' |
| 115 | + tokens = lz78_compressor.compress(text) |
| 116 | + decompressedText = lz78_compressor.decompress(tokens) |
| 117 | + assert decompressedText == text, 'Invalid result.' |
0 commit comments