|
| 1 | +def decrypt_caesar_with_chi_squared( |
| 2 | + ciphertext: str, |
| 3 | + cipher_alphabet=None, |
| 4 | + frequencies_dict=None, |
| 5 | + case_sensetive: bool = False, |
| 6 | +) -> list: |
| 7 | + """ |
| 8 | + Basic Usage |
| 9 | + =========== |
| 10 | + Arguments: |
| 11 | + * ciphertext (str): the text to decode (encoded with the caesar cipher) |
| 12 | +
|
| 13 | + Optional Arguments: |
| 14 | + * cipher_alphabet (list): the alphabet used for the cipher (each letter is |
| 15 | + a string separated by commas) |
| 16 | + * frequencies_dict (dict): a dictionary of word frequencies where keys are |
| 17 | + the letters and values are a percentage representation of the frequency as |
| 18 | + a decimal/float |
| 19 | + * case_sensetive (bool): a boolean value: True if the case matters during |
| 20 | + decryption, False if it doesn't |
| 21 | +
|
| 22 | + Returns: |
| 23 | + * A tuple in the form of: |
| 24 | + ( |
| 25 | + most_likely_cipher, |
| 26 | + most_likely_cipher_chi_squared_value, |
| 27 | + decoded_most_likely_cipher |
| 28 | + ) |
| 29 | +
|
| 30 | + where... |
| 31 | + - most_likely_cipher is an integer representing the shift of the smallest |
| 32 | + chi-squared statistic (most likely key) |
| 33 | + - most_likely_cipher_chi_squared_value is a float representing the |
| 34 | + chi-squared statistic of the most likely shift |
| 35 | + - decoded_most_likely_cipher is a string with the decoded cipher |
| 36 | + (decoded by the most_likely_cipher key) |
| 37 | +
|
| 38 | +
|
| 39 | + The Chi-squared test |
| 40 | + ==================== |
| 41 | +
|
| 42 | + The caesar cipher |
| 43 | + ----------------- |
| 44 | + The caesar cipher is a very insecure encryption algorithm, however it has |
| 45 | + been used since Julius Caesar. The cipher is a simple substitution cipher |
| 46 | + where each character in the plain text is replaced by a character in the |
| 47 | + alphabet a certain number of characters after the original character. The |
| 48 | + number of characters away is called the shift or key. For example: |
| 49 | +
|
| 50 | + Plain text: hello |
| 51 | + Key: 1 |
| 52 | + Cipher text: ifmmp |
| 53 | + (each letter in hello has been shifted one to the right in the eng. alphabet) |
| 54 | +
|
| 55 | + As you can imagine, this doesn't provide lots of security. In fact |
| 56 | + decrypting ciphertext by brute-force is extremely easy even by hand. However |
| 57 | + one way to do that is the chi-squared test. |
| 58 | +
|
| 59 | + The chi-squared test |
| 60 | + ------------------- |
| 61 | + Each letter in the english alphabet has a frequency, or the amount of times |
| 62 | + it shows up compared to other letters (usually expressed as a decimal |
| 63 | + representing the percentage likelihood). The most common letter in the |
| 64 | + english language is "e" with a frequency of 0.11162 or 11.162%. The test is |
| 65 | + completed in the following fashion. |
| 66 | +
|
| 67 | + 1. The ciphertext is decoded in a brute force way (every combination of the |
| 68 | + 26 possible combinations) |
| 69 | + 2. For every combination, for each letter in the combination, the average |
| 70 | + amount of times the letter should appear the message is calculated by |
| 71 | + multiplying the total number of characters by the frequency of the letter |
| 72 | +
|
| 73 | + For example: |
| 74 | + In a message of 100 characters, e should appear around 11.162 times. |
| 75 | +
|
| 76 | + 3. Then, to calculate the margin of error (the amount of times the letter |
| 77 | + SHOULD appear with the amount of times the letter DOES appear), we use |
| 78 | + the chi-squared test. The following formula is used: |
| 79 | +
|
| 80 | + Let: |
| 81 | + - n be the number of times the letter actually appears |
| 82 | + - p be the predicted value of the number of times the letter should |
| 83 | + appear (see #2) |
| 84 | + - let v be the chi-squared test result (referred to here as chi-squared |
| 85 | + value/statistic) |
| 86 | +
|
| 87 | + (n - p)^2 |
| 88 | + --------- = v |
| 89 | + p |
| 90 | +
|
| 91 | + 4. Each chi squared value for each letter is then added up to the total. |
| 92 | + The total is the chi-squared statistic for that encryption key. |
| 93 | + 5. The encryption key with the lowest chi-squared value is the most likely |
| 94 | + to be the decoded answer. |
| 95 | +
|
| 96 | + Further Reading |
| 97 | + ================ |
| 98 | +
|
| 99 | + * http://practicalcryptography.com/cryptanalysis/text-characterisation/chi-squared-statistic/ |
| 100 | + * https://en.wikipedia.org/wiki/Letter_frequency |
| 101 | + * https://en.wikipedia.org/wiki/Chi-squared_test |
| 102 | + * https://en.m.wikipedia.org/wiki/Caesar_cipher |
| 103 | +
|
| 104 | + Doctests |
| 105 | + ======== |
| 106 | + >>> decrypt_caesar_with_chi_squared('dof pz aol jhlzhy jpwoly zv wvwbshy? pa pz avv lhzf av jyhjr!') |
| 107 | + (7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!') |
| 108 | +
|
| 109 | + >>> decrypt_caesar_with_chi_squared('crybd cdbsxq') |
| 110 | + (10, 233.35343938980898, 'short string') |
| 111 | +
|
| 112 | + >>> decrypt_caesar_with_chi_squared(12) |
| 113 | + Traceback (most recent call last): |
| 114 | + AttributeError: 'int' object has no attribute 'lower' |
| 115 | + """ |
| 116 | + alphabet_letters = cipher_alphabet or [chr(i) for i in range(97, 123)] |
| 117 | + frequencies_dict = frequencies_dict or {} |
| 118 | + |
| 119 | + if frequencies_dict == {}: |
| 120 | + # Frequencies of letters in the english language (how much they show up) |
| 121 | + frequencies = { |
| 122 | + "a": 0.08497, |
| 123 | + "b": 0.01492, |
| 124 | + "c": 0.02202, |
| 125 | + "d": 0.04253, |
| 126 | + "e": 0.11162, |
| 127 | + "f": 0.02228, |
| 128 | + "g": 0.02015, |
| 129 | + "h": 0.06094, |
| 130 | + "i": 0.07546, |
| 131 | + "j": 0.00153, |
| 132 | + "k": 0.01292, |
| 133 | + "l": 0.04025, |
| 134 | + "m": 0.02406, |
| 135 | + "n": 0.06749, |
| 136 | + "o": 0.07507, |
| 137 | + "p": 0.01929, |
| 138 | + "q": 0.00095, |
| 139 | + "r": 0.07587, |
| 140 | + "s": 0.06327, |
| 141 | + "t": 0.09356, |
| 142 | + "u": 0.02758, |
| 143 | + "v": 0.00978, |
| 144 | + "w": 0.02560, |
| 145 | + "x": 0.00150, |
| 146 | + "y": 0.01994, |
| 147 | + "z": 0.00077, |
| 148 | + } |
| 149 | + else: |
| 150 | + # Custom frequencies dictionary |
| 151 | + frequencies = frequencies_dict |
| 152 | + |
| 153 | + if not case_sensetive: |
| 154 | + ciphertext = ciphertext.lower() |
| 155 | + |
| 156 | + # Chi squared statistic values |
| 157 | + chi_squared_statistic_values = {} |
| 158 | + |
| 159 | + # cycle through all of the shifts |
| 160 | + for shift in range(len(alphabet_letters)): |
| 161 | + decrypted_with_shift = "" |
| 162 | + |
| 163 | + # decrypt the message with the shift |
| 164 | + for letter in ciphertext: |
| 165 | + try: |
| 166 | + # Try to index the letter in the alphabet |
| 167 | + new_key = (alphabet_letters.index(letter) - shift) % len( |
| 168 | + alphabet_letters |
| 169 | + ) |
| 170 | + decrypted_with_shift += alphabet_letters[new_key] |
| 171 | + except ValueError: |
| 172 | + # Append the character if it isn't in the alphabet |
| 173 | + decrypted_with_shift += letter |
| 174 | + |
| 175 | + chi_squared_statistic = 0 |
| 176 | + |
| 177 | + # Loop through each letter in the decoded message with the shift |
| 178 | + for letter in decrypted_with_shift: |
| 179 | + if case_sensetive: |
| 180 | + if letter in frequencies: |
| 181 | + # Get the amount of times the letter occurs in the message |
| 182 | + occurrences = decrypted_with_shift.count(letter) |
| 183 | + |
| 184 | + # Get the excepcted amount of times the letter should appear based on letter frequencies |
| 185 | + expected = frequencies[letter] * occurrences |
| 186 | + |
| 187 | + # Complete the chi squared statistic formula |
| 188 | + chi_letter_value = ((occurrences - expected) ** 2) / expected |
| 189 | + |
| 190 | + # Add the margin of error to the total chi squared statistic |
| 191 | + chi_squared_statistic += chi_letter_value |
| 192 | + else: |
| 193 | + if letter.lower() in frequencies: |
| 194 | + # Get the amount of times the letter occurs in the message |
| 195 | + occurrences = decrypted_with_shift.count(letter) |
| 196 | + |
| 197 | + # Get the excepcted amount of times the letter should appear based on letter frequencies |
| 198 | + expected = frequencies[letter] * occurrences |
| 199 | + |
| 200 | + # Complete the chi squared statistic formula |
| 201 | + chi_letter_value = ((occurrences - expected) ** 2) / expected |
| 202 | + |
| 203 | + # Add the margin of error to the total chi squared statistic |
| 204 | + chi_squared_statistic += chi_letter_value |
| 205 | + |
| 206 | + # Add the data to the chi_squared_statistic_values dictionary |
| 207 | + chi_squared_statistic_values[shift] = [ |
| 208 | + chi_squared_statistic, |
| 209 | + decrypted_with_shift, |
| 210 | + ] |
| 211 | + |
| 212 | + # Get the most likely cipher by finding the cipher with the smallest chi squared statistic |
| 213 | + most_likely_cipher = min( |
| 214 | + chi_squared_statistic_values, key=chi_squared_statistic_values.get |
| 215 | + ) |
| 216 | + |
| 217 | + # Get all the data from the most likely cipher (key, decoded message) |
| 218 | + most_likely_cipher_chi_squared_value = chi_squared_statistic_values[ |
| 219 | + most_likely_cipher |
| 220 | + ][0] |
| 221 | + decoded_most_likely_cipher = chi_squared_statistic_values[most_likely_cipher][1] |
| 222 | + |
| 223 | + # Return the data on the most likely shift |
| 224 | + return ( |
| 225 | + most_likely_cipher, |
| 226 | + most_likely_cipher_chi_squared_value, |
| 227 | + decoded_most_likely_cipher, |
| 228 | + ) |
0 commit comments