Added new algorithm: cracking caesar cipher with the chi-squared test (#1950)

mrmaxguns · cclauss · web-flow · commit 4acc28ba55d6 · 2020-05-06T19:42:18.000+02:00
* added decrypt_caesar_with_chi_squared.py and ran all checks

* Updated default parameters

Removed mistake with mutable default arguments

Co-authored-by: Christian Clauss &lt;cclauss@me.com&gt;

* Updated handling for optional arguments

Co-authored-by: Christian Clauss &lt;cclauss@me.com&gt;

* Changed return statement to tuple

Made function return a tuple instead of a list

* Added more doctests

* Fixed spelling mistakes

* black . - reformatted decrypt_caesar_with_chi_squared.py

* Updated if statements to fit the updated code

* Minimized amount of lines in the code.

Co-authored-by: Christian Clauss &lt;cclauss@me.com&gt;
diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py
@@ -0,0 +1,228 @@
+def decrypt_caesar_with_chi_squared(
+    ciphertext: str,
+    cipher_alphabet=None,
+    frequencies_dict=None,
+    case_sensetive: bool = False,
+) -> list:
+    """
+    Basic Usage
+    ===========
+    Arguments:
+    * ciphertext (str): the text to decode (encoded with the caesar cipher)
+
+    Optional Arguments:
+    * cipher_alphabet (list): the alphabet used for the cipher (each letter is
+      a string separated by commas)
+    * frequencies_dict (dict): a dictionary of word frequencies where keys are
+      the letters and values are a percentage representation of the frequency as
+      a decimal/float
+    * case_sensetive (bool): a boolean value: True if the case matters during
+      decryption, False if it doesn't
+
+    Returns:
+    * A tuple in the form of:
+      (
+        most_likely_cipher,
+        most_likely_cipher_chi_squared_value,
+        decoded_most_likely_cipher
+      )
+
+      where...
+      - most_likely_cipher is an integer representing the shift of the smallest
+        chi-squared statistic (most likely key)
+      - most_likely_cipher_chi_squared_value is a float representing the
+        chi-squared statistic of the most likely shift
+      - decoded_most_likely_cipher is a string with the decoded cipher
+        (decoded by the most_likely_cipher key)
+
+
+    The Chi-squared test
+    ====================
+
+    The caesar cipher
+    -----------------
+    The caesar cipher is a very insecure encryption algorithm, however it has
+    been used since Julius Caesar. The cipher is a simple substitution cipher
+    where each character in the plain text is replaced by a character in the
+    alphabet a certain number of characters after the original character. The
+    number of characters away is called the shift or key. For example:
+
+    Plain text: hello
+    Key: 1
+    Cipher text: ifmmp
+    (each letter in hello has been shifted one to the right in the eng. alphabet)
+
+    As you can imagine, this doesn't provide lots of security. In fact
+    decrypting ciphertext by brute-force is extremely easy even by hand. However
+     one way to do that is the chi-squared test.
+
+    The chi-squared test
+    -------------------
+    Each letter in the english alphabet has a frequency, or the amount of times
+    it shows up compared to other letters (usually expressed as a decimal
+    representing the percentage likelihood). The most common letter in the
+    english language is "e" with a frequency of 0.11162 or 11.162%. The test is
+    completed in the following fashion.
+
+    1. The ciphertext is decoded in a brute force way (every combination of the
+       26 possible combinations)
+    2. For every combination, for each letter in the combination, the average
+       amount of times the letter should appear the message is calculated by
+       multiplying the total number of characters by the frequency of the letter
+
+       For example:
+       In a message of 100 characters, e should appear around 11.162 times.
+
+     3. Then, to calculate the margin of error (the amount of times the letter
+        SHOULD appear with the amount of times the letter DOES appear), we use
+        the chi-squared test. The following formula is used:
+
+        Let:
+        - n be the number of times the letter actually appears
+        - p be the predicted value of the number of times the letter should
+          appear (see #2)
+        - let v be the chi-squared test result (referred to here as chi-squared
+          value/statistic)
+
+        (n - p)^2
+        --------- = v
+           p
+
+    4. Each chi squared value for each letter is then added up to the total.
+       The total is the chi-squared statistic for that encryption key.
+    5. The encryption key with the lowest chi-squared value is the most likely
+       to be the decoded answer.
+
+    Further Reading
+    ================
+
+    * http://practicalcryptography.com/cryptanalysis/text-characterisation/chi-squared-statistic/
+    * https://en.wikipedia.org/wiki/Letter_frequency
+    * https://en.wikipedia.org/wiki/Chi-squared_test
+    * https://en.m.wikipedia.org/wiki/Caesar_cipher
+
+    Doctests
+    ========
+    >>> decrypt_caesar_with_chi_squared('dof pz aol jhlzhy jpwoly zv wvwbshy? pa pz avv lhzf av jyhjr!')
+    (7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!')
+
+    >>> decrypt_caesar_with_chi_squared('crybd cdbsxq')
+    (10, 233.35343938980898, 'short string')
+
+    >>> decrypt_caesar_with_chi_squared(12)
+    Traceback (most recent call last):
+    AttributeError: 'int' object has no attribute 'lower'
+    """
+    alphabet_letters = cipher_alphabet or [chr(i) for i in range(97, 123)]
+    frequencies_dict = frequencies_dict or {}
+
+    if frequencies_dict == {}:
+        # Frequencies of letters in the english language (how much they show up)
+        frequencies = {
+            "a": 0.08497,
+            "b": 0.01492,
+            "c": 0.02202,
+            "d": 0.04253,
+            "e": 0.11162,
+            "f": 0.02228,
+            "g": 0.02015,
+            "h": 0.06094,
+            "i": 0.07546,
+            "j": 0.00153,
+            "k": 0.01292,
+            "l": 0.04025,
+            "m": 0.02406,
+            "n": 0.06749,
+            "o": 0.07507,
+            "p": 0.01929,
+            "q": 0.00095,
+            "r": 0.07587,
+            "s": 0.06327,
+            "t": 0.09356,
+            "u": 0.02758,
+            "v": 0.00978,
+            "w": 0.02560,
+            "x": 0.00150,
+            "y": 0.01994,
+            "z": 0.00077,
+        }
+    else:
+        # Custom frequencies dictionary
+        frequencies = frequencies_dict
+
+    if not case_sensetive:
+        ciphertext = ciphertext.lower()
+
+    # Chi squared statistic values
+    chi_squared_statistic_values = {}
+
+    # cycle through all of the shifts
+    for shift in range(len(alphabet_letters)):
+        decrypted_with_shift = ""
+
+        # decrypt the message with the shift
+        for letter in ciphertext:
+            try:
+                # Try to index the letter in the alphabet
+                new_key = (alphabet_letters.index(letter) - shift) % len(
+                    alphabet_letters
+                )
+                decrypted_with_shift += alphabet_letters[new_key]
+            except ValueError:
+                # Append the character if it isn't in the alphabet
+                decrypted_with_shift += letter
+
+        chi_squared_statistic = 0
+
+        # Loop through each letter in the decoded message with the shift
+        for letter in decrypted_with_shift:
+            if case_sensetive:
+                if letter in frequencies:
+                    # Get the amount of times the letter occurs in the message
+                    occurrences = decrypted_with_shift.count(letter)
+
+                    # Get the excepcted amount of times the letter should appear based on letter frequencies
+                    expected = frequencies[letter] * occurrences
+
+                    # Complete the chi squared statistic formula
+                    chi_letter_value = ((occurrences - expected) ** 2) / expected
+
+                    # Add the margin of error to the total chi squared statistic
+                    chi_squared_statistic += chi_letter_value
+            else:
+                if letter.lower() in frequencies:
+                    # Get the amount of times the letter occurs in the message
+                    occurrences = decrypted_with_shift.count(letter)
+
+                    # Get the excepcted amount of times the letter should appear based on letter frequencies
+                    expected = frequencies[letter] * occurrences
+
+                    # Complete the chi squared statistic formula
+                    chi_letter_value = ((occurrences - expected) ** 2) / expected
+
+                    # Add the margin of error to the total chi squared statistic
+                    chi_squared_statistic += chi_letter_value
+
+        # Add the data to the chi_squared_statistic_values dictionary
+        chi_squared_statistic_values[shift] = [
+            chi_squared_statistic,
+            decrypted_with_shift,
+        ]
+
+    # Get the most likely cipher by finding the cipher with the smallest chi squared statistic
+    most_likely_cipher = min(
+        chi_squared_statistic_values, key=chi_squared_statistic_values.get
+    )
+
+    # Get all the data from the most likely cipher (key, decoded message)
+    most_likely_cipher_chi_squared_value = chi_squared_statistic_values[
+        most_likely_cipher
+    ][0]
+    decoded_most_likely_cipher = chi_squared_statistic_values[most_likely_cipher][1]
+
+    # Return the data on the most likely shift
+    return (
+        most_likely_cipher,
+        most_likely_cipher_chi_squared_value,
+        decoded_most_likely_cipher,
+    )