|
| 1 | +B64_CHARSET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" |
| 2 | + |
| 3 | + |
| 4 | +def base64_encode(data: bytes) -> bytes: |
| 5 | + """Encodes data according to RFC4648. |
| 6 | +
|
| 7 | + The data is first transformed to binary and appended with binary digits so that its |
| 8 | + length becomes a multiple of 6, then each 6 binary digits will match a character in |
| 9 | + the B64_CHARSET string. The number of appended binary digits would later determine |
| 10 | + how many "=" sign should be added, the padding. |
| 11 | + For every 2 binary digits added, a "=" sign is added in the output. |
| 12 | + We can add any binary digits to make it a multiple of 6, for instance, consider the |
| 13 | + following example: |
| 14 | + "AA" -> 0010100100101001 -> 001010 010010 1001 |
| 15 | + As can be seen above, 2 more binary digits should be added, so there's 4 |
| 16 | + possibilities here: 00, 01, 10 or 11. |
| 17 | + That being said, Base64 encoding can be used in Steganography to hide data in these |
| 18 | + appended digits. |
| 19 | +
|
| 20 | + >>> from base64 import b64encode |
| 21 | + >>> a = b"This pull request is part of Hacktoberfest20!" |
| 22 | + >>> b = b"https://tools.ietf.org/html/rfc4648" |
| 23 | + >>> c = b"A" |
| 24 | + >>> base64_encode(a) == b64encode(a) |
| 25 | + True |
| 26 | + >>> base64_encode(b) == b64encode(b) |
| 27 | + True |
| 28 | + >>> base64_encode(c) == b64encode(c) |
| 29 | + True |
| 30 | + >>> base64_encode("abc") |
| 31 | + Traceback (most recent call last): |
| 32 | + ... |
| 33 | + TypeError: a bytes-like object is required, not 'str' |
| 34 | + """ |
| 35 | + # Make sure the supplied data is a bytes-like object |
| 36 | + if not isinstance(data, bytes): |
| 37 | + raise TypeError( |
| 38 | + f"a bytes-like object is required, not '{data.__class__.__name__}'" |
| 39 | + ) |
| 40 | + |
| 41 | + binary_stream = "".join(bin(byte)[2:].zfill(8) for byte in data) |
| 42 | + |
| 43 | + padding_needed = len(binary_stream) % 6 != 0 |
| 44 | + |
| 45 | + if padding_needed: |
| 46 | + # The padding that will be added later |
| 47 | + padding = b"=" * ((6 - len(binary_stream) % 6) // 2) |
| 48 | + |
| 49 | + # Append binary_stream with arbitrary binary digits (0's by default) to make its |
| 50 | + # length a multiple of 6. |
| 51 | + binary_stream += "0" * (6 - len(binary_stream) % 6) |
| 52 | + else: |
| 53 | + padding = b"" |
| 54 | + |
| 55 | + # Encode every 6 binary digits to their corresponding Base64 character |
| 56 | + return ( |
| 57 | + "".join( |
| 58 | + B64_CHARSET[int(binary_stream[index : index + 6], 2)] |
| 59 | + for index in range(0, len(binary_stream), 6) |
| 60 | + ).encode() |
| 61 | + + padding |
| 62 | + ) |
| 63 | + |
| 64 | + |
| 65 | +def base64_decode(encoded_data: str) -> bytes: |
| 66 | + """Decodes data according to RFC4648. |
| 67 | +
|
| 68 | + This does the reverse operation of base64_encode. |
| 69 | + We first transform the encoded data back to a binary stream, take off the |
| 70 | + previously appended binary digits according to the padding, at this point we |
| 71 | + would have a binary stream whose length is multiple of 8, the last step is |
| 72 | + to convert every 8 bits to a byte. |
| 73 | +
|
| 74 | + >>> from base64 import b64decode |
| 75 | + >>> a = "VGhpcyBwdWxsIHJlcXVlc3QgaXMgcGFydCBvZiBIYWNrdG9iZXJmZXN0MjAh" |
| 76 | + >>> b = "aHR0cHM6Ly90b29scy5pZXRmLm9yZy9odG1sL3JmYzQ2NDg=" |
| 77 | + >>> c = "QQ==" |
| 78 | + >>> base64_decode(a) == b64decode(a) |
| 79 | + True |
| 80 | + >>> base64_decode(b) == b64decode(b) |
| 81 | + True |
| 82 | + >>> base64_decode(c) == b64decode(c) |
| 83 | + True |
| 84 | + >>> base64_decode("abc") |
| 85 | + Traceback (most recent call last): |
| 86 | + ... |
| 87 | + AssertionError: Incorrect padding |
| 88 | + """ |
| 89 | + # Make sure encoded_data is either a string or a bytes-like object |
| 90 | + if not isinstance(encoded_data, bytes) and not isinstance(encoded_data, str): |
| 91 | + raise TypeError( |
| 92 | + "argument should be a bytes-like object or ASCII string, not " |
| 93 | + f"'{encoded_data.__class__.__name__}'" |
| 94 | + ) |
| 95 | + |
| 96 | + # In case encoded_data is a bytes-like object, make sure it contains only |
| 97 | + # ASCII characters so we convert it to a string object |
| 98 | + if isinstance(encoded_data, bytes): |
| 99 | + try: |
| 100 | + encoded_data = encoded_data.decode("utf-8") |
| 101 | + except UnicodeDecodeError: |
| 102 | + raise ValueError("base64 encoded data should only contain ASCII characters") |
| 103 | + |
| 104 | + padding = encoded_data.count("=") |
| 105 | + |
| 106 | + # Check if the encoded string contains non base64 characters |
| 107 | + if padding: |
| 108 | + assert all( |
| 109 | + char in B64_CHARSET for char in encoded_data[:-padding] |
| 110 | + ), "Invalid base64 character(s) found." |
| 111 | + else: |
| 112 | + assert all( |
| 113 | + char in B64_CHARSET for char in encoded_data |
| 114 | + ), "Invalid base64 character(s) found." |
| 115 | + |
| 116 | + # Check the padding |
| 117 | + assert len(encoded_data) % 4 == 0 and padding < 3, "Incorrect padding" |
| 118 | + |
| 119 | + if padding: |
| 120 | + # Remove padding if there is one |
| 121 | + encoded_data = encoded_data[:-padding] |
| 122 | + |
| 123 | + binary_stream = "".join( |
| 124 | + bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data |
| 125 | + )[: -padding * 2] |
| 126 | + else: |
| 127 | + binary_stream = "".join( |
| 128 | + bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data |
| 129 | + ) |
| 130 | + |
| 131 | + data = [ |
| 132 | + int(binary_stream[index : index + 8], 2) |
| 133 | + for index in range(0, len(binary_stream), 8) |
| 134 | + ] |
| 135 | + |
| 136 | + return bytes(data) |
| 137 | + |
| 138 | + |
| 139 | +if __name__ == "__main__": |
| 140 | + import doctest |
| 141 | + |
| 142 | + doctest.testmod() |
0 commit comments