Skip to content

Commit ab71663

Browse files
hfz1337cclauss
andauthored
Replace base64_cipher.py with an easy to understand version (TheAlgorithms#3925)
* rename base64_cipher.py to base64_encoding.py * edit base64_encoding.py * import necessary modules inside doctests * make it behave like the official implementation * replace format with f-string where possible * replace format with f-string Co-authored-by: Christian Clauss <[email protected]> * fix: syntax error due to closing parenthese * reformat code Co-authored-by: Christian Clauss <[email protected]>
1 parent 5ef46cb commit ab71663

File tree

2 files changed

+142
-89
lines changed

2 files changed

+142
-89
lines changed

Diff for: ciphers/base64_cipher.py

-89
This file was deleted.

Diff for: ciphers/base64_encoding.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
B64_CHARSET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
2+
3+
4+
def base64_encode(data: bytes) -> bytes:
5+
"""Encodes data according to RFC4648.
6+
7+
The data is first transformed to binary and appended with binary digits so that its
8+
length becomes a multiple of 6, then each 6 binary digits will match a character in
9+
the B64_CHARSET string. The number of appended binary digits would later determine
10+
how many "=" sign should be added, the padding.
11+
For every 2 binary digits added, a "=" sign is added in the output.
12+
We can add any binary digits to make it a multiple of 6, for instance, consider the
13+
following example:
14+
"AA" -> 0010100100101001 -> 001010 010010 1001
15+
As can be seen above, 2 more binary digits should be added, so there's 4
16+
possibilities here: 00, 01, 10 or 11.
17+
That being said, Base64 encoding can be used in Steganography to hide data in these
18+
appended digits.
19+
20+
>>> from base64 import b64encode
21+
>>> a = b"This pull request is part of Hacktoberfest20!"
22+
>>> b = b"https://tools.ietf.org/html/rfc4648"
23+
>>> c = b"A"
24+
>>> base64_encode(a) == b64encode(a)
25+
True
26+
>>> base64_encode(b) == b64encode(b)
27+
True
28+
>>> base64_encode(c) == b64encode(c)
29+
True
30+
>>> base64_encode("abc")
31+
Traceback (most recent call last):
32+
...
33+
TypeError: a bytes-like object is required, not 'str'
34+
"""
35+
# Make sure the supplied data is a bytes-like object
36+
if not isinstance(data, bytes):
37+
raise TypeError(
38+
f"a bytes-like object is required, not '{data.__class__.__name__}'"
39+
)
40+
41+
binary_stream = "".join(bin(byte)[2:].zfill(8) for byte in data)
42+
43+
padding_needed = len(binary_stream) % 6 != 0
44+
45+
if padding_needed:
46+
# The padding that will be added later
47+
padding = b"=" * ((6 - len(binary_stream) % 6) // 2)
48+
49+
# Append binary_stream with arbitrary binary digits (0's by default) to make its
50+
# length a multiple of 6.
51+
binary_stream += "0" * (6 - len(binary_stream) % 6)
52+
else:
53+
padding = b""
54+
55+
# Encode every 6 binary digits to their corresponding Base64 character
56+
return (
57+
"".join(
58+
B64_CHARSET[int(binary_stream[index : index + 6], 2)]
59+
for index in range(0, len(binary_stream), 6)
60+
).encode()
61+
+ padding
62+
)
63+
64+
65+
def base64_decode(encoded_data: str) -> bytes:
66+
"""Decodes data according to RFC4648.
67+
68+
This does the reverse operation of base64_encode.
69+
We first transform the encoded data back to a binary stream, take off the
70+
previously appended binary digits according to the padding, at this point we
71+
would have a binary stream whose length is multiple of 8, the last step is
72+
to convert every 8 bits to a byte.
73+
74+
>>> from base64 import b64decode
75+
>>> a = "VGhpcyBwdWxsIHJlcXVlc3QgaXMgcGFydCBvZiBIYWNrdG9iZXJmZXN0MjAh"
76+
>>> b = "aHR0cHM6Ly90b29scy5pZXRmLm9yZy9odG1sL3JmYzQ2NDg="
77+
>>> c = "QQ=="
78+
>>> base64_decode(a) == b64decode(a)
79+
True
80+
>>> base64_decode(b) == b64decode(b)
81+
True
82+
>>> base64_decode(c) == b64decode(c)
83+
True
84+
>>> base64_decode("abc")
85+
Traceback (most recent call last):
86+
...
87+
AssertionError: Incorrect padding
88+
"""
89+
# Make sure encoded_data is either a string or a bytes-like object
90+
if not isinstance(encoded_data, bytes) and not isinstance(encoded_data, str):
91+
raise TypeError(
92+
"argument should be a bytes-like object or ASCII string, not "
93+
f"'{encoded_data.__class__.__name__}'"
94+
)
95+
96+
# In case encoded_data is a bytes-like object, make sure it contains only
97+
# ASCII characters so we convert it to a string object
98+
if isinstance(encoded_data, bytes):
99+
try:
100+
encoded_data = encoded_data.decode("utf-8")
101+
except UnicodeDecodeError:
102+
raise ValueError("base64 encoded data should only contain ASCII characters")
103+
104+
padding = encoded_data.count("=")
105+
106+
# Check if the encoded string contains non base64 characters
107+
if padding:
108+
assert all(
109+
char in B64_CHARSET for char in encoded_data[:-padding]
110+
), "Invalid base64 character(s) found."
111+
else:
112+
assert all(
113+
char in B64_CHARSET for char in encoded_data
114+
), "Invalid base64 character(s) found."
115+
116+
# Check the padding
117+
assert len(encoded_data) % 4 == 0 and padding < 3, "Incorrect padding"
118+
119+
if padding:
120+
# Remove padding if there is one
121+
encoded_data = encoded_data[:-padding]
122+
123+
binary_stream = "".join(
124+
bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data
125+
)[: -padding * 2]
126+
else:
127+
binary_stream = "".join(
128+
bin(B64_CHARSET.index(char))[2:].zfill(6) for char in encoded_data
129+
)
130+
131+
data = [
132+
int(binary_stream[index : index + 8], 2)
133+
for index in range(0, len(binary_stream), 8)
134+
]
135+
136+
return bytes(data)
137+
138+
139+
if __name__ == "__main__":
140+
import doctest
141+
142+
doctest.testmod()

0 commit comments

Comments
 (0)