Skip to content

Commit 8b1bec0

Browse files
committed
commented doctest
1 parent 78d19fd commit 8b1bec0

File tree

1 file changed

+66
-38
lines changed

1 file changed

+66
-38
lines changed

data_structures/hashing/bloom_filter.py

+66-38
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,101 @@
11
"""
22
See https://en.wikipedia.org/wiki/Bloom_filter
33
4-
>>> b = Bloom()
4+
The use of this data structure is to test membership in a set.
5+
Compared to python built-in set() it is more space-efficent.
6+
In the following example, only 8 bits of memory will be used:
7+
>>> b = Bloom(size=8)
8+
>>> "Titanic" in b
9+
False
10+
11+
Initially the filter contains all zeros:
12+
>>> b.bitstring
13+
'00000000'
14+
15+
When an element is added, two bits are set to 1
16+
since there are 2 hash functions:
517
>>> b.add("Titanic")
6-
>>> b.add("Avatar")
7-
>>> b.exists("Titanic")
8-
True
9-
>>> b.exists("Avatar")
18+
>>> b.bitstring
19+
'01100000'
20+
>>> "Titanic" in b
1021
True
11-
>>> b.exists("The Goodfather")
22+
23+
However, sometimes only one bit is added
24+
because both hash functions return the same value
25+
>>> b.add("Avatar")
26+
>>> b.format_hash("Avatar")
27+
'00000100'
28+
>>> b.bitstring
29+
'01100100'
30+
31+
Not added elements should return False ...
32+
>>> "The Goodfather" in b
1233
False
13-
>>> b.exists("Interstellar")
34+
>>> b.format_hash("The Goodfather")
35+
'00011000'
36+
>>> "Interstellar" in b
1437
False
15-
>>> b.exists("Parasite")
38+
>>> "Parasite" in b
1639
False
17-
>>> b.exists("Pulp Fiction")
40+
>>> "Pulp Fiction" in b
1841
False
42+
43+
but sometimes there are false positives:
44+
>>> "Ratatouille" in b
45+
True
46+
>>> b.format_hash("Ratatouille")
47+
'01100000'
48+
49+
>>> b.estimated_error_rate()
50+
0.140625
1951
"""
2052
from hashlib import md5, sha256
2153
from random import choices
2254
from string import ascii_lowercase
2355

56+
HASH_FUNCTIONS = (sha256, md5)
2457

25-
class Bloom:
26-
# number of hash functions is fixed
27-
HASH_FUNCTIONS = (sha256, md5)
2858

59+
class Bloom:
2960
def __init__(self, size: int = 8) -> None:
30-
self.bitstring = 0b0
61+
self.bitarray = 0b0
3162
self.size = size
3263

3364
def add(self, value: str) -> None:
3465
h = self.hash_(value)
35-
self.bitstring |= h
36-
37-
# print(
38-
# f"""\
39-
# [add] value = {value}
40-
# hash = {self.format_bin(h)}
41-
# filter = {self.format_bin(self.bitstring)}
42-
# """
43-
# )
66+
self.bitarray |= h
4467

4568
def exists(self, value: str) -> bool:
4669
h = self.hash_(value)
47-
res = (h & self.bitstring) == h
48-
49-
# print(
50-
# f"""\
51-
# [exists] value = {value}
52-
# hash = {self.format_bin(h)}
53-
# filter = {self.format_bin(self.bitstring)}
54-
# res = {res}
55-
# """
56-
# )
57-
return res
70+
return (h & self.bitarray) == h
5871

59-
def format_bin(self, value: int) -> str:
60-
res = bin(value)[2:]
72+
def __contains__(self, other):
73+
return self.exists(other)
74+
75+
def format_bin(self, bitarray: int) -> str:
76+
res = bin(bitarray)[2:]
6177
return res.zfill(self.size)
6278

79+
@property
80+
def bitstring(self):
81+
return self.format_bin(self.bitarray)
82+
6383
def hash_(self, value: str) -> int:
6484
res = 0b0
65-
for func in self.HASH_FUNCTIONS:
85+
for func in HASH_FUNCTIONS:
6686
b = func(value.encode()).digest()
6787
position = int.from_bytes(b, "little") % self.size
6888
res |= 2**position
6989
return res
7090

91+
def format_hash(self, value: str) -> str:
92+
return self.format_bin(self.hash_(value))
93+
94+
def estimated_error_rate(self):
95+
n_ones = bin(self.bitarray).count("1")
96+
k = len(HASH_FUNCTIONS)
97+
return (n_ones / self.size) ** k
98+
7199

72100
def random_string(size: int) -> str:
73101
return "".join(choices(ascii_lowercase + " ", k=size))
@@ -76,7 +104,7 @@ def random_string(size: int) -> str:
76104
def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
77105
b = Bloom(size=filter_bits)
78106

79-
k = len(b.HASH_FUNCTIONS)
107+
k = len(HASH_FUNCTIONS)
80108
estimated_error_rate_beforehand = (
81109
1 - (1 - 1 / filter_bits) ** (k * added_elements)
82110
) ** k
@@ -85,7 +113,7 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
85113
for _ in range(added_elements):
86114
b.add(not_added.pop())
87115

88-
n_ones = bin(b.bitstring).count("1")
116+
n_ones = bin(b.bitarray).count("1")
89117
estimated_error_rate = (n_ones / filter_bits) ** k
90118

91119
errors = 0

0 commit comments

Comments
 (0)