1
1
"""
2
2
See https://en.wikipedia.org/wiki/Bloom_filter
3
3
4
- >>> b = Bloom()
4
+ The use of this data structure is to test membership in a set.
5
+ Compared to python built-in set() it is more space-efficent.
6
+ In the following example, only 8 bits of memory will be used:
7
+ >>> b = Bloom(size=8)
8
+ >>> "Titanic" in b
9
+ False
10
+
11
+ Initially the filter contains all zeros:
12
+ >>> b.bitstring
13
+ '00000000'
14
+
15
+ When an element is added, two bits are set to 1
16
+ since there are 2 hash functions:
5
17
>>> b.add("Titanic")
6
- >>> b.add("Avatar")
7
- >>> b.exists("Titanic")
8
- True
9
- >>> b.exists("Avatar")
18
+ >>> b.bitstring
19
+ '01100000'
20
+ >>> "Titanic" in b
10
21
True
11
- >>> b.exists("The Goodfather")
22
+
23
+ However, sometimes only one bit is added
24
+ because both hash functions return the same value
25
+ >>> b.add("Avatar")
26
+ >>> b.format_hash("Avatar")
27
+ '00000100'
28
+ >>> b.bitstring
29
+ '01100100'
30
+
31
+ Not added elements should return False ...
32
+ >>> "The Goodfather" in b
12
33
False
13
- >>> b.exists("Interstellar")
34
+ >>> b.format_hash("The Goodfather")
35
+ '00011000'
36
+ >>> "Interstellar" in b
14
37
False
15
- >>> b.exists( "Parasite")
38
+ >>> "Parasite" in b
16
39
False
17
- >>> b.exists( "Pulp Fiction")
40
+ >>> "Pulp Fiction" in b
18
41
False
42
+
43
+ but sometimes there are false positives:
44
+ >>> "Ratatouille" in b
45
+ True
46
+ >>> b.format_hash("Ratatouille")
47
+ '01100000'
48
+
49
+ >>> b.estimated_error_rate()
50
+ 0.140625
19
51
"""
20
52
from hashlib import md5 , sha256
21
53
from random import choices
22
54
from string import ascii_lowercase
23
55
56
+ HASH_FUNCTIONS = (sha256 , md5 )
24
57
25
- class Bloom :
26
- # number of hash functions is fixed
27
- HASH_FUNCTIONS = (sha256 , md5 )
28
58
59
+ class Bloom :
29
60
def __init__ (self , size : int = 8 ) -> None :
30
- self .bitstring = 0b0
61
+ self .bitarray = 0b0
31
62
self .size = size
32
63
33
64
def add (self , value : str ) -> None :
34
65
h = self .hash_ (value )
35
- self .bitstring |= h
36
-
37
- # print(
38
- # f"""\
39
- # [add] value = {value}
40
- # hash = {self.format_bin(h)}
41
- # filter = {self.format_bin(self.bitstring)}
42
- # """
43
- # )
66
+ self .bitarray |= h
44
67
45
68
def exists (self , value : str ) -> bool :
46
69
h = self .hash_ (value )
47
- res = (h & self .bitstring ) == h
48
-
49
- # print(
50
- # f"""\
51
- # [exists] value = {value}
52
- # hash = {self.format_bin(h)}
53
- # filter = {self.format_bin(self.bitstring)}
54
- # res = {res}
55
- # """
56
- # )
57
- return res
70
+ return (h & self .bitarray ) == h
58
71
59
- def format_bin (self , value : int ) -> str :
60
- res = bin (value )[2 :]
72
+ def __contains__ (self , other ):
73
+ return self .exists (other )
74
+
75
+ def format_bin (self , bitarray : int ) -> str :
76
+ res = bin (bitarray )[2 :]
61
77
return res .zfill (self .size )
62
78
79
+ @property
80
+ def bitstring (self ):
81
+ return self .format_bin (self .bitarray )
82
+
63
83
def hash_ (self , value : str ) -> int :
64
84
res = 0b0
65
- for func in self . HASH_FUNCTIONS :
85
+ for func in HASH_FUNCTIONS :
66
86
b = func (value .encode ()).digest ()
67
87
position = int .from_bytes (b , "little" ) % self .size
68
88
res |= 2 ** position
69
89
return res
70
90
91
+ def format_hash (self , value : str ) -> str :
92
+ return self .format_bin (self .hash_ (value ))
93
+
94
+ def estimated_error_rate (self ):
95
+ n_ones = bin (self .bitarray ).count ("1" )
96
+ k = len (HASH_FUNCTIONS )
97
+ return (n_ones / self .size ) ** k
98
+
71
99
72
100
def random_string (size : int ) -> str :
73
101
return "" .join (choices (ascii_lowercase + " " , k = size ))
@@ -76,7 +104,7 @@ def random_string(size: int) -> str:
76
104
def test_probability (filter_bits : int = 64 , added_elements : int = 20 ) -> None :
77
105
b = Bloom (size = filter_bits )
78
106
79
- k = len (b . HASH_FUNCTIONS )
107
+ k = len (HASH_FUNCTIONS )
80
108
estimated_error_rate_beforehand = (
81
109
1 - (1 - 1 / filter_bits ) ** (k * added_elements )
82
110
) ** k
@@ -85,7 +113,7 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
85
113
for _ in range (added_elements ):
86
114
b .add (not_added .pop ())
87
115
88
- n_ones = bin (b .bitstring ).count ("1" )
116
+ n_ones = bin (b .bitarray ).count ("1" )
89
117
estimated_error_rate = (n_ones / filter_bits ) ** k
90
118
91
119
errors = 0
0 commit comments