|
4 | 4 | The use of this data structure is to test membership in a set.
|
5 | 5 | Compared to Python's built-in set() it is more space-efficient.
|
6 | 6 | In the following example, only 8 bits of memory will be used:
|
7 |
| ->>> b = Bloom(size=8) |
8 |
| ->>> "Titanic" in b |
| 7 | +>>> bloom = Bloom(size=8) |
| 8 | +>>> "Titanic" in bloom |
9 | 9 | False
|
10 | 10 |
|
11 | 11 | Initially the filter contains all zeros:
|
12 |
| ->>> b.bitstring |
| 12 | +>>> bloom.bitstring |
13 | 13 | '00000000'
|
14 | 14 |
|
15 | 15 | When an element is added, two bits are set to 1
|
16 | 16 | since there are 2 hash functions in this implementation:
|
17 |
| ->>> b.add("Titanic") |
18 |
| ->>> b.bitstring |
| 17 | +>>> bloom.add("Titanic") |
| 18 | +>>> bloom.bitstring |
19 | 19 | '01100000'
|
20 |
| ->>> "Titanic" in b |
| 20 | +>>> "Titanic" in bloom |
21 | 21 | True
|
22 | 22 |
|
23 | 23 | However, sometimes only one bit is added
|
24 | 24 | because both hash functions return the same value
|
25 |
| ->>> b.add("Avatar") |
26 |
| ->>> b.format_hash("Avatar") |
| 25 | +>>> bloom.add("Avatar") |
| 26 | +>>> bloom.format_hash("Avatar") |
27 | 27 | '00000100'
|
28 |
| ->>> b.bitstring |
| 28 | +>>> bloom.bitstring |
29 | 29 | '01100100'
|
30 | 30 |
|
31 | 31 | Not added elements should return False ...
|
32 |
| ->>> "The Goodfather" in b |
| 32 | +>>> "The Goodfather" in bloom |
33 | 33 | False
|
34 |
| ->>> b.format_hash("The Goodfather") |
| 34 | +>>> bloom.format_hash("The Goodfather") |
35 | 35 | '00011000'
|
36 |
| ->>> "Interstellar" in b |
| 36 | +>>> "Interstellar" in bloom |
37 | 37 | False
|
38 |
| ->>> b.format_hash("Interstellar") |
| 38 | +>>> bloom.format_hash("Interstellar") |
39 | 39 | '00000011'
|
40 |
| ->>> "Parasite" in b |
| 40 | +>>> "Parasite" in bloom |
41 | 41 | False
|
42 |
| ->>> b.format_hash("Parasite") |
| 42 | +>>> bloom.format_hash("Parasite") |
43 | 43 | '00010010'
|
44 |
| ->>> "Pulp Fiction" in b |
| 44 | +>>> "Pulp Fiction" in bloom |
45 | 45 | False
|
46 |
| ->>> b.format_hash("Pulp Fiction") |
| 46 | +>>> bloom.format_hash("Pulp Fiction") |
47 | 47 | '10000100'
|
48 | 48 |
|
49 | 49 | but sometimes there are false positives:
|
50 |
| ->>> "Ratatouille" in b |
| 50 | +>>> "Ratatouille" in bloom |
51 | 51 | True
|
52 |
| ->>> b.format_hash("Ratatouille") |
| 52 | +>>> bloom.format_hash("Ratatouille") |
53 | 53 | '01100000'
|
54 | 54 |
|
55 | 55 | The probability increases with the number of added elements
|
56 |
| ->>> b.estimated_error_rate() |
| 56 | +>>> bloom.estimated_error_rate() |
57 | 57 | 0.140625
|
58 |
| ->>> b.add("The Goodfather") |
59 |
| ->>> b.estimated_error_rate() |
| 58 | +>>> bloom.add("The Goodfather") |
| 59 | +>>> bloom.estimated_error_rate() |
60 | 60 | 0.390625
|
61 |
| ->>> b.bitstring |
| 61 | +>>> bloom.bitstring |
62 | 62 | '01111100'
|
63 | 63 | """
|
64 | 64 | from hashlib import md5, sha256
|
|
0 commit comments