From 173ab0ea96b4969b51f4d23f033a45242fe7e80a Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 10:59:59 +0200 Subject: [PATCH 01/32] Bloom filter with tests --- data_structures/hashing/bloom_filter.py | 103 ++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 data_structures/hashing/bloom_filter.py diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py new file mode 100644 index 000000000000..c34dd4eeac6b --- /dev/null +++ b/data_structures/hashing/bloom_filter.py @@ -0,0 +1,103 @@ +""" +See https://en.wikipedia.org/wiki/Bloom_filter +""" +from hashlib import sha256, md5 +from random import randint, choices +import string + + +class Bloom: + def __init__(self, size=8): + self.bitstring = 0b0 + self.size = size + + def add(self, value): + h = self.hash(value) + self.bitstring |= h + print( + f"""\ +[add] value = {value} + hash = {self.format_bin(h)} + filter = {self.format_bin(self.bitstring)} +""" + ) + + def exists(self, value): + h = self.hash(value) + res = (h & self.bitstring) == h + + print( + f"""\ +[exists] value = {value} + hash = {self.format_bin(h)} + filter = {self.format_bin(self.bitstring)} + res = {res} +""" + ) + return res + + def format_bin(self, value): + res = bin(value)[2:] + return res.zfill(self.size) + + def hash(self, value): + res = 0b0 + for func in (sha256, md5): + b = func(value.encode()).digest() + position = int.from_bytes(b, "little") % self.size + res |= 2**position + return res + + +def test_movies(): + b = Bloom() + b.add("titanic") + b.add("avatar") + + assert b.exists("titanic") + assert b.exists("avatar") + + assert b.exists("the goodfather") in (True, False) + assert b.exists("interstellar") in (True, False) + assert b.exists("Parasite") in (True, False) + assert b.exists("Pulp fiction") in (True, False) + + +def random_string(size): + return "".join(choices(string.ascii_lowercase + " ", k=size)) + + +def test_probability(m=64, n=20): + b = Bloom(size=m) + + added = {random_string(10) for i in range(n)} + for a in added: + b.add(a) + + # number of hash functions is fixed + k = 2 + + n_ones = bin(b.bitstring).count("1") + expected_probability = (n_ones / m) ** k + + expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k + + not_added = {random_string(10) for i in range(1000)} + fails = 0 + for string in not_added: + if b.exists(string): + fails += 1 + fail_rate = fails / len(not_added) + + print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}") + print(f"{expected_probability=}") + print(f"{expected_probability_wikipedia=}") + + assert ( + abs(expected_probability - fail_rate) <= 0.05 + ) # 5% margin calculated experiementally + + +if __name__ == "__main__": + test_movies() + test_probability() From 08bc970b7ad3d80a4c66828e95fcb0e186d7c4d8 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 11:20:41 +0200 Subject: [PATCH 02/32] has functions constant --- data_structures/hashing/bloom_filter.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index c34dd4eeac6b..fa3581e95a8f 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -7,6 +7,9 @@ class Bloom: + # number of hash functions is fixed + HASH_FUNCTIONS = (sha256, md5) + def __init__(self, size=8): self.bitstring = 0b0 self.size = size @@ -42,7 +45,7 @@ def format_bin(self, value): def hash(self, value): res = 0b0 - for func in (sha256, md5): + for func in HASH_FUNCTIONS: b = func(value.encode()).digest() position = int.from_bytes(b, "little") % self.size res |= 2**position @@ -74,8 +77,7 @@ def test_probability(m=64, n=20): for a in added: b.add(a) - # number of hash functions is fixed - k = 2 + k = len(b.HASH_FUNCIONS) n_ones = bin(b.bitstring).count("1") expected_probability = (n_ones / m) ** k @@ -95,7 +97,7 @@ def test_probability(m=64, n=20): assert ( abs(expected_probability - fail_rate) <= 0.05 - ) # 5% margin calculated experiementally + ) # 5% absolute margin calculated experiementally if __name__ == "__main__": From 044810991ada1bb404180783c7721a3dc8f5209e Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 11:21:52 +0200 Subject: [PATCH 03/32] fix type --- data_structures/hashing/bloom_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index fa3581e95a8f..c454c16ff16e 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -45,7 +45,7 @@ def format_bin(self, value): def hash(self, value): res = 0b0 - for func in HASH_FUNCTIONS: + for func in self.HASH_FUNCTIONS: b = func(value.encode()).digest() position = int.from_bytes(b, "little") % self.size res |= 2**position @@ -77,7 +77,7 @@ def test_probability(m=64, n=20): for a in added: b.add(a) - k = len(b.HASH_FUNCIONS) + k = len(b.HASH_FUNCTIONS) n_ones = bin(b.bitstring).count("1") expected_probability = (n_ones / m) ** k From 486dcbc04c89312c56c78d6a5af687e9b4e48494 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 13:42:18 +0200 Subject: [PATCH 04/32] isort --- data_structures/hashing/bloom_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index c454c16ff16e..74db6e2a1120 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -1,9 +1,9 @@ """ See https://en.wikipedia.org/wiki/Bloom_filter """ -from hashlib import sha256, md5 -from random import randint, choices import string +from hashlib import md5, sha256 +from random import choices class Bloom: From 4111807c4708c08fb56408b7e87d9473debfe824 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 13:46:21 +0200 Subject: [PATCH 05/32] passing ruff --- data_structures/hashing/bloom_filter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 74db6e2a1120..c39c654337cb 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -1,9 +1,9 @@ """ See https://en.wikipedia.org/wiki/Bloom_filter """ -import string from hashlib import md5, sha256 from random import choices +from string import ascii_lowercase class Bloom: @@ -15,7 +15,7 @@ def __init__(self, size=8): self.size = size def add(self, value): - h = self.hash(value) + h = self.hash_(value) self.bitstring |= h print( f"""\ @@ -26,7 +26,7 @@ def add(self, value): ) def exists(self, value): - h = self.hash(value) + h = self.hash_(value) res = (h & self.bitstring) == h print( @@ -43,7 +43,7 @@ def format_bin(self, value): res = bin(value)[2:] return res.zfill(self.size) - def hash(self, value): + def hash_(self, value): res = 0b0 for func in self.HASH_FUNCTIONS: b = func(value.encode()).digest() @@ -67,7 +67,7 @@ def test_movies(): def random_string(size): - return "".join(choices(string.ascii_lowercase + " ", k=size)) + return "".join(choices(ascii_lowercase + " ", k=size)) def test_probability(m=64, n=20): From e6ce09836d66173795cc0b61437f094a2025b201 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 13:59:37 +0200 Subject: [PATCH 06/32] type hints --- data_structures/hashing/bloom_filter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index c39c654337cb..d463142cd619 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -14,7 +14,7 @@ def __init__(self, size=8): self.bitstring = 0b0 self.size = size - def add(self, value): + def add(self, value: str): h = self.hash_(value) self.bitstring |= h print( @@ -25,7 +25,7 @@ def add(self, value): """ ) - def exists(self, value): + def exists(self, value: str)-> bool: h = self.hash_(value) res = (h & self.bitstring) == h @@ -39,11 +39,11 @@ def exists(self, value): ) return res - def format_bin(self, value): + def format_bin(self, value: int) -> str: res = bin(value)[2:] return res.zfill(self.size) - def hash_(self, value): + def hash_(self, value: str) -> int: res = 0b0 for func in self.HASH_FUNCTIONS: b = func(value.encode()).digest() From e4d39db77ecde7f65489926b58239ded04624c73 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 13:59:52 +0200 Subject: [PATCH 07/32] type hints --- data_structures/hashing/bloom_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index d463142cd619..edcb9abdb789 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -25,7 +25,7 @@ def add(self, value: str): """ ) - def exists(self, value: str)-> bool: + def exists(self, value: str) -> bool: h = self.hash_(value) res = (h & self.bitstring) == h From 7629686cc7a18073b21092698a1008732981d00d Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 14:07:01 +0200 Subject: [PATCH 08/32] from fail to erro --- data_structures/hashing/bloom_filter.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index edcb9abdb789..e2cc224fbbc2 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -73,30 +73,29 @@ def random_string(size): def test_probability(m=64, n=20): b = Bloom(size=m) + k = len(b.HASH_FUNCTIONS) + estimated_error_rate_beforehand = (1 - (1 - 1 / m) ** (k * n)) ** k + added = {random_string(10) for i in range(n)} for a in added: b.add(a) - k = len(b.HASH_FUNCTIONS) - n_ones = bin(b.bitstring).count("1") - expected_probability = (n_ones / m) ** k - - expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k + estimated_error_rate = (n_ones / m) ** k not_added = {random_string(10) for i in range(1000)} - fails = 0 + errors = 0 for string in not_added: if b.exists(string): - fails += 1 - fail_rate = fails / len(not_added) + errors += 1 + error_rate = errors / len(not_added) - print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}") - print(f"{expected_probability=}") - print(f"{expected_probability_wikipedia=}") + print(f"total = {len(not_added)}, errors = {errors}, error_rate = {error_rate}") + print(f"{estimated_error_rate=}") + print(f"{estimated_error_rate_beforehand=}") assert ( - abs(expected_probability - fail_rate) <= 0.05 + abs(estimated_error_rate - error_rate) <= 0.05 ) # 5% absolute margin calculated experiementally From 392616713d17ea555d4425c7d35678402f0c39ae Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 14:08:04 +0200 Subject: [PATCH 09/32] captital leter --- data_structures/hashing/bloom_filter.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index e2cc224fbbc2..1f08ddaf257a 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -54,16 +54,16 @@ def hash_(self, value: str) -> int: def test_movies(): b = Bloom() - b.add("titanic") - b.add("avatar") + b.add("Titanic") + b.add("Avatar") - assert b.exists("titanic") - assert b.exists("avatar") + assert b.exists("Titanic") + assert b.exists("Avatar") - assert b.exists("the goodfather") in (True, False) - assert b.exists("interstellar") in (True, False) + assert b.exists("The Goodfather") in (True, False) + assert b.exists("Interstellar") in (True, False) assert b.exists("Parasite") in (True, False) - assert b.exists("Pulp fiction") in (True, False) + assert b.exists("Pulp Fiction") in (True, False) def random_string(size): From 280ffa0564b21ad1e6d8240dcfca1aa72ad72394 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 14:17:32 +0200 Subject: [PATCH 10/32] type hints requested by boot --- data_structures/hashing/bloom_filter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 1f08ddaf257a..029d24a3e6d5 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -10,7 +10,7 @@ class Bloom: # number of hash functions is fixed HASH_FUNCTIONS = (sha256, md5) - def __init__(self, size=8): + def __init__(self, size: int = 8) -> None: self.bitstring = 0b0 self.size = size @@ -52,7 +52,7 @@ def hash_(self, value: str) -> int: return res -def test_movies(): +def test_movies() -> None: b = Bloom() b.add("Titanic") b.add("Avatar") @@ -66,11 +66,11 @@ def test_movies(): assert b.exists("Pulp Fiction") in (True, False) -def random_string(size): +def random_string(size: int) -> str: return "".join(choices(ascii_lowercase + " ", k=size)) -def test_probability(m=64, n=20): +def test_probability(m: int = 64, n: int = 20) -> None: b = Bloom(size=m) k = len(b.HASH_FUNCTIONS) From 5d460aa79e48fa11444aff0e196e3121b548f861 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 14:31:19 +0200 Subject: [PATCH 11/32] descriptive name for m --- data_structures/hashing/bloom_filter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 029d24a3e6d5..1c13b382e436 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -14,7 +14,7 @@ def __init__(self, size: int = 8) -> None: self.bitstring = 0b0 self.size = size - def add(self, value: str): + def add(self, value: str) -> None: h = self.hash_(value) self.bitstring |= h print( @@ -70,18 +70,18 @@ def random_string(size: int) -> str: return "".join(choices(ascii_lowercase + " ", k=size)) -def test_probability(m: int = 64, n: int = 20) -> None: - b = Bloom(size=m) +def test_probability(bits: int = 64, n: int = 20) -> None: + b = Bloom(size=bits) k = len(b.HASH_FUNCTIONS) - estimated_error_rate_beforehand = (1 - (1 - 1 / m) ** (k * n)) ** k + estimated_error_rate_beforehand = (1 - (1 - 1 / bits) ** (k * n)) ** k added = {random_string(10) for i in range(n)} for a in added: b.add(a) n_ones = bin(b.bitstring).count("1") - estimated_error_rate = (n_ones / m) ** k + estimated_error_rate = (n_ones / bits) ** k not_added = {random_string(10) for i in range(1000)} errors = 0 From cc54095c47bb76d67b4e00a5f36efb7deccf37bf Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Thu, 6 Apr 2023 17:26:25 +0200 Subject: [PATCH 12/32] more descriptibe arguments II --- data_structures/hashing/bloom_filter.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 1c13b382e436..32e98dcebe64 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -70,27 +70,28 @@ def random_string(size: int) -> str: return "".join(choices(ascii_lowercase + " ", k=size)) -def test_probability(bits: int = 64, n: int = 20) -> None: - b = Bloom(size=bits) +def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: + b = Bloom(size=filter_bits) k = len(b.HASH_FUNCTIONS) - estimated_error_rate_beforehand = (1 - (1 - 1 / bits) ** (k * n)) ** k + estimated_error_rate_beforehand = ( + 1 - (1 - 1 / filter_bits) ** (k * added_elements) + ) ** k - added = {random_string(10) for i in range(n)} - for a in added: - b.add(a) + not_added = {random_string(10) for i in range(1000)} + for _ in range(added_elements): + b.add(not_added.pop()) n_ones = bin(b.bitstring).count("1") - estimated_error_rate = (n_ones / bits) ** k + estimated_error_rate = (n_ones / filter_bits) ** k - not_added = {random_string(10) for i in range(1000)} errors = 0 for string in not_added: if b.exists(string): errors += 1 error_rate = errors / len(not_added) - print(f"total = {len(not_added)}, errors = {errors}, error_rate = {error_rate}") + print(f"error_rate = {errors}/{len(not_added)} = {error_rate}") print(f"{estimated_error_rate=}") print(f"{estimated_error_rate_beforehand=}") From 78d19fd19b18b33ea87e9c9a4a61305524261a31 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Fri, 7 Apr 2023 16:01:35 +0200 Subject: [PATCH 13/32] moved movies_test to doctest --- data_structures/hashing/bloom_filter.py | 62 +++++++++++++------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 32e98dcebe64..02e01713b628 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -1,5 +1,21 @@ """ See https://en.wikipedia.org/wiki/Bloom_filter + +>>> b = Bloom() +>>> b.add("Titanic") +>>> b.add("Avatar") +>>> b.exists("Titanic") +True +>>> b.exists("Avatar") +True +>>> b.exists("The Goodfather") +False +>>> b.exists("Interstellar") +False +>>> b.exists("Parasite") +False +>>> b.exists("Pulp Fiction") +False """ from hashlib import md5, sha256 from random import choices @@ -17,26 +33,27 @@ def __init__(self, size: int = 8) -> None: def add(self, value: str) -> None: h = self.hash_(value) self.bitstring |= h - print( - f"""\ -[add] value = {value} - hash = {self.format_bin(h)} - filter = {self.format_bin(self.bitstring)} -""" - ) + + # print( + # f"""\ + # [add] value = {value} + # hash = {self.format_bin(h)} + # filter = {self.format_bin(self.bitstring)} + # """ + # ) def exists(self, value: str) -> bool: h = self.hash_(value) res = (h & self.bitstring) == h - print( - f"""\ -[exists] value = {value} - hash = {self.format_bin(h)} - filter = {self.format_bin(self.bitstring)} - res = {res} -""" - ) + # print( + # f"""\ + # [exists] value = {value} + # hash = {self.format_bin(h)} + # filter = {self.format_bin(self.bitstring)} + # res = {res} + # """ + # ) return res def format_bin(self, value: int) -> str: @@ -52,20 +69,6 @@ def hash_(self, value: str) -> int: return res -def test_movies() -> None: - b = Bloom() - b.add("Titanic") - b.add("Avatar") - - assert b.exists("Titanic") - assert b.exists("Avatar") - - assert b.exists("The Goodfather") in (True, False) - assert b.exists("Interstellar") in (True, False) - assert b.exists("Parasite") in (True, False) - assert b.exists("Pulp Fiction") in (True, False) - - def random_string(size: int) -> str: return "".join(choices(ascii_lowercase + " ", k=size)) @@ -101,5 +104,4 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: if __name__ == "__main__": - test_movies() test_probability() From 8b1bec0dc6d5474d749210a70c0221f6527b2258 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Fri, 7 Apr 2023 17:21:38 +0200 Subject: [PATCH 14/32] commented doctest --- data_structures/hashing/bloom_filter.py | 104 +++++++++++++++--------- 1 file changed, 66 insertions(+), 38 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 02e01713b628..9f6048911a9c 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -1,73 +1,101 @@ """ See https://en.wikipedia.org/wiki/Bloom_filter ->>> b = Bloom() +The use of this data structure is to test membership in a set. +Compared to python built-in set() it is more space-efficent. +In the following example, only 8 bits of memory will be used: +>>> b = Bloom(size=8) +>>> "Titanic" in b +False + +Initially the filter contains all zeros: +>>> b.bitstring +'00000000' + +When an element is added, two bits are set to 1 +since there are 2 hash functions: >>> b.add("Titanic") ->>> b.add("Avatar") ->>> b.exists("Titanic") -True ->>> b.exists("Avatar") +>>> b.bitstring +'01100000' +>>> "Titanic" in b True ->>> b.exists("The Goodfather") + +However, sometimes only one bit is added +because both hash functions return the same value +>>> b.add("Avatar") +>>> b.format_hash("Avatar") +'00000100' +>>> b.bitstring +'01100100' + +Not added elements should return False ... +>>> "The Goodfather" in b False ->>> b.exists("Interstellar") +>>> b.format_hash("The Goodfather") +'00011000' +>>> "Interstellar" in b False ->>> b.exists("Parasite") +>>> "Parasite" in b False ->>> b.exists("Pulp Fiction") +>>> "Pulp Fiction" in b False + +but sometimes there are false positives: +>>> "Ratatouille" in b +True +>>> b.format_hash("Ratatouille") +'01100000' + +>>> b.estimated_error_rate() +0.140625 """ from hashlib import md5, sha256 from random import choices from string import ascii_lowercase +HASH_FUNCTIONS = (sha256, md5) -class Bloom: - # number of hash functions is fixed - HASH_FUNCTIONS = (sha256, md5) +class Bloom: def __init__(self, size: int = 8) -> None: - self.bitstring = 0b0 + self.bitarray = 0b0 self.size = size def add(self, value: str) -> None: h = self.hash_(value) - self.bitstring |= h - - # print( - # f"""\ - # [add] value = {value} - # hash = {self.format_bin(h)} - # filter = {self.format_bin(self.bitstring)} - # """ - # ) + self.bitarray |= h def exists(self, value: str) -> bool: h = self.hash_(value) - res = (h & self.bitstring) == h - - # print( - # f"""\ - # [exists] value = {value} - # hash = {self.format_bin(h)} - # filter = {self.format_bin(self.bitstring)} - # res = {res} - # """ - # ) - return res + return (h & self.bitarray) == h - def format_bin(self, value: int) -> str: - res = bin(value)[2:] + def __contains__(self, other): + return self.exists(other) + + def format_bin(self, bitarray: int) -> str: + res = bin(bitarray)[2:] return res.zfill(self.size) + @property + def bitstring(self): + return self.format_bin(self.bitarray) + def hash_(self, value: str) -> int: res = 0b0 - for func in self.HASH_FUNCTIONS: + for func in HASH_FUNCTIONS: b = func(value.encode()).digest() position = int.from_bytes(b, "little") % self.size res |= 2**position return res + def format_hash(self, value: str) -> str: + return self.format_bin(self.hash_(value)) + + def estimated_error_rate(self): + n_ones = bin(self.bitarray).count("1") + k = len(HASH_FUNCTIONS) + return (n_ones / self.size) ** k + def random_string(size: int) -> str: return "".join(choices(ascii_lowercase + " ", k=size)) @@ -76,7 +104,7 @@ def random_string(size: int) -> str: def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: b = Bloom(size=filter_bits) - k = len(b.HASH_FUNCTIONS) + k = len(HASH_FUNCTIONS) estimated_error_rate_beforehand = ( 1 - (1 - 1 / filter_bits) ** (k * added_elements) ) ** k @@ -85,7 +113,7 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: for _ in range(added_elements): b.add(not_added.pop()) - n_ones = bin(b.bitstring).count("1") + n_ones = bin(b.bitarray).count("1") estimated_error_rate = (n_ones / filter_bits) ** k errors = 0 From 28e66913b080f4a9231fe2ec9507c55b471949cc Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Fri, 7 Apr 2023 17:23:01 +0200 Subject: [PATCH 15/32] removed test_probability --- data_structures/hashing/bloom_filter.py | 40 ------------------------- 1 file changed, 40 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 9f6048911a9c..8affaabe4948 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -50,8 +50,6 @@ 0.140625 """ from hashlib import md5, sha256 -from random import choices -from string import ascii_lowercase HASH_FUNCTIONS = (sha256, md5) @@ -95,41 +93,3 @@ def estimated_error_rate(self): n_ones = bin(self.bitarray).count("1") k = len(HASH_FUNCTIONS) return (n_ones / self.size) ** k - - -def random_string(size: int) -> str: - return "".join(choices(ascii_lowercase + " ", k=size)) - - -def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None: - b = Bloom(size=filter_bits) - - k = len(HASH_FUNCTIONS) - estimated_error_rate_beforehand = ( - 1 - (1 - 1 / filter_bits) ** (k * added_elements) - ) ** k - - not_added = {random_string(10) for i in range(1000)} - for _ in range(added_elements): - b.add(not_added.pop()) - - n_ones = bin(b.bitarray).count("1") - estimated_error_rate = (n_ones / filter_bits) ** k - - errors = 0 - for string in not_added: - if b.exists(string): - errors += 1 - error_rate = errors / len(not_added) - - print(f"error_rate = {errors}/{len(not_added)} = {error_rate}") - print(f"{estimated_error_rate=}") - print(f"{estimated_error_rate_beforehand=}") - - assert ( - abs(estimated_error_rate - error_rate) <= 0.05 - ) # 5% absolute margin calculated experiementally - - -if __name__ == "__main__": - test_probability() From 2fd71965f12a08e6d0b8a4f526266e5f935270cc Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Fri, 7 Apr 2023 17:31:08 +0200 Subject: [PATCH 16/32] estimated error --- data_structures/hashing/bloom_filter.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 8affaabe4948..586d7de301be 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -13,7 +13,7 @@ '00000000' When an element is added, two bits are set to 1 -since there are 2 hash functions: +since there are 2 hash functions in this implementation: >>> b.add("Titanic") >>> b.bitstring '01100000' @@ -35,10 +35,16 @@ '00011000' >>> "Interstellar" in b False +>>> b.format_hash("Interstellar") +'00000011' >>> "Parasite" in b False +>>> b.format_hash("Parasite") +'00010010' >>> "Pulp Fiction" in b False +>>> b.format_hash("Pulp Fiction") +'10000100' but sometimes there are false positives: >>> "Ratatouille" in b @@ -46,8 +52,14 @@ >>> b.format_hash("Ratatouille") '01100000' +The probability increases with the number of added elements >>> b.estimated_error_rate() 0.140625 +>>> b.add("The Goodfather") +>>> b.estimated_error_rate() +0.390625 +>>> b.bitstring +'01111100' """ from hashlib import md5, sha256 From 314237d94e7aee8a835cc97d9b2472a1ce1acc44 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Fri, 7 Apr 2023 17:49:15 +0200 Subject: [PATCH 17/32] added types --- data_structures/hashing/bloom_filter.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 586d7de301be..a822ed818250 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -72,14 +72,14 @@ def __init__(self, size: int = 8) -> None: self.size = size def add(self, value: str) -> None: - h = self.hash_(value) + h = self.hash(value) self.bitarray |= h def exists(self, value: str) -> bool: - h = self.hash_(value) + h = self.hash(value) return (h & self.bitarray) == h - def __contains__(self, other): + def __contains__(self, other: str) -> bool: return self.exists(other) def format_bin(self, bitarray: int) -> str: @@ -87,10 +87,10 @@ def format_bin(self, bitarray: int) -> str: return res.zfill(self.size) @property - def bitstring(self): + def bitstring(self) -> None: return self.format_bin(self.bitarray) - def hash_(self, value: str) -> int: + def hash(self, value: str) -> int: res = 0b0 for func in HASH_FUNCTIONS: b = func(value.encode()).digest() @@ -99,9 +99,9 @@ def hash_(self, value: str) -> int: return res def format_hash(self, value: str) -> str: - return self.format_bin(self.hash_(value)) + return self.format_bin(self.hash(value)) - def estimated_error_rate(self): + def estimated_error_rate(self) -> float: n_ones = bin(self.bitarray).count("1") k = len(HASH_FUNCTIONS) return (n_ones / self.size) ** k From 9b014721e492c31af31f0e27d4e94cd7062cde3a Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Fri, 7 Apr 2023 17:53:03 +0200 Subject: [PATCH 18/32] again hash_ --- data_structures/hashing/bloom_filter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index a822ed818250..51ba916a4788 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -72,11 +72,11 @@ def __init__(self, size: int = 8) -> None: self.size = size def add(self, value: str) -> None: - h = self.hash(value) + h = self.hash_(value) self.bitarray |= h def exists(self, value: str) -> bool: - h = self.hash(value) + h = self.hash_(value) return (h & self.bitarray) == h def __contains__(self, other: str) -> bool: @@ -87,10 +87,10 @@ def format_bin(self, bitarray: int) -> str: return res.zfill(self.size) @property - def bitstring(self) -> None: + def bitstring(self) -> str: return self.format_bin(self.bitarray) - def hash(self, value: str) -> int: + def hash_(self, value: str) -> int: res = 0b0 for func in HASH_FUNCTIONS: b = func(value.encode()).digest() @@ -99,7 +99,7 @@ def hash(self, value: str) -> int: return res def format_hash(self, value: str) -> str: - return self.format_bin(self.hash(value)) + return self.format_bin(self.hash_(value)) def estimated_error_rate(self) -> float: n_ones = bin(self.bitarray).count("1") From c132d501b61cf4f45dd8e7761d1320acf388d015 Mon Sep 17 00:00:00 2001 From: isidroas Date: Sat, 8 Apr 2023 16:06:50 +0200 Subject: [PATCH 19/32] Update data_structures/hashing/bloom_filter.py Co-authored-by: Christian Clauss --- data_structures/hashing/bloom_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 51ba916a4788..de4f7dc23e64 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -2,7 +2,7 @@ See https://en.wikipedia.org/wiki/Bloom_filter The use of this data structure is to test membership in a set. -Compared to python built-in set() it is more space-efficent. +Compared to Python's built-in set() it is more space-efficient. In the following example, only 8 bits of memory will be used: >>> b = Bloom(size=8) >>> "Titanic" in b From 313c80c3694687e526205ab02b9c2583a98756fb Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Sat, 8 Apr 2023 16:09:04 +0200 Subject: [PATCH 20/32] from b to bloom --- data_structures/hashing/bloom_filter.py | 46 ++++++++++++------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index de4f7dc23e64..150461a96460 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -4,61 +4,61 @@ The use of this data structure is to test membership in a set. Compared to Python's built-in set() it is more space-efficient. In the following example, only 8 bits of memory will be used: ->>> b = Bloom(size=8) ->>> "Titanic" in b +>>> bloom = Bloom(size=8) +>>> "Titanic" in bloom False Initially the filter contains all zeros: ->>> b.bitstring +>>> bloom.bitstring '00000000' When an element is added, two bits are set to 1 since there are 2 hash functions in this implementation: ->>> b.add("Titanic") ->>> b.bitstring +>>> bloom.add("Titanic") +>>> bloom.bitstring '01100000' ->>> "Titanic" in b +>>> "Titanic" in bloom True However, sometimes only one bit is added because both hash functions return the same value ->>> b.add("Avatar") ->>> b.format_hash("Avatar") +>>> bloom.add("Avatar") +>>> bloom.format_hash("Avatar") '00000100' ->>> b.bitstring +>>> bloom.bitstring '01100100' Not added elements should return False ... ->>> "The Goodfather" in b +>>> "The Goodfather" in bloom False ->>> b.format_hash("The Goodfather") +>>> bloom.format_hash("The Goodfather") '00011000' ->>> "Interstellar" in b +>>> "Interstellar" in bloom False ->>> b.format_hash("Interstellar") +>>> bloom.format_hash("Interstellar") '00000011' ->>> "Parasite" in b +>>> "Parasite" in bloom False ->>> b.format_hash("Parasite") +>>> bloom.format_hash("Parasite") '00010010' ->>> "Pulp Fiction" in b +>>> "Pulp Fiction" in bloom False ->>> b.format_hash("Pulp Fiction") +>>> bloom.format_hash("Pulp Fiction") '10000100' but sometimes there are false positives: ->>> "Ratatouille" in b +>>> "Ratatouille" in bloom True ->>> b.format_hash("Ratatouille") +>>> bloom.format_hash("Ratatouille") '01100000' The probability increases with the number of added elements ->>> b.estimated_error_rate() +>>> bloom.estimated_error_rate() 0.140625 ->>> b.add("The Goodfather") ->>> b.estimated_error_rate() +>>> bloom.add("The Goodfather") +>>> bloom.estimated_error_rate() 0.390625 ->>> b.bitstring +>>> bloom.bitstring '01111100' """ from hashlib import md5, sha256 From 18e0dde13bb2c258d90a33c441965ac323bd0cfd Mon Sep 17 00:00:00 2001 From: isidroas Date: Sat, 8 Apr 2023 16:48:15 +0200 Subject: [PATCH 21/32] Update data_structures/hashing/bloom_filter.py Co-authored-by: Christian Clauss --- data_structures/hashing/bloom_filter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 150461a96460..526833c014d6 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -5,15 +5,15 @@ Compared to Python's built-in set() it is more space-efficient. In the following example, only 8 bits of memory will be used: >>> bloom = Bloom(size=8) ->>> "Titanic" in bloom -False -Initially the filter contains all zeros: +Initially, the filter contains all zeros: >>> bloom.bitstring '00000000' When an element is added, two bits are set to 1 since there are 2 hash functions in this implementation: +>>> "Titanic" in bloom +False >>> bloom.add("Titanic") >>> bloom.bitstring '01100000' From 54041ff38c8d7302e2492281e1f18ceac28ad03d Mon Sep 17 00:00:00 2001 From: isidroas Date: Sat, 8 Apr 2023 16:49:05 +0200 Subject: [PATCH 22/32] Update data_structures/hashing/bloom_filter.py Co-authored-by: Christian Clauss --- data_structures/hashing/bloom_filter.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 526833c014d6..fc599c3f8716 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -29,22 +29,11 @@ '01100100' Not added elements should return False ... ->>> "The Goodfather" in bloom +>>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction") +>>> {film: bloom.format_hash(film) for film in not_present_films)} +{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010': 'Pulp Fiction': '10000100'} +>>> any(film in bloom for film in not_present_films) False ->>> bloom.format_hash("The Goodfather") -'00011000' ->>> "Interstellar" in bloom -False ->>> bloom.format_hash("Interstellar") -'00000011' ->>> "Parasite" in bloom -False ->>> bloom.format_hash("Parasite") -'00010010' ->>> "Pulp Fiction" in bloom -False ->>> bloom.format_hash("Pulp Fiction") -'10000100' but sometimes there are false positives: >>> "Ratatouille" in bloom From 483a2a0ab2964d2425f09cd2379de89a013fd627 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 8 Apr 2023 14:49:30 +0000 Subject: [PATCH 23/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data_structures/hashing/bloom_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index fc599c3f8716..7ec5a4f35b62 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -30,7 +30,7 @@ Not added elements should return False ... >>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction") ->>> {film: bloom.format_hash(film) for film in not_present_films)} +>>> {film: bloom.format_hash(film) for film in not_present_films)} {'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010': 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False From 174ce08c731b4254a42f3ab5bea854c10c2f5caa Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Sat, 8 Apr 2023 17:01:16 +0200 Subject: [PATCH 24/32] syntax error in dict comprehension --- data_structures/hashing/bloom_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 7ec5a4f35b62..a92c5d86b999 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -30,8 +30,8 @@ Not added elements should return False ... >>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction") ->>> {film: bloom.format_hash(film) for film in not_present_films)} -{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010': 'Pulp Fiction': '10000100'} +>>> {film: bloom.format_hash(film) for film in not_present_films} +{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False From 00cc60e23aec5e97aa9b417733c0f47c196ac0bf Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Sat, 8 Apr 2023 17:03:08 +0200 Subject: [PATCH 25/32] from goodfather to godfather --- data_structures/hashing/bloom_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index a92c5d86b999..0ba1557e6dc7 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -29,9 +29,9 @@ '01100100' Not added elements should return False ... ->>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction") +>>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction") >>> {film: bloom.format_hash(film) for film in not_present_films} -{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} +{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False From 35fa5f5c4bf101d073aad43c37b0a423d8975071 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Sat, 8 Apr 2023 17:20:08 +0200 Subject: [PATCH 26/32] removed Interestellar --- data_structures/hashing/bloom_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 0ba1557e6dc7..a659fccf7f86 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -29,9 +29,9 @@ '01100100' Not added elements should return False ... ->>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction") +>>> not_present_films = ("The Godfather", "Parasite", "Pulp Fiction") >>> {film: bloom.format_hash(film) for film in not_present_films} -{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} +{'The Godfather': '00000101', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False From 5cd20ea9976390b46f3784421e21ed63c4f66575 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Sat, 8 Apr 2023 17:33:51 +0200 Subject: [PATCH 27/32] forgot the last Godfather --- data_structures/hashing/bloom_filter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index a659fccf7f86..c56dd55e5d1f 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -44,11 +44,11 @@ The probability increases with the number of added elements >>> bloom.estimated_error_rate() 0.140625 ->>> bloom.add("The Goodfather") +>>> bloom.add("The Godfather") >>> bloom.estimated_error_rate() -0.390625 +0.25 >>> bloom.bitstring -'01111100' +'01100101' """ from hashlib import md5, sha256 From 7617143cbf56918fd4d1f3a83e3450f909008fb1 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Sat, 8 Apr 2023 18:42:58 +0200 Subject: [PATCH 28/32] Revert "removed Interestellar" This reverts commit 35fa5f5c4bf101d073aad43c37b0a423d8975071. --- data_structures/hashing/bloom_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index c56dd55e5d1f..8b0bfa86a159 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -29,9 +29,9 @@ '01100100' Not added elements should return False ... ->>> not_present_films = ("The Godfather", "Parasite", "Pulp Fiction") +>>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction") >>> {film: bloom.format_hash(film) for film in not_present_films} -{'The Godfather': '00000101', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} +{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False From 799171a27316d36a7736a5646304bf0bb9795d12 Mon Sep 17 00:00:00 2001 From: Isidro Arias Date: Sat, 8 Apr 2023 18:54:01 +0200 Subject: [PATCH 29/32] pretty dict --- data_structures/hashing/bloom_filter.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 8b0bfa86a159..ec784aff13e8 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -30,8 +30,14 @@ Not added elements should return False ... >>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction") ->>> {film: bloom.format_hash(film) for film in not_present_films} -{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'} +>>> { +... film: bloom.format_hash(film) +... for film in not_present_films +... } # doctest: +NORMALIZE_WHITESPACE +{'The Godfather': '00000101', + 'Interstellar': '00000011', + 'Parasite': '00010010', + 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False From 1a71f4cb6aa22fe587c364fad84808bdd760dd12 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 8 Apr 2023 19:25:59 +0200 Subject: [PATCH 30/32] Apply suggestions from code review --- data_structures/hashing/bloom_filter.py | 28 +++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index ec784aff13e8..eab8de643b87 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -23,6 +23,8 @@ However, sometimes only one bit is added because both hash functions return the same value >>> bloom.add("Avatar") +>>> "Avatar" in bloom +True >>> bloom.format_hash("Avatar") '00000100' >>> bloom.bitstring @@ -31,13 +33,13 @@ Not added elements should return False ... >>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction") >>> { -... film: bloom.format_hash(film) -... for film in not_present_films +... film: bloom.format_hash(film) for film in not_present_films ... } # doctest: +NORMALIZE_WHITESPACE -{'The Godfather': '00000101', - 'Interstellar': '00000011', - 'Parasite': '00010010', - 'Pulp Fiction': '10000100'} +{ + 'The Godfather': '00000101', + 'Interstellar': '00000011', + 'Parasite': '00010010', + 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False @@ -47,11 +49,12 @@ >>> bloom.format_hash("Ratatouille") '01100000' -The probability increases with the number of added elements ->>> bloom.estimated_error_rate() +The probability increases with the number of elements added. +The probability decreases with the number of bits in the bitarray. +>>> bloom.estimated_error_rate 0.140625 >>> bloom.add("The Godfather") ->>> bloom.estimated_error_rate() +>>> bloom.estimated_error_rate 0.25 >>> bloom.bitstring '01100101' @@ -88,15 +91,14 @@ def bitstring(self) -> str: def hash_(self, value: str) -> int: res = 0b0 for func in HASH_FUNCTIONS: - b = func(value.encode()).digest() - position = int.from_bytes(b, "little") % self.size + position = int.from_bytes(func(value.encode()).digest(), "little") % self.size res |= 2**position return res def format_hash(self, value: str) -> str: return self.format_bin(self.hash_(value)) + @property def estimated_error_rate(self) -> float: n_ones = bin(self.bitarray).count("1") - k = len(HASH_FUNCTIONS) - return (n_ones / self.size) ** k + return (n_ones / self.size) ** len(HASH_FUNCTIONS) From 4e0263f9a57d67bd1c1c630694d2cdd08262686c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 8 Apr 2023 17:26:23 +0000 Subject: [PATCH 31/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data_structures/hashing/bloom_filter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index eab8de643b87..39455b93c55f 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -91,7 +91,9 @@ def bitstring(self) -> str: def hash_(self, value: str) -> int: res = 0b0 for func in HASH_FUNCTIONS: - position = int.from_bytes(func(value.encode()).digest(), "little") % self.size + position = ( + int.from_bytes(func(value.encode()).digest(), "little") % self.size + ) res |= 2**position return res From e74674605dc20c6a3ac876566cc72774cf857cde Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 8 Apr 2023 19:34:56 +0200 Subject: [PATCH 32/32] Update bloom_filter.py --- data_structures/hashing/bloom_filter.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py index 39455b93c55f..7fd0985bdc33 100644 --- a/data_structures/hashing/bloom_filter.py +++ b/data_structures/hashing/bloom_filter.py @@ -35,11 +35,10 @@ >>> { ... film: bloom.format_hash(film) for film in not_present_films ... } # doctest: +NORMALIZE_WHITESPACE -{ - 'The Godfather': '00000101', - 'Interstellar': '00000011', - 'Parasite': '00010010', - 'Pulp Fiction': '10000100'} +{'The Godfather': '00000101', + 'Interstellar': '00000011', + 'Parasite': '00010010', + 'Pulp Fiction': '10000100'} >>> any(film in bloom for film in not_present_films) False