From 173ab0ea96b4969b51f4d23f033a45242fe7e80a Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 10:59:59 +0200
Subject: [PATCH 01/32] Bloom filter with tests

---
 data_structures/hashing/bloom_filter.py | 103 ++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 data_structures/hashing/bloom_filter.py

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
new file mode 100644
index 000000000000..c34dd4eeac6b
--- /dev/null
+++ b/data_structures/hashing/bloom_filter.py
@@ -0,0 +1,103 @@
+"""
+See https://en.wikipedia.org/wiki/Bloom_filter
+"""
+from hashlib import sha256, md5
+from random import randint, choices
+import string
+
+
+class Bloom:
+    def __init__(self, size=8):
+        self.bitstring = 0b0
+        self.size = size
+
+    def add(self, value):
+        h = self.hash(value)
+        self.bitstring |= h
+        print(
+            f"""\
+[add] value =      {value}
+      hash =       {self.format_bin(h)}
+      filter =     {self.format_bin(self.bitstring)}
+"""
+        )
+
+    def exists(self, value):
+        h = self.hash(value)
+        res = (h & self.bitstring) == h
+
+        print(
+            f"""\
+[exists] value =   {value}
+         hash =    {self.format_bin(h)}
+         filter =  {self.format_bin(self.bitstring)}
+         res =     {res}
+"""
+        )
+        return res
+
+    def format_bin(self, value):
+        res = bin(value)[2:]
+        return res.zfill(self.size)
+
+    def hash(self, value):
+        res = 0b0
+        for func in (sha256, md5):
+            b = func(value.encode()).digest()
+            position = int.from_bytes(b, "little") % self.size
+            res |= 2**position
+        return res
+
+
+def test_movies():
+    b = Bloom()
+    b.add("titanic")
+    b.add("avatar")
+
+    assert b.exists("titanic")
+    assert b.exists("avatar")
+
+    assert b.exists("the goodfather") in (True, False)
+    assert b.exists("interstellar") in (True, False)
+    assert b.exists("Parasite") in (True, False)
+    assert b.exists("Pulp fiction") in (True, False)
+
+
+def random_string(size):
+    return "".join(choices(string.ascii_lowercase + " ", k=size))
+
+
+def test_probability(m=64, n=20):
+    b = Bloom(size=m)
+
+    added = {random_string(10) for i in range(n)}
+    for a in added:
+        b.add(a)
+
+    # number of hash functions is fixed
+    k = 2
+
+    n_ones = bin(b.bitstring).count("1")
+    expected_probability = (n_ones / m) ** k
+
+    expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k
+
+    not_added = {random_string(10) for i in range(1000)}
+    fails = 0
+    for string in not_added:
+        if b.exists(string):
+            fails += 1
+    fail_rate = fails / len(not_added)
+
+    print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}")
+    print(f"{expected_probability=}")
+    print(f"{expected_probability_wikipedia=}")
+
+    assert (
+        abs(expected_probability - fail_rate) <= 0.05
+    )  # 5% margin calculated experiementally
+
+
+if __name__ == "__main__":
+    test_movies()
+    test_probability()

From 08bc970b7ad3d80a4c66828e95fcb0e186d7c4d8 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 11:20:41 +0200
Subject: [PATCH 02/32] has functions constant

---
 data_structures/hashing/bloom_filter.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index c34dd4eeac6b..fa3581e95a8f 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -7,6 +7,9 @@
 
 
 class Bloom:
+    # number of hash functions is fixed
+    HASH_FUNCTIONS = (sha256, md5)
+
     def __init__(self, size=8):
         self.bitstring = 0b0
         self.size = size
@@ -42,7 +45,7 @@ def format_bin(self, value):
 
     def hash(self, value):
         res = 0b0
-        for func in (sha256, md5):
+        for func in HASH_FUNCTIONS:
             b = func(value.encode()).digest()
             position = int.from_bytes(b, "little") % self.size
             res |= 2**position
@@ -74,8 +77,7 @@ def test_probability(m=64, n=20):
     for a in added:
         b.add(a)
 
-    # number of hash functions is fixed
-    k = 2
+    k = len(b.HASH_FUNCIONS)
 
     n_ones = bin(b.bitstring).count("1")
     expected_probability = (n_ones / m) ** k
@@ -95,7 +97,7 @@ def test_probability(m=64, n=20):
 
     assert (
         abs(expected_probability - fail_rate) <= 0.05
-    )  # 5% margin calculated experiementally
+    )  # 5% absolute margin calculated experiementally
 
 
 if __name__ == "__main__":

From 044810991ada1bb404180783c7721a3dc8f5209e Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 11:21:52 +0200
Subject: [PATCH 03/32] fix type

---
 data_structures/hashing/bloom_filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index fa3581e95a8f..c454c16ff16e 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -45,7 +45,7 @@ def format_bin(self, value):
 
     def hash(self, value):
         res = 0b0
-        for func in HASH_FUNCTIONS:
+        for func in self.HASH_FUNCTIONS:
             b = func(value.encode()).digest()
             position = int.from_bytes(b, "little") % self.size
             res |= 2**position
@@ -77,7 +77,7 @@ def test_probability(m=64, n=20):
     for a in added:
         b.add(a)
 
-    k = len(b.HASH_FUNCIONS)
+    k = len(b.HASH_FUNCTIONS)
 
     n_ones = bin(b.bitstring).count("1")
     expected_probability = (n_ones / m) ** k

From 486dcbc04c89312c56c78d6a5af687e9b4e48494 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 13:42:18 +0200
Subject: [PATCH 04/32] isort

---
 data_structures/hashing/bloom_filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index c454c16ff16e..74db6e2a1120 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -1,9 +1,9 @@
 """
 See https://en.wikipedia.org/wiki/Bloom_filter
 """
-from hashlib import sha256, md5
-from random import randint, choices
 import string
+from hashlib import md5, sha256
+from random import choices
 
 
 class Bloom:

From 4111807c4708c08fb56408b7e87d9473debfe824 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 13:46:21 +0200
Subject: [PATCH 05/32] passing ruff

---
 data_structures/hashing/bloom_filter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 74db6e2a1120..c39c654337cb 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -1,9 +1,9 @@
 """
 See https://en.wikipedia.org/wiki/Bloom_filter
 """
-import string
 from hashlib import md5, sha256
 from random import choices
+from string import ascii_lowercase
 
 
 class Bloom:
@@ -15,7 +15,7 @@ def __init__(self, size=8):
         self.size = size
 
     def add(self, value):
-        h = self.hash(value)
+        h = self.hash_(value)
         self.bitstring |= h
         print(
             f"""\
@@ -26,7 +26,7 @@ def add(self, value):
         )
 
     def exists(self, value):
-        h = self.hash(value)
+        h = self.hash_(value)
         res = (h & self.bitstring) == h
 
         print(
@@ -43,7 +43,7 @@ def format_bin(self, value):
         res = bin(value)[2:]
         return res.zfill(self.size)
 
-    def hash(self, value):
+    def hash_(self, value):
         res = 0b0
         for func in self.HASH_FUNCTIONS:
             b = func(value.encode()).digest()
@@ -67,7 +67,7 @@ def test_movies():
 
 
 def random_string(size):
-    return "".join(choices(string.ascii_lowercase + " ", k=size))
+    return "".join(choices(ascii_lowercase + " ", k=size))
 
 
 def test_probability(m=64, n=20):

From e6ce09836d66173795cc0b61437f094a2025b201 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 13:59:37 +0200
Subject: [PATCH 06/32] type hints

---
 data_structures/hashing/bloom_filter.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index c39c654337cb..d463142cd619 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -14,7 +14,7 @@ def __init__(self, size=8):
         self.bitstring = 0b0
         self.size = size
 
-    def add(self, value):
+    def add(self, value: str):
         h = self.hash_(value)
         self.bitstring |= h
         print(
@@ -25,7 +25,7 @@ def add(self, value):
 """
         )
 
-    def exists(self, value):
+    def exists(self, value: str)-> bool:
         h = self.hash_(value)
         res = (h & self.bitstring) == h
 
@@ -39,11 +39,11 @@ def exists(self, value):
         )
         return res
 
-    def format_bin(self, value):
+    def format_bin(self, value: int) -> str:
         res = bin(value)[2:]
         return res.zfill(self.size)
 
-    def hash_(self, value):
+    def hash_(self, value: str) -> int:
         res = 0b0
         for func in self.HASH_FUNCTIONS:
             b = func(value.encode()).digest()

From e4d39db77ecde7f65489926b58239ded04624c73 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 13:59:52 +0200
Subject: [PATCH 07/32] type hints

---
 data_structures/hashing/bloom_filter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index d463142cd619..edcb9abdb789 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -25,7 +25,7 @@ def add(self, value: str):
 """
         )
 
-    def exists(self, value: str)-> bool:
+    def exists(self, value: str) -> bool:
         h = self.hash_(value)
         res = (h & self.bitstring) == h
 

From 7629686cc7a18073b21092698a1008732981d00d Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 14:07:01 +0200
Subject: [PATCH 08/32] from fail to erro

---
 data_structures/hashing/bloom_filter.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index edcb9abdb789..e2cc224fbbc2 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -73,30 +73,29 @@ def random_string(size):
 def test_probability(m=64, n=20):
     b = Bloom(size=m)
 
+    k = len(b.HASH_FUNCTIONS)
+    estimated_error_rate_beforehand = (1 - (1 - 1 / m) ** (k * n)) ** k
+
     added = {random_string(10) for i in range(n)}
     for a in added:
         b.add(a)
 
-    k = len(b.HASH_FUNCTIONS)
-
     n_ones = bin(b.bitstring).count("1")
-    expected_probability = (n_ones / m) ** k
-
-    expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k
+    estimated_error_rate = (n_ones / m) ** k
 
     not_added = {random_string(10) for i in range(1000)}
-    fails = 0
+    errors = 0
     for string in not_added:
         if b.exists(string):
-            fails += 1
-    fail_rate = fails / len(not_added)
+            errors += 1
+    error_rate = errors / len(not_added)
 
-    print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}")
-    print(f"{expected_probability=}")
-    print(f"{expected_probability_wikipedia=}")
+    print(f"total = {len(not_added)}, errors = {errors}, error_rate = {error_rate}")
+    print(f"{estimated_error_rate=}")
+    print(f"{estimated_error_rate_beforehand=}")
 
     assert (
-        abs(expected_probability - fail_rate) <= 0.05
+        abs(estimated_error_rate - error_rate) <= 0.05
     )  # 5% absolute margin calculated experiementally
 
 

From 392616713d17ea555d4425c7d35678402f0c39ae Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 14:08:04 +0200
Subject: [PATCH 09/32] captital leter

---
 data_structures/hashing/bloom_filter.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index e2cc224fbbc2..1f08ddaf257a 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -54,16 +54,16 @@ def hash_(self, value: str) -> int:
 
 def test_movies():
     b = Bloom()
-    b.add("titanic")
-    b.add("avatar")
+    b.add("Titanic")
+    b.add("Avatar")
 
-    assert b.exists("titanic")
-    assert b.exists("avatar")
+    assert b.exists("Titanic")
+    assert b.exists("Avatar")
 
-    assert b.exists("the goodfather") in (True, False)
-    assert b.exists("interstellar") in (True, False)
+    assert b.exists("The Goodfather") in (True, False)
+    assert b.exists("Interstellar") in (True, False)
     assert b.exists("Parasite") in (True, False)
-    assert b.exists("Pulp fiction") in (True, False)
+    assert b.exists("Pulp Fiction") in (True, False)
 
 
 def random_string(size):

From 280ffa0564b21ad1e6d8240dcfca1aa72ad72394 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 14:17:32 +0200
Subject: [PATCH 10/32] type hints requested by boot

---
 data_structures/hashing/bloom_filter.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 1f08ddaf257a..029d24a3e6d5 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -10,7 +10,7 @@ class Bloom:
     # number of hash functions is fixed
     HASH_FUNCTIONS = (sha256, md5)
 
-    def __init__(self, size=8):
+    def __init__(self, size: int = 8) -> None:
         self.bitstring = 0b0
         self.size = size
 
@@ -52,7 +52,7 @@ def hash_(self, value: str) -> int:
         return res
 
 
-def test_movies():
+def test_movies() -> None:
     b = Bloom()
     b.add("Titanic")
     b.add("Avatar")
@@ -66,11 +66,11 @@ def test_movies():
     assert b.exists("Pulp Fiction") in (True, False)
 
 
-def random_string(size):
+def random_string(size: int) -> str:
     return "".join(choices(ascii_lowercase + " ", k=size))
 
 
-def test_probability(m=64, n=20):
+def test_probability(m: int = 64, n: int = 20) -> None:
     b = Bloom(size=m)
 
     k = len(b.HASH_FUNCTIONS)

From 5d460aa79e48fa11444aff0e196e3121b548f861 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 14:31:19 +0200
Subject: [PATCH 11/32] descriptive name for m

---
 data_structures/hashing/bloom_filter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 029d24a3e6d5..1c13b382e436 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -14,7 +14,7 @@ def __init__(self, size: int = 8) -> None:
         self.bitstring = 0b0
         self.size = size
 
-    def add(self, value: str):
+    def add(self, value: str) -> None:
         h = self.hash_(value)
         self.bitstring |= h
         print(
@@ -70,18 +70,18 @@ def random_string(size: int) -> str:
     return "".join(choices(ascii_lowercase + " ", k=size))
 
 
-def test_probability(m: int = 64, n: int = 20) -> None:
-    b = Bloom(size=m)
+def test_probability(bits: int = 64, n: int = 20) -> None:
+    b = Bloom(size=bits)
 
     k = len(b.HASH_FUNCTIONS)
-    estimated_error_rate_beforehand = (1 - (1 - 1 / m) ** (k * n)) ** k
+    estimated_error_rate_beforehand = (1 - (1 - 1 / bits) ** (k * n)) ** k
 
     added = {random_string(10) for i in range(n)}
     for a in added:
         b.add(a)
 
     n_ones = bin(b.bitstring).count("1")
-    estimated_error_rate = (n_ones / m) ** k
+    estimated_error_rate = (n_ones / bits) ** k
 
     not_added = {random_string(10) for i in range(1000)}
     errors = 0

From cc54095c47bb76d67b4e00a5f36efb7deccf37bf Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Thu, 6 Apr 2023 17:26:25 +0200
Subject: [PATCH 12/32] more descriptibe arguments II

---
 data_structures/hashing/bloom_filter.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 1c13b382e436..32e98dcebe64 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -70,27 +70,28 @@ def random_string(size: int) -> str:
     return "".join(choices(ascii_lowercase + " ", k=size))
 
 
-def test_probability(bits: int = 64, n: int = 20) -> None:
-    b = Bloom(size=bits)
+def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
+    b = Bloom(size=filter_bits)
 
     k = len(b.HASH_FUNCTIONS)
-    estimated_error_rate_beforehand = (1 - (1 - 1 / bits) ** (k * n)) ** k
+    estimated_error_rate_beforehand = (
+        1 - (1 - 1 / filter_bits) ** (k * added_elements)
+    ) ** k
 
-    added = {random_string(10) for i in range(n)}
-    for a in added:
-        b.add(a)
+    not_added = {random_string(10) for i in range(1000)}
+    for _ in range(added_elements):
+        b.add(not_added.pop())
 
     n_ones = bin(b.bitstring).count("1")
-    estimated_error_rate = (n_ones / bits) ** k
+    estimated_error_rate = (n_ones / filter_bits) ** k
 
-    not_added = {random_string(10) for i in range(1000)}
     errors = 0
     for string in not_added:
         if b.exists(string):
             errors += 1
     error_rate = errors / len(not_added)
 
-    print(f"total = {len(not_added)}, errors = {errors}, error_rate = {error_rate}")
+    print(f"error_rate = {errors}/{len(not_added)} = {error_rate}")
     print(f"{estimated_error_rate=}")
     print(f"{estimated_error_rate_beforehand=}")
 

From 78d19fd19b18b33ea87e9c9a4a61305524261a31 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Fri, 7 Apr 2023 16:01:35 +0200
Subject: [PATCH 13/32] moved movies_test to doctest

---
 data_structures/hashing/bloom_filter.py | 62 +++++++++++++------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 32e98dcebe64..02e01713b628 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -1,5 +1,21 @@
 """
 See https://en.wikipedia.org/wiki/Bloom_filter
+
+>>> b = Bloom()
+>>> b.add("Titanic")
+>>> b.add("Avatar")
+>>> b.exists("Titanic")
+True
+>>> b.exists("Avatar")
+True
+>>> b.exists("The Goodfather")
+False
+>>> b.exists("Interstellar")
+False
+>>> b.exists("Parasite")
+False
+>>> b.exists("Pulp Fiction")
+False
 """
 from hashlib import md5, sha256
 from random import choices
@@ -17,26 +33,27 @@ def __init__(self, size: int = 8) -> None:
     def add(self, value: str) -> None:
         h = self.hash_(value)
         self.bitstring |= h
-        print(
-            f"""\
-[add] value =      {value}
-      hash =       {self.format_bin(h)}
-      filter =     {self.format_bin(self.bitstring)}
-"""
-        )
+
+    #        print(
+    #            f"""\
+    # [add] value =      {value}
+    #      hash =       {self.format_bin(h)}
+    #      filter =     {self.format_bin(self.bitstring)}
+    # """
+    #        )
 
     def exists(self, value: str) -> bool:
         h = self.hash_(value)
         res = (h & self.bitstring) == h
 
-        print(
-            f"""\
-[exists] value =   {value}
-         hash =    {self.format_bin(h)}
-         filter =  {self.format_bin(self.bitstring)}
-         res =     {res}
-"""
-        )
+        #        print(
+        #            f"""\
+        # [exists] value =   {value}
+        #         hash =    {self.format_bin(h)}
+        #         filter =  {self.format_bin(self.bitstring)}
+        #         res =     {res}
+        # """
+        #        )
         return res
 
     def format_bin(self, value: int) -> str:
@@ -52,20 +69,6 @@ def hash_(self, value: str) -> int:
         return res
 
 
-def test_movies() -> None:
-    b = Bloom()
-    b.add("Titanic")
-    b.add("Avatar")
-
-    assert b.exists("Titanic")
-    assert b.exists("Avatar")
-
-    assert b.exists("The Goodfather") in (True, False)
-    assert b.exists("Interstellar") in (True, False)
-    assert b.exists("Parasite") in (True, False)
-    assert b.exists("Pulp Fiction") in (True, False)
-
-
 def random_string(size: int) -> str:
     return "".join(choices(ascii_lowercase + " ", k=size))
 
@@ -101,5 +104,4 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
 
 
 if __name__ == "__main__":
-    test_movies()
     test_probability()

From 8b1bec0dc6d5474d749210a70c0221f6527b2258 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Fri, 7 Apr 2023 17:21:38 +0200
Subject: [PATCH 14/32] commented doctest

---
 data_structures/hashing/bloom_filter.py | 104 +++++++++++++++---------
 1 file changed, 66 insertions(+), 38 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 02e01713b628..9f6048911a9c 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -1,73 +1,101 @@
 """
 See https://en.wikipedia.org/wiki/Bloom_filter
 
->>> b = Bloom()
+The use of this data structure is to test membership in a set.
+Compared to python built-in set() it is more space-efficent.
+In the following example, only 8 bits of memory will be used:
+>>> b = Bloom(size=8)
+>>> "Titanic" in b
+False
+
+Initially the filter contains all zeros:
+>>> b.bitstring
+'00000000'
+
+When an element is added, two bits are set to 1
+since there are 2 hash functions:
 >>> b.add("Titanic")
->>> b.add("Avatar")
->>> b.exists("Titanic")
-True
->>> b.exists("Avatar")
+>>> b.bitstring
+'01100000'
+>>> "Titanic" in b
 True
->>> b.exists("The Goodfather")
+
+However, sometimes only one bit is added
+because both hash functions return the same value
+>>> b.add("Avatar")
+>>> b.format_hash("Avatar")
+'00000100'
+>>> b.bitstring
+'01100100'
+
+Not added elements should return False ...
+>>> "The Goodfather" in b
 False
->>> b.exists("Interstellar")
+>>> b.format_hash("The Goodfather")
+'00011000'
+>>> "Interstellar" in b
 False
->>> b.exists("Parasite")
+>>> "Parasite" in b
 False
->>> b.exists("Pulp Fiction")
+>>> "Pulp Fiction" in b
 False
+
+but sometimes there are false positives:
+>>> "Ratatouille" in b
+True
+>>> b.format_hash("Ratatouille")
+'01100000'
+
+>>> b.estimated_error_rate()
+0.140625
 """
 from hashlib import md5, sha256
 from random import choices
 from string import ascii_lowercase
 
+HASH_FUNCTIONS = (sha256, md5)
 
-class Bloom:
-    # number of hash functions is fixed
-    HASH_FUNCTIONS = (sha256, md5)
 
+class Bloom:
     def __init__(self, size: int = 8) -> None:
-        self.bitstring = 0b0
+        self.bitarray = 0b0
         self.size = size
 
     def add(self, value: str) -> None:
         h = self.hash_(value)
-        self.bitstring |= h
-
-    #        print(
-    #            f"""\
-    # [add] value =      {value}
-    #      hash =       {self.format_bin(h)}
-    #      filter =     {self.format_bin(self.bitstring)}
-    # """
-    #        )
+        self.bitarray |= h
 
     def exists(self, value: str) -> bool:
         h = self.hash_(value)
-        res = (h & self.bitstring) == h
-
-        #        print(
-        #            f"""\
-        # [exists] value =   {value}
-        #         hash =    {self.format_bin(h)}
-        #         filter =  {self.format_bin(self.bitstring)}
-        #         res =     {res}
-        # """
-        #        )
-        return res
+        return (h & self.bitarray) == h
 
-    def format_bin(self, value: int) -> str:
-        res = bin(value)[2:]
+    def __contains__(self, other):
+        return self.exists(other)
+
+    def format_bin(self, bitarray: int) -> str:
+        res = bin(bitarray)[2:]
         return res.zfill(self.size)
 
+    @property
+    def bitstring(self):
+        return self.format_bin(self.bitarray)
+
     def hash_(self, value: str) -> int:
         res = 0b0
-        for func in self.HASH_FUNCTIONS:
+        for func in HASH_FUNCTIONS:
             b = func(value.encode()).digest()
             position = int.from_bytes(b, "little") % self.size
             res |= 2**position
         return res
 
+    def format_hash(self, value: str) -> str:
+        return self.format_bin(self.hash_(value))
+
+    def estimated_error_rate(self):
+        n_ones = bin(self.bitarray).count("1")
+        k = len(HASH_FUNCTIONS)
+        return (n_ones / self.size) ** k
+
 
 def random_string(size: int) -> str:
     return "".join(choices(ascii_lowercase + " ", k=size))
@@ -76,7 +104,7 @@ def random_string(size: int) -> str:
 def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
     b = Bloom(size=filter_bits)
 
-    k = len(b.HASH_FUNCTIONS)
+    k = len(HASH_FUNCTIONS)
     estimated_error_rate_beforehand = (
         1 - (1 - 1 / filter_bits) ** (k * added_elements)
     ) ** k
@@ -85,7 +113,7 @@ def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
     for _ in range(added_elements):
         b.add(not_added.pop())
 
-    n_ones = bin(b.bitstring).count("1")
+    n_ones = bin(b.bitarray).count("1")
     estimated_error_rate = (n_ones / filter_bits) ** k
 
     errors = 0

From 28e66913b080f4a9231fe2ec9507c55b471949cc Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Fri, 7 Apr 2023 17:23:01 +0200
Subject: [PATCH 15/32] removed test_probability

---
 data_structures/hashing/bloom_filter.py | 40 -------------------------
 1 file changed, 40 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 9f6048911a9c..8affaabe4948 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -50,8 +50,6 @@
 0.140625
 """
 from hashlib import md5, sha256
-from random import choices
-from string import ascii_lowercase
 
 HASH_FUNCTIONS = (sha256, md5)
 
@@ -95,41 +93,3 @@ def estimated_error_rate(self):
         n_ones = bin(self.bitarray).count("1")
         k = len(HASH_FUNCTIONS)
         return (n_ones / self.size) ** k
-
-
-def random_string(size: int) -> str:
-    return "".join(choices(ascii_lowercase + " ", k=size))
-
-
-def test_probability(filter_bits: int = 64, added_elements: int = 20) -> None:
-    b = Bloom(size=filter_bits)
-
-    k = len(HASH_FUNCTIONS)
-    estimated_error_rate_beforehand = (
-        1 - (1 - 1 / filter_bits) ** (k * added_elements)
-    ) ** k
-
-    not_added = {random_string(10) for i in range(1000)}
-    for _ in range(added_elements):
-        b.add(not_added.pop())
-
-    n_ones = bin(b.bitarray).count("1")
-    estimated_error_rate = (n_ones / filter_bits) ** k
-
-    errors = 0
-    for string in not_added:
-        if b.exists(string):
-            errors += 1
-    error_rate = errors / len(not_added)
-
-    print(f"error_rate = {errors}/{len(not_added)} = {error_rate}")
-    print(f"{estimated_error_rate=}")
-    print(f"{estimated_error_rate_beforehand=}")
-
-    assert (
-        abs(estimated_error_rate - error_rate) <= 0.05
-    )  # 5% absolute margin calculated experiementally
-
-
-if __name__ == "__main__":
-    test_probability()

From 2fd71965f12a08e6d0b8a4f526266e5f935270cc Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Fri, 7 Apr 2023 17:31:08 +0200
Subject: [PATCH 16/32] estimated error

---
 data_structures/hashing/bloom_filter.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 8affaabe4948..586d7de301be 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -13,7 +13,7 @@
 '00000000'
 
 When an element is added, two bits are set to 1
-since there are 2 hash functions:
+since there are 2 hash functions in this implementation:
 >>> b.add("Titanic")
 >>> b.bitstring
 '01100000'
@@ -35,10 +35,16 @@
 '00011000'
 >>> "Interstellar" in b
 False
+>>> b.format_hash("Interstellar")
+'00000011'
 >>> "Parasite" in b
 False
+>>> b.format_hash("Parasite")
+'00010010'
 >>> "Pulp Fiction" in b
 False
+>>> b.format_hash("Pulp Fiction")
+'10000100'
 
 but sometimes there are false positives:
 >>> "Ratatouille" in b
@@ -46,8 +52,14 @@
 >>> b.format_hash("Ratatouille")
 '01100000'
 
+The probability increases with the number of added elements
 >>> b.estimated_error_rate()
 0.140625
+>>> b.add("The Goodfather")
+>>> b.estimated_error_rate()
+0.390625
+>>> b.bitstring
+'01111100'
 """
 from hashlib import md5, sha256
 

From 314237d94e7aee8a835cc97d9b2472a1ce1acc44 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Fri, 7 Apr 2023 17:49:15 +0200
Subject: [PATCH 17/32] added types

---
 data_structures/hashing/bloom_filter.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 586d7de301be..a822ed818250 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -72,14 +72,14 @@ def __init__(self, size: int = 8) -> None:
         self.size = size
 
     def add(self, value: str) -> None:
-        h = self.hash_(value)
+        h = self.hash(value)
         self.bitarray |= h
 
     def exists(self, value: str) -> bool:
-        h = self.hash_(value)
+        h = self.hash(value)
         return (h & self.bitarray) == h
 
-    def __contains__(self, other):
+    def __contains__(self, other: str) -> bool:
         return self.exists(other)
 
     def format_bin(self, bitarray: int) -> str:
@@ -87,10 +87,10 @@ def format_bin(self, bitarray: int) -> str:
         return res.zfill(self.size)
 
     @property
-    def bitstring(self):
+    def bitstring(self) -> None:
         return self.format_bin(self.bitarray)
 
-    def hash_(self, value: str) -> int:
+    def hash(self, value: str) -> int:
         res = 0b0
         for func in HASH_FUNCTIONS:
             b = func(value.encode()).digest()
@@ -99,9 +99,9 @@ def hash_(self, value: str) -> int:
         return res
 
     def format_hash(self, value: str) -> str:
-        return self.format_bin(self.hash_(value))
+        return self.format_bin(self.hash(value))
 
-    def estimated_error_rate(self):
+    def estimated_error_rate(self) -> float:
         n_ones = bin(self.bitarray).count("1")
         k = len(HASH_FUNCTIONS)
         return (n_ones / self.size) ** k

From 9b014721e492c31af31f0e27d4e94cd7062cde3a Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Fri, 7 Apr 2023 17:53:03 +0200
Subject: [PATCH 18/32] again hash_

---
 data_structures/hashing/bloom_filter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index a822ed818250..51ba916a4788 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -72,11 +72,11 @@ def __init__(self, size: int = 8) -> None:
         self.size = size
 
     def add(self, value: str) -> None:
-        h = self.hash(value)
+        h = self.hash_(value)
         self.bitarray |= h
 
     def exists(self, value: str) -> bool:
-        h = self.hash(value)
+        h = self.hash_(value)
         return (h & self.bitarray) == h
 
     def __contains__(self, other: str) -> bool:
@@ -87,10 +87,10 @@ def format_bin(self, bitarray: int) -> str:
         return res.zfill(self.size)
 
     @property
-    def bitstring(self) -> None:
+    def bitstring(self) -> str:
         return self.format_bin(self.bitarray)
 
-    def hash(self, value: str) -> int:
+    def hash_(self, value: str) -> int:
         res = 0b0
         for func in HASH_FUNCTIONS:
             b = func(value.encode()).digest()
@@ -99,7 +99,7 @@ def hash(self, value: str) -> int:
         return res
 
     def format_hash(self, value: str) -> str:
-        return self.format_bin(self.hash(value))
+        return self.format_bin(self.hash_(value))
 
     def estimated_error_rate(self) -> float:
         n_ones = bin(self.bitarray).count("1")

From c132d501b61cf4f45dd8e7761d1320acf388d015 Mon Sep 17 00:00:00 2001
From: isidroas <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 16:06:50 +0200
Subject: [PATCH 19/32] Update data_structures/hashing/bloom_filter.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 data_structures/hashing/bloom_filter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 51ba916a4788..de4f7dc23e64 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -2,7 +2,7 @@
 See https://en.wikipedia.org/wiki/Bloom_filter
 
 The use of this data structure is to test membership in a set.
-Compared to python built-in set() it is more space-efficent.
+Compared to Python's built-in set() it is more space-efficient.
 In the following example, only 8 bits of memory will be used:
 >>> b = Bloom(size=8)
 >>> "Titanic" in b

From 313c80c3694687e526205ab02b9c2583a98756fb Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 16:09:04 +0200
Subject: [PATCH 20/32] from b to bloom

---
 data_structures/hashing/bloom_filter.py | 46 ++++++++++++-------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index de4f7dc23e64..150461a96460 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -4,61 +4,61 @@
 The use of this data structure is to test membership in a set.
 Compared to Python's built-in set() it is more space-efficient.
 In the following example, only 8 bits of memory will be used:
->>> b = Bloom(size=8)
->>> "Titanic" in b
+>>> bloom = Bloom(size=8)
+>>> "Titanic" in bloom
 False
 
 Initially the filter contains all zeros:
->>> b.bitstring
+>>> bloom.bitstring
 '00000000'
 
 When an element is added, two bits are set to 1
 since there are 2 hash functions in this implementation:
->>> b.add("Titanic")
->>> b.bitstring
+>>> bloom.add("Titanic")
+>>> bloom.bitstring
 '01100000'
->>> "Titanic" in b
+>>> "Titanic" in bloom
 True
 
 However, sometimes only one bit is added
 because both hash functions return the same value
->>> b.add("Avatar")
->>> b.format_hash("Avatar")
+>>> bloom.add("Avatar")
+>>> bloom.format_hash("Avatar")
 '00000100'
->>> b.bitstring
+>>> bloom.bitstring
 '01100100'
 
 Not added elements should return False ...
->>> "The Goodfather" in b
+>>> "The Goodfather" in bloom
 False
->>> b.format_hash("The Goodfather")
+>>> bloom.format_hash("The Goodfather")
 '00011000'
->>> "Interstellar" in b
+>>> "Interstellar" in bloom
 False
->>> b.format_hash("Interstellar")
+>>> bloom.format_hash("Interstellar")
 '00000011'
->>> "Parasite" in b
+>>> "Parasite" in bloom
 False
->>> b.format_hash("Parasite")
+>>> bloom.format_hash("Parasite")
 '00010010'
->>> "Pulp Fiction" in b
+>>> "Pulp Fiction" in bloom
 False
->>> b.format_hash("Pulp Fiction")
+>>> bloom.format_hash("Pulp Fiction")
 '10000100'
 
 but sometimes there are false positives:
->>> "Ratatouille" in b
+>>> "Ratatouille" in bloom
 True
->>> b.format_hash("Ratatouille")
+>>> bloom.format_hash("Ratatouille")
 '01100000'
 
 The probability increases with the number of added elements
->>> b.estimated_error_rate()
+>>> bloom.estimated_error_rate()
 0.140625
->>> b.add("The Goodfather")
->>> b.estimated_error_rate()
+>>> bloom.add("The Goodfather")
+>>> bloom.estimated_error_rate()
 0.390625
->>> b.bitstring
+>>> bloom.bitstring
 '01111100'
 """
 from hashlib import md5, sha256

From 18e0dde13bb2c258d90a33c441965ac323bd0cfd Mon Sep 17 00:00:00 2001
From: isidroas <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 16:48:15 +0200
Subject: [PATCH 21/32] Update data_structures/hashing/bloom_filter.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 data_structures/hashing/bloom_filter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 150461a96460..526833c014d6 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -5,15 +5,15 @@
 Compared to Python's built-in set() it is more space-efficient.
 In the following example, only 8 bits of memory will be used:
 >>> bloom = Bloom(size=8)
->>> "Titanic" in bloom
-False
 
-Initially the filter contains all zeros:
+Initially, the filter contains all zeros:
 >>> bloom.bitstring
 '00000000'
 
 When an element is added, two bits are set to 1
 since there are 2 hash functions in this implementation:
+>>> "Titanic" in bloom
+False
 >>> bloom.add("Titanic")
 >>> bloom.bitstring
 '01100000'

From 54041ff38c8d7302e2492281e1f18ceac28ad03d Mon Sep 17 00:00:00 2001
From: isidroas <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 16:49:05 +0200
Subject: [PATCH 22/32] Update data_structures/hashing/bloom_filter.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 data_structures/hashing/bloom_filter.py | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 526833c014d6..fc599c3f8716 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -29,22 +29,11 @@
 '01100100'
 
 Not added elements should return False ...
->>> "The Goodfather" in bloom
+>>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction")
+>>> {film: bloom.format_hash(film) for film in not_present_films)} 
+{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010': 'Pulp Fiction': '10000100'}
+>>> any(film in bloom for film in not_present_films)
 False
->>> bloom.format_hash("The Goodfather")
-'00011000'
->>> "Interstellar" in bloom
-False
->>> bloom.format_hash("Interstellar")
-'00000011'
->>> "Parasite" in bloom
-False
->>> bloom.format_hash("Parasite")
-'00010010'
->>> "Pulp Fiction" in bloom
-False
->>> bloom.format_hash("Pulp Fiction")
-'10000100'
 
 but sometimes there are false positives:
 >>> "Ratatouille" in bloom

From 483a2a0ab2964d2425f09cd2379de89a013fd627 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 8 Apr 2023 14:49:30 +0000
Subject: [PATCH 23/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 data_structures/hashing/bloom_filter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index fc599c3f8716..7ec5a4f35b62 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -30,7 +30,7 @@
 
 Not added elements should return False ...
 >>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction")
->>> {film: bloom.format_hash(film) for film in not_present_films)} 
+>>> {film: bloom.format_hash(film) for film in not_present_films)}
 {'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010': 'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False

From 174ce08c731b4254a42f3ab5bea854c10c2f5caa Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 17:01:16 +0200
Subject: [PATCH 24/32] syntax error in dict comprehension

---
 data_structures/hashing/bloom_filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 7ec5a4f35b62..a92c5d86b999 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -30,8 +30,8 @@
 
 Not added elements should return False ...
 >>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction")
->>> {film: bloom.format_hash(film) for film in not_present_films)}
-{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010': 'Pulp Fiction': '10000100'}
+>>> {film: bloom.format_hash(film) for film in not_present_films}
+{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False
 

From 00cc60e23aec5e97aa9b417733c0f47c196ac0bf Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 17:03:08 +0200
Subject: [PATCH 25/32] from goodfather to godfather

---
 data_structures/hashing/bloom_filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index a92c5d86b999..0ba1557e6dc7 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -29,9 +29,9 @@
 '01100100'
 
 Not added elements should return False ...
->>> not_present_films = ("The Goodfather", "Interstellar", "Parasite", "Pulp Fiction")
+>>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction")
 >>> {film: bloom.format_hash(film) for film in not_present_films}
-{'The Goodfather': '00011000', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
+{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False
 

From 35fa5f5c4bf101d073aad43c37b0a423d8975071 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 17:20:08 +0200
Subject: [PATCH 26/32] removed Interestellar

---
 data_structures/hashing/bloom_filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 0ba1557e6dc7..a659fccf7f86 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -29,9 +29,9 @@
 '01100100'
 
 Not added elements should return False ...
->>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction")
+>>> not_present_films = ("The Godfather", "Parasite", "Pulp Fiction")
 >>> {film: bloom.format_hash(film) for film in not_present_films}
-{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
+{'The Godfather': '00000101', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False
 

From 5cd20ea9976390b46f3784421e21ed63c4f66575 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 17:33:51 +0200
Subject: [PATCH 27/32] forgot the last Godfather

---
 data_structures/hashing/bloom_filter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index a659fccf7f86..c56dd55e5d1f 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -44,11 +44,11 @@
 The probability increases with the number of added elements
 >>> bloom.estimated_error_rate()
 0.140625
->>> bloom.add("The Goodfather")
+>>> bloom.add("The Godfather")
 >>> bloom.estimated_error_rate()
-0.390625
+0.25
 >>> bloom.bitstring
-'01111100'
+'01100101'
 """
 from hashlib import md5, sha256
 

From 7617143cbf56918fd4d1f3a83e3450f909008fb1 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 18:42:58 +0200
Subject: [PATCH 28/32] Revert "removed Interestellar"

This reverts commit 35fa5f5c4bf101d073aad43c37b0a423d8975071.
---
 data_structures/hashing/bloom_filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index c56dd55e5d1f..8b0bfa86a159 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -29,9 +29,9 @@
 '01100100'
 
 Not added elements should return False ...
->>> not_present_films = ("The Godfather", "Parasite", "Pulp Fiction")
+>>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction")
 >>> {film: bloom.format_hash(film) for film in not_present_films}
-{'The Godfather': '00000101', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
+{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False
 

From 799171a27316d36a7736a5646304bf0bb9795d12 Mon Sep 17 00:00:00 2001
From: Isidro Arias <isidroariass@hotmail.es>
Date: Sat, 8 Apr 2023 18:54:01 +0200
Subject: [PATCH 29/32] pretty dict

---
 data_structures/hashing/bloom_filter.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 8b0bfa86a159..ec784aff13e8 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -30,8 +30,14 @@
 
 Not added elements should return False ...
 >>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction")
->>> {film: bloom.format_hash(film) for film in not_present_films}
-{'The Godfather': '00000101', 'Interstellar': '00000011', 'Parasite': '00010010', 'Pulp Fiction': '10000100'}
+>>> {
+...   film: bloom.format_hash(film)
+...   for film in not_present_films
+... } # doctest: +NORMALIZE_WHITESPACE
+{'The Godfather': '00000101',
+ 'Interstellar': '00000011',
+ 'Parasite': '00010010',
+ 'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False
 

From 1a71f4cb6aa22fe587c364fad84808bdd760dd12 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sat, 8 Apr 2023 19:25:59 +0200
Subject: [PATCH 30/32] Apply suggestions from code review

---
 data_structures/hashing/bloom_filter.py | 28 +++++++++++++------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index ec784aff13e8..eab8de643b87 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -23,6 +23,8 @@
 However, sometimes only one bit is added
 because both hash functions return the same value
 >>> bloom.add("Avatar")
+>>> "Avatar" in bloom
+True
 >>> bloom.format_hash("Avatar")
 '00000100'
 >>> bloom.bitstring
@@ -31,13 +33,13 @@
 Not added elements should return False ...
 >>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction")
 >>> {
-...   film: bloom.format_hash(film)
-...   for film in not_present_films
+...   film: bloom.format_hash(film) for film in not_present_films
 ... } # doctest: +NORMALIZE_WHITESPACE
-{'The Godfather': '00000101',
- 'Interstellar': '00000011',
- 'Parasite': '00010010',
- 'Pulp Fiction': '10000100'}
+{
+    'The Godfather': '00000101',
+    'Interstellar': '00000011',
+    'Parasite': '00010010',
+    'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False
 
@@ -47,11 +49,12 @@
 >>> bloom.format_hash("Ratatouille")
 '01100000'
 
-The probability increases with the number of added elements
->>> bloom.estimated_error_rate()
+The probability increases with the number of elements added.
+The probability decreases with the number of bits in the bitarray.
+>>> bloom.estimated_error_rate
 0.140625
 >>> bloom.add("The Godfather")
->>> bloom.estimated_error_rate()
+>>> bloom.estimated_error_rate
 0.25
 >>> bloom.bitstring
 '01100101'
@@ -88,15 +91,14 @@ def bitstring(self) -> str:
     def hash_(self, value: str) -> int:
         res = 0b0
         for func in HASH_FUNCTIONS:
-            b = func(value.encode()).digest()
-            position = int.from_bytes(b, "little") % self.size
+            position = int.from_bytes(func(value.encode()).digest(), "little") % self.size
             res |= 2**position
         return res
 
     def format_hash(self, value: str) -> str:
         return self.format_bin(self.hash_(value))
 
+    @property
     def estimated_error_rate(self) -> float:
         n_ones = bin(self.bitarray).count("1")
-        k = len(HASH_FUNCTIONS)
-        return (n_ones / self.size) ** k
+        return (n_ones / self.size) ** len(HASH_FUNCTIONS)

From 4e0263f9a57d67bd1c1c630694d2cdd08262686c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 8 Apr 2023 17:26:23 +0000
Subject: [PATCH 31/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 data_structures/hashing/bloom_filter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index eab8de643b87..39455b93c55f 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -91,7 +91,9 @@ def bitstring(self) -> str:
     def hash_(self, value: str) -> int:
         res = 0b0
         for func in HASH_FUNCTIONS:
-            position = int.from_bytes(func(value.encode()).digest(), "little") % self.size
+            position = (
+                int.from_bytes(func(value.encode()).digest(), "little") % self.size
+            )
             res |= 2**position
         return res
 

From e74674605dc20c6a3ac876566cc72774cf857cde Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sat, 8 Apr 2023 19:34:56 +0200
Subject: [PATCH 32/32] Update bloom_filter.py

---
 data_structures/hashing/bloom_filter.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/data_structures/hashing/bloom_filter.py b/data_structures/hashing/bloom_filter.py
index 39455b93c55f..7fd0985bdc33 100644
--- a/data_structures/hashing/bloom_filter.py
+++ b/data_structures/hashing/bloom_filter.py
@@ -35,11 +35,10 @@
 >>> {
 ...   film: bloom.format_hash(film) for film in not_present_films
 ... } # doctest: +NORMALIZE_WHITESPACE
-{
-    'The Godfather': '00000101',
-    'Interstellar': '00000011',
-    'Parasite': '00010010',
-    'Pulp Fiction': '10000100'}
+{'The Godfather': '00000101',
+ 'Interstellar': '00000011',
+ 'Parasite': '00010010',
+ 'Pulp Fiction': '10000100'}
 >>> any(film in bloom for film in not_present_films)
 False