Skip to content

Commit b797e43

Browse files
authored
Add hashmap implementation (TheAlgorithms#7967)
1 parent 8959211 commit b797e43

File tree

2 files changed

+259
-0
lines changed

2 files changed

+259
-0
lines changed

Diff for: data_structures/hashing/hash_map.py

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
"""
2+
Hash map with open addressing.
3+
4+
https://en.wikipedia.org/wiki/Hash_table
5+
6+
Another hash map implementation, with a good explanation.
7+
Modern Dictionaries by Raymond Hettinger
8+
https://www.youtube.com/watch?v=p33CVV29OG8
9+
"""
10+
from collections.abc import Iterator, MutableMapping
11+
from dataclasses import dataclass
12+
from typing import Generic, TypeVar
13+
14+
KEY = TypeVar("KEY")
15+
VAL = TypeVar("VAL")
16+
17+
18+
@dataclass(frozen=True, slots=True)
19+
class _Item(Generic[KEY, VAL]):
20+
key: KEY
21+
val: VAL
22+
23+
24+
class _DeletedItem(_Item):
25+
def __init__(self) -> None:
26+
super().__init__(None, None)
27+
28+
def __bool__(self) -> bool:
29+
return False
30+
31+
32+
_deleted = _DeletedItem()
33+
34+
35+
class HashMap(MutableMapping[KEY, VAL]):
36+
"""
37+
Hash map with open addressing.
38+
"""
39+
40+
def __init__(
41+
self, initial_block_size: int = 8, capacity_factor: float = 0.75
42+
) -> None:
43+
self._initial_block_size = initial_block_size
44+
self._buckets: list[_Item | None] = [None] * initial_block_size
45+
assert 0.0 < capacity_factor < 1.0
46+
self._capacity_factor = capacity_factor
47+
self._len = 0
48+
49+
def _get_bucket_index(self, key: KEY) -> int:
50+
return hash(key) % len(self._buckets)
51+
52+
def _get_next_ind(self, ind: int) -> int:
53+
"""
54+
Get next index.
55+
56+
Implements linear open addressing.
57+
"""
58+
return (ind + 1) % len(self._buckets)
59+
60+
def _try_set(self, ind: int, key: KEY, val: VAL) -> bool:
61+
"""
62+
Try to add value to the bucket.
63+
64+
If bucket is empty or key is the same, does insert and return True.
65+
66+
If bucket has another key or deleted placeholder,
67+
that means that we need to check next bucket.
68+
"""
69+
stored = self._buckets[ind]
70+
if not stored:
71+
self._buckets[ind] = _Item(key, val)
72+
self._len += 1
73+
return True
74+
elif stored.key == key:
75+
self._buckets[ind] = _Item(key, val)
76+
return True
77+
else:
78+
return False
79+
80+
def _is_full(self) -> bool:
81+
"""
82+
Return true if we have reached safe capacity.
83+
84+
So we need to increase the number of buckets to avoid collisions.
85+
"""
86+
limit = len(self._buckets) * self._capacity_factor
87+
return len(self) >= int(limit)
88+
89+
def _is_sparse(self) -> bool:
90+
"""Return true if we need twice fewer buckets when we have now."""
91+
if len(self._buckets) <= self._initial_block_size:
92+
return False
93+
limit = len(self._buckets) * self._capacity_factor / 2
94+
return len(self) < limit
95+
96+
def _resize(self, new_size: int) -> None:
97+
old_buckets = self._buckets
98+
self._buckets = [None] * new_size
99+
self._len = 0
100+
for item in old_buckets:
101+
if item:
102+
self._add_item(item.key, item.val)
103+
104+
def _size_up(self) -> None:
105+
self._resize(len(self._buckets) * 2)
106+
107+
def _size_down(self) -> None:
108+
self._resize(len(self._buckets) // 2)
109+
110+
def _iterate_buckets(self, key: KEY) -> Iterator[int]:
111+
ind = self._get_bucket_index(key)
112+
for _ in range(len(self._buckets)):
113+
yield ind
114+
ind = self._get_next_ind(ind)
115+
116+
def _add_item(self, key: KEY, val: VAL) -> None:
117+
for ind in self._iterate_buckets(key):
118+
if self._try_set(ind, key, val):
119+
break
120+
121+
def __setitem__(self, key: KEY, val: VAL) -> None:
122+
if self._is_full():
123+
self._size_up()
124+
125+
self._add_item(key, val)
126+
127+
def __delitem__(self, key: KEY) -> None:
128+
for ind in self._iterate_buckets(key):
129+
item = self._buckets[ind]
130+
if item is None:
131+
raise KeyError(key)
132+
if item is _deleted:
133+
continue
134+
if item.key == key:
135+
self._buckets[ind] = _deleted
136+
self._len -= 1
137+
break
138+
if self._is_sparse():
139+
self._size_down()
140+
141+
def __getitem__(self, key: KEY) -> VAL:
142+
for ind in self._iterate_buckets(key):
143+
item = self._buckets[ind]
144+
if item is None:
145+
break
146+
if item is _deleted:
147+
continue
148+
if item.key == key:
149+
return item.val
150+
raise KeyError(key)
151+
152+
def __len__(self) -> int:
153+
return self._len
154+
155+
def __iter__(self) -> Iterator[KEY]:
156+
yield from (item.key for item in self._buckets if item)
157+
158+
def __repr__(self) -> str:
159+
val_string = " ,".join(
160+
f"{item.key}: {item.val}" for item in self._buckets if item
161+
)
162+
return f"HashMap({val_string})"

Diff for: data_structures/hashing/tests/test_hash_map.py

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
from operator import delitem, getitem, setitem
2+
3+
import pytest
4+
5+
from data_structures.hashing.hash_map import HashMap
6+
7+
8+
def _get(k):
9+
return getitem, k
10+
11+
12+
def _set(k, v):
13+
return setitem, k, v
14+
15+
16+
def _del(k):
17+
return delitem, k
18+
19+
20+
def _run_operation(obj, fun, *args):
21+
try:
22+
return fun(obj, *args), None
23+
except Exception as e:
24+
return None, e
25+
26+
27+
_add_items = (
28+
_set("key_a", "val_a"),
29+
_set("key_b", "val_b"),
30+
)
31+
32+
_overwrite_items = [
33+
_set("key_a", "val_a"),
34+
_set("key_a", "val_b"),
35+
]
36+
37+
_delete_items = [
38+
_set("key_a", "val_a"),
39+
_set("key_b", "val_b"),
40+
_del("key_a"),
41+
_del("key_b"),
42+
_set("key_a", "val_a"),
43+
_del("key_a"),
44+
]
45+
46+
_access_absent_items = [
47+
_get("key_a"),
48+
_del("key_a"),
49+
_set("key_a", "val_a"),
50+
_del("key_a"),
51+
_del("key_a"),
52+
_get("key_a"),
53+
]
54+
55+
_add_with_resize_up = [
56+
*[_set(x, x) for x in range(5)], # guaranteed upsize
57+
]
58+
59+
_add_with_resize_down = [
60+
*[_set(x, x) for x in range(5)], # guaranteed upsize
61+
*[_del(x) for x in range(5)],
62+
_set("key_a", "val_b"),
63+
]
64+
65+
66+
@pytest.mark.parametrize(
67+
"operations",
68+
(
69+
pytest.param(_add_items, id="add items"),
70+
pytest.param(_overwrite_items, id="overwrite items"),
71+
pytest.param(_delete_items, id="delete items"),
72+
pytest.param(_access_absent_items, id="access absent items"),
73+
pytest.param(_add_with_resize_up, id="add with resize up"),
74+
pytest.param(_add_with_resize_down, id="add with resize down"),
75+
),
76+
)
77+
def test_hash_map_is_the_same_as_dict(operations):
78+
my = HashMap(initial_block_size=4)
79+
py = {}
80+
for _, (fun, *args) in enumerate(operations):
81+
my_res, my_exc = _run_operation(my, fun, *args)
82+
py_res, py_exc = _run_operation(py, fun, *args)
83+
assert my_res == py_res
84+
assert str(my_exc) == str(py_exc)
85+
assert set(py) == set(my)
86+
assert len(py) == len(my)
87+
assert set(my.items()) == set(py.items())
88+
89+
90+
def test_no_new_methods_was_added_to_api():
91+
def is_public(name: str) -> bool:
92+
return not name.startswith("_")
93+
94+
dict_public_names = {name for name in dir({}) if is_public(name)}
95+
hash_public_names = {name for name in dir(HashMap()) if is_public(name)}
96+
97+
assert dict_public_names > hash_public_names

0 commit comments

Comments
 (0)