Skip to content

Add hash map #7967

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions data_structures/hashing/hash_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""
Hash map with open addressing.

https://en.wikipedia.org/wiki/Hash_table

Another hash map implementation, with a good explanation.
Modern Dictionaries by Raymond Hettinger
https://www.youtube.com/watch?v=p33CVV29OG8
"""
from collections.abc import Iterator, MutableMapping
from dataclasses import dataclass
from typing import Generic, TypeVar

KEY = TypeVar("KEY")
VAL = TypeVar("VAL")


@dataclass(frozen=True, slots=True)
class _Item(Generic[KEY, VAL]):
key: KEY
val: VAL


class _DeletedItem(_Item):
def __init__(self) -> None:
super().__init__(None, None)

def __bool__(self) -> bool:
return False


_deleted = _DeletedItem()


class HashMap(MutableMapping[KEY, VAL]):
"""
Hash map with open addressing.
"""

def __init__(
self, initial_block_size: int = 8, capacity_factor: float = 0.75
) -> None:
self._initial_block_size = initial_block_size
self._buckets: list[_Item | None] = [None] * initial_block_size
assert 0.0 < capacity_factor < 1.0
self._capacity_factor = capacity_factor
self._len = 0

def _get_bucket_index(self, key: KEY) -> int:
return hash(key) % len(self._buckets)

def _get_next_ind(self, ind: int) -> int:
"""
Get next index.

Implements linear open addressing.
"""
return (ind + 1) % len(self._buckets)

def _try_set(self, ind: int, key: KEY, val: VAL) -> bool:
"""
Try to add value to the bucket.

If bucket is empty or key is the same, does insert and return True.

If bucket has another key or deleted placeholder,
that means that we need to check next bucket.
"""
stored = self._buckets[ind]
if not stored:
self._buckets[ind] = _Item(key, val)
self._len += 1
return True
elif stored.key == key:
self._buckets[ind] = _Item(key, val)
return True
else:
return False

def _is_full(self) -> bool:
"""
Return true if we have reached safe capacity.

So we need to increase the number of buckets to avoid collisions.
"""
limit = len(self._buckets) * self._capacity_factor
return len(self) >= int(limit)

def _is_sparse(self) -> bool:
"""Return true if we need twice fewer buckets when we have now."""
if len(self._buckets) <= self._initial_block_size:
return False
limit = len(self._buckets) * self._capacity_factor / 2
return len(self) < limit

def _resize(self, new_size: int) -> None:
old_buckets = self._buckets
self._buckets = [None] * new_size
self._len = 0
for item in old_buckets:
if item:
self._add_item(item.key, item.val)

def _size_up(self) -> None:
self._resize(len(self._buckets) * 2)

def _size_down(self) -> None:
self._resize(len(self._buckets) // 2)

def _iterate_buckets(self, key: KEY) -> Iterator[int]:
ind = self._get_bucket_index(key)
for _ in range(len(self._buckets)):
yield ind
ind = self._get_next_ind(ind)

def _add_item(self, key: KEY, val: VAL) -> None:
for ind in self._iterate_buckets(key):
if self._try_set(ind, key, val):
break

def __setitem__(self, key: KEY, val: VAL) -> None:
if self._is_full():
self._size_up()

self._add_item(key, val)

def __delitem__(self, key: KEY) -> None:
for ind in self._iterate_buckets(key):
item = self._buckets[ind]
if item is None:
raise KeyError(key)
if item is _deleted:
continue
if item.key == key:
self._buckets[ind] = _deleted
self._len -= 1
break
if self._is_sparse():
self._size_down()

def __getitem__(self, key: KEY) -> VAL:
for ind in self._iterate_buckets(key):
item = self._buckets[ind]
if item is None:
break
if item is _deleted:
continue
if item.key == key:
return item.val
raise KeyError(key)

def __len__(self) -> int:
return self._len

def __iter__(self) -> Iterator[KEY]:
yield from (item.key for item in self._buckets if item)

def __repr__(self) -> str:
val_string = " ,".join(
f"{item.key}: {item.val}" for item in self._buckets if item
)
return f"HashMap({val_string})"
97 changes: 97 additions & 0 deletions data_structures/hashing/tests/test_hash_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from operator import delitem, getitem, setitem

import pytest

from data_structures.hashing.hash_map import HashMap


def _get(k):
return getitem, k


def _set(k, v):
return setitem, k, v


def _del(k):
return delitem, k


def _run_operation(obj, fun, *args):
try:
return fun(obj, *args), None
except Exception as e:
return None, e


_add_items = (
_set("key_a", "val_a"),
_set("key_b", "val_b"),
)

_overwrite_items = [
_set("key_a", "val_a"),
_set("key_a", "val_b"),
]

_delete_items = [
_set("key_a", "val_a"),
_set("key_b", "val_b"),
_del("key_a"),
_del("key_b"),
_set("key_a", "val_a"),
_del("key_a"),
]

_access_absent_items = [
_get("key_a"),
_del("key_a"),
_set("key_a", "val_a"),
_del("key_a"),
_del("key_a"),
_get("key_a"),
]

_add_with_resize_up = [
*[_set(x, x) for x in range(5)], # guaranteed upsize
]

_add_with_resize_down = [
*[_set(x, x) for x in range(5)], # guaranteed upsize
*[_del(x) for x in range(5)],
_set("key_a", "val_b"),
]


@pytest.mark.parametrize(
"operations",
(
pytest.param(_add_items, id="add items"),
pytest.param(_overwrite_items, id="overwrite items"),
pytest.param(_delete_items, id="delete items"),
pytest.param(_access_absent_items, id="access absent items"),
pytest.param(_add_with_resize_up, id="add with resize up"),
pytest.param(_add_with_resize_down, id="add with resize down"),
),
)
def test_hash_map_is_the_same_as_dict(operations):
my = HashMap(initial_block_size=4)
py = {}
for _, (fun, *args) in enumerate(operations):
my_res, my_exc = _run_operation(my, fun, *args)
py_res, py_exc = _run_operation(py, fun, *args)
assert my_res == py_res
assert str(my_exc) == str(py_exc)
assert set(py) == set(my)
assert len(py) == len(my)
assert set(my.items()) == set(py.items())


def test_no_new_methods_was_added_to_api():
def is_public(name: str) -> bool:
return not name.startswith("_")

dict_public_names = {name for name in dir({}) if is_public(name)}
hash_public_names = {name for name in dir(HashMap()) if is_public(name)}

assert dict_public_names > hash_public_names