BUG: fixing mixup of bucket and element counts in hash tables (#39457)

realead · web-flow · commit 8f148363a61f · 2021-02-01T20:59:49.000-05:00
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -19,6 +19,7 @@ from pandas._libs.khash cimport (
     are_equivalent_float64_t,
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
+    kh_needed_n_buckets,
     kh_str_t,
     khcomplex64_t,
     khcomplex128_t,
@@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels):
         ndarray[int64_t, ndim=1] arr
         Int64VectorData *ud = idx.data
 
-    kh_resize_int64(table, min(n, SIZE_HINT_LIMIT))
+    kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
     with nogil:
         for i in range(n):
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable):
 
     def __cinit__(self, int64_t size_hint=1):
         self.table = kh_init_{{dtype}}()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_{{dtype}}(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_{{dtype}}(self.table, size_hint)
 
     def __len__(self) -> int:
         return self.table.size
@@ -420,6 +419,15 @@ cdef class {{name}}HashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """ returns infos about the state of the hashtable"""
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, {{dtype}}_t val):
         cdef:
             khiter_t k
@@ -731,9 +739,8 @@ cdef class StringHashTable(HashTable):
 
     def __init__(self, int64_t size_hint=1):
         self.table = kh_init_str()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_str(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_str(self.table, size_hint)
 
     def __dealloc__(self):
         if self.table is not NULL:
@@ -747,6 +754,15 @@ cdef class StringHashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """ returns infos about the state of the hashtable"""
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, str val):
         cdef:
             khiter_t k
@@ -1044,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable):
 
     def __init__(self, int64_t size_hint=1):
         self.table = kh_init_pymap()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_pymap(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_pymap(self.table, size_hint)
 
     def __dealloc__(self):
         if self.table is not NULL:
@@ -1072,6 +1087,18 @@ cdef class PyObjectHashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """
+        returns infos about the current state of the hashtable like size,
+        number of buckets and so on.
+        """
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, object val):
         cdef:
             khiter_t k
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
         kh_{{ttype}}_t *table = kh_init_{{ttype}}()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
 
-    kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT))
+    kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
     if keep not in ('last', 'first', False):
         raise ValueError('keep must be either "first", "last" or False')
diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
@@ -120,4 +120,7 @@ cdef extern from "khash_python.h":
 
     bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
 
+    khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
+
+
 include "khash_for_primitive_helper.pxi"
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
@@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
 void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
 	kh_resize_str(table->table, val);
 }
+
+// utility function: given the number of elements
+// returns number of necessary buckets
+khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
+    khuint_t candidate = n_elements;
+    kroundup32(candidate);
+    khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
+    return (upper_bound < n_elements) ? 2*candidate : candidate;
+
+}
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5465,15 +5465,13 @@ def duplicated(
         4     True
         dtype: bool
         """
-        from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64
+        from pandas._libs.hashtable import duplicated_int64
 
         if self.empty:
             return self._constructor_sliced(dtype=bool)
 
         def f(vals):
-            labels, shape = algorithms.factorize(
-                vals, size_hint=min(len(self), SIZE_HINT_LIMIT)
-            )
+            labels, shape = algorithms.factorize(vals, size_hint=len(self))
             return labels.astype("i8", copy=False), len(shape)
 
         if subset is None:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -593,7 +593,7 @@ def compress_group_index(group_index, sort: bool = True):
     space can be huge, so this function compresses it, by computing offsets
     (comp_ids) into the list of unique labels (obs_group_ids).
     """
-    size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT)
+    size_hint = len(group_index)
     table = hashtable.Int64HashTable(size_hint)
 
     group_index = ensure_int64(group_index)
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -155,6 +155,28 @@ def test_tracemalloc_for_empty(self, table_type, dtype):
             del table
             assert get_allocated_khash_memory() == 0
 
+    def test_get_state(self, table_type, dtype):
+        table = table_type(1000)
+        state = table.get_state()
+        assert state["size"] == 0
+        assert state["n_occupied"] == 0
+        assert "n_buckets" in state
+        assert "upper_bound" in state
+
+    def test_no_reallocation(self, table_type, dtype):
+        for N in range(1, 110):
+            keys = np.arange(N).astype(dtype)
+            preallocated_table = table_type(N)
+            n_buckets_start = preallocated_table.get_state()["n_buckets"]
+            preallocated_table.map_locations(keys)
+            n_buckets_end = preallocated_table.get_state()["n_buckets"]
+            # orgininal number of buckets was enough:
+            assert n_buckets_start == n_buckets_end
+            # check with clean table (not too much preallocated)
+            clean_table = table_type()
+            clean_table.map_locations(keys)
+            assert n_buckets_start == clean_table.get_state()["n_buckets"]
+
 
 def test_get_labels_groupby_for_Int64(writable):
     table = ht.Int64HashTable()
@@ -190,6 +212,21 @@ def test_tracemalloc_for_empty_StringHashTable():
         assert get_allocated_khash_memory() == 0
 
 
+def test_no_reallocation_StringHashTable():
+    for N in range(1, 110):
+        keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
+        preallocated_table = ht.StringHashTable(N)
+        n_buckets_start = preallocated_table.get_state()["n_buckets"]
+        preallocated_table.map_locations(keys)
+        n_buckets_end = preallocated_table.get_state()["n_buckets"]
+        # orgininal number of buckets was enough:
+        assert n_buckets_start == n_buckets_end
+        # check with clean table (not too much preallocated)
+        clean_table = ht.StringHashTable()
+        clean_table.map_locations(keys)
+        assert n_buckets_start == clean_table.get_state()["n_buckets"]
+
+
 @pytest.mark.parametrize(
     "table_type, dtype",
     [