Skip to content

Commit 8f14836

Browse files
authored
BUG: fixing mixup of bucket and element counts in hash tables (#39457)
1 parent fc5a1da commit 8f14836

File tree

8 files changed

+92
-16
lines changed

8 files changed

+92
-16
lines changed

pandas/_libs/hashtable.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ from pandas._libs.khash cimport (
1919
are_equivalent_float64_t,
2020
are_equivalent_khcomplex64_t,
2121
are_equivalent_khcomplex128_t,
22+
kh_needed_n_buckets,
2223
kh_str_t,
2324
khcomplex64_t,
2425
khcomplex128_t,
@@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels):
152153
ndarray[int64_t, ndim=1] arr
153154
Int64VectorData *ud = idx.data
154155

155-
kh_resize_int64(table, min(n, SIZE_HINT_LIMIT))
156+
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
156157

157158
with nogil:
158159
for i in range(n):

pandas/_libs/hashtable_class_helper.pxi.in

+36-9
Original file line numberDiff line numberDiff line change
@@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable):
392392

393393
def __cinit__(self, int64_t size_hint=1):
394394
self.table = kh_init_{{dtype}}()
395-
if size_hint is not None:
396-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
397-
kh_resize_{{dtype}}(self.table, size_hint)
395+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
396+
kh_resize_{{dtype}}(self.table, size_hint)
398397

399398
def __len__(self) -> int:
400399
return self.table.size
@@ -420,6 +419,15 @@ cdef class {{name}}HashTable(HashTable):
420419
sizeof(Py_ssize_t)) # vals
421420
return overhead + for_flags + for_pairs
422421

422+
def get_state(self):
423+
""" returns infos about the state of the hashtable"""
424+
return {
425+
'n_buckets' : self.table.n_buckets,
426+
'size' : self.table.size,
427+
'n_occupied' : self.table.n_occupied,
428+
'upper_bound' : self.table.upper_bound,
429+
}
430+
423431
cpdef get_item(self, {{dtype}}_t val):
424432
cdef:
425433
khiter_t k
@@ -731,9 +739,8 @@ cdef class StringHashTable(HashTable):
731739

732740
def __init__(self, int64_t size_hint=1):
733741
self.table = kh_init_str()
734-
if size_hint is not None:
735-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
736-
kh_resize_str(self.table, size_hint)
742+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
743+
kh_resize_str(self.table, size_hint)
737744

738745
def __dealloc__(self):
739746
if self.table is not NULL:
@@ -747,6 +754,15 @@ cdef class StringHashTable(HashTable):
747754
sizeof(Py_ssize_t)) # vals
748755
return overhead + for_flags + for_pairs
749756

757+
def get_state(self):
758+
""" returns infos about the state of the hashtable"""
759+
return {
760+
'n_buckets' : self.table.n_buckets,
761+
'size' : self.table.size,
762+
'n_occupied' : self.table.n_occupied,
763+
'upper_bound' : self.table.upper_bound,
764+
}
765+
750766
cpdef get_item(self, str val):
751767
cdef:
752768
khiter_t k
@@ -1044,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable):
10441060

10451061
def __init__(self, int64_t size_hint=1):
10461062
self.table = kh_init_pymap()
1047-
if size_hint is not None:
1048-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
1049-
kh_resize_pymap(self.table, size_hint)
1063+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
1064+
kh_resize_pymap(self.table, size_hint)
10501065

10511066
def __dealloc__(self):
10521067
if self.table is not NULL:
@@ -1072,6 +1087,18 @@ cdef class PyObjectHashTable(HashTable):
10721087
sizeof(Py_ssize_t)) # vals
10731088
return overhead + for_flags + for_pairs
10741089

1090+
def get_state(self):
1091+
"""
1092+
returns infos about the current state of the hashtable like size,
1093+
number of buckets and so on.
1094+
"""
1095+
return {
1096+
'n_buckets' : self.table.n_buckets,
1097+
'size' : self.table.size,
1098+
'n_occupied' : self.table.n_occupied,
1099+
'upper_bound' : self.table.upper_bound,
1100+
}
1101+
10751102
cpdef get_item(self, object val):
10761103
cdef:
10771104
khiter_t k

pandas/_libs/hashtable_func_helper.pxi.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
121121
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
122122
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
123123

124-
kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT))
124+
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
125125

126126
if keep not in ('last', 'first', False):
127127
raise ValueError('keep must be either "first", "last" or False')

pandas/_libs/khash.pxd

+3
Original file line numberDiff line numberDiff line change
@@ -120,4 +120,7 @@ cdef extern from "khash_python.h":
120120

121121
bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
122122

123+
khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
124+
125+
123126
include "khash_for_primitive_helper.pxi"

pandas/_libs/src/klib/khash_python.h

+10
Original file line numberDiff line numberDiff line change
@@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
244244
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
245245
kh_resize_str(table->table, val);
246246
}
247+
248+
// utility function: given the number of elements
249+
// returns number of necessary buckets
250+
khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
251+
khuint_t candidate = n_elements;
252+
kroundup32(candidate);
253+
khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
254+
return (upper_bound < n_elements) ? 2*candidate : candidate;
255+
256+
}

pandas/core/frame.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -5465,15 +5465,13 @@ def duplicated(
54655465
4 True
54665466
dtype: bool
54675467
"""
5468-
from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64
5468+
from pandas._libs.hashtable import duplicated_int64
54695469

54705470
if self.empty:
54715471
return self._constructor_sliced(dtype=bool)
54725472

54735473
def f(vals):
5474-
labels, shape = algorithms.factorize(
5475-
vals, size_hint=min(len(self), SIZE_HINT_LIMIT)
5476-
)
5474+
labels, shape = algorithms.factorize(vals, size_hint=len(self))
54775475
return labels.astype("i8", copy=False), len(shape)
54785476

54795477
if subset is None:

pandas/core/sorting.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,7 @@ def compress_group_index(group_index, sort: bool = True):
593593
space can be huge, so this function compresses it, by computing offsets
594594
(comp_ids) into the list of unique labels (obs_group_ids).
595595
"""
596-
size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT)
596+
size_hint = len(group_index)
597597
table = hashtable.Int64HashTable(size_hint)
598598

599599
group_index = ensure_int64(group_index)

pandas/tests/libs/test_hashtable.py

+37
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,28 @@ def test_tracemalloc_for_empty(self, table_type, dtype):
155155
del table
156156
assert get_allocated_khash_memory() == 0
157157

158+
def test_get_state(self, table_type, dtype):
159+
table = table_type(1000)
160+
state = table.get_state()
161+
assert state["size"] == 0
162+
assert state["n_occupied"] == 0
163+
assert "n_buckets" in state
164+
assert "upper_bound" in state
165+
166+
def test_no_reallocation(self, table_type, dtype):
167+
for N in range(1, 110):
168+
keys = np.arange(N).astype(dtype)
169+
preallocated_table = table_type(N)
170+
n_buckets_start = preallocated_table.get_state()["n_buckets"]
171+
preallocated_table.map_locations(keys)
172+
n_buckets_end = preallocated_table.get_state()["n_buckets"]
173+
# orgininal number of buckets was enough:
174+
assert n_buckets_start == n_buckets_end
175+
# check with clean table (not too much preallocated)
176+
clean_table = table_type()
177+
clean_table.map_locations(keys)
178+
assert n_buckets_start == clean_table.get_state()["n_buckets"]
179+
158180

159181
def test_get_labels_groupby_for_Int64(writable):
160182
table = ht.Int64HashTable()
@@ -190,6 +212,21 @@ def test_tracemalloc_for_empty_StringHashTable():
190212
assert get_allocated_khash_memory() == 0
191213

192214

215+
def test_no_reallocation_StringHashTable():
216+
for N in range(1, 110):
217+
keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
218+
preallocated_table = ht.StringHashTable(N)
219+
n_buckets_start = preallocated_table.get_state()["n_buckets"]
220+
preallocated_table.map_locations(keys)
221+
n_buckets_end = preallocated_table.get_state()["n_buckets"]
222+
# orgininal number of buckets was enough:
223+
assert n_buckets_start == n_buckets_end
224+
# check with clean table (not too much preallocated)
225+
clean_table = ht.StringHashTable()
226+
clean_table.map_locations(keys)
227+
assert n_buckets_start == clean_table.get_state()["n_buckets"]
228+
229+
193230
@pytest.mark.parametrize(
194231
"table_type, dtype",
195232
[

0 commit comments

Comments
 (0)