Skip to content

Commit f13fefb

Browse files
committed
BUG: translate number of elements into number of needed buckets, avoiding rehashing for some cases
1 parent d9153d3 commit f13fefb

File tree

6 files changed

+43
-11
lines changed

6 files changed

+43
-11
lines changed

pandas/_libs/hashtable.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ from pandas._libs.khash cimport (
1919
are_equivalent_float64_t,
2020
are_equivalent_khcomplex64_t,
2121
are_equivalent_khcomplex128_t,
22+
kh_needed_n_buckets,
2223
kh_str_t,
2324
khcomplex64_t,
2425
khcomplex128_t,
@@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels):
152153
ndarray[int64_t, ndim=1] arr
153154
Int64VectorData *ud = idx.data
154155

155-
kh_resize_int64(table, min(n, SIZE_HINT_LIMIT))
156+
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
156157

157158
with nogil:
158159
for i in range(n):

pandas/_libs/hashtable_class_helper.pxi.in

+6-9
Original file line numberDiff line numberDiff line change
@@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable):
392392

393393
def __cinit__(self, int64_t size_hint=1):
394394
self.table = kh_init_{{dtype}}()
395-
if size_hint is not None:
396-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
397-
kh_resize_{{dtype}}(self.table, size_hint)
395+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
396+
kh_resize_{{dtype}}(self.table, size_hint)
398397

399398
def __len__(self) -> int:
400399
return self.table.size
@@ -740,9 +739,8 @@ cdef class StringHashTable(HashTable):
740739

741740
def __init__(self, int64_t size_hint=1):
742741
self.table = kh_init_str()
743-
if size_hint is not None:
744-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
745-
kh_resize_str(self.table, size_hint)
742+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
743+
kh_resize_str(self.table, size_hint)
746744

747745
def __dealloc__(self):
748746
if self.table is not NULL:
@@ -1062,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable):
10621060

10631061
def __init__(self, int64_t size_hint=1):
10641062
self.table = kh_init_pymap()
1065-
if size_hint is not None:
1066-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
1067-
kh_resize_pymap(self.table, size_hint)
1063+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
1064+
kh_resize_pymap(self.table, size_hint)
10681065

10691066
def __dealloc__(self):
10701067
if self.table is not NULL:

pandas/_libs/hashtable_func_helper.pxi.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
121121
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
122122
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
123123

124-
kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT))
124+
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
125125

126126
if keep not in ('last', 'first', False):
127127
raise ValueError('keep must be either "first", "last" or False')

pandas/_libs/khash.pxd

+3
Original file line numberDiff line numberDiff line change
@@ -120,4 +120,7 @@ cdef extern from "khash_python.h":
120120

121121
bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
122122

123+
khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
124+
125+
123126
include "khash_for_primitive_helper.pxi"

pandas/_libs/src/klib/khash_python.h

+10
Original file line numberDiff line numberDiff line change
@@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
244244
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
245245
kh_resize_str(table->table, val);
246246
}
247+
248+
// utility function: given the number of elements
249+
// returns number of necessary buckets
250+
khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
251+
khuint_t candidate = n_elements;
252+
kroundup32(candidate);
253+
khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
254+
return (upper_bound < n_elements) ? 2*candidate : candidate;
255+
256+
}

pandas/tests/libs/test_hashtable.py

+21
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,16 @@ def test_get_state(self, table_type, dtype):
163163
assert "n_buckets" in state
164164
assert "upper_bound" in state
165165

166+
def test_no_reallocation(self, table_type, dtype):
167+
N = 110
168+
keys = np.arange(N).astype(dtype)
169+
table = table_type(N)
170+
n_buckets_start = table.get_state()["n_buckets"]
171+
table.map_locations(keys)
172+
n_buckets_end = table.get_state()["n_buckets"]
173+
# orgininal number of buckets was enough:
174+
assert n_buckets_start == n_buckets_end
175+
166176

167177
def test_get_labels_groupby_for_Int64(writable):
168178
table = ht.Int64HashTable()
@@ -198,6 +208,17 @@ def test_tracemalloc_for_empty_StringHashTable():
198208
assert get_allocated_khash_memory() == 0
199209

200210

211+
def test_no_reallocation_StringHashTable():
212+
N = 110
213+
keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
214+
table = ht.StringHashTable(N)
215+
n_buckets_start = table.get_state()["n_buckets"]
216+
table.map_locations(keys)
217+
n_buckets_end = table.get_state()["n_buckets"]
218+
# orgininal number of buckets was enough:
219+
assert n_buckets_start == n_buckets_end
220+
221+
201222
@pytest.mark.parametrize(
202223
"table_type, dtype",
203224
[

0 commit comments

Comments
 (0)