Skip to content

Commit cd8930b

Browse files
authored
[ENH] introducing IntpHashMap and making unique_label_indices use intp (#40653)
1 parent 11f94a5 commit cd8930b

File tree

6 files changed

+73
-38
lines changed

6 files changed

+73
-38
lines changed

pandas/_libs/hashtable.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ class UInt16HashTable(HashTable): ...
192192
class UInt8HashTable(HashTable): ...
193193
class StringHashTable(HashTable): ...
194194
class PyObjectHashTable(HashTable): ...
195+
class IntpHashTable(HashTable): ...
195196

196197
def duplicated_int64(
197198
values: np.ndarray, # const int64_t[:] values

pandas/_libs/hashtable.pyx

+12-35
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,18 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128
6565
include "hashtable_class_helper.pxi"
6666
include "hashtable_func_helper.pxi"
6767

68+
69+
# map derived hash-map types onto basic hash-map types:
70+
if np.dtype(np.intp) == np.dtype(np.int64):
71+
IntpHashTable = Int64HashTable
72+
unique_label_indices = _unique_label_indices_int64
73+
elif np.dtype(np.intp) == np.dtype(np.int32):
74+
IntpHashTable = Int32HashTable
75+
unique_label_indices = _unique_label_indices_int32
76+
else:
77+
raise ValueError(np.dtype(np.intp))
78+
79+
6880
cdef class Factorizer:
6981
cdef readonly:
7082
Py_ssize_t count
@@ -168,38 +180,3 @@ cdef class Int64Factorizer(Factorizer):
168180

169181
self.count = len(self.uniques)
170182
return labels
171-
172-
173-
@cython.wraparound(False)
174-
@cython.boundscheck(False)
175-
def unique_label_indices(const int64_t[:] labels) -> ndarray:
176-
"""
177-
Indices of the first occurrences of the unique labels
178-
*excluding* -1. equivalent to:
179-
np.unique(labels, return_index=True)[1]
180-
"""
181-
cdef:
182-
int ret = 0
183-
Py_ssize_t i, n = len(labels)
184-
kh_int64_t *table = kh_init_int64()
185-
Int64Vector idx = Int64Vector()
186-
ndarray[int64_t, ndim=1] arr
187-
Int64VectorData *ud = idx.data
188-
189-
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
190-
191-
with nogil:
192-
for i in range(n):
193-
kh_put_int64(table, labels[i], &ret)
194-
if ret != 0:
195-
if needs_resize(ud):
196-
with gil:
197-
idx.resize()
198-
append_data_int64(ud, i)
199-
200-
kh_destroy_int64(table)
201-
202-
arr = idx.to_array()
203-
arr = arr[np.asarray(labels)[arr].argsort()]
204-
205-
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr

pandas/_libs/hashtable_func_helper.pxi.in

+48
Original file line numberDiff line numberDiff line change
@@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna):
470470

471471
else:
472472
raise TypeError(values.dtype)
473+
474+
475+
{{py:
476+
477+
# name, dtype, ttype, c_type
478+
dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
479+
('Int32', 'int32', 'int32', 'int32_t'), ]
480+
481+
}}
482+
483+
{{for name, dtype, ttype, c_type in dtypes}}
484+
485+
486+
@cython.wraparound(False)
487+
@cython.boundscheck(False)
488+
def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
489+
"""
490+
Indices of the first occurrences of the unique labels
491+
*excluding* -1. equivalent to:
492+
np.unique(labels, return_index=True)[1]
493+
"""
494+
cdef:
495+
int ret = 0
496+
Py_ssize_t i, n = len(labels)
497+
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
498+
{{name}}Vector idx = {{name}}Vector()
499+
ndarray[{{c_type}}, ndim=1] arr
500+
{{name}}VectorData *ud = idx.data
501+
502+
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
503+
504+
with nogil:
505+
for i in range(n):
506+
kh_put_{{ttype}}(table, labels[i], &ret)
507+
if ret != 0:
508+
if needs_resize(ud):
509+
with gil:
510+
idx.resize()
511+
append_data_{{ttype}}(ud, i)
512+
513+
kh_destroy_{{ttype}}(table)
514+
515+
arr = idx.to_array()
516+
arr = arr[np.asarray(labels)[arr].argsort()]
517+
518+
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
519+
520+
{{endfor}}

pandas/core/sorting.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -261,8 +261,7 @@ def decons_obs_group_ids(
261261
out = decons_group_index(obs_ids, shape)
262262
return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
263263

264-
# TODO: unique_label_indices only used here, should take ndarray[np.intp]
265-
indexer = unique_label_indices(ensure_int64(comp_ids))
264+
indexer = unique_label_indices(comp_ids)
266265
return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
267266

268267

pandas/tests/libs/test_hashtable.py

+10
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def get_allocated_khash_memory():
4444
(ht.UInt16HashTable, np.uint16),
4545
(ht.Int8HashTable, np.int8),
4646
(ht.UInt8HashTable, np.uint8),
47+
(ht.IntpHashTable, np.intp),
4748
],
4849
)
4950
class TestHashTable:
@@ -389,6 +390,7 @@ def get_ht_function(fun_name, type_suffix):
389390
(np.uint16, "uint16"),
390391
(np.int8, "int8"),
391392
(np.uint8, "uint8"),
393+
(np.intp, "intp"),
392394
],
393395
)
394396
class TestHelpFunctions:
@@ -471,6 +473,14 @@ def test_modes_with_nans():
471473
assert np.isnan(modes[0])
472474

473475

476+
def test_unique_label_indices_intp(writable):
477+
keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
478+
keys.flags.writeable = writable
479+
result = ht.unique_label_indices(keys)
480+
expected = np.array([0, 1, 5], dtype=np.intp)
481+
tm.assert_numpy_array_equal(result, expected)
482+
483+
474484
@pytest.mark.parametrize(
475485
"dtype, type_suffix",
476486
[

pandas/tests/test_algos.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1741,7 +1741,7 @@ def test_quantile():
17411741

17421742
def test_unique_label_indices():
17431743

1744-
a = np.random.randint(1, 1 << 10, 1 << 15).astype("int64")
1744+
a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)
17451745

17461746
left = ht.unique_label_indices(a)
17471747
right = np.unique(a, return_index=True)[1]

0 commit comments

Comments
 (0)