Skip to content

Commit ca19724

Browse files
committed
introduce unique_label_indices_intp
1 parent 4ac7a5a commit ca19724

File tree

3 files changed

+60
-43
lines changed

3 files changed

+60
-43
lines changed

pandas/_libs/hashtable.pyx

+4-43
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,12 @@ include "hashtable_func_helper.pxi"
6969
# map derived hash-map types onto basic hash-map types:
7070
if np.dtype(np.intp) == np.dtype(np.int64):
7171
IntpHashTable = Int64HashTable
72-
value_count_intp = value_count_int64
73-
duplicated_intp = duplicated_int64
74-
ismember_intp = ismember_int64
75-
mode_intp = mode_int64
72+
unique_label_indices = _unique_label_indices_int64
73+
unique_label_indices_intp = _unique_label_indices_int64
7674
elif np.dtype(np.intp) == np.dtype(np.int32):
7775
IntpHashTable = Int32HashTable
78-
value_count_intp = value_count_int32
79-
duplicated_intp = duplicated_int32
80-
ismember_intp = ismember_int32
81-
mode_intp = mode_int32
76+
unique_label_indices = _unique_label_indices_int64
77+
unique_label_indices_intp = _unique_label_indices_int32
8278
else:
8379
raise ValueError(np.dtype(np.intp))
8480

@@ -186,38 +182,3 @@ cdef class Int64Factorizer(Factorizer):
186182

187183
self.count = len(self.uniques)
188184
return labels
189-
190-
191-
@cython.wraparound(False)
192-
@cython.boundscheck(False)
193-
def unique_label_indices(const int64_t[:] labels) -> ndarray:
194-
"""
195-
Indices of the first occurrences of the unique labels
196-
*excluding* -1. equivalent to:
197-
np.unique(labels, return_index=True)[1]
198-
"""
199-
cdef:
200-
int ret = 0
201-
Py_ssize_t i, n = len(labels)
202-
kh_int64_t *table = kh_init_int64()
203-
Int64Vector idx = Int64Vector()
204-
ndarray[int64_t, ndim=1] arr
205-
Int64VectorData *ud = idx.data
206-
207-
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
208-
209-
with nogil:
210-
for i in range(n):
211-
kh_put_int64(table, labels[i], &ret)
212-
if ret != 0:
213-
if needs_resize(ud):
214-
with gil:
215-
idx.resize()
216-
append_data_int64(ud, i)
217-
218-
kh_destroy_int64(table)
219-
220-
arr = idx.to_array()
221-
arr = arr[np.asarray(labels)[arr].argsort()]
222-
223-
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr

pandas/_libs/hashtable_func_helper.pxi.in

+48
Original file line numberDiff line numberDiff line change
@@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna):
470470

471471
else:
472472
raise TypeError(values.dtype)
473+
474+
475+
{{py:
476+
477+
# name, dtype, ttype, c_type
478+
dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
479+
('Int32', 'int32', 'int32', 'int32_t'), ]
480+
481+
}}
482+
483+
{{for name, dtype, ttype, c_type in dtypes}}
484+
485+
486+
@cython.wraparound(False)
487+
@cython.boundscheck(False)
488+
def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
489+
"""
490+
Indices of the first occurrences of the unique labels
491+
*excluding* -1. equivalent to:
492+
np.unique(labels, return_index=True)[1]
493+
"""
494+
cdef:
495+
int ret = 0
496+
Py_ssize_t i, n = len(labels)
497+
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
498+
{{name}}Vector idx = {{name}}Vector()
499+
ndarray[{{c_type}}, ndim=1] arr
500+
{{name}}VectorData *ud = idx.data
501+
502+
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
503+
504+
with nogil:
505+
for i in range(n):
506+
kh_put_{{ttype}}(table, labels[i], &ret)
507+
if ret != 0:
508+
if needs_resize(ud):
509+
with gil:
510+
idx.resize()
511+
append_data_{{ttype}}(ud, i)
512+
513+
kh_destroy_{{ttype}}(table)
514+
515+
arr = idx.to_array()
516+
arr = arr[np.asarray(labels)[arr].argsort()]
517+
518+
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
519+
520+
{{endfor}}

pandas/tests/libs/test_hashtable.py

+8
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,14 @@ def test_modes_with_nans():
473473
assert np.isnan(modes[0])
474474

475475

476+
def test_unique_label_indices_intp(writable):
477+
keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
478+
keys.flags.writeable = writable
479+
result = ht.unique_label_indices(keys)
480+
expected = np.array([0, 1, 5], dtype=np.intp)
481+
tm.assert_numpy_array_equal(result, expected)
482+
483+
476484
@pytest.mark.parametrize(
477485
"dtype, type_suffix",
478486
[

0 commit comments

Comments
 (0)