Skip to content

Commit af5b539

Browse files
committed
introduce unique_label_indices_intp
1 parent 29129ae commit af5b539

File tree

3 files changed

+60
-43
lines changed

3 files changed

+60
-43
lines changed

pandas/_libs/hashtable.pyx

+4-43
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,12 @@ include "hashtable_func_helper.pxi"
5959
# map derived hash-map types onto basic hash-map types:
6060
if np.dtype(np.intp) == np.dtype(np.int64):
6161
IntpHashTable = Int64HashTable
62-
value_count_intp = value_count_int64
63-
duplicated_intp = duplicated_int64
64-
ismember_intp = ismember_int64
65-
mode_intp = mode_int64
62+
unique_label_indices = _unique_label_indices_int64
63+
unique_label_indices_intp = _unique_label_indices_int64
6664
elif np.dtype(np.intp) == np.dtype(np.int32):
6765
IntpHashTable = Int32HashTable
68-
value_count_intp = value_count_int32
69-
duplicated_intp = duplicated_int32
70-
ismember_intp = ismember_int32
71-
mode_intp = mode_int32
66+
unique_label_indices = _unique_label_indices_int64
67+
unique_label_indices_intp = _unique_label_indices_int32
7268
else:
7369
raise ValueError(np.dtype(np.intp))
7470

@@ -174,38 +170,3 @@ cdef class Int64Factorizer(Factorizer):
174170

175171
self.count = len(self.uniques)
176172
return labels
177-
178-
179-
@cython.wraparound(False)
180-
@cython.boundscheck(False)
181-
def unique_label_indices(const int64_t[:] labels) -> ndarray:
182-
"""
183-
Indices of the first occurrences of the unique labels
184-
*excluding* -1. equivalent to:
185-
np.unique(labels, return_index=True)[1]
186-
"""
187-
cdef:
188-
int ret = 0
189-
Py_ssize_t i, n = len(labels)
190-
kh_int64_t *table = kh_init_int64()
191-
Int64Vector idx = Int64Vector()
192-
ndarray[int64_t, ndim=1] arr
193-
Int64VectorData *ud = idx.data
194-
195-
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
196-
197-
with nogil:
198-
for i in range(n):
199-
kh_put_int64(table, labels[i], &ret)
200-
if ret != 0:
201-
if needs_resize(ud):
202-
with gil:
203-
idx.resize()
204-
append_data_int64(ud, i)
205-
206-
kh_destroy_int64(table)
207-
208-
arr = idx.to_array()
209-
arr = arr[np.asarray(labels)[arr].argsort()]
210-
211-
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr

pandas/_libs/hashtable_func_helper.pxi.in

+48
Original file line numberDiff line numberDiff line change
@@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna):
470470

471471
else:
472472
raise TypeError(values.dtype)
473+
474+
475+
{{py:
476+
477+
# name, dtype, ttype, c_type
478+
dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
479+
('Int32', 'int32', 'int32', 'int32_t'), ]
480+
481+
}}
482+
483+
{{for name, dtype, ttype, c_type in dtypes}}
484+
485+
486+
@cython.wraparound(False)
487+
@cython.boundscheck(False)
488+
def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
489+
"""
490+
Indices of the first occurrences of the unique labels
491+
*excluding* -1. equivalent to:
492+
np.unique(labels, return_index=True)[1]
493+
"""
494+
cdef:
495+
int ret = 0
496+
Py_ssize_t i, n = len(labels)
497+
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
498+
{{name}}Vector idx = {{name}}Vector()
499+
ndarray[{{c_type}}, ndim=1] arr
500+
{{name}}VectorData *ud = idx.data
501+
502+
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
503+
504+
with nogil:
505+
for i in range(n):
506+
kh_put_{{ttype}}(table, labels[i], &ret)
507+
if ret != 0:
508+
if needs_resize(ud):
509+
with gil:
510+
idx.resize()
511+
append_data_{{ttype}}(ud, i)
512+
513+
kh_destroy_{{ttype}}(table)
514+
515+
arr = idx.to_array()
516+
arr = arr[np.asarray(labels)[arr].argsort()]
517+
518+
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
519+
520+
{{endfor}}

pandas/tests/libs/test_hashtable.py

+8
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,14 @@ def test_modes_with_nans():
381381
assert np.isnan(modes[0])
382382

383383

384+
def test_unique_label_indices_intp(writable):
385+
keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
386+
keys.flags.writeable = writable
387+
result = ht.unique_label_indices(keys)
388+
expected = np.array([0, 1, 5], dtype=np.intp)
389+
tm.assert_numpy_array_equal(result, expected)
390+
391+
384392
@pytest.mark.parametrize(
385393
"dtype, type_suffix",
386394
[

0 commit comments

Comments
 (0)