Skip to content

Commit 30de418

Browse files
committed
Always calculate inverse
1 parent 6079c26 commit 30de418

File tree

1 file changed

+41
-65
lines changed

1 file changed

+41
-65
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

+41-65
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,7 @@ cdef class {{name}}HashTable(HashTable):
358358
@cython.wraparound(False)
359359
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
360360
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
361-
object na_value=None, bint ignore_na=False,
362-
bint return_inverse=False):
361+
object na_value=None, bint ignore_na=False):
363362
"""
364363
Calculate unique values and labels (no sorting!)
365364

@@ -382,15 +381,12 @@ cdef class {{name}}HashTable(HashTable):
382381
Whether NA-values should be ignored for calculating the uniques. If
383382
True, the labels corresponding to missing values will be set to
384383
na_sentinel.
385-
return_inverse : boolean, default False
386-
Whether the mapping of the original array values to their location
387-
in the vector of uniques should be returned.
388384

389385
Returns
390386
-------
391387
uniques : ndarray[{{dtype}}]
392388
Unique values of input, not sorted
393-
labels : ndarray[int64] (if return_inverse=True)
389+
labels : ndarray[int64]
394390
The labels from values to uniques
395391
"""
396392
cdef:
@@ -402,8 +398,7 @@ cdef class {{name}}HashTable(HashTable):
402398
{{name}}VectorData *ud
403399
bint use_na_value
404400

405-
if return_inverse:
406-
labels = np.empty(n, dtype=np.int64)
401+
labels = np.empty(n, dtype=np.int64)
407402
ud = uniques.data
408403
use_na_value = na_value is not None
409404

@@ -440,19 +435,15 @@ cdef class {{name}}HashTable(HashTable):
440435
"Vector.resize() needed")
441436
uniques.resize()
442437
append_data_{{dtype}}(ud, val)
443-
if return_inverse:
444-
self.table.vals[k] = count
445-
labels[i] = count
446-
count += 1
447-
elif return_inverse:
438+
self.table.vals[k] = count
439+
labels[i] = count
440+
count += 1
441+
else:
448442
# k falls into a previous bucket
449-
# only relevant in case we need to construct the inverse
450443
idx = self.table.vals[k]
451444
labels[i] = idx
452445

453-
if return_inverse:
454-
return uniques.to_array(), np.asarray(labels)
455-
return uniques.to_array()
446+
return uniques.to_array(), np.asarray(labels)
456447

457448
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
458449
"""
@@ -474,8 +465,10 @@ cdef class {{name}}HashTable(HashTable):
474465
The labels from values to uniques
475466
"""
476467
uniques = {{name}}Vector()
477-
return self._unique(values, uniques, ignore_na=False,
478-
return_inverse=return_inverse)
468+
uniques, inverse = self._unique(values, uniques, ignore_na=False)
469+
if return_inverse:
470+
return uniques, inverse
471+
return uniques
479472

480473
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
481474
object na_value=None):
@@ -507,8 +500,7 @@ cdef class {{name}}HashTable(HashTable):
507500
uniques_vector = {{name}}Vector()
508501
uniques, labels = self._unique(values, uniques_vector,
509502
na_sentinel=na_sentinel,
510-
na_value=na_value, ignore_na=True,
511-
return_inverse=True)
503+
na_value=na_value, ignore_na=True)
512504
# factorize has reversed outputs compared to _unique
513505
return labels, uniques
514506

@@ -517,7 +509,7 @@ cdef class {{name}}HashTable(HashTable):
517509
object na_value=None):
518510
_, labels = self._unique(values, uniques, count_prior=count_prior,
519511
na_sentinel=na_sentinel, na_value=na_value,
520-
ignore_na=True, return_inverse=True)
512+
ignore_na=True)
521513
return labels
522514

523515
@cython.boundscheck(False)
@@ -709,8 +701,7 @@ cdef class StringHashTable(HashTable):
709701
@cython.wraparound(False)
710702
def _unique(self, ndarray[object] values, ObjectVector uniques,
711703
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
712-
object na_value=None, bint ignore_na=False,
713-
bint return_inverse=False):
704+
object na_value=None, bint ignore_na=False):
714705
"""
715706
Calculate unique values and labels (no sorting!)
716707

@@ -733,15 +724,12 @@ cdef class StringHashTable(HashTable):
733724
Whether NA-values should be ignored for calculating the uniques. If
734725
True, the labels corresponding to missing values will be set to
735726
na_sentinel.
736-
return_inverse : boolean, default False
737-
Whether the mapping of the original array values to their location
738-
in the vector of uniques should be returned.
739727

740728
Returns
741729
-------
742730
uniques : ndarray[object]
743731
Unique values of input, not sorted
744-
labels : ndarray[int64] (if return_inverse=True)
732+
labels : ndarray[int64]
745733
The labels from values to uniques
746734
"""
747735
cdef:
@@ -755,8 +743,7 @@ cdef class StringHashTable(HashTable):
755743
khiter_t k
756744
bint use_na_value
757745

758-
if return_inverse:
759-
labels = np.zeros(n, dtype=np.int64)
746+
labels = np.zeros(n, dtype=np.int64)
760747
uindexer = np.empty(n, dtype=np.int64)
761748
use_na_value = na_value is not None
762749

@@ -787,13 +774,11 @@ cdef class StringHashTable(HashTable):
787774
# k hasn't been seen yet
788775
k = kh_put_str(self.table, v, &ret)
789776
uindexer[count] = i
790-
if return_inverse:
791-
self.table.vals[k] = count
792-
labels[i] = <int64_t>count
777+
self.table.vals[k] = count
778+
labels[i] = <int64_t>count
793779
count += 1
794-
elif return_inverse:
780+
else:
795781
# k falls into a previous bucket
796-
# only relevant in case we need to construct the inverse
797782
idx = self.table.vals[k]
798783
labels[i] = <int64_t>idx
799784

@@ -803,9 +788,7 @@ cdef class StringHashTable(HashTable):
803788
for i in range(count):
804789
uniques.append(values[uindexer[i]])
805790

806-
if return_inverse:
807-
return uniques.to_array(), np.asarray(labels)
808-
return uniques.to_array()
791+
return uniques.to_array(), np.asarray(labels)
809792

810793
def unique(self, ndarray[object] values, bint return_inverse=False):
811794
"""
@@ -827,8 +810,10 @@ cdef class StringHashTable(HashTable):
827810
The labels from values to uniques
828811
"""
829812
uniques = ObjectVector()
830-
return self._unique(values, uniques, ignore_na=False,
831-
return_inverse=return_inverse)
813+
uniques, inverse = self._unique(values, uniques, ignore_na=False)
814+
if return_inverse:
815+
return uniques, inverse
816+
return uniques
832817

833818
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
834819
object na_value=None):
@@ -860,8 +845,7 @@ cdef class StringHashTable(HashTable):
860845
uniques_vector = ObjectVector()
861846
uniques, labels = self._unique(values, uniques_vector,
862847
na_sentinel=na_sentinel,
863-
na_value=na_value, ignore_na=True,
864-
return_inverse=True)
848+
na_value=na_value, ignore_na=True)
865849
# factorize has reversed outputs compared to _unique
866850
return labels, uniques
867851

@@ -870,7 +854,7 @@ cdef class StringHashTable(HashTable):
870854
object na_value=None):
871855
_, labels = self._unique(values, uniques, count_prior=count_prior,
872856
na_sentinel=na_sentinel, na_value=na_value,
873-
ignore_na=True, return_inverse=True)
857+
ignore_na=True)
874858
return labels
875859

876860

@@ -963,8 +947,7 @@ cdef class PyObjectHashTable(HashTable):
963947
@cython.wraparound(False)
964948
def _unique(self, ndarray[object] values, ObjectVector uniques,
965949
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
966-
object na_value=None, bint ignore_na=False,
967-
bint return_inverse=False):
950+
object na_value=None, bint ignore_na=False):
968951
"""
969952
Calculate unique values and labels (no sorting!)
970953

@@ -987,15 +970,12 @@ cdef class PyObjectHashTable(HashTable):
987970
Whether NA-values should be ignored for calculating the uniques. If
988971
True, the labels corresponding to missing values will be set to
989972
na_sentinel.
990-
return_inverse : boolean, default False
991-
Whether the mapping of the original array values to their location
992-
in the vector of uniques should be returned.
993973

994974
Returns
995975
-------
996976
uniques : ndarray[object]
997977
Unique values of input, not sorted
998-
labels : ndarray[int64] (if return_inverse=True)
978+
labels : ndarray[int64]
999979
The labels from values to uniques
1000980
"""
1001981
cdef:
@@ -1006,8 +986,7 @@ cdef class PyObjectHashTable(HashTable):
1006986
khiter_t k
1007987
bint use_na_value
1008988

1009-
if return_inverse:
1010-
labels = np.empty(n, dtype=np.int64)
989+
labels = np.empty(n, dtype=np.int64)
1011990
use_na_value = na_value is not None
1012991

1013992
for i in range(n):
@@ -1024,19 +1003,15 @@ cdef class PyObjectHashTable(HashTable):
10241003
# k hasn't been seen yet
10251004
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
10261005
uniques.append(val)
1027-
if return_inverse:
1028-
self.table.vals[k] = count
1029-
labels[i] = count
1030-
count += 1
1031-
elif return_inverse:
1006+
self.table.vals[k] = count
1007+
labels[i] = count
1008+
count += 1
1009+
else:
10321010
# k falls into a previous bucket
1033-
# only relevant in case we need to construct the inverse
10341011
idx = self.table.vals[k]
10351012
labels[i] = idx
10361013

1037-
if return_inverse:
1038-
return uniques.to_array(), np.asarray(labels)
1039-
return uniques.to_array()
1014+
return uniques.to_array(), np.asarray(labels)
10401015

10411016
def unique(self, ndarray[object] values, bint return_inverse=False):
10421017
"""
@@ -1058,8 +1033,10 @@ cdef class PyObjectHashTable(HashTable):
10581033
The labels from values to uniques
10591034
"""
10601035
uniques = ObjectVector()
1061-
return self._unique(values, uniques, ignore_na=False,
1062-
return_inverse=return_inverse)
1036+
uniques, inverse = self._unique(values, uniques, ignore_na=False)
1037+
if return_inverse:
1038+
return uniques, inverse
1039+
return uniques
10631040

10641041
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
10651042
object na_value=None):
@@ -1091,8 +1068,7 @@ cdef class PyObjectHashTable(HashTable):
10911068
uniques_vector = ObjectVector()
10921069
uniques, labels = self._unique(values, uniques_vector,
10931070
na_sentinel=na_sentinel,
1094-
na_value=na_value, ignore_na=True,
1095-
return_inverse=True)
1071+
na_value=na_value, ignore_na=True)
10961072
# factorize has reversed outputs compared to _unique
10971073
return labels, uniques
10981074

@@ -1101,5 +1077,5 @@ cdef class PyObjectHashTable(HashTable):
11011077
object na_value=None):
11021078
_, labels = self._unique(values, uniques, count_prior=count_prior,
11031079
na_sentinel=na_sentinel, na_value=na_value,
1104-
ignore_na=True, return_inverse=True)
1080+
ignore_na=True)
11051081
return labels

0 commit comments

Comments
 (0)