Skip to content

Commit 8481e19

Browse files
committed
Finish split in _unique_with_inverse and _unique_no_inverse
1 parent dbe4e0e commit 8481e19

File tree

1 file changed

+53
-63
lines changed

1 file changed

+53
-63
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

+53-63
Original file line numberDiff line numberDiff line change
@@ -356,10 +356,10 @@ cdef class {{name}}HashTable(HashTable):
356356
return np.asarray(locs)
357357

358358
@cython.boundscheck(False)
359-
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
360-
bint ignore_na=False, bint return_inverse=False,
361-
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
362-
object na_value=None):
359+
def _unique_with_inverse(self, const {{dtype}}_t[:] values,
360+
{{name}}Vector uniques, bint ignore_na=False,
361+
Py_ssize_t count_prior=0,
362+
Py_ssize_t na_sentinel=-1, object na_value=None):
363363
cdef:
364364
Py_ssize_t i, idx, count = count_prior, n = len(values)
365365
int64_t[:] labels
@@ -369,8 +369,7 @@ cdef class {{name}}HashTable(HashTable):
369369
{{name}}VectorData *ud
370370
bint use_na_value
371371

372-
if return_inverse:
373-
labels = np.empty(n, dtype=np.int64)
372+
labels = np.empty(n, dtype=np.int64)
374373
ud = uniques.data
375374
use_na_value = na_value is not None
376375

@@ -394,11 +393,11 @@ cdef class {{name}}HashTable(HashTable):
394393
continue
395394

396395
k = kh_get_{{dtype}}(self.table, val)
397-
if return_inverse and k != self.table.n_buckets:
396+
if k != self.table.n_buckets:
398397
# k falls into a previous bucket
399398
idx = self.table.vals[k]
400399
labels[i] = idx
401-
elif k == self.table.n_buckets:
400+
else:
402401
# k hasn't been seen yet
403402
k = kh_put_{{dtype}}(self.table, val, &ret)
404403
if needs_resize(ud):
@@ -409,14 +408,11 @@ cdef class {{name}}HashTable(HashTable):
409408
"Vector.resize() needed")
410409
uniques.resize()
411410
append_data_{{dtype}}(ud, val)
412-
if return_inverse:
413-
self.table.vals[k] = count
414-
labels[i] = count
411+
self.table.vals[k] = count
412+
labels[i] = count
415413
count += 1
416414

417-
if return_inverse:
418-
return uniques.to_array(), np.asarray(labels)
419-
return uniques.to_array()
415+
return uniques.to_array(), np.asarray(labels)
420416

421417
@cython.boundscheck(False)
422418
def _unique_no_inverse(self, const {{dtype}}_t[:] values):
@@ -443,20 +439,21 @@ cdef class {{name}}HashTable(HashTable):
443439

444440
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
445441
if return_inverse:
446-
return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
447-
return_inverse=True)
442+
return self._unique_with_inverse(values, uniques={{name}}Vector(),
443+
ignore_na=False)
448444
return self._unique_no_inverse(values)
449445

450446
def factorize(self, {{dtype}}_t[:] values):
451-
return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
452-
return_inverse=True)
447+
return self._unique_with_inverse(values, uniques={{name}}Vector(),
448+
ignore_na=True)
453449

454450
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
455451
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
456452
object na_value=None):
457-
_, labels = self._unique(values, uniques, ignore_na=True,
458-
return_inverse=True, count_prior=count_prior,
459-
na_sentinel=na_sentinel, na_value=na_value)
453+
_, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
454+
count_prior=count_prior,
455+
na_sentinel=na_sentinel,
456+
na_value=na_value)
460457
return labels
461458

462459
@cython.boundscheck(False)
@@ -645,10 +642,10 @@ cdef class StringHashTable(HashTable):
645642
free(vecs)
646643

647644
@cython.boundscheck(False)
648-
def _unique(self, ndarray[object] values, ObjectVector uniques,
649-
bint ignore_na=False, bint return_inverse=False,
650-
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
651-
object na_value=None):
645+
def _unique_with_inverse(self, ndarray[object] values,
646+
ObjectVector uniques, bint ignore_na=False,
647+
Py_ssize_t count_prior=0,
648+
Py_ssize_t na_sentinel=-1, object na_value=None):
652649
cdef:
653650
Py_ssize_t i, idx, count = count_prior, n = len(values)
654651
int64_t[:] labels
@@ -660,8 +657,7 @@ cdef class StringHashTable(HashTable):
660657
khiter_t k
661658
bint use_na_value
662659

663-
if return_inverse:
664-
labels = np.zeros(n, dtype=np.int64)
660+
labels = np.zeros(n, dtype=np.int64)
665661
uindexer = np.empty(n, dtype=np.int64)
666662
use_na_value = na_value is not None
667663

@@ -686,17 +682,16 @@ cdef class StringHashTable(HashTable):
686682

687683
v = vecs[i]
688684
k = kh_get_str(self.table, v)
689-
if return_inverse and k != self.table.n_buckets:
685+
if k != self.table.n_buckets:
690686
# k falls into a previous bucket
691687
idx = self.table.vals[k]
692688
labels[i] = <int64_t>idx
693-
elif k == self.table.n_buckets:
689+
else:
694690
# k hasn't been seen yet
695691
k = kh_put_str(self.table, v, &ret)
696692
uindexer[count] = i
697-
if return_inverse:
698-
self.table.vals[k] = count
699-
labels[i] = <int64_t>count
693+
self.table.vals[k] = count
694+
labels[i] = <int64_t>count
700695
count += 1
701696

702697
free(vecs)
@@ -705,9 +700,7 @@ cdef class StringHashTable(HashTable):
705700
for i in range(count):
706701
uniques.append(values[uindexer[i]])
707702

708-
if return_inverse:
709-
return uniques.to_array(), np.asarray(labels)
710-
return uniques.to_array()
703+
return uniques.to_array(), np.asarray(labels)
711704

712705
@cython.boundscheck(False)
713706
def _unique_no_inverse(self, ndarray[object] values):
@@ -745,20 +738,21 @@ cdef class StringHashTable(HashTable):
745738

746739
def unique(self, ndarray[object] values, bint return_inverse=False):
747740
if return_inverse:
748-
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
749-
return_inverse=True)
741+
return self._unique_with_inverse(values, uniques=ObjectVector(),
742+
ignore_na=False)
750743
return self._unique_no_inverse(values)
751744

752745
def factorize(self, ndarray[object] values):
753-
return self._unique(values, uniques=ObjectVector(), ignore_na=True,
754-
return_inverse=True)
746+
return self._unique_with_inverse(values, uniques=ObjectVector(),
747+
ignore_na=True)
755748

756749
def get_labels(self, ndarray[object] values, ObjectVector uniques,
757750
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
758751
object na_value=None):
759-
_, labels = self._unique(values, uniques, ignore_na=True,
760-
return_inverse=True, count_prior=count_prior,
761-
na_sentinel=na_sentinel, na_value=na_value)
752+
_, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
753+
count_prior=count_prior,
754+
na_sentinel=na_sentinel,
755+
na_value=na_value)
762756
return labels
763757

764758

@@ -848,10 +842,10 @@ cdef class PyObjectHashTable(HashTable):
848842
return np.asarray(locs)
849843

850844
@cython.boundscheck(False)
851-
def _unique(self, ndarray[object] values, ObjectVector uniques,
852-
bint ignore_na=False, bint return_inverse=False,
853-
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
854-
object na_value=None):
845+
def _unique_with_inverse(self, ndarray[object] values,
846+
ObjectVector uniques, bint ignore_na=False,
847+
Py_ssize_t count_prior=0,
848+
Py_ssize_t na_sentinel=-1, object na_value=None):
855849
cdef:
856850
Py_ssize_t i, idx, count = count_prior, n = len(values)
857851
int64_t[:] labels
@@ -860,8 +854,7 @@ cdef class PyObjectHashTable(HashTable):
860854
khiter_t k
861855
bint use_na_value
862856

863-
if return_inverse:
864-
labels = np.empty(n, dtype=np.int64)
857+
labels = np.empty(n, dtype=np.int64)
865858
use_na_value = na_value is not None
866859

867860
for i in range(n):
@@ -874,22 +867,19 @@ cdef class PyObjectHashTable(HashTable):
874867
continue
875868

876869
k = kh_get_pymap(self.table, <PyObject*>val)
877-
if return_inverse and k != self.table.n_buckets:
870+
if k != self.table.n_buckets:
878871
# k falls into a previous bucket
879872
idx = self.table.vals[k]
880873
labels[i] = <int64_t>idx
881-
elif k == self.table.n_buckets:
874+
else:
882875
# k hasn't been seen yet
883876
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
884877
uniques.append(val)
885-
if return_inverse:
886-
self.table.vals[k] = count
887-
labels[i] = <int64_t>count
878+
self.table.vals[k] = count
879+
labels[i] = <int64_t>count
888880
count += 1
889881

890-
if return_inverse:
891-
return uniques.to_array(), np.asarray(labels)
892-
return uniques.to_array()
882+
return uniques.to_array(), np.asarray(labels)
893883

894884
def _unique_no_inverse(self, ndarray[object] values):
895885
# define separate functions without inverse for performance
@@ -910,18 +900,18 @@ cdef class PyObjectHashTable(HashTable):
910900

911901
def unique(self, ndarray[object] values, bint return_inverse=False):
912902
if return_inverse:
913-
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
914-
return_inverse=True)
903+
return self._unique_with_inverse(values, uniques=ObjectVector(),
904+
ignore_na=False)
915905
return self._unique_no_inverse(values)
916906

917907
def factorize(self, ndarray[object] values):
918-
return self._unique(values, uniques=ObjectVector(), ignore_na=True,
919-
return_inverse=True)
908+
return self._unique_with_inverse(values, uniques=ObjectVector(), ignore_na=True)
920909

921910
def get_labels(self, ndarray[object] values, ObjectVector uniques,
922911
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
923912
object na_value=None):
924-
_, labels = self._unique(values, uniques, ignore_na=True,
925-
return_inverse=True, count_prior=count_prior,
926-
na_sentinel=na_sentinel, na_value=na_value)
913+
_, labels = self._unique_with_inverse(values, uniques, ignore_na=True,
914+
count_prior=count_prior,
915+
na_sentinel=na_sentinel,
916+
na_value=na_value)
927917
return labels

0 commit comments

Comments
 (0)