Skip to content

Commit f490328

Browse files
h-vetinaritm9k1
authored andcommitted
CLN: prepare unifying hashtable.factorize and .unique; add doc-strings (pandas-dev#22986)
1 parent a12c28b commit f490328

File tree

3 files changed

+319
-74
lines changed

3 files changed

+319
-74
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

+177-29
Original file line numberDiff line numberDiff line change
@@ -355,19 +355,38 @@ cdef class {{name}}HashTable(HashTable):
355355

356356
return np.asarray(locs)
357357

358-
def factorize(self, {{dtype}}_t values):
359-
uniques = {{name}}Vector()
360-
labels = self.get_labels(values, uniques, 0, 0)
361-
return uniques.to_array(), labels
362-
363358
@cython.boundscheck(False)
364-
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
365-
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
359+
@cython.wraparound(False)
360+
def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
361+
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
366362
object na_value=None):
363+
"""
364+
Calculate unique values and labels (no sorting); ignores all NA-values
365+
366+
Parameters
367+
----------
368+
values : ndarray[{{dtype}}]
369+
Array of values of which unique will be calculated
370+
uniques : {{name}}Vector
371+
Vector into which uniques will be written
372+
count_prior : Py_ssize_t, default 0
373+
Number of existing entries in uniques
374+
na_sentinel : Py_ssize_t, default -1
375+
Sentinel value used for all NA-values in inverse
376+
na_value : object, default None
377+
Value to identify as missing. If na_value is None, then
378+
any value satisfying val!=val are considered missing.
379+
380+
Returns
381+
-------
382+
uniques : ndarray[{{dtype}}]
383+
Unique values of input, not sorted
384+
labels : ndarray[int64]
385+
The labels from values to uniques
386+
"""
367387
cdef:
368-
Py_ssize_t i, n = len(values)
388+
Py_ssize_t i, idx, count = count_prior, n = len(values)
369389
int64_t[:] labels
370-
Py_ssize_t idx, count = count_prior
371390
int ret = 0
372391
{{dtype}}_t val, na_value2
373392
khiter_t k
@@ -399,9 +418,11 @@ cdef class {{name}}HashTable(HashTable):
399418
k = kh_get_{{dtype}}(self.table, val)
400419

401420
if k != self.table.n_buckets:
421+
# k falls into a previous bucket
402422
idx = self.table.vals[k]
403423
labels[i] = idx
404424
else:
425+
# k hasn't been seen yet
405426
k = kh_put_{{dtype}}(self.table, val, &ret)
406427
self.table.vals[k] = count
407428

@@ -418,6 +439,19 @@ cdef class {{name}}HashTable(HashTable):
418439

419440
return np.asarray(labels)
420441

442+
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
443+
object na_value=None):
444+
uniques = {{name}}Vector()
445+
labels = self._factorize(values, uniques=uniques,
446+
na_sentinel=na_sentinel, na_value=na_value)
447+
return labels, uniques.to_array()
448+
449+
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
450+
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
451+
object na_value=None):
452+
return self._factorize(values, uniques, count_prior=count_prior,
453+
na_sentinel=na_sentinel, na_value=na_value)
454+
421455
@cython.boundscheck(False)
422456
def get_labels_groupby(self, const {{dtype}}_t[:] values):
423457
cdef:
@@ -464,7 +498,21 @@ cdef class {{name}}HashTable(HashTable):
464498
return np.asarray(labels), arr_uniques
465499

466500
@cython.boundscheck(False)
501+
@cython.wraparound(False)
467502
def unique(self, const {{dtype}}_t[:] values):
503+
"""
504+
Calculate unique values without sorting
505+
506+
Parameters
507+
----------
508+
values : ndarray[{{dtype}}]
509+
Array of values of which unique will be calculated
510+
511+
Returns
512+
-------
513+
uniques : ndarray[{{dtype}}]
514+
Unique values of input, not sorted
515+
"""
468516
cdef:
469517
Py_ssize_t i, n = len(values)
470518
int ret = 0
@@ -567,7 +615,21 @@ cdef class StringHashTable(HashTable):
567615
return labels
568616

569617
@cython.boundscheck(False)
618+
@cython.wraparound(False)
570619
def unique(self, ndarray[object] values):
620+
"""
621+
Calculate unique values without sorting
622+
623+
Parameters
624+
----------
625+
values : ndarray[object]
626+
Array of values of which unique will be calculated
627+
628+
Returns
629+
-------
630+
uniques : ndarray[object]
631+
Unique values of input, not sorted
632+
"""
571633
cdef:
572634
Py_ssize_t i, count, n = len(values)
573635
int64_t[:] uindexer
@@ -602,11 +664,6 @@ cdef class StringHashTable(HashTable):
602664
uniques.append(values[uindexer[i]])
603665
return uniques.to_array()
604666

605-
def factorize(self, ndarray[object] values):
606-
uniques = ObjectVector()
607-
labels = self.get_labels(values, uniques, 0, 0)
608-
return uniques.to_array(), labels
609-
610667
@cython.boundscheck(False)
611668
def lookup(self, ndarray[object] values):
612669
cdef:
@@ -669,34 +726,55 @@ cdef class StringHashTable(HashTable):
669726
free(vecs)
670727

671728
@cython.boundscheck(False)
672-
def get_labels(self, ndarray[object] values, ObjectVector uniques,
673-
Py_ssize_t count_prior, int64_t na_sentinel,
729+
@cython.wraparound(False)
730+
def _factorize(self, ndarray[object] values, ObjectVector uniques,
731+
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
674732
object na_value=None):
733+
"""
734+
Calculate unique values and labels (no sorting); ignores all NA-values
735+
736+
Parameters
737+
----------
738+
values : ndarray[object]
739+
Array of values of which unique will be calculated
740+
uniques : ObjectVector
741+
Vector into which uniques will be written
742+
count_prior : Py_ssize_t, default 0
743+
Number of existing entries in uniques
744+
na_sentinel : Py_ssize_t, default -1
745+
Sentinel value used for all NA-values in inverse
746+
na_value : object, default None
747+
Value to identify as missing
748+
749+
Returns
750+
-------
751+
uniques : ndarray[object]
752+
Unique values of input, not sorted
753+
labels : ndarray[int64]
754+
The labels from values to uniques
755+
"""
675756
cdef:
676-
Py_ssize_t i, n = len(values)
757+
Py_ssize_t i, idx, count = count_prior, n = len(values)
677758
int64_t[:] labels
678759
int64_t[:] uindexer
679-
Py_ssize_t idx, count = count_prior
680760
int ret = 0
681761
object val
682762
const char *v
683763
const char **vecs
684764
khiter_t k
685765
bint use_na_value
686766

687-
# these by-definition *must* be strings
688767
labels = np.zeros(n, dtype=np.int64)
689768
uindexer = np.empty(n, dtype=np.int64)
690769
use_na_value = na_value is not None
691770

692-
# pre-filter out missing
693-
# and assign pointers
771+
# assign pointers and pre-filter out missing
694772
vecs = <const char **> malloc(n * sizeof(char *))
695773
for i in range(n):
696774
val = values[i]
697775

698-
if ((PyUnicode_Check(val) or PyString_Check(val)) and
699-
not (use_na_value and val == na_value)):
776+
if ((PyUnicode_Check(val) or PyString_Check(val))
777+
and not (use_na_value and val == na_value)):
700778
v = util.get_c_string(val)
701779
vecs[i] = v
702780
else:
@@ -711,9 +789,11 @@ cdef class StringHashTable(HashTable):
711789
v = vecs[i]
712790
k = kh_get_str(self.table, v)
713791
if k != self.table.n_buckets:
792+
# k falls into a previous bucket
714793
idx = self.table.vals[k]
715794
labels[i] = <int64_t>idx
716795
else:
796+
# k hasn't been seen yet
717797
k = kh_put_str(self.table, v, &ret)
718798
self.table.vals[k] = count
719799
uindexer[count] = i
@@ -728,6 +808,19 @@ cdef class StringHashTable(HashTable):
728808

729809
return np.asarray(labels)
730810

811+
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
812+
object na_value=None):
813+
uniques = ObjectVector()
814+
labels = self._factorize(values, uniques=uniques,
815+
na_sentinel=na_sentinel, na_value=na_value)
816+
return labels, uniques.to_array()
817+
818+
def get_labels(self, ndarray[object] values, ObjectVector uniques,
819+
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
820+
object na_value=None):
821+
return self._factorize(values, uniques, count_prior=count_prior,
822+
na_sentinel=na_sentinel, na_value=na_value)
823+
731824

732825
cdef class PyObjectHashTable(HashTable):
733826

@@ -814,7 +907,22 @@ cdef class PyObjectHashTable(HashTable):
814907

815908
return np.asarray(locs)
816909

910+
@cython.boundscheck(False)
911+
@cython.wraparound(False)
817912
def unique(self, ndarray[object] values):
913+
"""
914+
Calculate unique values without sorting
915+
916+
Parameters
917+
----------
918+
values : ndarray[object]
919+
Array of values of which unique will be calculated
920+
921+
Returns
922+
-------
923+
uniques : ndarray[object]
924+
Unique values of input, not sorted
925+
"""
818926
cdef:
819927
Py_ssize_t i, n = len(values)
820928
int ret = 0
@@ -832,13 +940,38 @@ cdef class PyObjectHashTable(HashTable):
832940

833941
return uniques.to_array()
834942

835-
def get_labels(self, ndarray[object] values, ObjectVector uniques,
836-
Py_ssize_t count_prior, int64_t na_sentinel,
943+
@cython.boundscheck(False)
944+
@cython.wraparound(False)
945+
def _factorize(self, ndarray[object] values, ObjectVector uniques,
946+
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
837947
object na_value=None):
948+
"""
949+
Calculate unique values and labels (no sorting); ignores all NA-values
950+
951+
Parameters
952+
----------
953+
values : ndarray[object]
954+
Array of values of which unique will be calculated
955+
uniques : ObjectVector
956+
Vector into which uniques will be written
957+
count_prior : Py_ssize_t, default 0
958+
Number of existing entries in uniques
959+
na_sentinel : Py_ssize_t, default -1
960+
Sentinel value used for all NA-values in inverse
961+
na_value : object, default None
962+
Value to identify as missing. If na_value is None, then None _plus_
963+
any value satisfying val!=val are considered missing.
964+
965+
Returns
966+
-------
967+
uniques : ndarray[object]
968+
Unique values of input, not sorted
969+
labels : ndarray[int64]
970+
The labels from values to uniques
971+
"""
838972
cdef:
839-
Py_ssize_t i, n = len(values)
973+
Py_ssize_t i, idx, count = count_prior, n = len(values)
840974
int64_t[:] labels
841-
Py_ssize_t idx, count = count_prior
842975
int ret = 0
843976
object val
844977
khiter_t k
@@ -851,20 +984,35 @@ cdef class PyObjectHashTable(HashTable):
851984
val = values[i]
852985
hash(val)
853986

854-
if ((val != val or val is None) or
855-
(use_na_value and val == na_value)):
987+
if ((val != val or val is None)
988+
or (use_na_value and val == na_value)):
856989
labels[i] = na_sentinel
857990
continue
858991

859992
k = kh_get_pymap(self.table, <PyObject*>val)
860993
if k != self.table.n_buckets:
994+
# k falls into a previous bucket
861995
idx = self.table.vals[k]
862996
labels[i] = idx
863997
else:
998+
# k hasn't been seen yet
864999
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
8651000
self.table.vals[k] = count
8661001
uniques.append(val)
8671002
labels[i] = count
8681003
count += 1
8691004

8701005
return np.asarray(labels)
1006+
1007+
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1008+
object na_value=None):
1009+
uniques = ObjectVector()
1010+
labels = self._factorize(values, uniques=uniques,
1011+
na_sentinel=na_sentinel, na_value=na_value)
1012+
return labels, uniques.to_array()
1013+
1014+
def get_labels(self, ndarray[object] values, ObjectVector uniques,
1015+
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
1016+
object na_value=None):
1017+
return self._factorize(values, uniques, count_prior=count_prior,
1018+
na_sentinel=na_sentinel, na_value=na_value)

pandas/core/algorithms.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -467,15 +467,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
467467
-------
468468
labels, uniques : ndarray
469469
"""
470-
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
470+
(hash_klass, _), values = _get_data_algo(values, _hashtables)
471471

472472
table = hash_klass(size_hint or len(values))
473-
uniques = vec_klass()
474-
labels = table.get_labels(values, uniques, 0, na_sentinel,
475-
na_value=na_value)
473+
labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
474+
na_value=na_value)
476475

477476
labels = ensure_platform_int(labels)
478-
uniques = uniques.to_array()
479477
return labels, uniques
480478

481479

0 commit comments

Comments
 (0)