@@ -355,19 +355,38 @@ cdef class {{name}}HashTable(HashTable):
355
355
356
356
return np.asarray(locs)
357
357
358
- def factorize(self, {{dtype}}_t values):
359
- uniques = {{name}}Vector()
360
- labels = self.get_labels(values, uniques, 0, 0)
361
- return uniques.to_array(), labels
362
-
363
358
@cython.boundscheck(False)
364
- def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
365
- Py_ssize_t count_prior, Py_ssize_t na_sentinel,
359
+ @cython.wraparound(False)
360
+ def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
361
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
366
362
object na_value=None):
363
+ """
364
+ Calculate unique values and labels (no sorting); ignores all NA-values
365
+
366
+ Parameters
367
+ ----------
368
+ values : ndarray[{{dtype}}]
369
+ Array of values of which unique will be calculated
370
+ uniques : {{name}}Vector
371
+ Vector into which uniques will be written
372
+ count_prior : Py_ssize_t, default 0
373
+ Number of existing entries in uniques
374
+ na_sentinel : Py_ssize_t, default -1
375
+ Sentinel value used for all NA-values in inverse
376
+ na_value : object, default None
377
+ Value to identify as missing. If na_value is None, then
378
+ any value satisfying val!=val are considered missing.
379
+
380
+ Returns
381
+ -------
382
+ uniques : ndarray[{{dtype}}]
383
+ Unique values of input, not sorted
384
+ labels : ndarray[int64]
385
+ The labels from values to uniques
386
+ """
367
387
cdef:
368
- Py_ssize_t i, n = len(values)
388
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
369
389
int64_t[:] labels
370
- Py_ssize_t idx, count = count_prior
371
390
int ret = 0
372
391
{{dtype}}_t val, na_value2
373
392
khiter_t k
@@ -399,9 +418,11 @@ cdef class {{name}}HashTable(HashTable):
399
418
k = kh_get_{{dtype}}(self.table, val)
400
419
401
420
if k != self.table.n_buckets:
421
+ # k falls into a previous bucket
402
422
idx = self.table.vals[k]
403
423
labels[i] = idx
404
424
else:
425
+ # k hasn't been seen yet
405
426
k = kh_put_{{dtype}}(self.table, val, &ret)
406
427
self.table.vals[k] = count
407
428
@@ -418,6 +439,19 @@ cdef class {{name}}HashTable(HashTable):
418
439
419
440
return np.asarray(labels)
420
441
442
+ def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
443
+ object na_value=None):
444
+ uniques = {{name}}Vector()
445
+ labels = self._factorize(values, uniques=uniques,
446
+ na_sentinel=na_sentinel, na_value=na_value)
447
+ return labels, uniques.to_array()
448
+
449
+ def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
450
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
451
+ object na_value=None):
452
+ return self._factorize(values, uniques, count_prior=count_prior,
453
+ na_sentinel=na_sentinel, na_value=na_value)
454
+
421
455
@cython.boundscheck(False)
422
456
def get_labels_groupby(self, const {{dtype}}_t[:] values):
423
457
cdef:
@@ -464,7 +498,21 @@ cdef class {{name}}HashTable(HashTable):
464
498
return np.asarray(labels), arr_uniques
465
499
466
500
@cython.boundscheck(False)
501
+ @cython.wraparound(False)
467
502
def unique(self, const {{dtype}}_t[:] values):
503
+ """
504
+ Calculate unique values without sorting
505
+
506
+ Parameters
507
+ ----------
508
+ values : ndarray[{{dtype}}]
509
+ Array of values of which unique will be calculated
510
+
511
+ Returns
512
+ -------
513
+ uniques : ndarray[{{dtype}}]
514
+ Unique values of input, not sorted
515
+ """
468
516
cdef:
469
517
Py_ssize_t i, n = len(values)
470
518
int ret = 0
@@ -567,7 +615,21 @@ cdef class StringHashTable(HashTable):
567
615
return labels
568
616
569
617
@cython.boundscheck(False)
618
+ @cython.wraparound(False)
570
619
def unique(self, ndarray[object] values):
620
+ """
621
+ Calculate unique values without sorting
622
+
623
+ Parameters
624
+ ----------
625
+ values : ndarray[object]
626
+ Array of values of which unique will be calculated
627
+
628
+ Returns
629
+ -------
630
+ uniques : ndarray[object]
631
+ Unique values of input, not sorted
632
+ """
571
633
cdef:
572
634
Py_ssize_t i, count, n = len(values)
573
635
int64_t[:] uindexer
@@ -602,11 +664,6 @@ cdef class StringHashTable(HashTable):
602
664
uniques.append(values[uindexer[i]])
603
665
return uniques.to_array()
604
666
605
- def factorize(self, ndarray[object] values):
606
- uniques = ObjectVector()
607
- labels = self.get_labels(values, uniques, 0, 0)
608
- return uniques.to_array(), labels
609
-
610
667
@cython.boundscheck(False)
611
668
def lookup(self, ndarray[object] values):
612
669
cdef:
@@ -669,34 +726,55 @@ cdef class StringHashTable(HashTable):
669
726
free(vecs)
670
727
671
728
@cython.boundscheck(False)
672
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
673
- Py_ssize_t count_prior, int64_t na_sentinel,
729
+ @cython.wraparound(False)
730
+ def _factorize(self, ndarray[object] values, ObjectVector uniques,
731
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
674
732
object na_value=None):
733
+ """
734
+ Calculate unique values and labels (no sorting); ignores all NA-values
735
+
736
+ Parameters
737
+ ----------
738
+ values : ndarray[object]
739
+ Array of values of which unique will be calculated
740
+ uniques : ObjectVector
741
+ Vector into which uniques will be written
742
+ count_prior : Py_ssize_t, default 0
743
+ Number of existing entries in uniques
744
+ na_sentinel : Py_ssize_t, default -1
745
+ Sentinel value used for all NA-values in inverse
746
+ na_value : object, default None
747
+ Value to identify as missing
748
+
749
+ Returns
750
+ -------
751
+ uniques : ndarray[object]
752
+ Unique values of input, not sorted
753
+ labels : ndarray[int64]
754
+ The labels from values to uniques
755
+ """
675
756
cdef:
676
- Py_ssize_t i, n = len(values)
757
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
677
758
int64_t[:] labels
678
759
int64_t[:] uindexer
679
- Py_ssize_t idx, count = count_prior
680
760
int ret = 0
681
761
object val
682
762
const char *v
683
763
const char **vecs
684
764
khiter_t k
685
765
bint use_na_value
686
766
687
- # these by-definition *must* be strings
688
767
labels = np.zeros(n, dtype=np.int64)
689
768
uindexer = np.empty(n, dtype=np.int64)
690
769
use_na_value = na_value is not None
691
770
692
- # pre-filter out missing
693
- # and assign pointers
771
+ # assign pointers and pre-filter out missing
694
772
vecs = <const char **> malloc(n * sizeof(char *))
695
773
for i in range(n):
696
774
val = values[i]
697
775
698
- if ((PyUnicode_Check(val) or PyString_Check(val)) and
699
- not (use_na_value and val == na_value)):
776
+ if ((PyUnicode_Check(val) or PyString_Check(val))
777
+ and not (use_na_value and val == na_value)):
700
778
v = util.get_c_string(val)
701
779
vecs[i] = v
702
780
else:
@@ -711,9 +789,11 @@ cdef class StringHashTable(HashTable):
711
789
v = vecs[i]
712
790
k = kh_get_str(self.table, v)
713
791
if k != self.table.n_buckets:
792
+ # k falls into a previous bucket
714
793
idx = self.table.vals[k]
715
794
labels[i] = <int64_t>idx
716
795
else:
796
+ # k hasn't been seen yet
717
797
k = kh_put_str(self.table, v, &ret)
718
798
self.table.vals[k] = count
719
799
uindexer[count] = i
@@ -728,6 +808,19 @@ cdef class StringHashTable(HashTable):
728
808
729
809
return np.asarray(labels)
730
810
811
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
812
+ object na_value=None):
813
+ uniques = ObjectVector()
814
+ labels = self._factorize(values, uniques=uniques,
815
+ na_sentinel=na_sentinel, na_value=na_value)
816
+ return labels, uniques.to_array()
817
+
818
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
819
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
820
+ object na_value=None):
821
+ return self._factorize(values, uniques, count_prior=count_prior,
822
+ na_sentinel=na_sentinel, na_value=na_value)
823
+
731
824
732
825
cdef class PyObjectHashTable(HashTable):
733
826
@@ -814,7 +907,22 @@ cdef class PyObjectHashTable(HashTable):
814
907
815
908
return np.asarray(locs)
816
909
910
+ @cython.boundscheck(False)
911
+ @cython.wraparound(False)
817
912
def unique(self, ndarray[object] values):
913
+ """
914
+ Calculate unique values without sorting
915
+
916
+ Parameters
917
+ ----------
918
+ values : ndarray[object]
919
+ Array of values of which unique will be calculated
920
+
921
+ Returns
922
+ -------
923
+ uniques : ndarray[object]
924
+ Unique values of input, not sorted
925
+ """
818
926
cdef:
819
927
Py_ssize_t i, n = len(values)
820
928
int ret = 0
@@ -832,13 +940,38 @@ cdef class PyObjectHashTable(HashTable):
832
940
833
941
return uniques.to_array()
834
942
835
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
836
- Py_ssize_t count_prior, int64_t na_sentinel,
943
+ @cython.boundscheck(False)
944
+ @cython.wraparound(False)
945
+ def _factorize(self, ndarray[object] values, ObjectVector uniques,
946
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
837
947
object na_value=None):
948
+ """
949
+ Calculate unique values and labels (no sorting); ignores all NA-values
950
+
951
+ Parameters
952
+ ----------
953
+ values : ndarray[object]
954
+ Array of values of which unique will be calculated
955
+ uniques : ObjectVector
956
+ Vector into which uniques will be written
957
+ count_prior : Py_ssize_t, default 0
958
+ Number of existing entries in uniques
959
+ na_sentinel : Py_ssize_t, default -1
960
+ Sentinel value used for all NA-values in inverse
961
+ na_value : object, default None
962
+ Value to identify as missing. If na_value is None, then None _plus_
963
+ any value satisfying val!=val are considered missing.
964
+
965
+ Returns
966
+ -------
967
+ uniques : ndarray[object]
968
+ Unique values of input, not sorted
969
+ labels : ndarray[int64]
970
+ The labels from values to uniques
971
+ """
838
972
cdef:
839
- Py_ssize_t i, n = len(values)
973
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
840
974
int64_t[:] labels
841
- Py_ssize_t idx, count = count_prior
842
975
int ret = 0
843
976
object val
844
977
khiter_t k
@@ -851,20 +984,35 @@ cdef class PyObjectHashTable(HashTable):
851
984
val = values[i]
852
985
hash(val)
853
986
854
- if ((val != val or val is None) or
855
- (use_na_value and val == na_value)):
987
+ if ((val != val or val is None)
988
+ or (use_na_value and val == na_value)):
856
989
labels[i] = na_sentinel
857
990
continue
858
991
859
992
k = kh_get_pymap(self.table, <PyObject*>val)
860
993
if k != self.table.n_buckets:
994
+ # k falls into a previous bucket
861
995
idx = self.table.vals[k]
862
996
labels[i] = idx
863
997
else:
998
+ # k hasn't been seen yet
864
999
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
865
1000
self.table.vals[k] = count
866
1001
uniques.append(val)
867
1002
labels[i] = count
868
1003
count += 1
869
1004
870
1005
return np.asarray(labels)
1006
+
1007
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1008
+ object na_value=None):
1009
+ uniques = ObjectVector()
1010
+ labels = self._factorize(values, uniques=uniques,
1011
+ na_sentinel=na_sentinel, na_value=na_value)
1012
+ return labels, uniques.to_array()
1013
+
1014
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
1015
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
1016
+ object na_value=None):
1017
+ return self._factorize(values, uniques, count_prior=count_prior,
1018
+ na_sentinel=na_sentinel, na_value=na_value)
0 commit comments