@@ -377,12 +377,13 @@ cdef class Int64HashTable(HashTable):
377
377
378
378
def factorize (self , ndarray[object] values ):
379
379
reverse = {}
380
- labels = self .get_labels(values, reverse, 0 )
380
+ labels = self .get_labels(values, reverse, 0 , 0 )
381
381
return reverse, labels
382
382
383
383
@ cython.boundscheck (False )
384
384
def get_labels (self , int64_t[:] values , Int64Vector uniques ,
385
- Py_ssize_t count_prior , Py_ssize_t na_sentinel ):
385
+ Py_ssize_t count_prior , Py_ssize_t na_sentinel ,
386
+ bint check_null = True ):
386
387
cdef:
387
388
Py_ssize_t i, n = len (values)
388
389
int64_t[:] labels
@@ -399,6 +400,11 @@ cdef class Int64HashTable(HashTable):
399
400
for i in range (n):
400
401
val = values[i]
401
402
k = kh_get_int64(self .table, val)
403
+
404
+ if check_null and val == iNaT:
405
+ labels[i] = na_sentinel
406
+ continue
407
+
402
408
if k != self .table.n_buckets:
403
409
idx = self .table.vals[k]
404
410
labels[i] = idx
@@ -525,13 +531,14 @@ cdef class Float64HashTable(HashTable):
525
531
526
532
def factorize (self , float64_t[:] values ):
527
533
uniques = Float64Vector()
528
- labels = self .get_labels(values, uniques, 0 , - 1 )
534
+ labels = self .get_labels(values, uniques, 0 , - 1 , 1 )
529
535
return uniques.to_array(), labels
530
536
531
537
@ cython.boundscheck (False )
532
538
def get_labels (self , float64_t[:] values ,
533
- Float64Vector uniques ,
534
- Py_ssize_t count_prior , int64_t na_sentinel ):
539
+ Float64Vector uniques ,
540
+ Py_ssize_t count_prior , int64_t na_sentinel ,
541
+ bint check_null = True ):
535
542
cdef:
536
543
Py_ssize_t i, n = len (values)
537
544
int64_t[:] labels
@@ -548,7 +555,7 @@ cdef class Float64HashTable(HashTable):
548
555
for i in range (n):
549
556
val = values[i]
550
557
551
- if val != val:
558
+ if check_null and val != val:
552
559
labels[i] = na_sentinel
553
560
continue
554
561
@@ -762,7 +769,8 @@ cdef class PyObjectHashTable(HashTable):
762
769
return uniques.to_array()
763
770
764
771
def get_labels (self , ndarray[object] values , ObjectVector uniques ,
765
- Py_ssize_t count_prior , int64_t na_sentinel ):
772
+ Py_ssize_t count_prior , int64_t na_sentinel ,
773
+ bint check_null = True ):
766
774
cdef:
767
775
Py_ssize_t i, n = len (values)
768
776
int64_t[:] labels
@@ -777,7 +785,7 @@ cdef class PyObjectHashTable(HashTable):
777
785
val = values[i]
778
786
hash (val)
779
787
780
- if val != val or val is None :
788
+ if check_null and val != val or val is None :
781
789
labels[i] = na_sentinel
782
790
continue
783
791
@@ -808,14 +816,15 @@ cdef class Factorizer:
808
816
def get_count (self ):
809
817
return self .count
810
818
811
- def factorize (self , ndarray[object] values , sort = False , na_sentinel = - 1 ):
819
+ def factorize (self , ndarray[object] values , sort = False , na_sentinel = - 1 ,
820
+ check_null = True ):
812
821
"""
813
822
Factorize values with nans replaced by na_sentinel
814
823
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
815
824
array([ 0, 1, 20])
816
825
"""
817
826
labels = self .table.get_labels(values, self .uniques,
818
- self .count, na_sentinel)
827
+ self .count, na_sentinel, check_null )
819
828
mask = (labels == na_sentinel)
820
829
# sort on
821
830
if sort:
@@ -848,9 +857,10 @@ cdef class Int64Factorizer:
848
857
return self .count
849
858
850
859
def factorize (self , int64_t[:] values , sort = False ,
851
- na_sentinel = - 1 ):
860
+ na_sentinel = - 1 , check_null = True ):
852
861
labels = self .table.get_labels(values, self .uniques,
853
- self .count, na_sentinel)
862
+ self .count, na_sentinel,
863
+ check_null)
854
864
855
865
# sort on
856
866
if sort:
0 commit comments