@@ -521,7 +521,7 @@ cdef class {{name}}HashTable(HashTable):
521
521
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
522
522
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
523
523
object na_value=None, bint ignore_na=False,
524
- object mask=None, bint return_inverse=False):
524
+ object mask=None, bint return_inverse=False, bint use_result_mask=False ):
525
525
"""
526
526
Calculate unique values and labels (no sorting!)
527
527
@@ -551,13 +551,18 @@ cdef class {{name}}HashTable(HashTable):
551
551
return_inverse : bool, default False
552
552
Whether the mapping of the original array values to their location
553
553
in the vector of uniques should be returned.
554
+ use_result_mask: bool, default False
555
+ Whether to create a result mask for the unique values. Not supported
556
+ with return_inverse=True.
554
557
555
558
Returns
556
559
-------
557
560
uniques : ndarray[{{dtype}}]
558
561
Unique values of input, not sorted
559
562
labels : ndarray[intp_t] (if return_inverse=True)
560
563
The labels from values to uniques
564
+ result_mask: ndarray[bool], if use_result_mask is true
565
+ The mask for the result values.
561
566
"""
562
567
cdef:
563
568
Py_ssize_t i, idx, count = count_prior, n = len(values)
@@ -566,14 +571,24 @@ cdef class {{name}}HashTable(HashTable):
566
571
{{c_type}} val, na_value2
567
572
khiter_t k
568
573
{{name}}VectorData *ud
569
- bint use_na_value, use_mask
574
+ UInt8Vector result_mask
575
+ UInt8VectorData *rmd
576
+ bint use_na_value, use_mask, seen_na = False
570
577
uint8_t[:] mask_values
571
578
572
579
if return_inverse:
573
580
labels = np.empty(n, dtype=np.intp)
574
581
ud = uniques.data
575
582
use_na_value = na_value is not None
576
583
use_mask = mask is not None
584
+ if not use_mask and use_result_mask:
585
+ raise NotImplementedError # pragma: no cover
586
+
587
+ if use_result_mask and return_inverse:
588
+ raise NotImplementedError # pragma: no cover
589
+
590
+ result_mask = UInt8Vector()
591
+ rmd = result_mask.data
577
592
578
593
if use_mask:
579
594
mask_values = mask.view("uint8")
@@ -605,6 +620,27 @@ cdef class {{name}}HashTable(HashTable):
605
620
# and replace the corresponding label with na_sentinel
606
621
labels[i] = na_sentinel
607
622
continue
623
+ elif not ignore_na and use_result_mask:
624
+ if mask_values[i]:
625
+ if seen_na:
626
+ continue
627
+
628
+ seen_na = True
629
+ if needs_resize(ud):
630
+ with gil:
631
+ if uniques.external_view_exists:
632
+ raise ValueError("external reference to "
633
+ "uniques held, but "
634
+ "Vector.resize() needed")
635
+ uniques.resize()
636
+ if result_mask.external_view_exists:
637
+ raise ValueError("external reference to "
638
+ "result_mask held, but "
639
+ "Vector.resize() needed")
640
+ result_mask.resize()
641
+ append_data_{{dtype}}(ud, val)
642
+ append_data_uint8(rmd, 1)
643
+ continue
608
644
609
645
k = kh_get_{{dtype}}(self.table, val)
610
646
@@ -619,7 +655,16 @@ cdef class {{name}}HashTable(HashTable):
619
655
"uniques held, but "
620
656
"Vector.resize() needed")
621
657
uniques.resize()
658
+ if use_result_mask:
659
+ if result_mask.external_view_exists:
660
+ raise ValueError("external reference to "
661
+ "result_mask held, but "
662
+ "Vector.resize() needed")
663
+ result_mask.resize()
622
664
append_data_{{dtype}}(ud, val)
665
+ if use_result_mask:
666
+ append_data_uint8(rmd, 0)
667
+
623
668
if return_inverse:
624
669
self.table.vals[k] = count
625
670
labels[i] = count
@@ -632,9 +677,11 @@ cdef class {{name}}HashTable(HashTable):
632
677
633
678
if return_inverse:
634
679
return uniques.to_array(), labels.base # .base -> underlying ndarray
680
+ if use_result_mask:
681
+ return uniques.to_array(), result_mask.to_array()
635
682
return uniques.to_array()
636
683
637
- def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
684
+ def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None ):
638
685
"""
639
686
Calculate unique values and labels (no sorting!)
640
687
@@ -645,17 +692,23 @@ cdef class {{name}}HashTable(HashTable):
645
692
return_inverse : bool, default False
646
693
Whether the mapping of the original array values to their location
647
694
in the vector of uniques should be returned.
695
+ mask : ndarray[bool], optional
696
+ If not None, the mask is used as indicator for missing values
697
+ (True = missing, False = valid) instead of `na_value` or
648
698
649
699
Returns
650
700
-------
651
701
uniques : ndarray[{{dtype}}]
652
702
Unique values of input, not sorted
653
703
labels : ndarray[intp_t] (if return_inverse)
654
704
The labels from values to uniques
705
+ result_mask: ndarray[bool], if mask is given as input
706
+ The mask for the result values.
655
707
"""
656
708
uniques = {{name}}Vector()
709
+ use_result_mask = True if mask is not None else False
657
710
return self._unique(values, uniques, ignore_na=False,
658
- return_inverse=return_inverse)
711
+ return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask )
659
712
660
713
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
661
714
object na_value=None, object mask=None):
@@ -1013,7 +1066,7 @@ cdef class StringHashTable(HashTable):
1013
1066
return uniques.to_array(), labels.base # .base -> underlying ndarray
1014
1067
return uniques.to_array()
1015
1068
1016
- def unique(self, ndarray[object] values, bint return_inverse=False):
1069
+ def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None ):
1017
1070
"""
1018
1071
Calculate unique values and labels (no sorting!)
1019
1072
@@ -1024,6 +1077,8 @@ cdef class StringHashTable(HashTable):
1024
1077
return_inverse : bool, default False
1025
1078
Whether the mapping of the original array values to their location
1026
1079
in the vector of uniques should be returned.
1080
+ mask : ndarray[bool], optional
1081
+ Not yet implemented for StringHashTable
1027
1082
1028
1083
Returns
1029
1084
-------
@@ -1266,7 +1321,7 @@ cdef class PyObjectHashTable(HashTable):
1266
1321
return uniques.to_array(), labels.base # .base -> underlying ndarray
1267
1322
return uniques.to_array()
1268
1323
1269
- def unique(self, ndarray[object] values, bint return_inverse=False):
1324
+ def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None ):
1270
1325
"""
1271
1326
Calculate unique values and labels (no sorting!)
1272
1327
@@ -1277,6 +1332,8 @@ cdef class PyObjectHashTable(HashTable):
1277
1332
return_inverse : bool, default False
1278
1333
Whether the mapping of the original array values to their location
1279
1334
in the vector of uniques should be returned.
1335
+ mask : ndarray[bool], optional
1336
+ Not yet implemented for PyObjectHashTable
1280
1337
1281
1338
Returns
1282
1339
-------
0 commit comments