@@ -365,7 +365,7 @@ cdef class {{name}}HashTable(HashTable):
365
365
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
366
366
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
367
367
object na_value=None, bint ignore_na=False,
368
- bint return_inverse=False):
368
+ object mask=None, bint return_inverse=False):
369
369
"""
370
370
Calculate unique values and labels (no sorting!)
371
371
@@ -388,6 +388,10 @@ cdef class {{name}}HashTable(HashTable):
388
388
Whether NA-values should be ignored for calculating the uniques. If
389
389
True, the labels corresponding to missing values will be set to
390
390
na_sentinel.
391
+ mask : ndarray[bool], optional
392
+ If not None, the mask is used as indicator for missing values
393
+ (True = missing, False = valid) instead of `na_value` or
394
+ condition "val != val".
391
395
return_inverse : boolean, default False
392
396
Whether the mapping of the original array values to their location
393
397
in the vector of uniques should be returned.
@@ -406,12 +410,17 @@ cdef class {{name}}HashTable(HashTable):
406
410
{{dtype}}_t val, na_value2
407
411
khiter_t k
408
412
{{name}}VectorData *ud
409
- bint use_na_value
413
+ bint use_na_value, use_mask
414
+ uint8_t[:] mask_values
410
415
411
416
if return_inverse:
412
417
labels = np.empty(n, dtype=np.int64)
413
418
ud = uniques.data
414
419
use_na_value = na_value is not None
420
+ use_mask = mask is not None
421
+
422
+ if use_mask:
423
+ mask_values = mask.view("uint8")
415
424
416
425
if use_na_value:
417
426
# We need this na_value2 because we want to allow users
@@ -427,7 +436,11 @@ cdef class {{name}}HashTable(HashTable):
427
436
for i in range(n):
428
437
val = values[i]
429
438
430
- if ignore_na and (
439
+ if ignore_na and use_mask:
440
+ if mask_values[i]:
441
+ labels[i] = na_sentinel
442
+ continue
443
+ elif ignore_na and (
431
444
{{if not name.lower().startswith(("uint", "int"))}}
432
445
val != val or
433
446
{{endif}}
@@ -491,7 +504,7 @@ cdef class {{name}}HashTable(HashTable):
491
504
return_inverse=return_inverse)
492
505
493
506
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
494
- object na_value=None):
507
+ object na_value=None, object mask=None ):
495
508
"""
496
509
Calculate unique values and labels (no sorting!)
497
510
@@ -509,6 +522,10 @@ cdef class {{name}}HashTable(HashTable):
509
522
any value "val" satisfying val != val is considered missing.
510
523
If na_value is not None, then _additionally_, any value "val"
511
524
satisfying val == na_value is considered missing.
525
+ mask : ndarray[bool], optional
526
+ If not None, the mask is used as indicator for missing values
527
+ (True = missing, False = valid) instead of `na_value` or
528
+ condition "val != val".
512
529
513
530
Returns
514
531
-------
@@ -519,7 +536,7 @@ cdef class {{name}}HashTable(HashTable):
519
536
"""
520
537
uniques_vector = {{name}}Vector()
521
538
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
522
- na_value=na_value, ignore_na=True,
539
+ na_value=na_value, ignore_na=True, mask=mask,
523
540
return_inverse=True)
524
541
525
542
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -852,7 +869,7 @@ cdef class StringHashTable(HashTable):
852
869
return_inverse=return_inverse)
853
870
854
871
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
855
- object na_value=None):
872
+ object na_value=None, object mask=None ):
856
873
"""
857
874
Calculate unique values and labels (no sorting!)
858
875
@@ -870,6 +887,8 @@ cdef class StringHashTable(HashTable):
870
887
that is not a string is considered missing. If na_value is
871
888
not None, then _additionally_ any value "val" satisfying
872
889
val == na_value is considered missing.
890
+ mask : ndarray[bool], optional
891
+ Not yet implementd for StringHashTable.
873
892
874
893
Returns
875
894
-------
@@ -1091,7 +1110,7 @@ cdef class PyObjectHashTable(HashTable):
1091
1110
return_inverse=return_inverse)
1092
1111
1093
1112
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1094
- object na_value=None):
1113
+ object na_value=None, object mask=None ):
1095
1114
"""
1096
1115
Calculate unique values and labels (no sorting!)
1097
1116
@@ -1109,6 +1128,8 @@ cdef class PyObjectHashTable(HashTable):
1109
1128
any value "val" satisfying val != val is considered missing.
1110
1129
If na_value is not None, then _additionally_, any value "val"
1111
1130
satisfying val == na_value is considered missing.
1131
+ mask : ndarray[bool], optional
1132
+ Not yet implemented for PyObjectHashTable.
1112
1133
1113
1134
Returns
1114
1135
-------
0 commit comments