@@ -1171,11 +1171,13 @@ cdef class StringHashTable(HashTable):
1171
1171
const char **vecs
1172
1172
khiter_t k
1173
1173
bint use_na_value
1174
+ bint non_null_na_value
1174
1175
1175
1176
if return_inverse:
1176
1177
labels = np.zeros(n, dtype=np.intp)
1177
1178
uindexer = np.empty(n, dtype=np.int64)
1178
1179
use_na_value = na_value is not None
1180
+ non_null_na_value = not checknull(na_value)
1179
1181
1180
1182
# assign pointers and pre-filter out missing (if ignore_na)
1181
1183
vecs = <const char **>malloc(n * sizeof(char *))
@@ -1186,7 +1188,12 @@ cdef class StringHashTable(HashTable):
1186
1188
1187
1189
if (ignore_na
1188
1190
and (not isinstance(val, str)
1189
- or (use_na_value and val == na_value))):
1191
+ or (use_na_value and (
1192
+ (non_null_na_value and val == na_value) or
1193
+ (not non_null_na_value and is_matching_na(val, na_value)))
1194
+ )
1195
+ )
1196
+ ):
1190
1197
# if missing values do not count as unique values (i.e. if
1191
1198
# ignore_na is True), we can skip the actual value, and
1192
1199
# replace the label with na_sentinel directly
@@ -1452,18 +1459,23 @@ cdef class PyObjectHashTable(HashTable):
1452
1459
object val
1453
1460
khiter_t k
1454
1461
bint use_na_value
1455
-
1462
+ bint non_null_na_value
1456
1463
if return_inverse:
1457
1464
labels = np.empty(n, dtype=np.intp)
1458
1465
use_na_value = na_value is not None
1466
+ non_null_na_value = not checknull(na_value)
1459
1467
1460
1468
for i in range(n):
1461
1469
val = values[i]
1462
1470
hash(val)
1463
1471
1464
1472
if ignore_na and (
1465
1473
checknull(val)
1466
- or (use_na_value and val == na_value)
1474
+ or (use_na_value and (
1475
+ (non_null_na_value and val == na_value) or
1476
+ (not non_null_na_value and is_matching_na(val, na_value))
1477
+ )
1478
+ )
1467
1479
):
1468
1480
# if missing values do not count as unique values (i.e. if
1469
1481
# ignore_na is True), skip the hashtable entry for them, and
0 commit comments