Skip to content

Commit 0f950de

Browse files
committed
CLN: Remove special handling of nans in the float64-case of isin
It is no longer needed because the hash-table handles the nans correctly out of the box (see GH21866) Not having to scan the values via isna(...).any() will improve the perfomance.
1 parent 475e391 commit 0f950de

File tree

3 files changed

+26
-13
lines changed

3 files changed

+26
-13
lines changed

pandas/_libs/hashtable_func_helper.pxi.in

+4-11
Original file line numberDiff line numberDiff line change
@@ -210,10 +210,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
210210
@cython.boundscheck(False)
211211
{{if dtype == 'object'}}
212212

213-
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
213+
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values):
214214
{{else}}
215215

216-
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
216+
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
217217
{{endif}}
218218

219219
"""
@@ -224,7 +224,6 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
224224
----------
225225
arr : {{dtype}} ndarray
226226
values : {{dtype}} ndarray
227-
hasnans : bint, optional
228227

229228
Returns
230229
-------
@@ -259,19 +258,13 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
259258
for i in range(n):
260259
val = arr[i]
261260
k = kh_get_{{ttype}}(table, <PyObject*> val)
262-
if k != table.n_buckets:
263-
result[i] = 1
264-
else:
265-
result[i] = hasnans and val != val
261+
result[i] = (k != table.n_buckets)
266262
{{else}}
267263
with nogil:
268264
for i in range(n):
269265
val = arr[i]
270266
k = kh_get_{{ttype}}(table, val)
271-
if k != table.n_buckets:
272-
result[i] = 1
273-
else:
274-
result[i] = hasnans and val != val
267+
result[i] = (k != table.n_buckets)
275268
{{endif}}
276269

277270
kh_destroy_{{ttype}}(table)

pandas/core/algorithms.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -437,8 +437,7 @@ def isin(comps, values):
437437
try:
438438
values = values.astype('float64', copy=False)
439439
comps = comps.astype('float64', copy=False)
440-
checknull = isna(values).any()
441-
f = lambda x, y: htable.ismember_float64(x, y, checknull)
440+
f = lambda x, y: htable.ismember_float64(x, y)
442441
except (TypeError, ValueError):
443442
values = values.astype(object)
444443
comps = comps.astype(object)

pandas/tests/test_algos.py

+21
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,27 @@ def test_different_nan_objects(self):
632632
result = algos.isin(comps, vals)
633633
tm.assert_numpy_array_equal(expected, result)
634634

635+
def test_different_nans_as_float64(self):
636+
# create different nans from bit-patterns,
637+
# these nans will land in different buckets in the hash-table
638+
# if no special care is taken
639+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
640+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
641+
assert NAN1 != NAN1
642+
assert NAN2 != NAN2
643+
644+
# check that NAN1 and NAN2 are equivalent:
645+
arr = np.array([NAN1, NAN2], dtype=np.float64)
646+
lookup1 = np.array([NAN1], dtype=np.float64)
647+
result = algos.isin(arr, lookup1)
648+
expected = np.array([True, True])
649+
tm.assert_numpy_array_equal(result, expected)
650+
651+
lookup2 = np.array([NAN2], dtype=np.float64)
652+
result = algos.isin(arr, lookup2)
653+
expected = np.array([True, True])
654+
tm.assert_numpy_array_equal(result, expected)
655+
635656

636657
class TestValueCounts(object):
637658

0 commit comments

Comments
 (0)