Skip to content

Commit c393015

Browse files
committed
CLN: Remove special handling of nans in the float64-case of isin
It is no longer needed because the hash-table handles the nans correctly out of the box (see GH21866) Not having to scan the values via isna(...).any() will improve the perfomance.
1 parent d30c4a0 commit c393015

File tree

3 files changed

+26
-13
lines changed

3 files changed

+26
-13
lines changed

pandas/_libs/hashtable_func_helper.pxi.in

+4-11
Original file line numberDiff line numberDiff line change
@@ -210,10 +210,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
210210
@cython.boundscheck(False)
211211
{{if dtype == 'object'}}
212212

213-
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
213+
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values):
214214
{{else}}
215215

216-
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
216+
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
217217
{{endif}}
218218

219219
"""
@@ -224,7 +224,6 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
224224
----------
225225
arr : {{dtype}} ndarray
226226
values : {{dtype}} ndarray
227-
hasnans : bint, optional
228227

229228
Returns
230229
-------
@@ -259,19 +258,13 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
259258
for i in range(n):
260259
val = arr[i]
261260
k = kh_get_{{ttype}}(table, <PyObject*> val)
262-
if k != table.n_buckets:
263-
result[i] = 1
264-
else:
265-
result[i] = hasnans and val != val
261+
result[i] = (k != table.n_buckets)
266262
{{else}}
267263
with nogil:
268264
for i in range(n):
269265
val = arr[i]
270266
k = kh_get_{{ttype}}(table, val)
271-
if k != table.n_buckets:
272-
result[i] = 1
273-
else:
274-
result[i] = hasnans and val != val
267+
result[i] = (k != table.n_buckets)
275268
{{endif}}
276269

277270
kh_destroy_{{ttype}}(table)

pandas/core/algorithms.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -437,8 +437,7 @@ def isin(comps, values):
437437
try:
438438
values = values.astype('float64', copy=False)
439439
comps = comps.astype('float64', copy=False)
440-
checknull = isna(values).any()
441-
f = lambda x, y: htable.ismember_float64(x, y, checknull)
440+
f = lambda x, y: htable.ismember_float64(x, y)
442441
except (TypeError, ValueError):
443442
values = values.astype(object)
444443
comps = comps.astype(object)

pandas/tests/test_algos.py

+21
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,27 @@ def test_empty(self, empty):
623623
result = algos.isin(vals, empty)
624624
tm.assert_numpy_array_equal(expected, result)
625625

626+
def test_different_nans_as_float64(self):
627+
# create different nans from bit-patterns,
628+
# these nans will land in different buckets in the hash-table
629+
# if no special care is taken
630+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
631+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
632+
assert NAN1 != NAN1
633+
assert NAN2 != NAN2
634+
635+
# check that NAN1 and NAN2 are equivalent:
636+
arr = np.array([NAN1, NAN2], dtype=np.float64)
637+
lookup1 = np.array([NAN1], dtype=np.float64)
638+
result = algos.isin(arr, lookup1)
639+
expected = np.array([True, True])
640+
tm.assert_numpy_array_equal(result, expected)
641+
642+
lookup2 = np.array([NAN2], dtype=np.float64)
643+
result = algos.isin(arr, lookup2)
644+
expected = np.array([True, True])
645+
tm.assert_numpy_array_equal(result, expected)
646+
626647

627648
class TestValueCounts(object):
628649

0 commit comments

Comments
 (0)