Skip to content

Commit 1dcac5c

Browse files
realeadvictor
authored and
victor
committed
CLN: Remove special handling of nans in the float64-case of isin (pandas-dev#22117)
1 parent afe0011 commit 1dcac5c

File tree

4 files changed

+48
-13
lines changed

4 files changed

+48
-13
lines changed

asv_bench/benchmarks/series_methods.py

+21
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,27 @@ def time_isin(self, dtypes):
3838
self.s.isin(self.values)
3939

4040

41+
class IsInFloat64(object):
42+
43+
def setup(self):
44+
self.small = Series([1, 2], dtype=np.float64)
45+
self.many_different_values = np.arange(10**6, dtype=np.float64)
46+
self.few_different_values = np.zeros(10**7, dtype=np.float64)
47+
self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64)
48+
49+
def time_isin_many_different(self):
50+
# runtime is dominated by creation of the lookup-table
51+
self.small.isin(self.many_different_values)
52+
53+
def time_isin_few_different(self):
54+
# runtime is dominated by creation of the lookup-table
55+
self.small.isin(self.few_different_values)
56+
57+
def time_isin_nan_values(self):
58+
# runtime is dominated by creation of the lookup-table
59+
self.small.isin(self.few_different_values)
60+
61+
4162
class IsInForObjects(object):
4263

4364
def setup(self):

pandas/_libs/hashtable_func_helper.pxi.in

+4-11
Original file line numberDiff line numberDiff line change
@@ -210,10 +210,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
210210
@cython.boundscheck(False)
211211
{{if dtype == 'object'}}
212212

213-
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
213+
def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values):
214214
{{else}}
215215

216-
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
216+
def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
217217
{{endif}}
218218

219219
"""
@@ -224,7 +224,6 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
224224
----------
225225
arr : {{dtype}} ndarray
226226
values : {{dtype}} ndarray
227-
hasnans : bint, optional
228227

229228
Returns
230229
-------
@@ -259,19 +258,13 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
259258
for i in range(n):
260259
val = arr[i]
261260
k = kh_get_{{ttype}}(table, <PyObject*> val)
262-
if k != table.n_buckets:
263-
result[i] = 1
264-
else:
265-
result[i] = hasnans and val != val
261+
result[i] = (k != table.n_buckets)
266262
{{else}}
267263
with nogil:
268264
for i in range(n):
269265
val = arr[i]
270266
k = kh_get_{{ttype}}(table, val)
271-
if k != table.n_buckets:
272-
result[i] = 1
273-
else:
274-
result[i] = hasnans and val != val
267+
result[i] = (k != table.n_buckets)
275268
{{endif}}
276269

277270
kh_destroy_{{ttype}}(table)

pandas/core/algorithms.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -437,8 +437,7 @@ def isin(comps, values):
437437
try:
438438
values = values.astype('float64', copy=False)
439439
comps = comps.astype('float64', copy=False)
440-
checknull = isna(values).any()
441-
f = lambda x, y: htable.ismember_float64(x, y, checknull)
440+
f = lambda x, y: htable.ismember_float64(x, y)
442441
except (TypeError, ValueError):
443442
values = values.astype(object)
444443
comps = comps.astype(object)

pandas/tests/test_algos.py

+22
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,28 @@ def test_different_nan_objects(self):
632632
result = algos.isin(comps, vals)
633633
tm.assert_numpy_array_equal(expected, result)
634634

635+
def test_different_nans_as_float64(self):
636+
# GH 21866
637+
# create different nans from bit-patterns,
638+
# these nans will land in different buckets in the hash-table
639+
# if no special care is taken
640+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
641+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
642+
assert NAN1 != NAN1
643+
assert NAN2 != NAN2
644+
645+
# check that NAN1 and NAN2 are equivalent:
646+
arr = np.array([NAN1, NAN2], dtype=np.float64)
647+
lookup1 = np.array([NAN1], dtype=np.float64)
648+
result = algos.isin(arr, lookup1)
649+
expected = np.array([True, True])
650+
tm.assert_numpy_array_equal(result, expected)
651+
652+
lookup2 = np.array([NAN2], dtype=np.float64)
653+
result = algos.isin(arr, lookup2)
654+
expected = np.array([True, True])
655+
tm.assert_numpy_array_equal(result, expected)
656+
635657

636658
class TestValueCounts(object):
637659

0 commit comments

Comments
 (0)