diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 34a8d552304d1..a26c5d89bc483 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -38,6 +38,27 @@ def time_isin(self, dtypes): self.s.isin(self.values) +class IsInFloat64(object): + + def setup(self): + self.small = Series([1, 2], dtype=np.float64) + self.many_different_values = np.arange(10**6, dtype=np.float64) + self.few_different_values = np.zeros(10**7, dtype=np.float64) + self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64) + + def time_isin_many_different(self): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.many_different_values) + + def time_isin_few_different(self): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.few_different_values) + + def time_isin_nan_values(self): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.few_different_values) + + class IsInForObjects(object): def setup(self): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 521e564447c59..e7f3701ee83c3 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -210,10 +210,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): @cython.boundscheck(False) {{if dtype == 'object'}} -def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0): +def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values): {{else}} -def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): +def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): {{endif}} """ @@ -224,7 +224,6 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): ---------- arr : {{dtype}} ndarray values : {{dtype}} ndarray - hasnans : bint, optional Returns ------- @@ -259,19 +258,13 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0): for i in range(n): val = arr[i] k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - result[i] = 1 - else: - result[i] = hasnans and val != val + result[i] = (k != table.n_buckets) {{else}} with nogil: for i in range(n): val = arr[i] k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - result[i] = 1 - else: - result[i] = hasnans and val != val + result[i] = (k != table.n_buckets) {{endif}} kh_destroy_{{ttype}}(table) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4bf62b021cddc..2773e7b230084 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -437,8 +437,7 @@ def isin(comps, values): try: values = values.astype('float64', copy=False) comps = comps.astype('float64', copy=False) - checknull = isna(values).any() - f = lambda x, y: htable.ismember_float64(x, y, checknull) + f = lambda x, y: htable.ismember_float64(x, y) except (TypeError, ValueError): values = values.astype(object) comps = comps.astype(object) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f89c7545765c9..4f8f61a9884b9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -632,6 +632,28 @@ def test_different_nan_objects(self): result = algos.isin(comps, vals) tm.assert_numpy_array_equal(expected, result) + def test_different_nans_as_float64(self): + # GH 21866 + # create different nans from bit-patterns, + # these nans will land in different buckets in the hash-table + # if no special care is taken + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + + # check that NAN1 and NAN2 are equivalent: + arr = np.array([NAN1, NAN2], dtype=np.float64) + lookup1 = np.array([NAN1], dtype=np.float64) + result = algos.isin(arr, lookup1) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + lookup2 = np.array([NAN2], dtype=np.float64) + result = algos.isin(arr, lookup2) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + class TestValueCounts(object):