From 0f950dea1d7d12604b26e13ab85b72c57f89672c Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sun, 29 Jul 2018 22:30:20 +0200
Subject: [PATCH 1/5] CLN: Remove special handling of nans in the float64-case
 of isin

It is no longer needed because the hash-table handles the nans correctly out of the box (see GH21866)

Not having to scan the values via isna(...).any()  will improve the perfomance.
---
 pandas/_libs/hashtable_func_helper.pxi.in | 15 ++++-----------
 pandas/core/algorithms.py                 |  3 +--
 pandas/tests/test_algos.py                | 21 +++++++++++++++++++++
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
index 521e564447c59..e7f3701ee83c3 100644
--- a/pandas/_libs/hashtable_func_helper.pxi.in
+++ b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -210,10 +210,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'):
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
 
-def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values, bint hasnans=0):
+def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values):
 {{else}}
 
-def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
+def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values):
 {{endif}}
 
     """
@@ -224,7 +224,6 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
     ----------
     arr : {{dtype}} ndarray
     values : {{dtype}} ndarray
-    hasnans : bint, optional
 
     Returns
     -------
@@ -259,19 +258,13 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
     for i in range(n):
         val = arr[i]
         k = kh_get_{{ttype}}(table, <PyObject*> val)
-        if k != table.n_buckets:
-            result[i] = 1
-        else:
-            result[i] = hasnans and val != val
+        result[i] = (k != table.n_buckets)
     {{else}}
     with nogil:
         for i in range(n):
             val = arr[i]
             k = kh_get_{{ttype}}(table, val)
-            if k != table.n_buckets:
-                result[i] = 1
-            else:
-                result[i] = hasnans and val != val
+            result[i] = (k != table.n_buckets)
     {{endif}}
 
     kh_destroy_{{ttype}}(table)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 4bf62b021cddc..2773e7b230084 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -437,8 +437,7 @@ def isin(comps, values):
         try:
             values = values.astype('float64', copy=False)
             comps = comps.astype('float64', copy=False)
-            checknull = isna(values).any()
-            f = lambda x, y: htable.ismember_float64(x, y, checknull)
+            f = lambda x, y: htable.ismember_float64(x, y)
         except (TypeError, ValueError):
             values = values.astype(object)
             comps = comps.astype(object)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index f89c7545765c9..716b60a13624b 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -632,6 +632,27 @@ def test_different_nan_objects(self):
         result = algos.isin(comps, vals)
         tm.assert_numpy_array_equal(expected, result)
 
+    def test_different_nans_as_float64(self):
+        # create different nans from bit-patterns,
+        # these nans will land in different buckets in the hash-table
+        # if no special care is taken
+        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+
+        # check that NAN1 and NAN2 are equivalent:
+        arr = np.array([NAN1, NAN2], dtype=np.float64)
+        lookup1 = np.array([NAN1], dtype=np.float64)
+        result = algos.isin(arr, lookup1)
+        expected = np.array([True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        lookup2 = np.array([NAN2], dtype=np.float64)
+        result = algos.isin(arr, lookup2)
+        expected = np.array([True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
 
 class TestValueCounts(object):
 

From 6791a2cb09930bae7d9dec09159bcb0400858629 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Mon, 30 Jul 2018 23:42:10 +0200
Subject: [PATCH 2/5] adding performance tests for isin in combination with
 float64

---
 asv_bench/benchmarks/series_methods.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 34a8d552304d1..7e9a99ca4b2c8 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -27,16 +27,27 @@ def time_constructor(self, data):
 class IsIn(object):
 
     goal_time = 0.2
-    params = ['int64', 'object']
+    params = ['int64', 'object', 'float64']
     param_names = ['dtype']
 
     def setup(self, dtype):
         self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype)
         self.values = [1, 2]
+        self.small = Series(np.random.randint(1, 10, 10)).astype(dtype)
+        self.many_different_values = np.arange(10**6).astype(dtype)
+        self.few_different_values = np.zeros(10**6).astype(dtype)
 
     def time_isin(self, dtypes):
         self.s.isin(self.values)
 
+    def time_isin_many_different(self, dtypes):
+        # runtime is dominated by creation of the lookup-table
+        self.small.isin(self.many_different_values)
+
+    def time_isin_few_different(self, dtypes):
+        # runtime is dominated by creation of the lookup-table
+        self.small.isin(self.few_different_values)
+
 
 class IsInForObjects(object):
 

From f4ab90336b36735c5bdf16fbf7dd2bc463119f17 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Mon, 30 Jul 2018 23:44:29 +0200
Subject: [PATCH 3/5] adding bug-id to unit test

---
 pandas/tests/test_algos.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 716b60a13624b..4f8f61a9884b9 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -633,6 +633,7 @@ def test_different_nan_objects(self):
         tm.assert_numpy_array_equal(expected, result)
 
     def test_different_nans_as_float64(self):
+        # GH 21866
         # create different nans from bit-patterns,
         # these nans will land in different buckets in the hash-table
         # if no special care is taken

From 9f5524d58c412c4fc00dd65f180e413a6038c5cf Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sat, 4 Aug 2018 07:26:14 +0200
Subject: [PATCH 4/5] tweaking tests

---
 asv_bench/benchmarks/series_methods.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 7e9a99ca4b2c8..983b8f5ff3f1c 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -27,24 +27,34 @@ def time_constructor(self, data):
 class IsIn(object):
 
     goal_time = 0.2
-    params = ['int64', 'object', 'float64']
+    params = ['int64', 'object']
     param_names = ['dtype']
 
     def setup(self, dtype):
         self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype)
         self.values = [1, 2]
-        self.small = Series(np.random.randint(1, 10, 10)).astype(dtype)
-        self.many_different_values = np.arange(10**6).astype(dtype)
-        self.few_different_values = np.zeros(10**6).astype(dtype)
 
     def time_isin(self, dtypes):
         self.s.isin(self.values)
 
-    def time_isin_many_different(self, dtypes):
+
+class IsInFloat64(object):
+
+    def setup(self):
+        self.small = Series(np.random.randint(1, 10, 10)).astype(dtype=np.float64)
+        self.many_different_values = np.arange(10**6, dtype=np.float64)
+        self.few_different_values = np.zeros(10**7, dtype=np.float64)      
+        self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64)
+
+    def time_isin_many_different(self):
         # runtime is dominated by creation of the lookup-table
         self.small.isin(self.many_different_values)
 
-    def time_isin_few_different(self, dtypes):
+    def time_isin_few_different(self):
+        # runtime is dominated by creation of the lookup-table
+        self.small.isin(self.few_different_values)
+
+    def time_isin_nan_values(self):
         # runtime is dominated by creation of the lookup-table
         self.small.isin(self.few_different_values)
 

From 567e7bb84a7e172076e759b04d1557e5021be630 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sat, 4 Aug 2018 11:12:58 +0200
Subject: [PATCH 5/5] pep8 problems

---
 asv_bench/benchmarks/series_methods.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 983b8f5ff3f1c..a26c5d89bc483 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -41,9 +41,9 @@ def time_isin(self, dtypes):
 class IsInFloat64(object):
 
     def setup(self):
-        self.small = Series(np.random.randint(1, 10, 10)).astype(dtype=np.float64)
+        self.small = Series([1, 2], dtype=np.float64)
         self.many_different_values = np.arange(10**6, dtype=np.float64)
-        self.few_different_values = np.zeros(10**7, dtype=np.float64)      
+        self.few_different_values = np.zeros(10**7, dtype=np.float64)
         self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64)
 
     def time_isin_many_different(self):