From 1663bb2a973bc3099eb1a9532ca8c6030879fac3 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Thu, 24 Sep 2020 20:38:48 +0200
Subject: [PATCH 01/14] PERF: always use hash-map for isin rather than
 np.isin1d

---
 pandas/core/algorithms.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index ec88eb817b3f8..8a00840f3b596 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -433,19 +433,9 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
     comps, dtype = _ensure_data(comps)
     values, _ = _ensure_data(values, dtype=dtype)
 
-    # faster for larger cases to use np.in1d
     f = htable.ismember_object
 
-    # GH16012
-    # Ensure np.in1d doesn't get object types or it *may* throw an exception
-    if len(comps) > 1_000_000 and not is_object_dtype(comps):
-        # If the the values include nan we need to check for nan explicitly
-        # since np.nan it not equal to np.nan
-        if isna(values).any():
-            f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
-        else:
-            f = np.in1d
-    elif is_integer_dtype(comps):
+    if is_integer_dtype(comps):
         try:
             values = values.astype("int64", copy=False)
             comps = comps.astype("int64", copy=False)

From f543f1596c44df1d878f0f13f82f44e8f11a6e61 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Thu, 24 Sep 2020 20:42:56 +0200
Subject: [PATCH 02/14] adding benchmark for isin with many elements

---
 asv_bench/benchmarks/series_methods.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 258c29c145721..7bbc8c0918b68 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -90,6 +90,18 @@ def time_isin_long_series_long_values_floats(self):
         self.s_long_floats.isin(self.vals_long_floats)
 
 
+class IsInLongSeries(object):
+    params = [['int64', 'int32', 'float64', 'float32'], [1, 2, 5, 10, 1000, 10**5]]
+    param_names = ['dtype', 'M']
+
+    def setup(self, dtype, M):
+        self.s = Series(np.arange(10**7)).astype(dtype)
+        self.values = np.arange(M).astype(dtype)
+
+    def time_isin(self, dtypes, M):
+        self.s.isin(self.values)
+
+
 class NSort:
 
     params = ["first", "last", "all"]

From 48fe31eba55ebc1a441063b434c802144a586787 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Thu, 24 Sep 2020 23:05:04 +0200
Subject: [PATCH 03/14] use random order of elements rather than sorted, for
 sorted elements the search in the sorted arrays will have too few cache
 misses

---
 asv_bench/benchmarks/series_methods.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 7bbc8c0918b68..1153152b864bd 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -91,11 +91,11 @@ def time_isin_long_series_long_values_floats(self):
 
 
 class IsInLongSeries(object):
-    params = [['int64', 'int32', 'float64', 'float32'], [1, 2, 5, 10, 1000, 10**5]]
-    param_names = ['dtype', 'M']
+    params = [["int64", "int32", "float64", "float32"], [1, 2, 5, 10, 1000, 10 ** 5]]
+    param_names = ["dtype", "M"]
 
     def setup(self, dtype, M):
-        self.s = Series(np.arange(10**7)).astype(dtype)
+        self.s = Series(np.random.randint(0, M, 10 ** 7)).astype(dtype)
         self.values = np.arange(M).astype(dtype)
 
     def time_isin(self, dtypes, M):

From e8a02ae4f031100edd2974fb1fd6b7eacacc7dac Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sat, 26 Sep 2020 16:18:51 +0200
Subject: [PATCH 04/14] test also PyObject

---
 asv_bench/benchmarks/series_methods.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 1153152b864bd..438a13d53ccb8 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -90,8 +90,11 @@ def time_isin_long_series_long_values_floats(self):
         self.s_long_floats.isin(self.vals_long_floats)
 
 
-class IsInLongSeries(object):
-    params = [["int64", "int32", "float64", "float32"], [1, 2, 5, 10, 1000, 10 ** 5]]
+class IsInLongSeries:
+    params = [
+        ["int64", "int32", "float64", "float32", "object"],
+        [1, 2, 5, 10, 1000, 10 ** 5],
+    ]
     param_names = ["dtype", "M"]
 
     def setup(self, dtype, M):

From 1cd54f0350143a19928e31af7a23d567e5d1376b Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Fri, 13 Nov 2020 22:48:00 +0100
Subject: [PATCH 05/14] comps->comps.dtype

---
 pandas/core/algorithms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 8a00840f3b596..8ce26537d72ca 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -435,7 +435,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
     f = htable.ismember_object
 
-    if is_integer_dtype(comps):
+    if is_integer_dtype(comps.dtype):
         try:
             values = values.astype("int64", copy=False)
             comps = comps.astype("int64", copy=False)
@@ -444,7 +444,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
             values = values.astype(object)
             comps = comps.astype(object)
 
-    elif is_float_dtype(comps):
+    elif is_float_dtype(comps.dtype):
         try:
             values = values.astype("float64", copy=False)
             comps = comps.astype("float64", copy=False)

From 1af1a349b9841da4228d4cc86356b7fcf9d9f87f Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Fri, 13 Nov 2020 23:01:38 +0100
Subject: [PATCH 06/14] using longer names for variables

---
 asv_bench/benchmarks/series_methods.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 438a13d53ccb8..9bc678258cfec 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -95,14 +95,14 @@ class IsInLongSeries:
         ["int64", "int32", "float64", "float32", "object"],
         [1, 2, 5, 10, 1000, 10 ** 5],
     ]
-    param_names = ["dtype", "M"]
+    param_names = ["dtype", "MaxNumber"]
 
-    def setup(self, dtype, M):
-        self.s = Series(np.random.randint(0, M, 10 ** 7)).astype(dtype)
-        self.values = np.arange(M).astype(dtype)
+    def setup(self, dtype, MaxNumber):
+        self.series = Series(np.random.randint(0, MaxNumber, 10 ** 7)).astype(dtype)
+        self.values = np.arange(MaxNumber).astype(dtype)
 
-    def time_isin(self, dtypes, M):
-        self.s.isin(self.values)
+    def time_isin(self, dtypes, MaxNumber):
+        self.series.isin(self.values)
 
 
 class NSort:

From eb858c2f1eaf195ddf775939353b1d839f95f107 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Fri, 13 Nov 2020 23:18:54 +0100
Subject: [PATCH 07/14] adding further types of series for asv-tests

---
 asv_bench/benchmarks/series_methods.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 9bc678258cfec..42d44f3f85b20 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -93,15 +93,25 @@ def time_isin_long_series_long_values_floats(self):
 class IsInLongSeries:
     params = [
         ["int64", "int32", "float64", "float32", "object"],
-        [1, 2, 5, 10, 1000, 10 ** 5],
+        [1, 2, 5, 10, 50, 100, 1000, 10 ** 5],
+        ["random_hits", "random_misses", "monotone"],
     ]
-    param_names = ["dtype", "MaxNumber"]
-
-    def setup(self, dtype, MaxNumber):
-        self.series = Series(np.random.randint(0, MaxNumber, 10 ** 7)).astype(dtype)
+    param_names = ["dtype", "MaxNumber", "series_type"]
+
+    def setup(self, dtype, MaxNumber, series_type):
+        N = 10 ** 7
+        if series_type == "random_hits":
+            np.random.seed(42)
+            array = np.random.randint(0, MaxNumber, N)
+        if series_type == "random_misses":
+            np.random.seed(42)
+            array = np.random.randint(0, MaxNumber, N) + MaxNumber
+        if series_type == "monotone":
+            array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
+        self.series = Series(array).astype(dtype)
         self.values = np.arange(MaxNumber).astype(dtype)
 
-    def time_isin(self, dtypes, MaxNumber):
+    def time_isin(self, dtypes, MaxNumber, series_type):
         self.series.isin(self.values)
 
 

From 3d781bebaead1c79fed56b37f49bbd556cf960d4 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sat, 14 Nov 2020 12:07:10 +0100
Subject: [PATCH 08/14] adding more test cases

---
 asv_bench/benchmarks/series_methods.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 42d44f3f85b20..54c719282ed1f 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -94,7 +94,7 @@ class IsInLongSeries:
     params = [
         ["int64", "int32", "float64", "float32", "object"],
         [1, 2, 5, 10, 50, 100, 1000, 10 ** 5],
-        ["random_hits", "random_misses", "monotone"],
+        ["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
     ]
     param_names = ["dtype", "MaxNumber", "series_type"]
 
@@ -106,8 +106,10 @@ def setup(self, dtype, MaxNumber, series_type):
         if series_type == "random_misses":
             np.random.seed(42)
             array = np.random.randint(0, MaxNumber, N) + MaxNumber
-        if series_type == "monotone":
+        if series_type == "monotone_hits":
             array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
+        if series_type == "monotone_misses":
+            array = np.arange(N) + MaxNumber
         self.series = Series(array).astype(dtype)
         self.values = np.arange(MaxNumber).astype(dtype)
 

From ef49ca39cb53537762a33dcab434af2b30a0c7f1 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sun, 15 Nov 2020 12:38:43 +0100
Subject: [PATCH 09/14] adding values-dominate asv-tests

---
 asv_bench/benchmarks/series_methods.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 54c719282ed1f..0882ecc3fe5d0 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -90,10 +90,10 @@ def time_isin_long_series_long_values_floats(self):
         self.s_long_floats.isin(self.vals_long_floats)
 
 
-class IsInLongSeries:
+class IsInLongSeriesLookUpDominates:
     params = [
         ["int64", "int32", "float64", "float32", "object"],
-        [1, 2, 5, 10, 50, 100, 1000, 10 ** 5],
+        [1, 2, 5, 10, 16, 50, 100, 1000, 10 ** 5],
         ["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
     ]
     param_names = ["dtype", "MaxNumber", "series_type"]
@@ -117,6 +117,28 @@ def time_isin(self, dtypes, MaxNumber, series_type):
         self.series.isin(self.values)
 
 
+class IsInLongSeriesValuesDominate:
+    params = [
+        ["int64", "int32", "float64", "float32", "object"],
+        ["random", "monotone"],
+    ]
+    param_names = ["dtype", "series_type"]
+
+    def setup(self, dtype, series_type):
+        N = 10 ** 7
+        if series_type == "random":
+            np.random.seed(42)
+            vals = np.random.randint(0, 10 * N, N)
+        if series_type == "monotone":
+            vals = np.arange(N)
+        self.values = vals.astype(dtype)
+        M = 10 ** 6 + 1
+        self.series = Series(np.arange(M)).astype(dtype)
+
+    def time_isin(self, dtypes, series_type):
+        self.series.isin(self.values)
+
+
 class NSort:
 
     params = ["first", "last", "all"]

From 303e6ac81362b8aa48060971987820b75997509b Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sun, 15 Nov 2020 14:22:03 +0100
Subject: [PATCH 10/14] add comment

---
 pandas/core/algorithms.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 8ce26537d72ca..b28033c0153db 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -435,6 +435,10 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
     f = htable.ismember_object
 
+    # an alternative is to use np.in1d if values has a few
+    # elements (about 10) - it is faster than a hash-table
+    # for these cases. However, one must be cautious with
+    # nans (see GH22205)
     if is_integer_dtype(comps.dtype):
         try:
             values = values.astype("int64", copy=False)

From f6f2dd4beb12975c8fc602f5966e9e146251e088 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sun, 15 Nov 2020 20:15:23 +0100
Subject: [PATCH 11/14] adding whatsnew note

---
 doc/source/whatsnew/v1.2.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index e690334a36c5b..9a4642d65aeef 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -482,6 +482,7 @@ Performance improvements
 - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
 - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
 - Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
+- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
 
 .. ---------------------------------------------------------------------------
 

From 7d706daf1f941aa14f9374d648da3d93f20da9c7 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sun, 15 Nov 2020 20:20:08 +0100
Subject: [PATCH 12/14] reduce the number of asv-tests

---
 asv_bench/benchmarks/series_methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 0882ecc3fe5d0..3b65bccd48aee 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -93,7 +93,7 @@ def time_isin_long_series_long_values_floats(self):
 class IsInLongSeriesLookUpDominates:
     params = [
         ["int64", "int32", "float64", "float32", "object"],
-        [1, 2, 5, 10, 16, 50, 100, 1000, 10 ** 5],
+        [5, 1000],
         ["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
     ]
     param_names = ["dtype", "MaxNumber", "series_type"]

From 0d57fe34cf695b852dc0eea8d120b9fb58b1aa9b Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Tue, 17 Nov 2020 07:10:30 +0100
Subject: [PATCH 13/14] tests show, that for small look-up tables in1d is still
 faster

---
 pandas/core/algorithms.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index b28033c0153db..75d46361fa138 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -435,11 +435,18 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
     f = htable.ismember_object
 
-    # an alternative is to use np.in1d if values has a few
-    # elements (about 10) - it is faster than a hash-table
-    # for these cases. However, one must be cautious with
-    # nans (see GH22205)
-    if is_integer_dtype(comps.dtype):
+    # GH16012
+    # Ensure np.in1d doesn't get object types or it *may* throw an exception
+    # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
+    # in1d is faster for small sizes
+    if len(comps) > 1_000_000 and len(values) <= 16 and not is_object_dtype(comps):
+        # If the the values include nan we need to check for nan explicitly
+        # since np.nan it not equal to np.nan
+        if isna(values).any():
+            f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
+        else:
+            f = np.in1d
+    elif is_integer_dtype(comps.dtype):
         try:
             values = values.astype("int64", copy=False)
             comps = comps.astype("int64", copy=False)

From 501190b7654c726365e6a2fee99f826ce8c4c90c Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Tue, 17 Nov 2020 12:23:33 +0100
Subject: [PATCH 14/14] being more conservative taking only improvements

---
 pandas/core/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 75d46361fa138..2e6b801db109a 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -439,7 +439,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
     # Ensure np.in1d doesn't get object types or it *may* throw an exception
     # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
     # in1d is faster for small sizes
-    if len(comps) > 1_000_000 and len(values) <= 16 and not is_object_dtype(comps):
+    if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps):
         # If the the values include nan we need to check for nan explicitly
         # since np.nan it not equal to np.nan
         if isna(values).any():