pandas-dev · jreback · Nov 17, 2020 · Sep 24, 2020 · Sep 24, 2020 · Sep 24, 2020
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -90,6 +90,55 @@ def time_isin_long_series_long_values_floats(self):
         self.s_long_floats.isin(self.vals_long_floats)
 
 
+class IsInLongSeriesLookUpDominates:
+    params = [
+        ["int64", "int32", "float64", "float32", "object"],
+        [5, 1000],
+        ["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
+    ]
+    param_names = ["dtype", "MaxNumber", "series_type"]
+
+    def setup(self, dtype, MaxNumber, series_type):
+        N = 10 ** 7
+        if series_type == "random_hits":
+            np.random.seed(42)
+            array = np.random.randint(0, MaxNumber, N)
+        if series_type == "random_misses":
+            np.random.seed(42)
+            array = np.random.randint(0, MaxNumber, N) + MaxNumber
+        if series_type == "monotone_hits":
+            array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
+        if series_type == "monotone_misses":
+            array = np.arange(N) + MaxNumber
+        self.series = Series(array).astype(dtype)
+        self.values = np.arange(MaxNumber).astype(dtype)
+
+    def time_isin(self, dtypes, MaxNumber, series_type):
+        self.series.isin(self.values)
+
+
+class IsInLongSeriesValuesDominate:
+    params = [
+        ["int64", "int32", "float64", "float32", "object"],
+        ["random", "monotone"],
+    ]
+    param_names = ["dtype", "series_type"]
+
+    def setup(self, dtype, series_type):
+        N = 10 ** 7
+        if series_type == "random":
+            np.random.seed(42)
+            vals = np.random.randint(0, 10 * N, N)
+        if series_type == "monotone":
+            vals = np.arange(N)
+        self.values = vals.astype(dtype)
+        M = 10 ** 6 + 1
+        self.series = Series(np.arange(M)).astype(dtype)
+
+    def time_isin(self, dtypes, series_type):
+        self.series.isin(self.values)
+
+
 class NSort:
 
     params = ["first", "last", "all"]

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -482,6 +482,7 @@ Performance improvements
 - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
 - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
 - Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
+- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -433,19 +433,20 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
     comps, dtype = _ensure_data(comps)
     values, _ = _ensure_data(values, dtype=dtype)
 
-    # faster for larger cases to use np.in1d
     f = htable.ismember_object
 
     # GH16012
     # Ensure np.in1d doesn't get object types or it *may* throw an exception
-    if len(comps) > 1_000_000 and not is_object_dtype(comps):
+    # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
+    # in1d is faster for small sizes
+    if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps):
         # If the the values include nan we need to check for nan explicitly
         # since np.nan it not equal to np.nan
         if isna(values).any():
             f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
         else:
             f = np.in1d
-    elif is_integer_dtype(comps):
+    elif is_integer_dtype(comps.dtype):
         try:
             values = values.astype("int64", copy=False)
             comps = comps.astype("int64", copy=False)
@@ -454,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
             values = values.astype(object)
             comps = comps.astype(object)
 
-    elif is_float_dtype(comps):
+    elif is_float_dtype(comps.dtype):
         try:
             values = values.astype("float64", copy=False)
             comps = comps.astype("float64", copy=False)