diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 258c29c145721..3b65bccd48aee 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -90,6 +90,55 @@ def time_isin_long_series_long_values_floats(self): self.s_long_floats.isin(self.vals_long_floats) +class IsInLongSeriesLookUpDominates: + params = [ + ["int64", "int32", "float64", "float32", "object"], + [5, 1000], + ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], + ] + param_names = ["dtype", "MaxNumber", "series_type"] + + def setup(self, dtype, MaxNumber, series_type): + N = 10 ** 7 + if series_type == "random_hits": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + if series_type == "random_misses": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + MaxNumber + if series_type == "monotone_hits": + array = np.repeat(np.arange(MaxNumber), N // MaxNumber) + if series_type == "monotone_misses": + array = np.arange(N) + MaxNumber + self.series = Series(array).astype(dtype) + self.values = np.arange(MaxNumber).astype(dtype) + + def time_isin(self, dtypes, MaxNumber, series_type): + self.series.isin(self.values) + + +class IsInLongSeriesValuesDominate: + params = [ + ["int64", "int32", "float64", "float32", "object"], + ["random", "monotone"], + ] + param_names = ["dtype", "series_type"] + + def setup(self, dtype, series_type): + N = 10 ** 7 + if series_type == "random": + np.random.seed(42) + vals = np.random.randint(0, 10 * N, N) + if series_type == "monotone": + vals = np.arange(N) + self.values = vals.astype(dtype) + M = 10 ** 6 + 1 + self.series = Series(np.arange(M)).astype(dtype) + + def time_isin(self, dtypes, series_type): + self.series.isin(self.values) + + class NSort: params = ["first", "last", "all"] diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e690334a36c5b..9a4642d65aeef 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -482,6 +482,7 @@ Performance improvements - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) - Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`) +- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ec88eb817b3f8..2e6b801db109a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -433,19 +433,20 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) - # faster for larger cases to use np.in1d f = htable.ismember_object # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - if len(comps) > 1_000_000 and not is_object_dtype(comps): + # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), + # in1d is faster for small sizes + if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps): # If the the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d - elif is_integer_dtype(comps): + elif is_integer_dtype(comps.dtype): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) @@ -454,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: values = values.astype(object) comps = comps.astype(object) - elif is_float_dtype(comps): + elif is_float_dtype(comps.dtype): try: values = values.astype("float64", copy=False) comps = comps.astype("float64", copy=False)