From 1663bb2a973bc3099eb1a9532ca8c6030879fac3 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 24 Sep 2020 20:38:48 +0200 Subject: [PATCH 01/14] PERF: always use hash-map for isin rather than np.isin1d --- pandas/core/algorithms.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ec88eb817b3f8..8a00840f3b596 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -433,19 +433,9 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) - # faster for larger cases to use np.in1d f = htable.ismember_object - # GH16012 - # Ensure np.in1d doesn't get object types or it *may* throw an exception - if len(comps) > 1_000_000 and not is_object_dtype(comps): - # If the the values include nan we need to check for nan explicitly - # since np.nan it not equal to np.nan - if isna(values).any(): - f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) - else: - f = np.in1d - elif is_integer_dtype(comps): + if is_integer_dtype(comps): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) From f543f1596c44df1d878f0f13f82f44e8f11a6e61 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 24 Sep 2020 20:42:56 +0200 Subject: [PATCH 02/14] adding benchmark for isin with many elements --- asv_bench/benchmarks/series_methods.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 258c29c145721..7bbc8c0918b68 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -90,6 +90,18 @@ def time_isin_long_series_long_values_floats(self): self.s_long_floats.isin(self.vals_long_floats) +class IsInLongSeries(object): + params = [['int64', 'int32', 'float64', 'float32'], [1, 2, 5, 10, 1000, 10**5]] + param_names = ['dtype', 'M'] + + def setup(self, dtype, M): + self.s = Series(np.arange(10**7)).astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtypes, M): + self.s.isin(self.values) + + class NSort: params = ["first", "last", "all"] From 48fe31eba55ebc1a441063b434c802144a586787 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 24 Sep 2020 23:05:04 +0200 Subject: [PATCH 03/14] use random order of elements rather than sorted, for sorted elements the search in the sorted arrays will have too few cache misses --- asv_bench/benchmarks/series_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 7bbc8c0918b68..1153152b864bd 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -91,11 +91,11 @@ def time_isin_long_series_long_values_floats(self): class IsInLongSeries(object): - params = [['int64', 'int32', 'float64', 'float32'], [1, 2, 5, 10, 1000, 10**5]] - param_names = ['dtype', 'M'] + params = [["int64", "int32", "float64", "float32"], [1, 2, 5, 10, 1000, 10 ** 5]] + param_names = ["dtype", "M"] def setup(self, dtype, M): - self.s = Series(np.arange(10**7)).astype(dtype) + self.s = Series(np.random.randint(0, M, 10 ** 7)).astype(dtype) self.values = np.arange(M).astype(dtype) def time_isin(self, dtypes, M): From e8a02ae4f031100edd2974fb1fd6b7eacacc7dac Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 26 Sep 2020 16:18:51 +0200 Subject: [PATCH 04/14] test also PyObject --- asv_bench/benchmarks/series_methods.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 1153152b864bd..438a13d53ccb8 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -90,8 +90,11 @@ def time_isin_long_series_long_values_floats(self): self.s_long_floats.isin(self.vals_long_floats) -class IsInLongSeries(object): - params = [["int64", "int32", "float64", "float32"], [1, 2, 5, 10, 1000, 10 ** 5]] +class IsInLongSeries: + params = [ + ["int64", "int32", "float64", "float32", "object"], + [1, 2, 5, 10, 1000, 10 ** 5], + ] param_names = ["dtype", "M"] def setup(self, dtype, M): From 1cd54f0350143a19928e31af7a23d567e5d1376b Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 13 Nov 2020 22:48:00 +0100 Subject: [PATCH 05/14] comps->comps.dtype --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8a00840f3b596..8ce26537d72ca 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,7 +435,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f = htable.ismember_object - if is_integer_dtype(comps): + if is_integer_dtype(comps.dtype): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) @@ -444,7 +444,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: values = values.astype(object) comps = comps.astype(object) - elif is_float_dtype(comps): + elif is_float_dtype(comps.dtype): try: values = values.astype("float64", copy=False) comps = comps.astype("float64", copy=False) From 1af1a349b9841da4228d4cc86356b7fcf9d9f87f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 13 Nov 2020 23:01:38 +0100 Subject: [PATCH 06/14] using longer names for variables --- asv_bench/benchmarks/series_methods.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 438a13d53ccb8..9bc678258cfec 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -95,14 +95,14 @@ class IsInLongSeries: ["int64", "int32", "float64", "float32", "object"], [1, 2, 5, 10, 1000, 10 ** 5], ] - param_names = ["dtype", "M"] + param_names = ["dtype", "MaxNumber"] - def setup(self, dtype, M): - self.s = Series(np.random.randint(0, M, 10 ** 7)).astype(dtype) - self.values = np.arange(M).astype(dtype) + def setup(self, dtype, MaxNumber): + self.series = Series(np.random.randint(0, MaxNumber, 10 ** 7)).astype(dtype) + self.values = np.arange(MaxNumber).astype(dtype) - def time_isin(self, dtypes, M): - self.s.isin(self.values) + def time_isin(self, dtypes, MaxNumber): + self.series.isin(self.values) class NSort: From eb858c2f1eaf195ddf775939353b1d839f95f107 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 13 Nov 2020 23:18:54 +0100 Subject: [PATCH 07/14] adding further types of series for asv-tests --- asv_bench/benchmarks/series_methods.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 9bc678258cfec..42d44f3f85b20 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -93,15 +93,25 @@ def time_isin_long_series_long_values_floats(self): class IsInLongSeries: params = [ ["int64", "int32", "float64", "float32", "object"], - [1, 2, 5, 10, 1000, 10 ** 5], + [1, 2, 5, 10, 50, 100, 1000, 10 ** 5], + ["random_hits", "random_misses", "monotone"], ] - param_names = ["dtype", "MaxNumber"] - - def setup(self, dtype, MaxNumber): - self.series = Series(np.random.randint(0, MaxNumber, 10 ** 7)).astype(dtype) + param_names = ["dtype", "MaxNumber", "series_type"] + + def setup(self, dtype, MaxNumber, series_type): + N = 10 ** 7 + if series_type == "random_hits": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + if series_type == "random_misses": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + MaxNumber + if series_type == "monotone": + array = np.repeat(np.arange(MaxNumber), N // MaxNumber) + self.series = Series(array).astype(dtype) self.values = np.arange(MaxNumber).astype(dtype) - def time_isin(self, dtypes, MaxNumber): + def time_isin(self, dtypes, MaxNumber, series_type): self.series.isin(self.values) From 3d781bebaead1c79fed56b37f49bbd556cf960d4 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 14 Nov 2020 12:07:10 +0100 Subject: [PATCH 08/14] adding more test cases --- asv_bench/benchmarks/series_methods.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 42d44f3f85b20..54c719282ed1f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -94,7 +94,7 @@ class IsInLongSeries: params = [ ["int64", "int32", "float64", "float32", "object"], [1, 2, 5, 10, 50, 100, 1000, 10 ** 5], - ["random_hits", "random_misses", "monotone"], + ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], ] param_names = ["dtype", "MaxNumber", "series_type"] @@ -106,8 +106,10 @@ def setup(self, dtype, MaxNumber, series_type): if series_type == "random_misses": np.random.seed(42) array = np.random.randint(0, MaxNumber, N) + MaxNumber - if series_type == "monotone": + if series_type == "monotone_hits": array = np.repeat(np.arange(MaxNumber), N // MaxNumber) + if series_type == "monotone_misses": + array = np.arange(N) + MaxNumber self.series = Series(array).astype(dtype) self.values = np.arange(MaxNumber).astype(dtype) From ef49ca39cb53537762a33dcab434af2b30a0c7f1 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 15 Nov 2020 12:38:43 +0100 Subject: [PATCH 09/14] adding values-dominate asv-tests --- asv_bench/benchmarks/series_methods.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 54c719282ed1f..0882ecc3fe5d0 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -90,10 +90,10 @@ def time_isin_long_series_long_values_floats(self): self.s_long_floats.isin(self.vals_long_floats) -class IsInLongSeries: +class IsInLongSeriesLookUpDominates: params = [ ["int64", "int32", "float64", "float32", "object"], - [1, 2, 5, 10, 50, 100, 1000, 10 ** 5], + [1, 2, 5, 10, 16, 50, 100, 1000, 10 ** 5], ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], ] param_names = ["dtype", "MaxNumber", "series_type"] @@ -117,6 +117,28 @@ def time_isin(self, dtypes, MaxNumber, series_type): self.series.isin(self.values) +class IsInLongSeriesValuesDominate: + params = [ + ["int64", "int32", "float64", "float32", "object"], + ["random", "monotone"], + ] + param_names = ["dtype", "series_type"] + + def setup(self, dtype, series_type): + N = 10 ** 7 + if series_type == "random": + np.random.seed(42) + vals = np.random.randint(0, 10 * N, N) + if series_type == "monotone": + vals = np.arange(N) + self.values = vals.astype(dtype) + M = 10 ** 6 + 1 + self.series = Series(np.arange(M)).astype(dtype) + + def time_isin(self, dtypes, series_type): + self.series.isin(self.values) + + class NSort: params = ["first", "last", "all"] From 303e6ac81362b8aa48060971987820b75997509b Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 15 Nov 2020 14:22:03 +0100 Subject: [PATCH 10/14] add comment --- pandas/core/algorithms.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8ce26537d72ca..b28033c0153db 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,6 +435,10 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f = htable.ismember_object + # an alternative is to use np.in1d if values has a few + # elements (about 10) - it is faster than a hash-table + # for these cases. However, one must be cautious with + # nans (see GH22205) if is_integer_dtype(comps.dtype): try: values = values.astype("int64", copy=False) From f6f2dd4beb12975c8fc602f5966e9e146251e088 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 15 Nov 2020 20:15:23 +0100 Subject: [PATCH 11/14] adding whatsnew note --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e690334a36c5b..9a4642d65aeef 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -482,6 +482,7 @@ Performance improvements - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) - Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`) +- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements .. --------------------------------------------------------------------------- From 7d706daf1f941aa14f9374d648da3d93f20da9c7 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 15 Nov 2020 20:20:08 +0100 Subject: [PATCH 12/14] reduce the number of asv-tests --- asv_bench/benchmarks/series_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 0882ecc3fe5d0..3b65bccd48aee 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -93,7 +93,7 @@ def time_isin_long_series_long_values_floats(self): class IsInLongSeriesLookUpDominates: params = [ ["int64", "int32", "float64", "float32", "object"], - [1, 2, 5, 10, 16, 50, 100, 1000, 10 ** 5], + [5, 1000], ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], ] param_names = ["dtype", "MaxNumber", "series_type"] From 0d57fe34cf695b852dc0eea8d120b9fb58b1aa9b Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 07:10:30 +0100 Subject: [PATCH 13/14] tests show, that for small look-up tables in1d is still faster --- pandas/core/algorithms.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b28033c0153db..75d46361fa138 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,11 +435,18 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f = htable.ismember_object - # an alternative is to use np.in1d if values has a few - # elements (about 10) - it is faster than a hash-table - # for these cases. However, one must be cautious with - # nans (see GH22205) - if is_integer_dtype(comps.dtype): + # GH16012 + # Ensure np.in1d doesn't get object types or it *may* throw an exception + # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), + # in1d is faster for small sizes + if len(comps) > 1_000_000 and len(values) <= 16 and not is_object_dtype(comps): + # If the the values include nan we need to check for nan explicitly + # since np.nan it not equal to np.nan + if isna(values).any(): + f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) + else: + f = np.in1d + elif is_integer_dtype(comps.dtype): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) From 501190b7654c726365e6a2fee99f826ce8c4c90c Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 17 Nov 2020 12:23:33 +0100 Subject: [PATCH 14/14] being more conservative taking only improvements --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 75d46361fa138..2e6b801db109a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -439,7 +439,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # Ensure np.in1d doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # in1d is faster for small sizes - if len(comps) > 1_000_000 and len(values) <= 16 and not is_object_dtype(comps): + if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps): # If the the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any():