From c94b45e1edd4494eee2a8885c25e041f6100eba6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 15 May 2020 17:38:04 +0200 Subject: [PATCH 01/20] PERF: Remove unnecessary copies in sorting functions --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 25312b180dba1..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values.copy() + return values if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From 4ba7472f2124c4a0d38803421dbef2f4181e18a9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 15 May 2020 22:18:27 +0200 Subject: [PATCH 02/20] PERF: Create array from list with given dtype=bool --- pandas/core/indexing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b857a59195695..9b6ccfcb4c86d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -7,7 +7,9 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.core.construction import array as pd_array from pandas.core.dtypes.common import ( + is_array_like, is_hashable, is_integer, is_iterator, @@ -2164,11 +2166,14 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: result = result.astype(bool)._values elif is_object_dtype(key): # key might be object-dtype bool, check_array_indexer needs bool array + result = np.asarray(result, dtype=bool) result = check_array_indexer(index, result) else: + if not is_array_like(result): + # GH 33924 + result = pd_array(result, dtype=bool) result = check_array_indexer(index, result) - return result From 509f74a6a523aee3566f75e41e949b936ad970de Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 15 May 2020 22:20:39 +0200 Subject: [PATCH 03/20] Run black --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9b6ccfcb4c86d..d00a25b129b4d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2166,7 +2166,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: result = result.astype(bool)._values elif is_object_dtype(key): # key might be object-dtype bool, check_array_indexer needs bool array - + result = np.asarray(result, dtype=bool) result = check_array_indexer(index, result) else: From 0ab450b9ea5f38582d09acbcd8f697ac62f37919 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:06:23 +0200 Subject: [PATCH 04/20] Run tests --- pandas/core/sorting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..2943714a5d015 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,7 +386,6 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values - if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -404,7 +403,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError: + except TypeError:opy() raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From 54c7304d585c60dd148e3e47aa28514100289eb5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:07:12 +0200 Subject: [PATCH 05/20] Run tests --- pandas/core/sorting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2943714a5d015..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,6 +386,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values + if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -403,7 +404,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError:opy() + except TypeError: raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From c7de90b84b97b8ff31b40bc13422384b3f5b9d8c Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 20:19:22 +0200 Subject: [PATCH 06/20] Run tests --- pandas/core/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d00a25b129b4d..e72e37e028889 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2166,7 +2166,6 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: result = result.astype(bool)._values elif is_object_dtype(key): # key might be object-dtype bool, check_array_indexer needs bool array - result = np.asarray(result, dtype=bool) result = check_array_indexer(index, result) else: From 0e75426277274222d26193ad6d5a86e4ccec1d62 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 20:59:01 +0200 Subject: [PATCH 07/20] Fix imports --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e72e37e028889..7e5d70a970827 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -7,7 +7,6 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import doc -from pandas.core.construction import array as pd_array from pandas.core.dtypes.common import ( is_array_like, is_hashable, @@ -24,6 +23,7 @@ from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com +from pandas.core.construction import array as pd_array from pandas.core.indexers import ( check_array_indexer, is_list_like_indexer, From 6d72a346770fc93778a83e171daceec52b60e6d4 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:19:58 +0200 Subject: [PATCH 08/20] Add asv --- asv_bench/benchmarks/algorithms.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..a96d9bc924308 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,4 +174,13 @@ def time_argsort(self, N): self.array.argsort() +class SortIndexSeries: + def setup(self): + N = 10 ** 5 + idx = pd.date_range(start="1/1/2000", periods=N, freq="s") + self.s = pd.Series(np.random.randn(N), index=idx) + + def time_sort_index(self): + self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 5ba54a6039d3981a4187b38e11b479e53f8dcdd1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:20:53 +0200 Subject: [PATCH 09/20] Run black --- asv_bench/benchmarks/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index a96d9bc924308..7afa97f9aa394 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -183,4 +183,5 @@ def setup(self): def time_sort_index(self): self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 276627019d8000792473742c0a9036cf59b5f3cb Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 00:28:24 +0200 Subject: [PATCH 10/20] Remove asv --- asv_bench/benchmarks/algorithms.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7afa97f9aa394..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,14 +174,4 @@ def time_argsort(self, N): self.array.argsort() -class SortIndexSeries: - def setup(self): - N = 10 ** 5 - idx = pd.date_range(start="1/1/2000", periods=N, freq="s") - self.s = pd.Series(np.random.randn(N), index=idx) - - def time_sort_index(self): - self.s.sort_index() - - from .pandas_vb_common import setup # noqa: F401 isort:skip From cb1312cf236a2b54b585e79e7fb6327104ce831e Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 01:46:10 +0200 Subject: [PATCH 11/20] Add requested changes --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/indexing.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 73892da2cbf71..4ce416a26e584 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -610,6 +610,7 @@ Performance improvements and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). +- Performance improvement in `indexing.check_bool_indexer` when `key` is a list (:issue:`33924`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7e5d70a970827..6955ef6e66998 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2168,10 +2168,12 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: # key might be object-dtype bool, check_array_indexer needs bool array result = np.asarray(result, dtype=bool) result = check_array_indexer(index, result) + elif not is_array_like(result): + # GH 33924 + # key may contain nan elements, check_array_indexer needs bool array + result = pd_array(result, dtype=bool) + result = check_array_indexer(index, result) else: - if not is_array_like(result): - # GH 33924 - result = pd_array(result, dtype=bool) result = check_array_indexer(index, result) return result From 74cb4495b5bd316cdd0d8f3fda170feca6e11509 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 02:49:25 +0200 Subject: [PATCH 12/20] Run black --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6955ef6e66998..1984a31c5a474 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2169,7 +2169,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: result = np.asarray(result, dtype=bool) result = check_array_indexer(index, result) elif not is_array_like(result): - # GH 33924 + # GH 33924 # key may contain nan elements, check_array_indexer needs bool array result = pd_array(result, dtype=bool) result = check_array_indexer(index, result) From 915dec8a0dfd1d32ca70fd424e761e074118b565 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 02:50:47 +0200 Subject: [PATCH 13/20] Delete newline --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 94e64c9cbaa06..b8322180f9e5b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -618,7 +618,6 @@ Performance improvements - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) - Performance improvement in `indexing.check_bool_indexer` when `key` is a list (:issue:`33924`) - .. --------------------------------------------------------------------------- .. _whatsnew_110.bug_fixes: From 6925c293ef5b7b2e96a41f8b605bbcade1207926 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 23:16:55 +0200 Subject: [PATCH 14/20] Fix whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b8322180f9e5b..191a1c90d7be4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -616,7 +616,7 @@ Performance improvements - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) -- Performance improvement in `indexing.check_bool_indexer` when `key` is a list (:issue:`33924`) +- Performance improvement in `indexing.check_bool_indexer` when `key` is a list (:issue:`33924`) .. --------------------------------------------------------------------------- From 0734a8752c6cfa301bba7333d8150bb03a550403 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 17:00:41 +0200 Subject: [PATCH 15/20] Add requested changes --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/indexing.py | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 191a1c90d7be4..e18fba697c449 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -616,7 +616,7 @@ Performance improvements - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) -- Performance improvement in `indexing.check_bool_indexer` when `key` is a list (:issue:`33924`) +- Performance improvement in `Dataframe[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1984a31c5a474..c2433b2a0a4ee 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2163,19 +2163,15 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: "indexer (index of the boolean Series and of " "the indexed object do not match)." ) - result = result.astype(bool)._values - elif is_object_dtype(key): + return result.astype(bool)._values + if is_object_dtype(key): # key might be object-dtype bool, check_array_indexer needs bool array result = np.asarray(result, dtype=bool) - result = check_array_indexer(index, result) elif not is_array_like(result): # GH 33924 # key may contain nan elements, check_array_indexer needs bool array result = pd_array(result, dtype=bool) - result = check_array_indexer(index, result) - else: - result = check_array_indexer(index, result) - return result + return check_array_indexer(indexer, result) def convert_missing_indexer(indexer): From 70a266c69a2d20bdff4bc975f6f50017e95b7d78 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 17:03:06 +0200 Subject: [PATCH 16/20] Fix --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..8f76a13b10a0b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values + return value.copy() if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From 8bed9b5caf42ffb07b66ad30be401cfc918c6a4e Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 17:03:36 +0200 Subject: [PATCH 17/20] Fix --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 8f76a13b10a0b..25312b180dba1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return value.copy() + return values.copy() if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From cfa6b9e6401712bc684d01d45e1e702e8641794d Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 17:04:42 +0200 Subject: [PATCH 18/20] Fix typo --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e18fba697c449..db186f2e7ddcd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -616,7 +616,7 @@ Performance improvements - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) -- Performance improvement in `Dataframe[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) +- Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) .. --------------------------------------------------------------------------- From 8707be4cb37693a637a9303d35a0998dc433dad3 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 17:19:38 +0200 Subject: [PATCH 19/20] Fix --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c2433b2a0a4ee..4abf5e2a0bdc3 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2171,7 +2171,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: # GH 33924 # key may contain nan elements, check_array_indexer needs bool array result = pd_array(result, dtype=bool) - return check_array_indexer(indexer, result) + return check_array_indexer(index, result) def convert_missing_indexer(indexer): From 2ce27c8aebf30eeb1679b7cde09c9579d9b3cb3f Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 01:37:51 +0200 Subject: [PATCH 20/20] Update asv --- asv_bench/benchmarks/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 7f3c24d5a2732..836d3ca8602ec 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -158,9 +158,9 @@ def time_boolean_rows_boolean(self): class DataFrameNumericIndexing: def setup(self): self.idx_dupe = np.array(range(30)) * 99 - self.df = DataFrame(np.random.randn(10000, 5)) + self.df = DataFrame(np.random.randn(100000, 5)) self.df_dup = concat([self.df, 2 * self.df, 3 * self.df]) - self.bool_indexer = [True] * 5000 + [False] * 5000 + self.bool_indexer = [True] * 50000 + [False] * 50000 def time_iloc_dups(self): self.df_dup.iloc[self.idx_dupe]