From f7a21c2e33b20ff364b905ca4e697aaa87d3ec59 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 14 Dec 2022 19:29:43 -0500 Subject: [PATCH 1/9] ArrowExtensionArray._rank --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 65 +++++++++++++++++++++++- pandas/core/arrays/base.py | 2 - pandas/tests/series/methods/test_rank.py | 30 +++++++---- 4 files changed, 86 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 0b29843735189..668a4c8874457 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -734,6 +734,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`, :issue:`49577`) - Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) +- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`#####`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 254ff8894b36c..e60f79fdde035 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -11,6 +11,7 @@ from pandas._typing import ( ArrayLike, + AxisInt, Dtype, FillnaOptions, Iterator, @@ -22,6 +23,7 @@ from pandas.compat import ( pa_version_under6p0, pa_version_under7p0, + pa_version_under9p0, ) from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs @@ -949,7 +951,68 @@ def _indexing_key_to_indices( indices = np.arange(n)[key] return indices - # TODO: redefine _rank using pc.rank with pyarrow 9.0 + def _rank( + self: ArrowExtensionArrayT, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ) -> ArrowExtensionArrayT: + """ + See Series.rank.__doc__. + """ + if axis != 0: + raise NotImplementedError + + if ( + pa_version_under9p0 + # as of version 10, pyarrow does not support an "average" method + or method not in ("min", "max", "first", "dense") + ): + from pandas.core.algorithms import rank + + ranked = rank( + self.to_numpy(), + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + result = pa.array(ranked, type=pa.float64(), from_pandas=True) + return type(self)(result) + + sort_keys = "ascending" if ascending else "descending" + + if na_option == "top": + null_placement = "at_start" + else: + null_placement = "at_end" + + result = pc.rank( + self._data.combine_chunks(), + sort_keys=sort_keys, + null_placement=null_placement, + tiebreaker=method, + ) + if not pa.types.is_floating(result.type): + result = result.cast(pa.float64()) + + if na_option == "keep": + mask = pc.is_null(self._data) + null = pa.scalar(None, type=self._data.type) + result = pc.if_else(mask, null, result) + + if pct: + if method == "dense": + divisor = pc.max(result) + else: + divisor = pc.count(result) + result = pc.divide(result, divisor) + + return type(self)(result) def _quantile( self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c36728391ba21..6954c97007d23 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1576,8 +1576,6 @@ def _rank( if axis != 0: raise NotImplementedError - # TODO: we only have tests that get here with dt64 and td64 - # TODO: all tests that get here use the defaults for all the kwds return rank( self, axis=axis, diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 3a66bf1adf25b..8af0b5ddd4685 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -11,6 +11,7 @@ import pandas.util._test_decorators as td from pandas import ( + NA, NaT, Series, Timestamp, @@ -38,6 +39,21 @@ def results(request): return request.param +@pytest.fixture( + params=[ + "object", + "float64", + "int64", + "Float64", + "Int64", + "float64[pyarrow]", + "int64[pyarrow]", + ] +) +def dtype(request): + return request.param + + class TestSeriesRank: @td.skip_if_no_scipy def test_rank(self, datetime_series): @@ -238,13 +254,16 @@ def test_rank_tie_methods(self, ser, results, dtype): [ ("object", None, Infinity(), NegInfinity()), ("float64", np.nan, np.inf, -np.inf), + ("Float64", NA, np.inf, -np.inf), + ("float64[pyarrow]", NA, np.inf, -np.inf), ], ) def test_rank_tie_methods_on_infs_nans( self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf ): - chunk = 3 + exp_dtype = dtype if dtype == "float64[pyarrow]" else "float64" + chunk = 3 in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk iseries = Series(in_arr, dtype=dtype) exp_ranks = { @@ -264,7 +283,7 @@ def test_rank_tie_methods_on_infs_nans( expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) result = iseries.rank(method=method, na_option=na_option, ascending=ascending) - tm.assert_series_equal(result, Series(expected, dtype="float64")) + tm.assert_series_equal(result, Series(expected, dtype=exp_dtype)) def test_rank_desc_mix_nans_infs(self): # GH 19538 @@ -299,7 +318,6 @@ def test_rank_methods_series(self, method, op, value): expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.parametrize( "ser, exp", [ @@ -319,7 +337,6 @@ def test_rank_dense_method(self, dtype, ser, exp): expected = Series(exp).astype(result.dtype) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) def test_rank_descending(self, ser, results, dtype): method, _ = results if "i" in dtype: @@ -365,7 +382,6 @@ def test_rank_modify_inplace(self): # GH15630, pct should be on 100% basis when method='dense' -@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.parametrize( "ser, exp", [ @@ -387,7 +403,6 @@ def test_rank_dense_pct(dtype, ser, exp): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.parametrize( "ser, exp", [ @@ -409,7 +424,6 @@ def test_rank_min_pct(dtype, ser, exp): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.parametrize( "ser, exp", [ @@ -431,7 +445,6 @@ def test_rank_max_pct(dtype, ser, exp): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.parametrize( "ser, exp", [ @@ -453,7 +466,6 @@ def test_rank_average_pct(dtype, ser, exp): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dtype", ["f8", "i8"]) @pytest.mark.parametrize( "ser, exp", [ From 743d90a097f8b5b49f8d73bc09d49acc7b9ecbd9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 14 Dec 2022 19:56:09 -0500 Subject: [PATCH 2/9] gh ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 668a4c8874457..cd61e893ea84b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -734,7 +734,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`, :issue:`49577`) - Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) -- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`#####`) +- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) From 708878e9d461cd9c145060dbf8f8ad4d96d5c310 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 14 Dec 2022 21:10:57 -0500 Subject: [PATCH 3/9] skip pyarrow tests if not installed --- pandas/tests/series/methods/test_rank.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 8af0b5ddd4685..baa86dd14fbfb 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -46,8 +46,8 @@ def results(request): "int64", "Float64", "Int64", - "float64[pyarrow]", - "int64[pyarrow]", + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), ] ) def dtype(request): @@ -255,7 +255,9 @@ def test_rank_tie_methods(self, ser, results, dtype): ("object", None, Infinity(), NegInfinity()), ("float64", np.nan, np.inf, -np.inf), ("Float64", NA, np.inf, -np.inf), - ("float64[pyarrow]", NA, np.inf, -np.inf), + pytest.param( + "float64[pyarrow]", NA, np.inf, -np.inf, marks=td.skip_if_no("pyarrow") + ), ], ) def test_rank_tie_methods_on_infs_nans( From 85226f5f1799d9e8d602da1d9c2980d03b2f14cd Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 15 Dec 2022 20:55:09 -0500 Subject: [PATCH 4/9] defer to pc.rank output types --- pandas/core/arrays/arrow/array.py | 6 +++--- pandas/tests/series/methods/test_rank.py | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e60f79fdde035..7d674efacde9d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -997,15 +997,15 @@ def _rank( null_placement=null_placement, tiebreaker=method, ) - if not pa.types.is_floating(result.type): - result = result.cast(pa.float64()) if na_option == "keep": mask = pc.is_null(self._data) - null = pa.scalar(None, type=self._data.type) + null = pa.scalar(None, type=result.type) result = pc.if_else(mask, null, result) if pct: + if not pa.types.is_floating(result.type): + result = result.cast(pa.float64()) if method == "dense": divisor = pc.max(result) else: diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index baa86dd14fbfb..6af835215a837 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -263,7 +263,13 @@ def test_rank_tie_methods(self, ser, results, dtype): def test_rank_tie_methods_on_infs_nans( self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf ): - exp_dtype = dtype if dtype == "float64[pyarrow]" else "float64" + if dtype == "float64[pyarrow]": + if method == "average": + exp_dtype = "float64[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + else: + exp_dtype = "float64" chunk = 3 in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk From 5d1a533f7c8dc24831a50534602025adb260bd1d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 15 Dec 2022 22:22:45 -0500 Subject: [PATCH 5/9] fix test --- pandas/tests/series/methods/test_rank.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 6af835215a837..be57ba348760f 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -256,7 +256,11 @@ def test_rank_tie_methods(self, ser, results, dtype): ("float64", np.nan, np.inf, -np.inf), ("Float64", NA, np.inf, -np.inf), pytest.param( - "float64[pyarrow]", NA, np.inf, -np.inf, marks=td.skip_if_no("pyarrow") + "float64[pyarrow]", + NA, + np.inf, + -np.inf, + marks=td.skip_if_no("pyarrow", min_version="9.0"), ), ], ) From dee82ee6a67c5ced6f86d1ec4387a00e18587da5 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 15 Dec 2022 22:36:51 -0500 Subject: [PATCH 6/9] more consistency --- pandas/core/arrays/arrow/array.py | 6 +++++- pandas/tests/series/methods/test_rank.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7d674efacde9d..30f4782e3dff7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -981,7 +981,11 @@ def _rank( ascending=ascending, pct=pct, ) - result = pa.array(ranked, type=pa.float64(), from_pandas=True) + if method != "average" and not pct: + pa_type = pa.uint64() + else: + pa_type = pa.float64() + result = pa.array(ranked, type=pa_type, from_pandas=True) return type(self)(result) sort_keys = "ascending" if ascending else "descending" diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index be57ba348760f..3183ba24bb88d 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -260,7 +260,7 @@ def test_rank_tie_methods(self, ser, results, dtype): NA, np.inf, -np.inf, - marks=td.skip_if_no("pyarrow", min_version="9.0"), + marks=td.skip_if_no("pyarrow"), ), ], ) From 6ba998ee8e0ebf1893d45c145f9e694e126dada6 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 16 Dec 2022 05:47:07 -0500 Subject: [PATCH 7/9] use pyarrow for method="average" --- pandas/core/arrays/arrow/array.py | 51 ++++++++++++++++--------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 30f4782e3dff7..44b24dc93ae11 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -952,54 +952,46 @@ def _indexing_key_to_indices( return indices def _rank( - self: ArrowExtensionArrayT, + self, *, axis: AxisInt = 0, method: str = "average", na_option: str = "keep", ascending: bool = True, pct: bool = False, - ) -> ArrowExtensionArrayT: + ): """ See Series.rank.__doc__. """ - if axis != 0: - raise NotImplementedError - - if ( - pa_version_under9p0 - # as of version 10, pyarrow does not support an "average" method - or method not in ("min", "max", "first", "dense") - ): - from pandas.core.algorithms import rank - - ranked = rank( - self.to_numpy(), + if pa_version_under9p0: + ranked = super().rank( axis=axis, method=method, na_option=na_option, ascending=ascending, pct=pct, ) - if method != "average" and not pct: - pa_type = pa.uint64() - else: + # keep dtypes consistent with the implementation below + if method == "average" or pct: pa_type = pa.float64() + else: + pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) return type(self)(result) - sort_keys = "ascending" if ascending else "descending" + if axis != 0: + raise NotImplementedError - if na_option == "top": - null_placement = "at_start" - else: - null_placement = "at_end" + data = self._data.combine_chunks() + sort_keys = "ascending" if ascending else "descending" + null_placement = "at_start" if na_option == "top" else "at_end" + tiebreaker = "min" if method == "average" else method result = pc.rank( - self._data.combine_chunks(), + data, sort_keys=sort_keys, null_placement=null_placement, - tiebreaker=method, + tiebreaker=tiebreaker, ) if na_option == "keep": @@ -1007,6 +999,17 @@ def _rank( null = pa.scalar(None, type=result.type) result = pc.if_else(mask, null, result) + if method == "average": + result_max = pc.rank( + data, + sort_keys=sort_keys, + null_placement=null_placement, + tiebreaker="max", + ) + result_max = result_max.cast(pa.float64()) + result_min = result.cast(pa.float64()) + result = pc.divide(pc.add(result_min, result_max), 2) + if pct: if not pa.types.is_floating(result.type): result = result.cast(pa.float64()) From a648a90cfa239cdd2c290abbcd64ad5fad376e1e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 16 Dec 2022 06:52:23 -0500 Subject: [PATCH 8/9] fix call to super --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 44b24dc93ae11..6bf7dc876a092 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -964,7 +964,7 @@ def _rank( See Series.rank.__doc__. """ if pa_version_under9p0: - ranked = super().rank( + ranked = super()._rank( axis=axis, method=method, na_option=na_option, From bd90163a55fbf5638a89222599286545cfda2841 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 16 Dec 2022 18:48:16 -0500 Subject: [PATCH 9/9] call super with axis != 0 --- pandas/core/arrays/arrow/array.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6bf7dc876a092..1d3c31a129647 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -963,7 +963,7 @@ def _rank( """ See Series.rank.__doc__. """ - if pa_version_under9p0: + if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, method=method, @@ -979,9 +979,6 @@ def _rank( result = pa.array(ranked, type=pa_type, from_pandas=True) return type(self)(result) - if axis != 0: - raise NotImplementedError - data = self._data.combine_chunks() sort_keys = "ascending" if ascending else "descending" null_placement = "at_start" if na_option == "top" else "at_end"