From 9dddb80915f24f3c915ae0203af90e5a8f41c84f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 15:08:47 +0100 Subject: [PATCH 1/6] BUG: rank raising for arrow string dtypes --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/arrays/arrow/array.py | 31 ++++++++++++++++++++----- pandas/core/arrays/string_arrow.py | 23 ++++++++++++++++++ pandas/tests/frame/methods/test_rank.py | 11 +++++++++ 4 files changed, 60 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..a2fa7cf32746b 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`TODO`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4b79d0dbb683e..37ebb69f1d73c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1708,7 +1708,7 @@ def __setitem__(self, key, value) -> None: data = pa.chunked_array([data]) self._pa_array = data - def _rank( + def _rank_calc( self, *, axis: AxisInt = 0, @@ -1717,9 +1717,6 @@ def _rank( ascending: bool = True, pct: bool = False, ): - """ - See Series.rank.__doc__. - """ if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, @@ -1734,7 +1731,7 @@ def _rank( else: pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) - return type(self)(result) + return result data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" @@ -1773,7 +1770,29 @@ def _rank( divisor = pc.count(result) result = pc.divide(result, divisor) - return type(self)(result) + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6262055827428..b0ddd082e9c8e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -53,6 +53,7 @@ from collections.abc import Sequence from pandas._typing import ( + AxisInt, Dtype, Scalar, npt, @@ -501,6 +502,28 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..b037169a8241d 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -488,3 +488,14 @@ def test_rank_mixed_axis_zero(self, data, expected): df.rank() result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH# + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) From 31a3d305735f2b4bdabf0d654cdc7c9575f5e486 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 11:40:25 +0200 Subject: [PATCH 2/6] Update v2.1.2.rst --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index a2fa7cf32746b..8954f52ecdbac 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`TODO`) +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: From 06d70e07aefb014d5ca48434d10ebeeff24580fe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 11:40:43 +0200 Subject: [PATCH 3/6] Update test_rank.py --- pandas/tests/frame/methods/test_rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index b037169a8241d..2d27d725e9d3f 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -494,7 +494,7 @@ def test_rank_mixed_axis_zero(self, data, expected): [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], ) def test_rank_string_dtype(self, dtype, exp_dtype): - # GH# + # GH#55362 obj = Series(["foo", "foo", None, "foo"], dtype=dtype) result = obj.rank(method="first") expected = Series([1, 2, None, 3], dtype=exp_dtype) From 63e9d9526d680d1e95c0fe0c33d0c9e13d069d5b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 17:59:49 +0200 Subject: [PATCH 4/6] Update test_rank.py --- pandas/tests/frame/methods/test_rank.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 2d27d725e9d3f..b5b5e42691e59 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -495,6 +495,7 @@ def test_rank_mixed_axis_zero(self, data, expected): ) def test_rank_string_dtype(self, dtype, exp_dtype): # GH#55362 + pytest.importorskip("pyarrow") obj = Series(["foo", "foo", None, "foo"], dtype=dtype) result = obj.rank(method="first") expected = Series([1, 2, None, 3], dtype=exp_dtype) From d447e6b5eeedbebdc447228756b01105b4fc2f1f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 19:58:29 +0200 Subject: [PATCH 5/6] Fix --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b0ddd082e9c8e..77b3c6cfca9fb 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -607,7 +607,7 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): - result = result.to_numpy() + result = result.to_numpy(zero_copy_only=False) if result.dtype == np.int32: result = result.astype(np.int64) return result From fced7ef96f2367d7ad9b4a26079b09ef113d9ca6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 22:05:02 +0200 Subject: [PATCH 6/6] Fix --- pandas/core/arrays/string_arrow.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 77b3c6cfca9fb..434e6d3c5480c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -607,7 +607,10 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): - result = result.to_numpy(zero_copy_only=False) + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result