diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03b3a6b55dff6..01c2ed3821d7a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -102,6 +102,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bd94447f0cd80..39cae5b8e2683 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1999,7 +1999,7 @@ def _rank( """ See Series.rank.__doc__. """ - return self._convert_int_result( + return self._convert_rank_result( self._rank_calc( axis=axis, method=method, @@ -2318,6 +2318,9 @@ def _convert_bool_result(self, result): def _convert_int_result(self, result): return type(self)(result) + def _convert_rank_result(self, result): + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0) -> Self: if flags: raise NotImplementedError(f"count not implemented with {flags=}") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f446cc5bde147..75bb1f8fb1a65 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -29,6 +29,7 @@ from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import ( @@ -395,6 +396,16 @@ def _convert_int_result(self, result): return Int64Dtype().__from_arrow__(result) + def _convert_rank_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + return result.astype("float64", copy=False) + + return Float64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index c1cdeaa6c10dd..6c6c208ee0c78 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,15 +6,11 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.algos import ( Infinity, NegInfinity, ) -from pandas.compat import HAS_PYARROW -import pandas as pd from pandas import ( DataFrame, Index, @@ -467,23 +463,10 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first( - self, - request, - frame_or_series, - na_option, - ascending, - expected, - using_infer_string, - ): + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) - if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string and isinstance(obj, Series): - expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -507,7 +490,9 @@ def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") - exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) if string_dtype_no_object.storage == "python": # TODO nullable string[python] should also return nullable Int64 exp_dtype = "float64" diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 2d7fde130ce70..7c6a7893ba3a0 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,7 +33,8 @@ def ser(): ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], - ] + ], + ids=lambda x: x[0], ) def results(request): return request.param @@ -48,12 +49,29 @@ def results(request): "Int64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[python]", + "str", ] ) def dtype(request): return request.param +def expected_dtype(dtype, method, pct=False): + exp_dtype = "float64" + # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]: + if dtype in ["string[pyarrow]"]: + exp_dtype = "Float64" + elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]: + if method == "average" or pct: + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + + return exp_dtype + + class TestSeriesRank: def test_rank(self, datetime_series): sp_stats = pytest.importorskip("scipy.stats") @@ -251,12 +269,14 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_tie_methods(self, ser, results, dtype): + def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results + if dtype == "int64" or (not using_infer_string and dtype == "str"): + pytest.skip("int64/str does not support NaN") + ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp)) + tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( @@ -357,25 +377,35 @@ def test_rank_methods_series(self, rank_method, op, value): ], ) def test_rank_dense_method(self, dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense") - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype): + def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if "i" in dtype: + if dtype == "int64" or (not using_infer_string and dtype == "str"): s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) - expected = (s.max() - s).rank() - tm.assert_series_equal(res, expected) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank() + else: + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) - expected = (s.max() - s).rank(method=method) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank( + method=method + ) + else: + expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) - tm.assert_series_equal(res2, expected) + tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): method, exp = results @@ -432,9 +462,11 @@ def test_rank_ea_small_values(self): ], ) def test_rank_dense_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True)) tm.assert_series_equal(result, expected) @@ -453,9 +485,11 @@ def test_rank_dense_pct(dtype, ser, exp): ], ) def test_rank_min_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="min", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True)) tm.assert_series_equal(result, expected) @@ -474,9 +508,11 @@ def test_rank_min_pct(dtype, ser, exp): ], ) def test_rank_max_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="max", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True)) tm.assert_series_equal(result, expected) @@ -495,9 +531,11 @@ def test_rank_max_pct(dtype, ser, exp): ], ) def test_rank_average_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True)) tm.assert_series_equal(result, expected) @@ -516,9 +554,11 @@ def test_rank_average_pct(dtype, ser, exp): ], ) def test_rank_first_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="first", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True)) tm.assert_series_equal(result, expected)