BUG/API (string dtype): return float dtype for series[str].rank() (#59768)

jorisvandenbossche · jorisvandenbossche · commit ea1e11800d3e · 2024-10-10T15:04:52.000+02:00
* BUG/API (string dtype): return float dtype for series[str].rank()

* update frame tests

* add whatsnew

* correct whatsnew note
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -102,6 +102,7 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1989,7 +1989,7 @@ def _rank(
         """
         See Series.rank.__doc__.
         """
-        return self._convert_int_result(
+        return self._convert_rank_result(
             self._rank_calc(
                 axis=axis,
                 method=method,
@@ -2291,6 +2291,9 @@ def _convert_bool_result(self, result):
     def _convert_int_result(self, result):
         return type(self)(result)
 
+    def _convert_rank_result(self, result):
+        return type(self)(result)
+
     def _str_count(self, pat: str, flags: int = 0):
         if flags:
             raise NotImplementedError(f"count not implemented with {flags=}")
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -30,6 +30,7 @@
 from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
 from pandas.core.arrays.arrow import ArrowExtensionArray
 from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.floating import Float64Dtype
 from pandas.core.arrays.integer import Int64Dtype
 from pandas.core.arrays.numeric import NumericDtype
 from pandas.core.arrays.string_ import (
@@ -388,6 +389,16 @@ def _convert_int_result(self, result):
 
         return Int64Dtype().__from_arrow__(result)
 
+    def _convert_rank_result(self, result):
+        if self.dtype.na_value is np.nan:
+            if isinstance(result, pa.Array):
+                result = result.to_numpy(zero_copy_only=False)
+            else:
+                result = result.to_numpy()
+            return result.astype("float64", copy=False)
+
+        return Float64Dtype().__from_arrow__(result)
+
     def _reduce(
         self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
     ):
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
@@ -6,15 +6,11 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.algos import (
     Infinity,
     NegInfinity,
 )
-from pandas.compat import HAS_PYARROW
 
-import pandas as pd
 from pandas import (
     DataFrame,
     Index,
@@ -474,23 +470,10 @@ def test_rank_inf_nans_na_option(
             ("top", False, [2.0, 3.0, 1.0, 4.0]),
         ],
     )
-    def test_rank_object_first(
-        self,
-        request,
-        frame_or_series,
-        na_option,
-        ascending,
-        expected,
-        using_infer_string,
-    ):
+    def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
         obj = frame_or_series(["foo", "foo", None, "foo"])
-        if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series):
-            request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
-
         result = obj.rank(method="first", na_option=na_option, ascending=ascending)
         expected = frame_or_series(expected)
-        if using_infer_string and isinstance(obj, Series):
-            expected = expected.astype("uint64")
         tm.assert_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -514,7 +497,9 @@ def test_rank_string_dtype(self, string_dtype_no_object):
         # GH#55362
         obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
         result = obj.rank(method="first")
-        exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
+        exp_dtype = (
+            "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64"
+        )
         if string_dtype_no_object.storage == "python":
             # TODO nullable string[python] should also return nullable Int64
             exp_dtype = "float64"
diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py
@@ -33,7 +33,8 @@ def ser():
         ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])],
         ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])],
         ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])],
-    ]
+    ],
+    ids=lambda x: x[0],
 )
 def results(request):
     return request.param
@@ -48,12 +49,29 @@ def results(request):
         "Int64",
         pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
         pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        "string[python]",
+        "str",
     ]
 )
 def dtype(request):
     return request.param
 
 
+def expected_dtype(dtype, method, pct=False):
+    exp_dtype = "float64"
+    # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]:
+    if dtype in ["string[pyarrow]"]:
+        exp_dtype = "Float64"
+    elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]:
+        if method == "average" or pct:
+            exp_dtype = "double[pyarrow]"
+        else:
+            exp_dtype = "uint64[pyarrow]"
+
+    return exp_dtype
+
+
 class TestSeriesRank:
     def test_rank(self, datetime_series):
         sp_stats = pytest.importorskip("scipy.stats")
@@ -241,12 +259,14 @@ def test_rank_signature(self):
         with pytest.raises(ValueError, match=msg):
             s.rank("average")
 
-    @pytest.mark.parametrize("dtype", [None, object])
-    def test_rank_tie_methods(self, ser, results, dtype):
+    def test_rank_tie_methods(self, ser, results, dtype, using_infer_string):
         method, exp = results
+        if dtype == "int64" or (not using_infer_string and dtype == "str"):
+            pytest.skip("int64/str does not support NaN")
+
         ser = ser if dtype is None else ser.astype(dtype)
         result = ser.rank(method=method)
-        tm.assert_series_equal(result, Series(exp))
+        tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method)))
 
     @pytest.mark.parametrize("ascending", [True, False])
     @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
@@ -346,25 +366,35 @@ def test_rank_methods_series(self, method, op, value):
         ],
     )
     def test_rank_dense_method(self, dtype, ser, exp):
+        if ser[0] < 0 and dtype.startswith("str"):
+            exp = exp[::-1]
         s = Series(ser).astype(dtype)
         result = s.rank(method="dense")
-        expected = Series(exp).astype(result.dtype)
+        expected = Series(exp).astype(expected_dtype(dtype, "dense"))
         tm.assert_series_equal(result, expected)
 
-    def test_rank_descending(self, ser, results, dtype):
+    def test_rank_descending(self, ser, results, dtype, using_infer_string):
         method, _ = results
-        if "i" in dtype:
+        if dtype == "int64" or (not using_infer_string and dtype == "str"):
             s = ser.dropna()
         else:
             s = ser.astype(dtype)
 
         res = s.rank(ascending=False)
-        expected = (s.max() - s).rank()
-        tm.assert_series_equal(res, expected)
+        if dtype.startswith("str"):
+            expected = (s.astype("float64").max() - s.astype("float64")).rank()
+        else:
+            expected = (s.max() - s).rank()
+        tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average")))
 
-        expected = (s.max() - s).rank(method=method)
+        if dtype.startswith("str"):
+            expected = (s.astype("float64").max() - s.astype("float64")).rank(
+                method=method
+            )
+        else:
+            expected = (s.max() - s).rank(method=method)
         res2 = s.rank(method=method, ascending=False)
-        tm.assert_series_equal(res2, expected)
+        tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method)))
 
     def test_rank_int(self, ser, results):
         method, exp = results
@@ -421,9 +451,11 @@ def test_rank_ea_small_values(self):
     ],
 )
 def test_rank_dense_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="dense", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -442,9 +474,11 @@ def test_rank_dense_pct(dtype, ser, exp):
     ],
 )
 def test_rank_min_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="min", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -463,9 +497,11 @@ def test_rank_min_pct(dtype, ser, exp):
     ],
 )
 def test_rank_max_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="max", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -484,9 +520,11 @@ def test_rank_max_pct(dtype, ser, exp):
     ],
 )
 def test_rank_average_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="average", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -505,9 +543,11 @@ def test_rank_average_pct(dtype, ser, exp):
     ],
 )
 def test_rank_first_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="first", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True))
     tm.assert_series_equal(result, expected)