Skip to content

BUG/API (string dtype): return float dtype for series[str].rank() #59768

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ Conversion

Strings
^^^^^^^
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1999,7 +1999,7 @@ def _rank(
"""
See Series.rank.__doc__.
"""
return self._convert_int_result(
return self._convert_rank_result(
self._rank_calc(
axis=axis,
method=method,
Expand Down Expand Up @@ -2318,6 +2318,9 @@ def _convert_bool_result(self, result):
def _convert_int_result(self, result):
return type(self)(result)

def _convert_rank_result(self, result):
return type(self)(result)

def _str_count(self, pat: str, flags: int = 0) -> Self:
if flags:
raise NotImplementedError(f"count not implemented with {flags=}")
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import Float64Dtype
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.numeric import NumericDtype
from pandas.core.arrays.string_ import (
Expand Down Expand Up @@ -395,6 +396,16 @@ def _convert_int_result(self, result):

return Int64Dtype().__from_arrow__(result)

def _convert_rank_result(self, result):
if self.dtype.na_value is np.nan:
if isinstance(result, pa.Array):
result = result.to_numpy(zero_copy_only=False)
else:
result = result.to_numpy()
return result.astype("float64", copy=False)

return Float64Dtype().__from_arrow__(result)

def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
Expand Down
23 changes: 4 additions & 19 deletions pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,11 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.algos import (
Infinity,
NegInfinity,
)
from pandas.compat import HAS_PYARROW

import pandas as pd
from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -467,23 +463,10 @@ def test_rank_inf_nans_na_option(
("top", False, [2.0, 3.0, 1.0, 4.0]),
],
)
def test_rank_object_first(
self,
request,
frame_or_series,
na_option,
ascending,
expected,
using_infer_string,
):
def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
obj = frame_or_series(["foo", "foo", None, "foo"])
if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series):
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))

result = obj.rank(method="first", na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
if using_infer_string and isinstance(obj, Series):
expected = expected.astype("uint64")
tm.assert_equal(result, expected)

@pytest.mark.parametrize(
Expand All @@ -507,7 +490,9 @@ def test_rank_string_dtype(self, string_dtype_no_object):
# GH#55362
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
result = obj.rank(method="first")
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
exp_dtype = (
"Float64" if string_dtype_no_object == "string[pyarrow]" else "float64"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this change the string[python] case to float64 from Float64? I guess yet another discussion for PDEP-13, but shouldn't the if string_dtype_no_object.na_value invariant still remain?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear this PR doesn't "change" anything regarding the flavor of dtypes (just changing int to float) and is only testing what we have right now, but you are certainly right this is inconsistent.

The technical reason is (AFAIK) that we just never implemented (yet) rank for our nullable dtypes in general (so also the nullable int returns numpy floats instead of nullable float). This can be considered as a missing part of the implementation.
And at some point we added a custom rank implementation on ArrowExtensionArray using pyarrow compute, and because of the subclass structure, the ArrowStringArray (i.e. "string[pyarrow]") inherits that but converts the pyarrow result to a nullable dtype (and "string[python]" array class does not inherit from this, so uses the base class rank implementation with always returns numpy types).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK...very confusing. Thanks for clarifying

)
if string_dtype_no_object.storage == "python":
# TODO nullable string[python] should also return nullable Int64
exp_dtype = "float64"
Expand Down
72 changes: 56 additions & 16 deletions pandas/tests/series/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def ser():
["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])],
["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])],
["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])],
]
],
ids=lambda x: x[0],
)
def results(request):
return request.param
Expand All @@ -48,12 +49,29 @@ def results(request):
"Int64",
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
"string[python]",
"str",
]
)
def dtype(request):
return request.param


def expected_dtype(dtype, method, pct=False):
exp_dtype = "float64"
# elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]:
if dtype in ["string[pyarrow]"]:
exp_dtype = "Float64"
elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]:
if method == "average" or pct:
exp_dtype = "double[pyarrow]"
else:
exp_dtype = "uint64[pyarrow]"

return exp_dtype


class TestSeriesRank:
def test_rank(self, datetime_series):
sp_stats = pytest.importorskip("scipy.stats")
Expand Down Expand Up @@ -251,12 +269,14 @@ def test_rank_signature(self):
with pytest.raises(ValueError, match=msg):
s.rank("average")

@pytest.mark.parametrize("dtype", [None, object])
def test_rank_tie_methods(self, ser, results, dtype):
def test_rank_tie_methods(self, ser, results, dtype, using_infer_string):
method, exp = results
if dtype == "int64" or (not using_infer_string and dtype == "str"):
pytest.skip("int64/str does not support NaN")

ser = ser if dtype is None else ser.astype(dtype)
result = ser.rank(method=method)
tm.assert_series_equal(result, Series(exp))
tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method)))

@pytest.mark.parametrize("na_option", ["top", "bottom", "keep"])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -357,25 +377,35 @@ def test_rank_methods_series(self, rank_method, op, value):
],
)
def test_rank_dense_method(self, dtype, ser, exp):
if ser[0] < 0 and dtype.startswith("str"):
exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="dense")
expected = Series(exp).astype(result.dtype)
expected = Series(exp).astype(expected_dtype(dtype, "dense"))
tm.assert_series_equal(result, expected)

def test_rank_descending(self, ser, results, dtype):
def test_rank_descending(self, ser, results, dtype, using_infer_string):
method, _ = results
if "i" in dtype:
if dtype == "int64" or (not using_infer_string and dtype == "str"):
s = ser.dropna()
else:
s = ser.astype(dtype)

res = s.rank(ascending=False)
expected = (s.max() - s).rank()
tm.assert_series_equal(res, expected)
if dtype.startswith("str"):
expected = (s.astype("float64").max() - s.astype("float64")).rank()
else:
expected = (s.max() - s).rank()
tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average")))

expected = (s.max() - s).rank(method=method)
if dtype.startswith("str"):
expected = (s.astype("float64").max() - s.astype("float64")).rank(
method=method
)
else:
expected = (s.max() - s).rank(method=method)
res2 = s.rank(method=method, ascending=False)
tm.assert_series_equal(res2, expected)
tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method)))

def test_rank_int(self, ser, results):
method, exp = results
Expand Down Expand Up @@ -432,9 +462,11 @@ def test_rank_ea_small_values(self):
],
)
def test_rank_dense_pct(dtype, ser, exp):
if ser[0] < 0 and dtype.startswith("str"):
exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="dense", pct=True)
expected = Series(exp).astype(result.dtype)
expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True))
tm.assert_series_equal(result, expected)


Expand All @@ -453,9 +485,11 @@ def test_rank_dense_pct(dtype, ser, exp):
],
)
def test_rank_min_pct(dtype, ser, exp):
if ser[0] < 0 and dtype.startswith("str"):
exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="min", pct=True)
expected = Series(exp).astype(result.dtype)
expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True))
tm.assert_series_equal(result, expected)


Expand All @@ -474,9 +508,11 @@ def test_rank_min_pct(dtype, ser, exp):
],
)
def test_rank_max_pct(dtype, ser, exp):
if ser[0] < 0 and dtype.startswith("str"):
exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="max", pct=True)
expected = Series(exp).astype(result.dtype)
expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True))
tm.assert_series_equal(result, expected)


Expand All @@ -495,9 +531,11 @@ def test_rank_max_pct(dtype, ser, exp):
],
)
def test_rank_average_pct(dtype, ser, exp):
if ser[0] < 0 and dtype.startswith("str"):
exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="average", pct=True)
expected = Series(exp).astype(result.dtype)
expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True))
tm.assert_series_equal(result, expected)


Expand All @@ -516,9 +554,11 @@ def test_rank_average_pct(dtype, ser, exp):
],
)
def test_rank_first_pct(dtype, ser, exp):
if ser[0] < 0 and dtype.startswith("str"):
exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="first", pct=True)
expected = Series(exp).astype(result.dtype)
expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True))
tm.assert_series_equal(result, expected)


Expand Down
Loading