diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b6d5493aefaa9..607a14c696578 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -212,6 +212,7 @@ Numeric - Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`) - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`) - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) +- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) Conversion ^^^^^^^^^^ diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4cddd49381a83..d97957eea0543 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1022,16 +1022,19 @@ def rank_2d( ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values ndarray[int64_t, ndim=2] argsorted + ndarray[uint8_t, ndim=2] mask rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 + int64_t idx bint keep_na = False float64_t count = 0.0 - bint condition, skip_condition + bint condition, check_mask tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' + check_mask = rank_t is not uint64_t if axis == 0: values = np.asarray(in_arr).T.copy() @@ -1067,6 +1070,8 @@ def rank_2d( mask = values == NPY_NAT np.putmask(values, mask, nan_value) + else: + mask = np.zeros_like(values, dtype=bool) n, k = (values).shape ranks = np.empty((n, k), dtype='f8') @@ -1099,43 +1104,35 @@ def rank_2d( argsorted = _as.astype('i8') for i in range(n): - if rank_t is object: - dups = sum_ranks = infs = 0 - else: - dups = sum_ranks = 0 + dups = sum_ranks = infs = 0 total_tie_count = 0 count = 0.0 for j in range(k): - if rank_t is not object: - sum_ranks += j + 1 - dups += 1 - val = values[i, j] - - if rank_t is not uint64_t: - if rank_t is object: - skip_condition = (val is nan_value) and keep_na - else: - skip_condition = (val == nan_value) and keep_na - if skip_condition: - ranks[i, argsorted[i, j]] = NaN - - if rank_t is object: - infs += 1 - - continue + idx = argsorted[i, j] + if keep_na and check_mask and mask[i, idx]: + ranks[i, idx] = NaN + infs += 1 + continue count += 1.0 - if rank_t is object: - sum_ranks += (j - infs) + 1 - dups += 1 + sum_ranks += (j - infs) + 1 + dups += 1 if rank_t is object: - condition = j == k - 1 or are_diff(values[i, j + 1], val) + condition = ( + j == k - 1 or + are_diff(values[i, j + 1], val) or + (keep_na and check_mask and mask[i, argsorted[i, j + 1]]) + ) else: - condition = j == k - 1 or values[i, j + 1] != val + condition = ( + j == k - 1 or + values[i, j + 1] != val or + (keep_na and check_mask and mask[i, argsorted[i, j + 1]]) + ) if condition: if tiebreak == TIEBREAK_AVERAGE: diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index bab2db3192b4a..991a91275ae1d 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._libs import iNaT +from pandas._libs.algos import Infinity, NegInfinity import pandas.util._test_decorators as td from pandas import DataFrame, Series @@ -329,3 +331,116 @@ def test_pct_max_many_rows(self): ) result = df.rank(pct=True).max() assert (result == 1).all() + + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + pytest.param( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + marks=pytest.mark.xfail( + reason="iNaT is equivalent to minimum value of dtype" + "int64 pending issue GH#16674" + ), + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ], + ) + def test_rank_inf_and_nan(self, contents, dtype): + dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, + } + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + values = np.array(contents, dtype=dtype) + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + # shuffle the testing array and expected results in the same way + random_order = np.random.permutation(len(values)) + df = DataFrame({"a": values[random_order]}) + expected = DataFrame({"a": exp_order[random_order]}, dtype="float64") + result = df.rank() + tm.assert_frame_equal(result, expected) + + def test_df_series_inf_nan_consistency(self): + # GH#32593 + index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10] + col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] + col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] + df = DataFrame( + data={ + "col1": col1, + "col2": col2, + }, + index=index, + dtype="f8", + ) + df_result = df.rank() + + series_result = df.copy() + series_result["col1"] = df["col1"].rank() + series_result["col2"] = df["col2"].rank() + + tm.assert_frame_equal(df_result, series_result) + + def test_rank_both_inf(self): + # GH#32593 + df = DataFrame({"a": [-np.inf, 0, np.inf]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0]}) + result = df.rank() + tm.assert_frame_equal(result, expected)