diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f992d6aa09ead..8cb3f7a2e6032 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -137,7 +137,8 @@ Timezones Numeric ^^^^^^^ -- +- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`) +- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`) - Conversion diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4efc30e40654c..a026cbe447c19 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1372,26 +1372,29 @@ def rank_2d( Fast NaN-friendly version of ``scipy.stats.rankdata``. """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - Py_ssize_t infs - ndarray[float64_t, ndim=2] ranks + Py_ssize_t k, n, col + float64_t[::1, :] out # Column-major so columns are contiguous + int64_t[::1, :] grp_sizes + const intp_t[:] labels ndarray[rank_t, ndim=2] values - ndarray[intp_t, ndim=2] argsort_indexer - ndarray[uint8_t, ndim=2] mask - rank_t val, nan_fill_val - float64_t count, sum_ranks = 0.0 - int tiebreak = 0 - int64_t idx - bint check_mask, condition, keep_na, nans_rank_highest + rank_t[:, :] masked_vals + intp_t[:, :] sort_indexer + uint8_t[:, :] mask + TiebreakEnumType tiebreak + bint check_mask, keep_na, nans_rank_highest + rank_t nan_fill_val tiebreak = tiebreakers[ties_method] + if tiebreak == TIEBREAK_FIRST: + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING keep_na = na_option == 'keep' # For cases where a mask is not possible, we can avoid mask checks check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) - if axis == 0: + if axis == 1: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() @@ -1403,99 +1406,62 @@ def rank_2d( nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + if rank_t is object: - mask = missing.isnaobj2d(values) + mask = missing.isnaobj2d(values).view(np.uint8) elif rank_t is float64_t: - mask = np.isnan(values) + mask = np.isnan(values).view(np.uint8) # int64 and datetimelike else: - mask = values == NPY_NAT - + mask = (values == NPY_NAT).view(np.uint8) np.putmask(values, mask, nan_fill_val) else: - mask = np.zeros_like(values, dtype=bool) + mask = np.zeros_like(values, dtype=np.uint8) + + if nans_rank_highest: + order = (values, mask) + else: + order = (values, ~np.asarray(mask)) n, k = (values).shape - ranks = np.empty((n, k), dtype='f8') + out = np.empty((n, k), dtype='f8', order='F') + grp_sizes = np.ones((n, k), dtype='i8', order='F') + labels = np.zeros(n, dtype=np.intp) - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - argsort_indexer = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + # lexsort is slower, so only use if we need to worry about the mask + if check_mask: + sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) else: - argsort_indexer = values.argsort(1) + kind = "stable" if ties_method == "first" else None + sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False) if not ascending: - argsort_indexer = argsort_indexer[:, ::-1] - - values = _take_2d(values, argsort_indexer) + sort_indexer = sort_indexer[::-1, :] - for i in range(n): - dups = sum_ranks = infs = 0 - - total_tie_count = 0 - count = 0.0 - for j in range(k): - val = values[i, j] - idx = argsort_indexer[i, j] - if keep_na and check_mask and mask[i, idx]: - ranks[i, idx] = NaN - infs += 1 - continue - - count += 1.0 - - sum_ranks += (j - infs) + 1 - dups += 1 - - if rank_t is object: - condition = ( - j == k - 1 or - are_diff(values[i, j + 1], val) or - (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) - ) - else: - condition = ( - j == k - 1 or - values[i, j + 1] != val or - (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) - ) - - if condition: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for non-numeric data') - else: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - if tiebreak == TIEBREAK_DENSE: - ranks[i, :] /= total_tie_count - else: - ranks[i, :] /= count - if axis == 0: - return ranks.T + # putmask doesn't accept a memoryview, so we assign in a separate step + masked_vals = values + with nogil: + for col in range(k): + rank_sorted_1d( + out[:, col], + grp_sizes[:, col], + labels, + sort_indexer[:, col], + masked_vals[:, col], + mask[:, col], + tiebreak, + check_mask, + False, + keep_na, + pct, + n, + ) + + if axis == 1: + return np.asarray(out.T) else: - return ranks + return np.asarray(out) ctypedef fused diff_t: diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 96605fd2009fb..90a9a7b2349e1 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -219,33 +219,3 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} {{endfor}} - -# ---------------------------------------------------------------------- -# take_2d internal function -# ---------------------------------------------------------------------- - -ctypedef fused take_t: - float64_t - uint64_t - int64_t - object - - -cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[intp_t, ndim=2, cast=True] indexer = idx - ndarray[take_t, ndim=2] result - - N, K = (values).shape - - if take_t is object: - # evaluated at compile-time - result = values.copy() - else: - result = np.empty_like(values) - - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 5ba4ab4408f11..6c5831ad897d1 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -246,13 +246,11 @@ def test_rank_methods_frame(self): expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_rank_descending(self, method, dtype): - if "i" in dtype: - df = self.df.dropna() + df = self.df.dropna().astype(dtype) else: df = self.df.astype(dtype) @@ -260,9 +258,6 @@ def test_rank_descending(self, method, dtype): expected = (df.max() - df).rank() tm.assert_frame_equal(res, expected) - if method == "first" and dtype == "O": - return - expected = (df.max() - df).rank(method=method) if dtype != "O": @@ -287,9 +282,6 @@ def _check2d(df, expected, method="average", axis=0): result = df.rank(method=method, axis=axis) tm.assert_frame_equal(result, exp_df) - disabled = {(object, "first")} - if (dtype, method) in disabled: - return frame = df if dtype is None else df.astype(dtype) _check2d(frame, self.results[method], method=method, axis=axis) @@ -456,6 +448,38 @@ def test_rank_both_inf(self): result = df.rank() tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("top", True, [3.0, 1.0, 2.0]), + ("top", False, [2.0, 1.0, 3.0]), + ("bottom", True, [2.0, 3.0, 1.0]), + ("bottom", False, [1.0, 3.0, 2.0]), + ], + ) + def test_rank_inf_nans_na_option( + self, frame_or_series, method, na_option, ascending, expected + ): + obj = frame_or_series([np.inf, np.nan, -np.inf]) + result = obj.rank(method=method, na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("bottom", True, [1.0, 2.0, 4.0, 3.0]), + ("bottom", False, [1.0, 2.0, 4.0, 3.0]), + ("top", True, [2.0, 3.0, 1.0, 4.0]), + ("top", False, [2.0, 3.0, 1.0, 4.0]), + ], + ) + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + obj = frame_or_series(["foo", "foo", None, "foo"]) + result = obj.rank(method="first", na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + @pytest.mark.parametrize( "data,expected", [