diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 92efb225682b7..6bba650dc80a1 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -501,6 +501,7 @@ Numeric - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) +- Bug in ``rank`` method for :class:`Series`, :class:`DataFrame`, :class:`DataFrameGroupBy`, and :class:`SeriesGroupBy` treating the most negative ``int64`` value as missing (:issue:`32859`) - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 495160e65eec3..a4bc2443e0eeb 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -962,6 +962,7 @@ ctypedef fused rank_t: def rank_1d( ndarray[rank_t, ndim=1] values, const intp_t[:] labels, + bint is_datetimelike=False, ties_method="average", bint ascending=True, bint pct=False, @@ -977,6 +978,8 @@ def rank_1d( Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called from a groupby operation, will be an array of 0's + is_datetimelike : bool, default False + True if `values` contains datetime-like entries. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' * average: average rank of group @@ -1032,7 +1035,7 @@ def rank_1d( if rank_t is object: mask = missing.isnaobj(masked_vals) - elif rank_t is int64_t: + elif rank_t is int64_t and is_datetimelike: mask = (masked_vals == NPY_NAT).astype(np.uint8) elif rank_t is float64_t: mask = np.isnan(masked_vals).astype(np.uint8) @@ -1059,7 +1062,7 @@ def rank_1d( if rank_t is object: nan_fill_val = NegInfinity() elif rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).min + nan_fill_val = NPY_NAT elif rank_t is uint64_t: nan_fill_val = 0 else: @@ -1275,6 +1278,7 @@ def rank_1d( def rank_2d( ndarray[rank_t, ndim=2] in_arr, int axis=0, + bint is_datetimelike=False, ties_method="average", bint ascending=True, na_option="keep", @@ -1299,7 +1303,9 @@ def rank_2d( tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' - check_mask = rank_t is not uint64_t + + # For cases where a mask is not possible, we can avoid mask checks + check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) if axis == 0: values = np.asarray(in_arr).T.copy() @@ -1310,13 +1316,15 @@ def rank_2d( if values.dtype != np.object_: values = values.astype('O') - if rank_t is not uint64_t: + if check_mask: if ascending ^ (na_option == 'top'): if rank_t is object: nan_value = Infinity() elif rank_t is float64_t: nan_value = np.inf - elif rank_t is int64_t: + + # int64 and datetimelike + else: nan_value = np.iinfo(np.int64).max else: @@ -1324,14 +1332,18 @@ def rank_2d( nan_value = NegInfinity() elif rank_t is float64_t: nan_value = -np.inf - elif rank_t is int64_t: + + # int64 and datetimelike + else: nan_value = NPY_NAT if rank_t is object: mask = missing.isnaobj2d(values) elif rank_t is float64_t: mask = np.isnan(values) - elif rank_t is int64_t: + + # int64 and datetimelike + else: mask = values == NPY_NAT np.putmask(values, mask, nan_value) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7ddc087df9b11..64373adf0217f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1079,9 +1079,8 @@ def group_rank(float64_t[:, ::1] out, ngroups : int This parameter is not used, is needed to match signatures of other groupby functions. - is_datetimelike : bool, default False - unused in this method but provided for call compatibility with other - Cython transformations + is_datetimelike : bool + True if `values` contains datetime-like entries. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' * average: average rank of group @@ -1109,6 +1108,7 @@ def group_rank(float64_t[:, ::1] out, result = rank_1d( values=values[:, 0], labels=labels, + is_datetimelike=is_datetimelike, ties_method=ties_method, ascending=ascending, pct=pct, diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 77b5a0148905e..f52aff424eb0b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1031,21 +1031,23 @@ def rank( Whether or not to the display the returned rankings in integer form (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ + is_datetimelike = needs_i8_conversion(values.dtype) + values = _get_values_for_rank(values) if values.ndim == 1: - values = _get_values_for_rank(values) ranks = algos.rank_1d( values, labels=np.zeros(len(values), dtype=np.intp), + is_datetimelike=is_datetimelike, ties_method=method, ascending=ascending, na_option=na_option, pct=pct, ) elif values.ndim == 2: - values = _get_values_for_rank(values) ranks = algos.rank_2d( values, axis=axis, + is_datetimelike=is_datetimelike, ties_method=method, ascending=ascending, na_option=na_option, diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index ce46d1d8b1869..6538eda8cdeff 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from pandas._libs import iNaT from pandas._libs.algos import ( Infinity, NegInfinity, @@ -382,7 +381,7 @@ def test_pct_max_many_rows(self): "float32", ), ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), - pytest.param( + ( [ np.iinfo(np.int64).min, -100, @@ -394,20 +393,20 @@ def test_pct_max_many_rows(self): np.iinfo(np.int64).max, ], "int64", - marks=pytest.mark.xfail( - reason="iNaT is equivalent to minimum value of dtype" - "int64 pending issue GH#16674" - ), ), ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ( + [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)], + "datetime64", + ), ], ) def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): dtype_na_map = { "float64": np.nan, "float32": np.nan, - "int64": iNaT, "object": None, + "datetime64": np.datetime64("nat"), } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 6116703ebd174..00641effac08d 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -1,9 +1,12 @@ +from datetime import datetime + import numpy as np import pytest import pandas as pd from pandas import ( DataFrame, + NaT, Series, concat, ) @@ -517,3 +520,25 @@ def test_rank_zero_div(input_key, input_value, output_value): result = df.groupby("A").rank(method="dense", pct=True) expected = DataFrame({"B": output_value}) tm.assert_frame_equal(result, expected) + + +def test_rank_min_int(): + # GH-32859 + df = DataFrame( + { + "grp": [1, 1, 2], + "int_col": [ + np.iinfo(np.int64).min, + np.iinfo(np.int64).max, + np.iinfo(np.int64).min, + ], + "datetimelike": [NaT, datetime(2001, 1, 1), NaT], + } + ) + + result = df.groupby("grp").rank() + expected = DataFrame( + {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]} + ) + + tm.assert_frame_equal(result, expected)