BUG: rank treating min int as NaN (#40659)

mzeitlin11 · web-flow · commit a59987b4049a · 2021-03-29T10:21:48.000-04:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -533,6 +533,7 @@ Numeric
 - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
 - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
 - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
+- Bug in ``rank`` method for :class:`Series`, :class:`DataFrame`, :class:`DataFrameGroupBy`, and :class:`SeriesGroupBy` treating the most negative ``int64`` value as missing (:issue:`32859`)
 - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
 - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
 - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -962,6 +962,7 @@ ctypedef fused rank_t:
 def rank_1d(
     ndarray[rank_t, ndim=1] values,
     const intp_t[:] labels,
+    bint is_datetimelike=False,
     ties_method="average",
     bint ascending=True,
     bint pct=False,
@@ -977,6 +978,8 @@ def rank_1d(
         Array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`. If not called
         from a groupby operation, will be an array of 0's
+    is_datetimelike : bool, default False
+        True if `values` contains datetime-like entries.
     ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
         'average'
         * average: average rank of group
@@ -1032,7 +1035,7 @@ def rank_1d(
 
     if rank_t is object:
         mask = missing.isnaobj(masked_vals)
-    elif rank_t is int64_t:
+    elif rank_t is int64_t and is_datetimelike:
         mask = (masked_vals == NPY_NAT).astype(np.uint8)
     elif rank_t is float64_t:
         mask = np.isnan(masked_vals).astype(np.uint8)
@@ -1059,7 +1062,7 @@ def rank_1d(
         if rank_t is object:
             nan_fill_val = NegInfinity()
         elif rank_t is int64_t:
-            nan_fill_val = np.iinfo(np.int64).min
+            nan_fill_val = NPY_NAT
         elif rank_t is uint64_t:
             nan_fill_val = 0
         else:
@@ -1275,6 +1278,7 @@ def rank_1d(
 def rank_2d(
     ndarray[rank_t, ndim=2] in_arr,
     int axis=0,
+    bint is_datetimelike=False,
     ties_method="average",
     bint ascending=True,
     na_option="keep",
@@ -1299,7 +1303,9 @@ def rank_2d(
     tiebreak = tiebreakers[ties_method]
 
     keep_na = na_option == 'keep'
-    check_mask = rank_t is not uint64_t
+
+    # For cases where a mask is not possible, we can avoid mask checks
+    check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
 
     if axis == 0:
         values = np.asarray(in_arr).T.copy()
@@ -1310,28 +1316,34 @@ def rank_2d(
         if values.dtype != np.object_:
             values = values.astype('O')
 
-    if rank_t is not uint64_t:
+    if check_mask:
         if ascending ^ (na_option == 'top'):
             if rank_t is object:
                 nan_value = Infinity()
             elif rank_t is float64_t:
                 nan_value = np.inf
-            elif rank_t is int64_t:
+
+            # int64 and datetimelike
+            else:
                 nan_value = np.iinfo(np.int64).max
 
         else:
             if rank_t is object:
                 nan_value = NegInfinity()
             elif rank_t is float64_t:
                 nan_value = -np.inf
-            elif rank_t is int64_t:
+
+            # int64 and datetimelike
+            else:
                 nan_value = NPY_NAT
 
         if rank_t is object:
             mask = missing.isnaobj2d(values)
         elif rank_t is float64_t:
             mask = np.isnan(values)
-        elif rank_t is int64_t:
+
+        # int64 and datetimelike
+        else:
             mask = values == NPY_NAT
 
         np.putmask(values, mask, nan_value)
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1074,9 +1074,8 @@ def group_rank(float64_t[:, ::1] out,
     ngroups : int
         This parameter is not used, is needed to match signatures of other
         groupby functions.
-    is_datetimelike : bool, default False
-        unused in this method but provided for call compatibility with other
-        Cython transformations
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
     ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
         'average'
         * average: average rank of group
@@ -1104,6 +1103,7 @@ def group_rank(float64_t[:, ::1] out,
     result = rank_1d(
         values=values[:, 0],
         labels=labels,
+        is_datetimelike=is_datetimelike,
         ties_method=ties_method,
         ascending=ascending,
         pct=pct,
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1031,21 +1031,23 @@ def rank(
         Whether or not to the display the returned rankings in integer form
         (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
     """
+    is_datetimelike = needs_i8_conversion(values.dtype)
+    values = _get_values_for_rank(values)
     if values.ndim == 1:
-        values = _get_values_for_rank(values)
         ranks = algos.rank_1d(
             values,
             labels=np.zeros(len(values), dtype=np.intp),
+            is_datetimelike=is_datetimelike,
             ties_method=method,
             ascending=ascending,
             na_option=na_option,
             pct=pct,
         )
     elif values.ndim == 2:
-        values = _get_values_for_rank(values)
         ranks = algos.rank_2d(
             values,
             axis=axis,
+            is_datetimelike=is_datetimelike,
             ties_method=method,
             ascending=ascending,
             na_option=na_option,
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
@@ -6,7 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._libs import iNaT
 from pandas._libs.algos import (
     Infinity,
     NegInfinity,
@@ -382,7 +381,7 @@ def test_pct_max_many_rows(self):
                 "float32",
             ),
             ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
-            pytest.param(
+            (
                 [
                     np.iinfo(np.int64).min,
                     -100,
@@ -394,20 +393,20 @@ def test_pct_max_many_rows(self):
                     np.iinfo(np.int64).max,
                 ],
                 "int64",
-                marks=pytest.mark.xfail(
-                    reason="iNaT is equivalent to minimum value of dtype"
-                    "int64 pending issue GH#16674"
-                ),
             ),
             ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
+            (
+                [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)],
+                "datetime64",
+            ),
         ],
     )
     def test_rank_inf_and_nan(self, contents, dtype, frame_or_series):
         dtype_na_map = {
             "float64": np.nan,
             "float32": np.nan,
-            "int64": iNaT,
             "object": None,
+            "datetime64": np.datetime64("nat"),
         }
         # Insert nans at random positions if underlying dtype has missing
         # value. Then adjust the expected order by adding nans accordingly
diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py
@@ -1,9 +1,12 @@
+from datetime import datetime
+
 import numpy as np
 import pytest
 
 import pandas as pd
 from pandas import (
     DataFrame,
+    NaT,
     Series,
     concat,
 )
@@ -517,3 +520,25 @@ def test_rank_zero_div(input_key, input_value, output_value):
     result = df.groupby("A").rank(method="dense", pct=True)
     expected = DataFrame({"B": output_value})
     tm.assert_frame_equal(result, expected)
+
+
+def test_rank_min_int():
+    # GH-32859
+    df = DataFrame(
+        {
+            "grp": [1, 1, 2],
+            "int_col": [
+                np.iinfo(np.int64).min,
+                np.iinfo(np.int64).max,
+                np.iinfo(np.int64).min,
+            ],
+            "datetimelike": [NaT, datetime(2001, 1, 1), NaT],
+        }
+    )
+
+    result = df.groupby("grp").rank()
+    expected = DataFrame(
+        {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]}
+    )
+
+    tm.assert_frame_equal(result, expected)