BUG: DataFrame.rank with np.inf and np.nan (#38681)

mzeitlin11 · web-flow · commit 387d485c963c · 2020-12-30T16:11:27.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -212,6 +212,7 @@ Numeric
 - Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`)
 - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)
 - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
+- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -1022,16 +1022,19 @@ def rank_2d(
         ndarray[float64_t, ndim=2] ranks
         ndarray[rank_t, ndim=2] values
         ndarray[int64_t, ndim=2] argsorted
+        ndarray[uint8_t, ndim=2] mask
         rank_t val, nan_value
         float64_t sum_ranks = 0
         int tiebreak = 0
+        int64_t idx
         bint keep_na = False
         float64_t count = 0.0
-        bint condition, skip_condition
+        bint condition, check_mask
 
     tiebreak = tiebreakers[ties_method]
 
     keep_na = na_option == 'keep'
+    check_mask = rank_t is not uint64_t
 
     if axis == 0:
         values = np.asarray(in_arr).T.copy()
@@ -1067,6 +1070,8 @@ def rank_2d(
             mask = values == NPY_NAT
 
         np.putmask(values, mask, nan_value)
+    else:
+        mask = np.zeros_like(values, dtype=bool)
 
     n, k = (<object>values).shape
     ranks = np.empty((n, k), dtype='f8')
@@ -1099,43 +1104,35 @@ def rank_2d(
     argsorted = _as.astype('i8')
 
     for i in range(n):
-        if rank_t is object:
-            dups = sum_ranks = infs = 0
-        else:
-            dups = sum_ranks = 0
+        dups = sum_ranks = infs = 0
 
         total_tie_count = 0
         count = 0.0
         for j in range(k):
-            if rank_t is not object:
-                sum_ranks += j + 1
-                dups += 1
-
             val = values[i, j]
-
-            if rank_t is not uint64_t:
-                if rank_t is object:
-                    skip_condition = (val is nan_value) and keep_na
-                else:
-                    skip_condition = (val == nan_value) and keep_na
-                if skip_condition:
-                    ranks[i, argsorted[i, j]] = NaN
-
-                    if rank_t is object:
-                        infs += 1
-
-                    continue
+            idx = argsorted[i, j]
+            if keep_na and check_mask and mask[i, idx]:
+                ranks[i, idx] = NaN
+                infs += 1
+                continue
 
             count += 1.0
 
-            if rank_t is object:
-                sum_ranks += (j - infs) + 1
-                dups += 1
+            sum_ranks += (j - infs) + 1
+            dups += 1
 
             if rank_t is object:
-                condition = j == k - 1 or are_diff(values[i, j + 1], val)
+                condition = (
+                    j == k - 1 or
+                    are_diff(values[i, j + 1], val) or
+                    (keep_na and check_mask and mask[i, argsorted[i, j + 1]])
+                )
             else:
-                condition = j == k - 1 or values[i, j + 1] != val
+                condition = (
+                    j == k - 1 or
+                    values[i, j + 1] != val or
+                    (keep_na and check_mask and mask[i, argsorted[i, j + 1]])
+                )
 
             if condition:
                 if tiebreak == TIEBREAK_AVERAGE:
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+from pandas._libs import iNaT
+from pandas._libs.algos import Infinity, NegInfinity
 import pandas.util._test_decorators as td
 
 from pandas import DataFrame, Series
@@ -329,3 +331,116 @@ def test_pct_max_many_rows(self):
         )
         result = df.rank(pct=True).max()
         assert (result == 1).all()
+
+    @pytest.mark.parametrize(
+        "contents,dtype",
+        [
+            (
+                [
+                    -np.inf,
+                    -50,
+                    -1,
+                    -1e-20,
+                    -1e-25,
+                    -1e-50,
+                    0,
+                    1e-40,
+                    1e-20,
+                    1e-10,
+                    2,
+                    40,
+                    np.inf,
+                ],
+                "float64",
+            ),
+            (
+                [
+                    -np.inf,
+                    -50,
+                    -1,
+                    -1e-20,
+                    -1e-25,
+                    -1e-45,
+                    0,
+                    1e-40,
+                    1e-20,
+                    1e-10,
+                    2,
+                    40,
+                    np.inf,
+                ],
+                "float32",
+            ),
+            ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
+            pytest.param(
+                [
+                    np.iinfo(np.int64).min,
+                    -100,
+                    0,
+                    1,
+                    9999,
+                    100000,
+                    1e10,
+                    np.iinfo(np.int64).max,
+                ],
+                "int64",
+                marks=pytest.mark.xfail(
+                    reason="iNaT is equivalent to minimum value of dtype"
+                    "int64 pending issue GH#16674"
+                ),
+            ),
+            ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
+        ],
+    )
+    def test_rank_inf_and_nan(self, contents, dtype):
+        dtype_na_map = {
+            "float64": np.nan,
+            "float32": np.nan,
+            "int64": iNaT,
+            "object": None,
+        }
+        # Insert nans at random positions if underlying dtype has missing
+        # value. Then adjust the expected order by adding nans accordingly
+        # This is for testing whether rank calculation is affected
+        # when values are interwined with nan values.
+        values = np.array(contents, dtype=dtype)
+        exp_order = np.array(range(len(values)), dtype="float64") + 1.0
+        if dtype in dtype_na_map:
+            na_value = dtype_na_map[dtype]
+            nan_indices = np.random.choice(range(len(values)), 5)
+            values = np.insert(values, nan_indices, na_value)
+            exp_order = np.insert(exp_order, nan_indices, np.nan)
+        # shuffle the testing array and expected results in the same way
+        random_order = np.random.permutation(len(values))
+        df = DataFrame({"a": values[random_order]})
+        expected = DataFrame({"a": exp_order[random_order]}, dtype="float64")
+        result = df.rank()
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_series_inf_nan_consistency(self):
+        # GH#32593
+        index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
+        col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
+        col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
+        df = DataFrame(
+            data={
+                "col1": col1,
+                "col2": col2,
+            },
+            index=index,
+            dtype="f8",
+        )
+        df_result = df.rank()
+
+        series_result = df.copy()
+        series_result["col1"] = df["col1"].rank()
+        series_result["col2"] = df["col2"].rank()
+
+        tm.assert_frame_equal(df_result, series_result)
+
+    def test_rank_both_inf(self):
+        # GH#32593
+        df = DataFrame({"a": [-np.inf, 0, np.inf]})
+        expected = DataFrame({"a": [1.0, 2.0, 3.0]})
+        result = df.rank()
+        tm.assert_frame_equal(result, expected)