BUG: fix issues on rank calculations when +/- inf values are present with NaNs (pandas-dev#6945)

peterpanmj · peterpanmj · commit c3c92af555ec · 2017-12-03T21:35:50.000+08:00
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -76,6 +76,67 @@ Other Enhancements
 - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
 - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
 - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
+- :func:`Series.rank` and :func:`DataFrame.rank` now can handle ``inf`` values properly when ``NaN`` are present (:issue:`6945`)
+
+handle ``inf`` values properly when ``NaN`` are present
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+In previous versions, ``inf`` elements were assigned ``NaN`` as their ranks. Now ranks are calculated properly.
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [6]: pd.Series([-np.inf, 0, 1, np.nan, np.inf]).rank()
+    Out[6]:
+    0    1.0
+    1    2.0
+    2    3.0
+    3    NaN
+    4    NaN
+    dtype: float64
+
+Current Behavior
+
+.. ipython:: python
+
+    In [2]: import numpy as np
+
+    In [3]: pd.Series([-np.inf, 0, 1, np.nan, np.inf]).rank()
+    Out[3]:
+    0    1.0
+    1    2.0
+    2    3.0
+    3    NaN
+    4    4.0
+    dtype: float64
+
+Furthermore, previously if you rank ``inf`` or ``-inf`` values together with ``NaN`` values, the calculation won't distinguish ``NaN`` from infinity when using 'top' or 'bottom' argument.
+
+Previously Behavior:
+
+.. code-block:: ipython
+
+    In [15]: pd.Series([np.nan, np.nan, -np.inf, -np.inf]).rank(na_option='top')
+    Out[15]:
+    0    2.5
+    1    2.5
+    2    2.5
+    3    2.5
+    dtype: float64
+
+Current Behavior
+
+.. ipython:: python
+
+    In [4]:  pd.Series([np.nan, np.nan, -np.inf, -np.inf]).rank(na_option='top')
+    Out[4]:
+    0    1.5
+    1    1.5
+    2    3.5
+    3    3.5
+    dtype: float64
+
 
 .. _whatsnew_0220.api_breaking:
 
@@ -227,7 +288,7 @@ Reshaping
 Numeric
 ^^^^^^^
 
--
+- Bug in :func:`Series.rank` and :func:`DataFrame.rank` could not properly rank infinity values when ``NaN`` values are present (:issue:`6945`)
 -
 -
 
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -64,22 +64,24 @@ class Infinity(object):
     """ provide a positive Infinity comparision method for ranking """
 
     __lt__ = lambda self, other: False
-    __le__ = lambda self, other: self is other
-    __eq__ = lambda self, other: self is other
-    __ne__ = lambda self, other: self is not other
-    __gt__ = lambda self, other: self is not other
-    __ge__ = lambda self, other: True
+    __le__ = lambda self, other: isinstance(other, Infinity)
+    __eq__ = lambda self, other: isinstance(other, Infinity)
+    __ne__ = lambda self, other: not isinstance(other, Infinity)
+    __gt__ = lambda self, other: (not isinstance(other, Infinity) and
+                                  not missing.checknull(other))
+    __ge__ = lambda self, other: not missing.checknull(other)
 
 
 class NegInfinity(object):
     """ provide a negative Infinity comparision method for ranking """
 
-    __lt__ = lambda self, other: self is not other
-    __le__ = lambda self, other: True
-    __eq__ = lambda self, other: self is other
-    __ne__ = lambda self, other: self is not other
+    __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
+                                  not missing.checknull(other))
+    __le__ = lambda self, other: not missing.checknull(other)
+    __eq__ = lambda self, other: isinstance(other, NegInfinity)
+    __ne__ = lambda self, other: not isinstance(other, NegInfinity)
     __gt__ = lambda self, other: False
-    __ge__ = lambda self, other: self is other
+    __ge__ = lambda self, other: isinstance(other, NegInfinity)
 
 
 @cython.wraparound(False)
diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
@@ -27,7 +27,7 @@ dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
 {{if dtype == 'object'}}
 
 
-def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average',
+def rank_1d_{{dtype}}(object in_arr, ties_method='average',
                       ascending=True, na_option='keep', pct=False):
 {{else}}
 
@@ -40,7 +40,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
     """
 
     cdef:
-        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
+        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
 
         {{if dtype == 'object'}}
         ndarray sorted_data, values
@@ -50,6 +50,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
 
         ndarray[float64_t] ranks
         ndarray[int64_t] argsorted
+        ndarray[np.uint8_t, cast=True] sorted_mask
 
         {{if dtype == 'uint64'}}
         {{ctype}} val
@@ -60,6 +61,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
+        bint isnan
         float count = 0.0
     tiebreak = tiebreakers[ties_method]
 
@@ -76,12 +78,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
 
     keep_na = na_option == 'keep'
 
-    {{if dtype != 'uint64'}}
-    if ascending ^ (na_option == 'top'):
-        nan_value = {{pos_nan_value}}
-    else:
-        nan_value = {{neg_nan_value}}
-
     {{if dtype == 'object'}}
     mask = missing.isnaobj(values)
     {{elif dtype == 'float64'}}
@@ -90,56 +86,69 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
     mask = values == iNaT
     {{endif}}
 
+    # double sort first by mask and then by values to ensure nan values are
+    # either at the beginning or the end. mask/(~mask) controls padding at
+    # tail or the head
+    {{if dtype != 'uint64'}}
+    if ascending ^ (na_option == 'top'):
+        nan_value = {{pos_nan_value}}
+        order = (values, mask)
+    else:
+        nan_value = {{neg_nan_value}}
+        order = (values, ~mask)
     np.putmask(values, mask, nan_value)
+    {{else}}
+    mask = np.zeros(shape=len(values), dtype=bool)
+    order = (values, mask)
     {{endif}}
 
     n = len(values)
     ranks = np.empty(n, dtype='f8')
 
     {{if dtype == 'object'}}
+
     try:
-        _as = values.argsort()
+        _as = np.lexsort(keys=order)
     except TypeError:
-        if not retry:
-            raise
-
-        valid_locs = (~mask).nonzero()[0]
-        ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0,
-                                             ties_method=ties_method,
-                                             ascending=ascending))
-        np.putmask(ranks, mask, np.nan)
-        return ranks
+        # lexsort on object array will raise TypeError for numpy version
+        # earlier than 1.11.0. Use argsort with order argument instead.
+        _dt = [('values', 'O'), ('mask', '?')]
+        _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt)
+        _as = np.argsort(_values, kind='mergesort', order=('mask', 'values'))
     {{else}}
     if tiebreak == TIEBREAK_FIRST:
         # need to use a stable sort here
-        _as = values.argsort(kind='mergesort')
+        _as = np.lexsort(keys=order)
         if not ascending:
             tiebreak = TIEBREAK_FIRST_DESCENDING
     else:
-        _as = values.argsort()
+        _as = np.lexsort(keys=order)
     {{endif}}
 
     if not ascending:
         _as = _as[::-1]
 
     sorted_data = values.take(_as)
+    sorted_mask = mask.take(_as)
+    _indices = order[1].take(_as).nonzero()[0]
+    non_na_idx = _indices[0] if len(_indices) > 0 else -1
     argsorted = _as.astype('i8')
 
     {{if dtype == 'object'}}
     for i in range(n):
         sum_ranks += i + 1
         dups += 1
-
+        isnan = sorted_mask[i]
         val = util.get_value_at(sorted_data, i)
 
-        if (val is nan_value) and keep_na:
+        if isnan and keep_na:
             ranks[argsorted[i]] = nan
             continue
-
         count += 1.0
 
         if (i == n - 1 or
-            are_diff(util.get_value_at(sorted_data, i + 1), val)):
+                are_diff(util.get_value_at(sorted_data, i + 1), val) or
+                i == non_na_idx - 1):
             if tiebreak == TIEBREAK_AVERAGE:
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = sum_ranks / dups
@@ -164,18 +173,19 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
         for i in range(n):
             sum_ranks += i + 1
             dups += 1
-
             val = sorted_data[i]
 
             {{if dtype != 'uint64'}}
-            if (val == nan_value) and keep_na:
+            isnan = sorted_mask[i]
+            if isnan and keep_na:
                 ranks[argsorted[i]] = nan
                 continue
             {{endif}}
 
             count += 1.0
 
-            if i == n - 1 or sorted_data[i + 1] != val:
+            if (i == n - 1 or sorted_data[i + 1] != val or
+                i == non_na_idx - 1):
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = sum_ranks / dups
diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py
@@ -14,6 +14,8 @@
 from pandas.util.testing import assert_series_equal
 import pandas.util.testing as tm
 from pandas.tests.series.common import TestData
+from pandas._libs.tslib import iNaT
+from pandas._libs.algos import Infinity, NegInfinity
 
 
 class TestSeriesRank(TestData):
@@ -195,16 +197,48 @@ def test_rank_signature(self):
         s.rank(method='average')
         pytest.raises(ValueError, s.rank, 'average')
 
-    def test_rank_inf(self):
-        pytest.skip('DataFrame.rank does not currently rank '
-                    'np.inf and -np.inf properly')
-
-        values = np.array(
-            [-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
-             2, 40, np.inf], dtype='float64')
+    @pytest.mark.parametrize('contents,dtype', [
+        ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
+          2, 40, np.inf],
+         'float64'),
+        ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10,
+          2, 40, np.inf],
+         'float32'),
+        ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max],
+         'uint8'),
+        pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000,
+                      1e10, np.iinfo(np.int64).max],
+                     'int64',
+                     marks=pytest.mark.xfail(reason='''iNaT is equivalent to
+                                                      minimum value of dtype
+                                                      int64 pending issue
+                                                      #16674'''),
+                     ),
+        ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()],
+         'object')
+    ])
+    def test_rank_inf(self, contents, dtype):
+        dtype_na_map = {
+            'float64': np.nan,
+            'float32': np.nan,
+            'int64': iNaT,
+            'object': None
+        }
+        # Insert nans at random positions if underlying dtype has missing
+        # value. Then adjust the expected order by adding nans accordingly
+        # This is for testing whether rank calculation is affected
+        # when values are interwined with nan values.
+        values = np.array(contents, dtype=dtype)
+        exp_order = np.array(range(len(values)), dtype='float64') + 1.0
+        if dtype in dtype_na_map:
+            na_value = dtype_na_map[dtype]
+            nan_indices = np.random.choice(range(len(values)), 5)
+            values = np.insert(values, nan_indices, na_value)
+            exp_order = np.insert(exp_order, nan_indices, np.nan)
+        # shuffle the testing array and expected results in the same way
         random_order = np.random.permutation(len(values))
         iseries = Series(values[random_order])
-        exp = Series(random_order + 1.0, dtype='float64')
+        exp = Series(exp_order[random_order], dtype='float64')
         iranks = iseries.rank()
         assert_series_equal(iranks, exp)
 
@@ -225,6 +259,39 @@ def _check(s, expected, method='average'):
             series = s if dtype is None else s.astype(dtype)
             _check(series, results[method], method=method)
 
+    def test_rank_tie_methods_on_infs_nans(self):
+        dtypes = [('object', None, Infinity(), NegInfinity()),
+                  ('float64', np.nan, np.inf, -np.inf)]
+        chunk = 3
+        disabled = set([('object', 'first')])
+
+        def _check(s, expected, method='average', na_option='keep'):
+            result = s.rank(method=method, na_option=na_option)
+            tm.assert_series_equal(result, Series(expected, dtype='float64'))
+
+        exp_ranks = {
+            'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
+            'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
+            'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
+            'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
+            'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
+        }
+        na_options = ('top', 'bottom', 'keep')
+        for dtype, na_value, pos_inf, neg_inf in dtypes:
+            in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
+            iseries = Series(in_arr, dtype=dtype)
+            for method, na_opt in product(exp_ranks.keys(), na_options):
+                ranks = exp_ranks[method]
+                if (dtype, method) in disabled:
+                    continue
+                if na_opt == 'top':
+                    order = ranks[1] + ranks[0] + ranks[2]
+                elif na_opt == 'bottom':
+                    order = ranks[0] + ranks[2] + ranks[1]
+                else:
+                    order = ranks[0] + [np.nan] * chunk + ranks[1]
+                _check(iseries, order, method, na_opt)
+
     def test_rank_methods_series(self):
         pytest.importorskip('scipy.stats.special')
         rankdata = pytest.importorskip('scipy.stats.rankdata')
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py