ENH: rank na_options top and bottom #1508

changhiskhan · wesm · commit 72f0758fb860 · 2012-11-02T12:48:25.000-04:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -191,11 +191,12 @@ def rank(values, axis=0, method='average', na_option='keep',
     """
     if values.ndim == 1:
         f, values = _get_data_algo(values, _rank1d_functions)
-        ranks = f(values, ties_method=method, ascending=ascending)
+        ranks = f(values, ties_method=method, ascending=ascending,
+                  na_option=na_option)
     elif values.ndim == 2:
         f, values = _get_data_algo(values, _rank2d_functions)
         ranks = f(values, axis=axis, ties_method=method,
-                  ascending=ascending)
+                  ascending=ascending, na_option=na_option)
     return ranks
 
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4704,8 +4704,10 @@ def rank(self, axis=0, numeric_only=None, method='average',
             min: lowest rank in group
             max: highest rank in group
             first: ranks assigned in order they appear in the array
-        na_option : {'keep'}
+        na_option : {'keep', 'top', 'bottom'}
             keep: leave NA values where they are
+            top: smallest rank if ascending
+            bottom: smallest rank if descending
         ascending : boolean, default True
             False for ranks by high (1) to low (N)
 
@@ -4716,7 +4718,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
         if numeric_only is None:
             try:
                 ranks = algos.rank(self.values, axis=axis, method=method,
-                                   ascending=ascending)
+                                   ascending=ascending, na_option=na_option)
                 return DataFrame(ranks, index=self.index, columns=self.columns)
             except TypeError:
                 numeric_only = True
@@ -4726,7 +4728,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
         else:
             data = self
         ranks = algos.rank(data.values, axis=axis, method=method,
-                           ascending=ascending)
+                           ascending=ascending, na_option=na_option)
         return DataFrame(ranks, index=data.index, columns=data.columns)
 
     def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
diff --git a/pandas/src/stats.pyx b/pandas/src/stats.pyx
@@ -70,7 +70,8 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
     return result
 
 
-def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
+def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
+                    na_option='keep'):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -86,7 +87,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
 
     values = np.asarray(in_arr).copy()
 
-    if ascending:
+    if ascending ^ (na_option == 'top'):
         nan_value = np.inf
     else:
         nan_value = -np.inf
@@ -115,7 +116,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
         sum_ranks += i + 1
         dups += 1
         val = sorted_data[i]
-        if val == nan_value:
+        if (val == nan_value) and (na_option == 'keep'):
             ranks[argsorted[i]] = nan
             continue
         if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
@@ -138,7 +139,8 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
     return ranks
 
 
-def rank_1d_int64(object in_arr, ties_method='average', ascending=True):
+def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
+                  na_option='keep'):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -198,7 +200,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True):
 
 
 def rank_2d_float64(object in_arr, axis=0, ties_method='average',
-                    ascending=True):
+                    ascending=True, na_option='keep'):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -219,7 +221,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
     else:
         values = in_arr.copy()
 
-    if ascending:
+    if ascending ^ (na_option == 'top'):
         nan_value = np.inf
     else:
         nan_value = -np.inf
@@ -249,7 +251,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
             sum_ranks += j + 1
             dups += 1
             val = values[i, j]
-            if val == nan_value:
+            if val == nan_value and na_option == 'keep':
                 ranks[i, argsorted[i, j]] = nan
                 continue
             if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
@@ -277,7 +279,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
 
 
 def rank_2d_int64(object in_arr, axis=0, ties_method='average',
-                    ascending=True):
+                    ascending=True, na_option='keep'):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -345,7 +347,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
 
 
 def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
-                    ascending=True):
+                    ascending=True, na_option='keep'):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -365,7 +367,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
     if values.dtype != np.object_:
         values = values.astype('O')
 
-    if ascending:
+    if ascending ^ (na_option == 'top'):
         # always greater than everything
         nan_value = Infinity()
     else:
@@ -401,7 +403,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
         sum_ranks += i + 1
         dups += 1
         val = util.get_value_at(sorted_data, i)
-        if val is nan_value:
+        if val is nan_value and na_option=='keep':
             ranks[argsorted[i]] = nan
             continue
         if (i == n - 1 or
@@ -450,7 +452,7 @@ class NegInfinity(object):
     __cmp__ = _return_true
 
 def rank_2d_generic(object in_arr, axis=0, ties_method='average',
-                    ascending=True):
+                    ascending=True, na_option='keep'):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -475,7 +477,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
     if values.dtype != np.object_:
         values = values.astype('O')
 
-    if ascending:
+    if ascending ^ (na_option == 'top'):
         # always greater than everything
         nan_value = Infinity()
     else:
@@ -510,7 +512,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
         dups = sum_ranks = infs = 0
         for j in range(k):
             val = values[i, j]
-            if val is nan_value:
+            if val is nan_value and na_option == 'keep':
                 ranks[i, argsorted[i, j]] = nan
                 infs += 1
                 continue
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -6459,6 +6459,73 @@ def test_rank2(self):
         expected = self.mixed_frame.rank(1, numeric_only=True)
         assert_frame_equal(result, expected)
 
+    def test_rank_na_option(self):
+        from pandas.compat.scipy import rankdata
+
+        self.frame['A'][::2] = np.nan
+        self.frame['B'][::3] = np.nan
+        self.frame['C'][::4] = np.nan
+        self.frame['D'][::5] = np.nan
+
+        #bottom
+        ranks0 = self.frame.rank(na_option='bottom')
+        ranks1 = self.frame.rank(1, na_option='bottom')
+
+        fvals = self.frame.fillna(np.inf).values
+
+        exp0 = np.apply_along_axis(rankdata, 0, fvals)
+        exp1 = np.apply_along_axis(rankdata, 1, fvals)
+
+        assert_almost_equal(ranks0.values, exp0)
+        assert_almost_equal(ranks1.values, exp1)
+
+        #top
+        ranks0 = self.frame.rank(na_option='top')
+        ranks1 = self.frame.rank(1, na_option='top')
+
+        fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
+        fval1 = self.frame.T
+        fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
+        fval1 = fval1.fillna(np.inf).values
+
+        exp0 = np.apply_along_axis(rankdata, 0, fval0)
+        exp1 = np.apply_along_axis(rankdata, 1, fval1)
+
+        assert_almost_equal(ranks0.values, exp0)
+        assert_almost_equal(ranks1.values, exp1)
+
+        #descending
+
+        #bottom
+        ranks0 = self.frame.rank(na_option='top', ascending=False)
+        ranks1 = self.frame.rank(1, na_option='top', ascending=False)
+
+        fvals = self.frame.fillna(np.inf).values
+
+        exp0 = np.apply_along_axis(rankdata, 0, -fvals)
+        exp1 = np.apply_along_axis(rankdata, 1, -fvals)
+
+        assert_almost_equal(ranks0.values, exp0)
+        assert_almost_equal(ranks1.values, exp1)
+
+        #descending
+
+        #top
+        ranks0 = self.frame.rank(na_option='bottom', ascending=False)
+        ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
+
+        fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
+        fval1 = self.frame.T
+        fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
+        fval1 = fval1.fillna(np.inf).values
+
+        exp0 = np.apply_along_axis(rankdata, 0, -fval0)
+        exp1 = np.apply_along_axis(rankdata, 1, -fval1)
+
+        assert_almost_equal(ranks0.values, exp0)
+        assert_almost_equal(ranks1.values, exp1)
+
+
     def test_describe(self):
         desc = self.tsframe.describe()
         desc = self.mixed_frame.describe()