Merge pull request #5978 from MichaelWS/master

jreback · jreback · commit d3dd67c86188 · 2014-02-16T16:23:11.000-05:00
ENH: series rank has a percentage rank option
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -66,6 +66,7 @@ API Changes
 - ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent;
   previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`)
 
+
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -83,6 +84,8 @@ Improvements to existing features
 - implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`)
 - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
 - Testing statements updated to use specialized asserts (:issue: `6175`)
+- ``Series.rank()`` now has a percentage rank option (:issue: `5971`)
+
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -131,7 +131,7 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
 
 
 def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
-                    na_option='keep'):
+                    na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -144,6 +144,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
+        float count = 0.0
     tiebreak = tiebreakers[ties_method]
 
     values = np.asarray(in_arr).copy()
@@ -182,6 +183,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
         if (val == nan_value) and keep_na:
             ranks[argsorted[i]] = nan
             continue
+        count += 1.0
         if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
             if tiebreak == TIEBREAK_AVERAGE:
                 for j in range(i - dups + 1, i + 1):
@@ -199,11 +201,14 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = 2 * i - j - dups + 2
             sum_ranks = dups = 0
-    return ranks
+    if pct:
+        return ranks / count
+    else:
+        return ranks
 
 
 def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
-                  na_option='keep'):
+                  na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -216,6 +221,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
         int64_t val
         float64_t sum_ranks = 0
         int tiebreak = 0
+        float count = 0.0
     tiebreak = tiebreakers[ties_method]
 
     values = np.asarray(in_arr)
@@ -242,6 +248,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
         sum_ranks += i + 1
         dups += 1
         val = sorted_data[i]
+        count += 1.0
         if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
             if tiebreak == TIEBREAK_AVERAGE:
                 for j in range(i - dups + 1, i + 1):
@@ -259,7 +266,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = 2 * i - j - dups + 2
             sum_ranks = dups = 0
-    return ranks
+    if pct:
+        return ranks / count
+    else:
+        return ranks
 
 
 def rank_2d_float64(object in_arr, axis=0, ties_method='average',
@@ -414,7 +424,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
 
 
 def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
-                    ascending=True, na_option='keep'):
+                    ascending=True, na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -428,6 +438,8 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
+        float count = 0.0
+
 
     tiebreak = tiebreakers[ties_method]
 
@@ -469,7 +481,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
 
     sorted_data = values.take(_as)
     argsorted = _as.astype('i8')
-
     for i in range(n):
         sum_ranks += i + 1
         dups += 1
@@ -479,6 +490,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
             continue
         if (i == n - 1 or
             are_diff(util.get_value_at(sorted_data, i + 1), val)):
+            count += 1.0
             if tiebreak == TIEBREAK_AVERAGE:
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = sum_ranks / dups
@@ -491,7 +503,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
             elif tiebreak == TIEBREAK_FIRST:
                 raise ValueError('first not supported for non-numeric data')
             sum_ranks = dups = 0
-    return ranks
+    if pct:
+        ranks / count
+    else:
+        return ranks
 
 cdef inline are_diff(object left, object right):
     try:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -285,14 +285,14 @@ def mode(values):
 
 
 def rank(values, axis=0, method='average', na_option='keep',
-         ascending=True):
+         ascending=True, pct=False):
     """
 
     """
     if values.ndim == 1:
         f, values = _get_data_algo(values, _rank1d_functions)
         ranks = f(values, ties_method=method, ascending=ascending,
-                  na_option=na_option)
+                  na_option=na_option, pct=pct)
     elif values.ndim == 2:
         f, values = _get_data_algo(values, _rank2d_functions)
         ranks = f(values, axis=axis, ties_method=method,
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1707,7 +1707,8 @@ def argsort(self, axis=0, kind='quicksort', order=None):
                 np.argsort(values, kind=kind), index=self.index,
                 dtype='int64').__finalize__(self)
 
-    def rank(self, method='average', na_option='keep', ascending=True):
+    def rank(self, method='average', na_option='keep', ascending=True,
+             pct=False):
         """
         Compute data ranks (1 through n). Equal values are assigned a rank that
         is the average of the ranks of those values
@@ -1723,14 +1724,16 @@ def rank(self, method='average', na_option='keep', ascending=True):
             keep: leave NA values where they are
         ascending : boolean, default True
             False for ranks by high (1) to low (N)
-
+        pct : boolean, defeault False
+            Computes percentage rank of data
+            
         Returns
         -------
         ranks : Series
         """
         from pandas.core.algorithms import rank
         ranks = rank(self.values, method=method, na_option=na_option,
-                     ascending=ascending)
+                     ascending=ascending, pct=pct)
         return self._constructor(ranks, index=self.index).__finalize__(self)
 
     def order(self, na_last=True, ascending=True, kind='mergesort'):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2348,7 +2348,15 @@ def test_rank_apply(self):
             expected.append(piece.value.rank())
         expected = concat(expected, axis=0)
         expected = expected.reindex(result.index)
+        assert_series_equal(result, expected)
+
+        result = df.groupby(['key1', 'key2']).value.rank(pct=True)
 
+        expected = []
+        for key, piece in df.groupby(['key1', 'key2']):
+            expected.append(piece.value.rank(pct=True))
+        expected = concat(expected, axis=0)
+        expected = expected.reindex(result.index)
         assert_series_equal(result, expected)
 
     def test_dont_clobber_name_column(self):
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -3955,6 +3955,48 @@ def test_rank(self):
         iranks = iseries.rank()
         exp = iseries.astype(float).rank()
         assert_series_equal(iranks, exp)
+        iseries = Series(np.arange(5)) + 1.0
+        exp = iseries / 5.0
+        iranks = iseries.rank(pct=True)
+
+        assert_series_equal(iranks, exp)
+
+        iseries = Series(np.repeat(1, 100))
+        exp = Series(np.repeat(0.505, 100))
+        iranks = iseries.rank(pct=True)
+        assert_series_equal(iranks, exp)
+
+        iseries[1] = np.nan
+        exp = Series(np.repeat(50.0 / 99.0, 100))
+        exp[1] =  np.nan
+        iranks = iseries.rank(pct=True)
+        assert_series_equal(iranks, exp)
+
+        iseries = Series(np.arange(5)) + 1.0
+        iseries[4] = np.nan
+        exp = iseries / 4.0
+        iranks = iseries.rank(pct=True)
+        assert_series_equal(iranks, exp)
+
+        iseries = Series(np.repeat(np.nan, 100))
+        exp = iseries.copy()
+        iranks = iseries.rank(pct=True)
+        assert_series_equal(iranks, exp)
+
+        iseries = Series(np.arange(5)) + 1
+        iseries[4] = np.nan
+        exp = iseries / 4.0
+        iranks = iseries.rank(pct=True)
+        assert_series_equal(iranks, exp)
+        rng = date_range('1/1/1990', periods=5)
+
+        iseries = Series(np.arange(5), rng) + 1
+        iseries.ix[4] = np.nan
+        exp = iseries / 4.0
+        iranks = iseries.rank(pct=True)
+        assert_series_equal(iranks, exp)
+
+
 
     def test_from_csv(self):
 
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -47,6 +47,11 @@ def f():
     Benchmark('simple_series.groupby(key1).sum()', setup,
               start_date=datetime(2011, 3, 1))
 
+
+stmt4 = "df.groupby('key1').rank(pct=True)"
+groupby_series_simple_cython = Benchmark(stmt4, setup,
+                                    start_date=datetime(2014, 1, 16))
+
 #----------------------------------------------------------------------
 # 2d grouping, aggregate many columns
 
diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py
@@ -85,6 +85,10 @@
 stats_rank_average = Benchmark('s.rank()', setup,
                                start_date=datetime(2011, 12, 12))
 
+stats_rank_pct_average = Benchmark('s.rank(pct=True)', setup,
+                                   start_date=datetime(2014, 01, 16))
+stats_rank_pct_average_old = Benchmark('s.rank() / s.size()', setup,
+                                       start_date=datetime(2014, 01, 16))
 setup = common_setup + """
 values = np.random.randint(0, 100000, size=200000)
 s = Series(values)