From 7b3785801221f5e25648f634f5ca171bc861d852 Mon Sep 17 00:00:00 2001 From: michaelws Date: Sun, 26 Jan 2014 10:35:03 -0500 Subject: [PATCH] checkin of percentage rank --- doc/source/release.rst | 3 +++ pandas/algos.pyx | 29 +++++++++++++++++++------ pandas/core/algorithms.py | 4 ++-- pandas/core/series.py | 9 +++++--- pandas/tests/test_groupby.py | 8 +++++++ pandas/tests/test_series.py | 42 ++++++++++++++++++++++++++++++++++++ vb_suite/groupby.py | 5 +++++ vb_suite/stat_ops.py | 4 ++++ 8 files changed, 92 insertions(+), 12 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 341450410f1e5..9988ffc2f1bdd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -66,6 +66,7 @@ API Changes - ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent; previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`) + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -83,6 +84,8 @@ Improvements to existing features - implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`) - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) - Testing statements updated to use specialized asserts (:issue: `6175`) +- ``Series.rank()`` now has a percentage rank option (:issue: `5971`) + .. _release.bug_fixes-0.14.0: diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 0be238117fe4e..7f406611c82f7 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -131,7 +131,7 @@ cdef _take_2d_object(ndarray[object, ndim=2] values, def rank_1d_float64(object in_arr, ties_method='average', ascending=True, - na_option='keep'): + na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -144,6 +144,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 + float count = 0.0 tiebreak = tiebreakers[ties_method] values = np.asarray(in_arr).copy() @@ -182,6 +183,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, if (val == nan_value) and keep_na: ranks[argsorted[i]] = nan continue + count += 1.0 if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -199,11 +201,14 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 sum_ranks = dups = 0 - return ranks + if pct: + return ranks / count + else: + return ranks def rank_1d_int64(object in_arr, ties_method='average', ascending=True, - na_option='keep'): + na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -216,6 +221,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, int64_t val float64_t sum_ranks = 0 int tiebreak = 0 + float count = 0.0 tiebreak = tiebreakers[ties_method] values = np.asarray(in_arr) @@ -242,6 +248,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, sum_ranks += i + 1 dups += 1 val = sorted_data[i] + count += 1.0 if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -259,7 +266,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 sum_ranks = dups = 0 - return ranks + if pct: + return ranks / count + else: + return ranks def rank_2d_float64(object in_arr, axis=0, ties_method='average', @@ -414,7 +424,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', - ascending=True, na_option='keep'): + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -428,6 +438,8 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 + float count = 0.0 + tiebreak = tiebreakers[ties_method] @@ -469,7 +481,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', sorted_data = values.take(_as) argsorted = _as.astype('i8') - for i in range(n): sum_ranks += i + 1 dups += 1 @@ -479,6 +490,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', continue if (i == n - 1 or are_diff(util.get_value_at(sorted_data, i + 1), val)): + count += 1.0 if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -491,7 +503,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') sum_ranks = dups = 0 - return ranks + if pct: + ranks / count + else: + return ranks cdef inline are_diff(object left, object right): try: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 44cd2d8906a5b..9c972c9795c47 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -285,14 +285,14 @@ def mode(values): def rank(values, axis=0, method='average', na_option='keep', - ascending=True): + ascending=True, pct=False): """ """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) ranks = f(values, ties_method=method, ascending=ascending, - na_option=na_option) + na_option=na_option, pct=pct) elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) ranks = f(values, axis=axis, ties_method=method, diff --git a/pandas/core/series.py b/pandas/core/series.py index 2ede2dfc130da..35acfffe5b598 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1707,7 +1707,8 @@ def argsort(self, axis=0, kind='quicksort', order=None): np.argsort(values, kind=kind), index=self.index, dtype='int64').__finalize__(self) - def rank(self, method='average', na_option='keep', ascending=True): + def rank(self, method='average', na_option='keep', ascending=True, + pct=False): """ Compute data ranks (1 through n). Equal values are assigned a rank that is the average of the ranks of those values @@ -1723,14 +1724,16 @@ def rank(self, method='average', na_option='keep', ascending=True): keep: leave NA values where they are ascending : boolean, default True False for ranks by high (1) to low (N) - + pct : boolean, defeault False + Computes percentage rank of data + Returns ------- ranks : Series """ from pandas.core.algorithms import rank ranks = rank(self.values, method=method, na_option=na_option, - ascending=ascending) + ascending=ascending, pct=pct) return self._constructor(ranks, index=self.index).__finalize__(self) def order(self, na_last=True, ascending=True, kind='mergesort'): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index fa7a2b2d24636..2c8b60ea25a6e 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2348,7 +2348,15 @@ def test_rank_apply(self): expected.append(piece.value.rank()) expected = concat(expected, axis=0) expected = expected.reindex(result.index) + assert_series_equal(result, expected) + + result = df.groupby(['key1', 'key2']).value.rank(pct=True) + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank(pct=True)) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) assert_series_equal(result, expected) def test_dont_clobber_name_column(self): diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 79ef285d7c5c2..74cbb956663ce 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3955,6 +3955,48 @@ def test_rank(self): iranks = iseries.rank() exp = iseries.astype(float).rank() assert_series_equal(iranks, exp) + iseries = Series(np.arange(5)) + 1.0 + exp = iseries / 5.0 + iranks = iseries.rank(pct=True) + + assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(1, 100)) + exp = Series(np.repeat(0.505, 100)) + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries[1] = np.nan + exp = Series(np.repeat(50.0 / 99.0, 100)) + exp[1] = np.nan + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1.0 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(np.nan, 100)) + exp = iseries.copy() + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + rng = date_range('1/1/1990', periods=5) + + iseries = Series(np.arange(5), rng) + 1 + iseries.ix[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + def test_from_csv(self): diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 4b2f097c212f8..01b44cbd5351c 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -47,6 +47,11 @@ def f(): Benchmark('simple_series.groupby(key1).sum()', setup, start_date=datetime(2011, 3, 1)) + +stmt4 = "df.groupby('key1').rank(pct=True)" +groupby_series_simple_cython = Benchmark(stmt4, setup, + start_date=datetime(2014, 1, 16)) + #---------------------------------------------------------------------- # 2d grouping, aggregate many columns diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py index 91741eb3c3759..a3a1df70dc248 100644 --- a/vb_suite/stat_ops.py +++ b/vb_suite/stat_ops.py @@ -85,6 +85,10 @@ stats_rank_average = Benchmark('s.rank()', setup, start_date=datetime(2011, 12, 12)) +stats_rank_pct_average = Benchmark('s.rank(pct=True)', setup, + start_date=datetime(2014, 01, 16)) +stats_rank_pct_average_old = Benchmark('s.rank() / s.size()', setup, + start_date=datetime(2014, 01, 16)) setup = common_setup + """ values = np.random.randint(0, 100000, size=200000) s = Series(values)