Skip to content

Commit d3dd67c

Browse files
committed
Merge pull request #5978 from MichaelWS/master
ENH: series rank has a percentage rank option
2 parents 5e64e88 + 7b37858 commit d3dd67c

File tree

8 files changed

+92
-12
lines changed

8 files changed

+92
-12
lines changed

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ API Changes
6666
- ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent;
6767
previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`)
6868

69+
6970
Experimental Features
7071
~~~~~~~~~~~~~~~~~~~~~
7172

@@ -83,6 +84,8 @@ Improvements to existing features
8384
- implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`)
8485
- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
8586
- Testing statements updated to use specialized asserts (:issue: `6175`)
87+
- ``Series.rank()`` now has a percentage rank option (:issue: `5971`)
88+
8689

8790
.. _release.bug_fixes-0.14.0:
8891

pandas/algos.pyx

+22-7
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
131131

132132

133133
def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
134-
na_option='keep'):
134+
na_option='keep', pct=False):
135135
"""
136136
Fast NaN-friendly version of scipy.stats.rankdata
137137
"""
@@ -144,6 +144,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
144144
float64_t sum_ranks = 0
145145
int tiebreak = 0
146146
bint keep_na = 0
147+
float count = 0.0
147148
tiebreak = tiebreakers[ties_method]
148149

149150
values = np.asarray(in_arr).copy()
@@ -182,6 +183,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
182183
if (val == nan_value) and keep_na:
183184
ranks[argsorted[i]] = nan
184185
continue
186+
count += 1.0
185187
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
186188
if tiebreak == TIEBREAK_AVERAGE:
187189
for j in range(i - dups + 1, i + 1):
@@ -199,11 +201,14 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
199201
for j in range(i - dups + 1, i + 1):
200202
ranks[argsorted[j]] = 2 * i - j - dups + 2
201203
sum_ranks = dups = 0
202-
return ranks
204+
if pct:
205+
return ranks / count
206+
else:
207+
return ranks
203208

204209

205210
def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
206-
na_option='keep'):
211+
na_option='keep', pct=False):
207212
"""
208213
Fast NaN-friendly version of scipy.stats.rankdata
209214
"""
@@ -216,6 +221,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
216221
int64_t val
217222
float64_t sum_ranks = 0
218223
int tiebreak = 0
224+
float count = 0.0
219225
tiebreak = tiebreakers[ties_method]
220226

221227
values = np.asarray(in_arr)
@@ -242,6 +248,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
242248
sum_ranks += i + 1
243249
dups += 1
244250
val = sorted_data[i]
251+
count += 1.0
245252
if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
246253
if tiebreak == TIEBREAK_AVERAGE:
247254
for j in range(i - dups + 1, i + 1):
@@ -259,7 +266,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
259266
for j in range(i - dups + 1, i + 1):
260267
ranks[argsorted[j]] = 2 * i - j - dups + 2
261268
sum_ranks = dups = 0
262-
return ranks
269+
if pct:
270+
return ranks / count
271+
else:
272+
return ranks
263273

264274

265275
def rank_2d_float64(object in_arr, axis=0, ties_method='average',
@@ -414,7 +424,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
414424

415425

416426
def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
417-
ascending=True, na_option='keep'):
427+
ascending=True, na_option='keep', pct=False):
418428
"""
419429
Fast NaN-friendly version of scipy.stats.rankdata
420430
"""
@@ -428,6 +438,8 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
428438
float64_t sum_ranks = 0
429439
int tiebreak = 0
430440
bint keep_na = 0
441+
float count = 0.0
442+
431443

432444
tiebreak = tiebreakers[ties_method]
433445

@@ -469,7 +481,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
469481

470482
sorted_data = values.take(_as)
471483
argsorted = _as.astype('i8')
472-
473484
for i in range(n):
474485
sum_ranks += i + 1
475486
dups += 1
@@ -479,6 +490,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
479490
continue
480491
if (i == n - 1 or
481492
are_diff(util.get_value_at(sorted_data, i + 1), val)):
493+
count += 1.0
482494
if tiebreak == TIEBREAK_AVERAGE:
483495
for j in range(i - dups + 1, i + 1):
484496
ranks[argsorted[j]] = sum_ranks / dups
@@ -491,7 +503,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
491503
elif tiebreak == TIEBREAK_FIRST:
492504
raise ValueError('first not supported for non-numeric data')
493505
sum_ranks = dups = 0
494-
return ranks
506+
if pct:
507+
ranks / count
508+
else:
509+
return ranks
495510

496511
cdef inline are_diff(object left, object right):
497512
try:

pandas/core/algorithms.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -285,14 +285,14 @@ def mode(values):
285285

286286

287287
def rank(values, axis=0, method='average', na_option='keep',
288-
ascending=True):
288+
ascending=True, pct=False):
289289
"""
290290
291291
"""
292292
if values.ndim == 1:
293293
f, values = _get_data_algo(values, _rank1d_functions)
294294
ranks = f(values, ties_method=method, ascending=ascending,
295-
na_option=na_option)
295+
na_option=na_option, pct=pct)
296296
elif values.ndim == 2:
297297
f, values = _get_data_algo(values, _rank2d_functions)
298298
ranks = f(values, axis=axis, ties_method=method,

pandas/core/series.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,8 @@ def argsort(self, axis=0, kind='quicksort', order=None):
17071707
np.argsort(values, kind=kind), index=self.index,
17081708
dtype='int64').__finalize__(self)
17091709

1710-
def rank(self, method='average', na_option='keep', ascending=True):
1710+
def rank(self, method='average', na_option='keep', ascending=True,
1711+
pct=False):
17111712
"""
17121713
Compute data ranks (1 through n). Equal values are assigned a rank that
17131714
is the average of the ranks of those values
@@ -1723,14 +1724,16 @@ def rank(self, method='average', na_option='keep', ascending=True):
17231724
keep: leave NA values where they are
17241725
ascending : boolean, default True
17251726
False for ranks by high (1) to low (N)
1726-
1727+
pct : boolean, defeault False
1728+
Computes percentage rank of data
1729+
17271730
Returns
17281731
-------
17291732
ranks : Series
17301733
"""
17311734
from pandas.core.algorithms import rank
17321735
ranks = rank(self.values, method=method, na_option=na_option,
1733-
ascending=ascending)
1736+
ascending=ascending, pct=pct)
17341737
return self._constructor(ranks, index=self.index).__finalize__(self)
17351738

17361739
def order(self, na_last=True, ascending=True, kind='mergesort'):

pandas/tests/test_groupby.py

+8
Original file line numberDiff line numberDiff line change
@@ -2348,7 +2348,15 @@ def test_rank_apply(self):
23482348
expected.append(piece.value.rank())
23492349
expected = concat(expected, axis=0)
23502350
expected = expected.reindex(result.index)
2351+
assert_series_equal(result, expected)
2352+
2353+
result = df.groupby(['key1', 'key2']).value.rank(pct=True)
23512354

2355+
expected = []
2356+
for key, piece in df.groupby(['key1', 'key2']):
2357+
expected.append(piece.value.rank(pct=True))
2358+
expected = concat(expected, axis=0)
2359+
expected = expected.reindex(result.index)
23522360
assert_series_equal(result, expected)
23532361

23542362
def test_dont_clobber_name_column(self):

pandas/tests/test_series.py

+42
Original file line numberDiff line numberDiff line change
@@ -3955,6 +3955,48 @@ def test_rank(self):
39553955
iranks = iseries.rank()
39563956
exp = iseries.astype(float).rank()
39573957
assert_series_equal(iranks, exp)
3958+
iseries = Series(np.arange(5)) + 1.0
3959+
exp = iseries / 5.0
3960+
iranks = iseries.rank(pct=True)
3961+
3962+
assert_series_equal(iranks, exp)
3963+
3964+
iseries = Series(np.repeat(1, 100))
3965+
exp = Series(np.repeat(0.505, 100))
3966+
iranks = iseries.rank(pct=True)
3967+
assert_series_equal(iranks, exp)
3968+
3969+
iseries[1] = np.nan
3970+
exp = Series(np.repeat(50.0 / 99.0, 100))
3971+
exp[1] = np.nan
3972+
iranks = iseries.rank(pct=True)
3973+
assert_series_equal(iranks, exp)
3974+
3975+
iseries = Series(np.arange(5)) + 1.0
3976+
iseries[4] = np.nan
3977+
exp = iseries / 4.0
3978+
iranks = iseries.rank(pct=True)
3979+
assert_series_equal(iranks, exp)
3980+
3981+
iseries = Series(np.repeat(np.nan, 100))
3982+
exp = iseries.copy()
3983+
iranks = iseries.rank(pct=True)
3984+
assert_series_equal(iranks, exp)
3985+
3986+
iseries = Series(np.arange(5)) + 1
3987+
iseries[4] = np.nan
3988+
exp = iseries / 4.0
3989+
iranks = iseries.rank(pct=True)
3990+
assert_series_equal(iranks, exp)
3991+
rng = date_range('1/1/1990', periods=5)
3992+
3993+
iseries = Series(np.arange(5), rng) + 1
3994+
iseries.ix[4] = np.nan
3995+
exp = iseries / 4.0
3996+
iranks = iseries.rank(pct=True)
3997+
assert_series_equal(iranks, exp)
3998+
3999+
39584000

39594001
def test_from_csv(self):
39604002

vb_suite/groupby.py

+5
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ def f():
4747
Benchmark('simple_series.groupby(key1).sum()', setup,
4848
start_date=datetime(2011, 3, 1))
4949

50+
51+
stmt4 = "df.groupby('key1').rank(pct=True)"
52+
groupby_series_simple_cython = Benchmark(stmt4, setup,
53+
start_date=datetime(2014, 1, 16))
54+
5055
#----------------------------------------------------------------------
5156
# 2d grouping, aggregate many columns
5257

vb_suite/stat_ops.py

+4
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@
8585
stats_rank_average = Benchmark('s.rank()', setup,
8686
start_date=datetime(2011, 12, 12))
8787

88+
stats_rank_pct_average = Benchmark('s.rank(pct=True)', setup,
89+
start_date=datetime(2014, 01, 16))
90+
stats_rank_pct_average_old = Benchmark('s.rank() / s.size()', setup,
91+
start_date=datetime(2014, 01, 16))
8892
setup = common_setup + """
8993
values = np.random.randint(0, 100000, size=200000)
9094
s = Series(values)

0 commit comments

Comments
 (0)