Skip to content

ENH: series rank has a percentage rank option #5978

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ API Changes
- ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent;
previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`)


Experimental Features
~~~~~~~~~~~~~~~~~~~~~

Expand All @@ -83,6 +84,8 @@ Improvements to existing features
- implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`)
- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
- Testing statements updated to use specialized asserts (:issue: `6175`)
- ``Series.rank()`` now has a percentage rank option (:issue: `5971`)


.. _release.bug_fixes-0.14.0:

Expand Down
29 changes: 22 additions & 7 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,


def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
na_option='keep'):
na_option='keep', pct=False):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -144,6 +144,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
float64_t sum_ranks = 0
int tiebreak = 0
bint keep_na = 0
float count = 0.0
tiebreak = tiebreakers[ties_method]

values = np.asarray(in_arr).copy()
Expand Down Expand Up @@ -182,6 +183,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
if (val == nan_value) and keep_na:
ranks[argsorted[i]] = nan
continue
count += 1.0
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
Expand All @@ -199,11 +201,14 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = 2 * i - j - dups + 2
sum_ranks = dups = 0
return ranks
if pct:
return ranks / count
else:
return ranks


def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
na_option='keep'):
na_option='keep', pct=False):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -216,6 +221,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
int64_t val
float64_t sum_ranks = 0
int tiebreak = 0
float count = 0.0
tiebreak = tiebreakers[ties_method]

values = np.asarray(in_arr)
Expand All @@ -242,6 +248,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
sum_ranks += i + 1
dups += 1
val = sorted_data[i]
count += 1.0
if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0:
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
Expand All @@ -259,7 +266,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = 2 * i - j - dups + 2
sum_ranks = dups = 0
return ranks
if pct:
return ranks / count
else:
return ranks


def rank_2d_float64(object in_arr, axis=0, ties_method='average',
Expand Down Expand Up @@ -414,7 +424,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',


def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
ascending=True, na_option='keep'):
ascending=True, na_option='keep', pct=False):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
Expand All @@ -428,6 +438,8 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
float64_t sum_ranks = 0
int tiebreak = 0
bint keep_na = 0
float count = 0.0


tiebreak = tiebreakers[ties_method]

Expand Down Expand Up @@ -469,7 +481,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',

sorted_data = values.take(_as)
argsorted = _as.astype('i8')

for i in range(n):
sum_ranks += i + 1
dups += 1
Expand All @@ -479,6 +490,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
continue
if (i == n - 1 or
are_diff(util.get_value_at(sorted_data, i + 1), val)):
count += 1.0
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
Expand All @@ -491,7 +503,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
elif tiebreak == TIEBREAK_FIRST:
raise ValueError('first not supported for non-numeric data')
sum_ranks = dups = 0
return ranks
if pct:
ranks / count
else:
return ranks

cdef inline are_diff(object left, object right):
try:
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,14 +285,14 @@ def mode(values):


def rank(values, axis=0, method='average', na_option='keep',
ascending=True):
ascending=True, pct=False):
"""

"""
if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
ranks = f(values, ties_method=method, ascending=ascending,
na_option=na_option)
na_option=na_option, pct=pct)
elif values.ndim == 2:
f, values = _get_data_algo(values, _rank2d_functions)
ranks = f(values, axis=axis, ties_method=method,
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,7 +1707,8 @@ def argsort(self, axis=0, kind='quicksort', order=None):
np.argsort(values, kind=kind), index=self.index,
dtype='int64').__finalize__(self)

def rank(self, method='average', na_option='keep', ascending=True):
def rank(self, method='average', na_option='keep', ascending=True,
pct=False):
"""
Compute data ranks (1 through n). Equal values are assigned a rank that
is the average of the ranks of those values
Expand All @@ -1723,14 +1724,16 @@ def rank(self, method='average', na_option='keep', ascending=True):
keep: leave NA values where they are
ascending : boolean, default True
False for ranks by high (1) to low (N)

pct : boolean, defeault False
Computes percentage rank of data

Returns
-------
ranks : Series
"""
from pandas.core.algorithms import rank
ranks = rank(self.values, method=method, na_option=na_option,
ascending=ascending)
ascending=ascending, pct=pct)
return self._constructor(ranks, index=self.index).__finalize__(self)

def order(self, na_last=True, ascending=True, kind='mergesort'):
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2348,7 +2348,15 @@ def test_rank_apply(self):
expected.append(piece.value.rank())
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
assert_series_equal(result, expected)

result = df.groupby(['key1', 'key2']).value.rank(pct=True)

expected = []
for key, piece in df.groupby(['key1', 'key2']):
expected.append(piece.value.rank(pct=True))
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
assert_series_equal(result, expected)

def test_dont_clobber_name_column(self):
Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3955,6 +3955,48 @@ def test_rank(self):
iranks = iseries.rank()
exp = iseries.astype(float).rank()
assert_series_equal(iranks, exp)
iseries = Series(np.arange(5)) + 1.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a couple of more tests, maybe all nan series and for groupby, group that has 1 element

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added more tests with partial nan's and duplicate values. nan's will always be nan's so not sure if we would ever catch a bug if all nan's.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but that is the check; make sure you propogate nans; the edge cases are always important to test (and usually the hardest to get right)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

easy enough. I added that as well.

exp = iseries / 5.0
iranks = iseries.rank(pct=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you have it here, but can you have a test with 'int64' and datetimes (e.g. a date series) (result of course should be float64);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


assert_series_equal(iranks, exp)

iseries = Series(np.repeat(1, 100))
exp = Series(np.repeat(0.505, 100))
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)

iseries[1] = np.nan
exp = Series(np.repeat(50.0 / 99.0, 100))
exp[1] = np.nan
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)

iseries = Series(np.arange(5)) + 1.0
iseries[4] = np.nan
exp = iseries / 4.0
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)

iseries = Series(np.repeat(np.nan, 100))
exp = iseries.copy()
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)

iseries = Series(np.arange(5)) + 1
iseries[4] = np.nan
exp = iseries / 4.0
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)
rng = date_range('1/1/1990', periods=5)

iseries = Series(np.arange(5), rng) + 1
iseries.ix[4] = np.nan
exp = iseries / 4.0
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)



def test_from_csv(self):

Expand Down
5 changes: 5 additions & 0 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ def f():
Benchmark('simple_series.groupby(key1).sum()', setup,
start_date=datetime(2011, 3, 1))


stmt4 = "df.groupby('key1').rank(pct=True)"
groupby_series_simple_cython = Benchmark(stmt4, setup,
start_date=datetime(2014, 1, 16))

#----------------------------------------------------------------------
# 2d grouping, aggregate many columns

Expand Down
4 changes: 4 additions & 0 deletions vb_suite/stat_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@
stats_rank_average = Benchmark('s.rank()', setup,
start_date=datetime(2011, 12, 12))

stats_rank_pct_average = Benchmark('s.rank(pct=True)', setup,
start_date=datetime(2014, 01, 16))
stats_rank_pct_average_old = Benchmark('s.rank() / s.size()', setup,
start_date=datetime(2014, 01, 16))
setup = common_setup + """
values = np.random.randint(0, 100000, size=200000)
s = Series(values)
Expand Down