Skip to content

Commit 72f0758

Browse files
changhiskhanwesm
authored andcommitted
ENH: rank na_options top and bottom #1508
1 parent 6484b9d commit 72f0758

File tree

4 files changed

+91
-19
lines changed

4 files changed

+91
-19
lines changed

pandas/core/algorithms.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,11 +191,12 @@ def rank(values, axis=0, method='average', na_option='keep',
191191
"""
192192
if values.ndim == 1:
193193
f, values = _get_data_algo(values, _rank1d_functions)
194-
ranks = f(values, ties_method=method, ascending=ascending)
194+
ranks = f(values, ties_method=method, ascending=ascending,
195+
na_option=na_option)
195196
elif values.ndim == 2:
196197
f, values = _get_data_algo(values, _rank2d_functions)
197198
ranks = f(values, axis=axis, ties_method=method,
198-
ascending=ascending)
199+
ascending=ascending, na_option=na_option)
199200
return ranks
200201

201202

pandas/core/frame.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4704,8 +4704,10 @@ def rank(self, axis=0, numeric_only=None, method='average',
47044704
min: lowest rank in group
47054705
max: highest rank in group
47064706
first: ranks assigned in order they appear in the array
4707-
na_option : {'keep'}
4707+
na_option : {'keep', 'top', 'bottom'}
47084708
keep: leave NA values where they are
4709+
top: smallest rank if ascending
4710+
bottom: smallest rank if descending
47094711
ascending : boolean, default True
47104712
False for ranks by high (1) to low (N)
47114713
@@ -4716,7 +4718,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
47164718
if numeric_only is None:
47174719
try:
47184720
ranks = algos.rank(self.values, axis=axis, method=method,
4719-
ascending=ascending)
4721+
ascending=ascending, na_option=na_option)
47204722
return DataFrame(ranks, index=self.index, columns=self.columns)
47214723
except TypeError:
47224724
numeric_only = True
@@ -4726,7 +4728,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
47264728
else:
47274729
data = self
47284730
ranks = algos.rank(data.values, axis=axis, method=method,
4729-
ascending=ascending)
4731+
ascending=ascending, na_option=na_option)
47304732
return DataFrame(ranks, index=data.index, columns=data.columns)
47314733

47324734
def to_timestamp(self, freq=None, how='start', axis=0, copy=True):

pandas/src/stats.pyx

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
7070
return result
7171

7272

73-
def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
73+
def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
74+
na_option='keep'):
7475
"""
7576
Fast NaN-friendly version of scipy.stats.rankdata
7677
"""
@@ -86,7 +87,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
8687

8788
values = np.asarray(in_arr).copy()
8889

89-
if ascending:
90+
if ascending ^ (na_option == 'top'):
9091
nan_value = np.inf
9192
else:
9293
nan_value = -np.inf
@@ -115,7 +116,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
115116
sum_ranks += i + 1
116117
dups += 1
117118
val = sorted_data[i]
118-
if val == nan_value:
119+
if (val == nan_value) and (na_option == 'keep'):
119120
ranks[argsorted[i]] = nan
120121
continue
121122
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
@@ -138,7 +139,8 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
138139
return ranks
139140

140141

141-
def rank_1d_int64(object in_arr, ties_method='average', ascending=True):
142+
def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
143+
na_option='keep'):
142144
"""
143145
Fast NaN-friendly version of scipy.stats.rankdata
144146
"""
@@ -198,7 +200,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True):
198200

199201

200202
def rank_2d_float64(object in_arr, axis=0, ties_method='average',
201-
ascending=True):
203+
ascending=True, na_option='keep'):
202204
"""
203205
Fast NaN-friendly version of scipy.stats.rankdata
204206
"""
@@ -219,7 +221,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
219221
else:
220222
values = in_arr.copy()
221223

222-
if ascending:
224+
if ascending ^ (na_option == 'top'):
223225
nan_value = np.inf
224226
else:
225227
nan_value = -np.inf
@@ -249,7 +251,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
249251
sum_ranks += j + 1
250252
dups += 1
251253
val = values[i, j]
252-
if val == nan_value:
254+
if val == nan_value and na_option == 'keep':
253255
ranks[i, argsorted[i, j]] = nan
254256
continue
255257
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
@@ -277,7 +279,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
277279

278280

279281
def rank_2d_int64(object in_arr, axis=0, ties_method='average',
280-
ascending=True):
282+
ascending=True, na_option='keep'):
281283
"""
282284
Fast NaN-friendly version of scipy.stats.rankdata
283285
"""
@@ -345,7 +347,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
345347

346348

347349
def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
348-
ascending=True):
350+
ascending=True, na_option='keep'):
349351
"""
350352
Fast NaN-friendly version of scipy.stats.rankdata
351353
"""
@@ -365,7 +367,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
365367
if values.dtype != np.object_:
366368
values = values.astype('O')
367369

368-
if ascending:
370+
if ascending ^ (na_option == 'top'):
369371
# always greater than everything
370372
nan_value = Infinity()
371373
else:
@@ -401,7 +403,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
401403
sum_ranks += i + 1
402404
dups += 1
403405
val = util.get_value_at(sorted_data, i)
404-
if val is nan_value:
406+
if val is nan_value and na_option=='keep':
405407
ranks[argsorted[i]] = nan
406408
continue
407409
if (i == n - 1 or
@@ -450,7 +452,7 @@ class NegInfinity(object):
450452
__cmp__ = _return_true
451453

452454
def rank_2d_generic(object in_arr, axis=0, ties_method='average',
453-
ascending=True):
455+
ascending=True, na_option='keep'):
454456
"""
455457
Fast NaN-friendly version of scipy.stats.rankdata
456458
"""
@@ -475,7 +477,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
475477
if values.dtype != np.object_:
476478
values = values.astype('O')
477479

478-
if ascending:
480+
if ascending ^ (na_option == 'top'):
479481
# always greater than everything
480482
nan_value = Infinity()
481483
else:
@@ -510,7 +512,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
510512
dups = sum_ranks = infs = 0
511513
for j in range(k):
512514
val = values[i, j]
513-
if val is nan_value:
515+
if val is nan_value and na_option == 'keep':
514516
ranks[i, argsorted[i, j]] = nan
515517
infs += 1
516518
continue

pandas/tests/test_frame.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6459,6 +6459,73 @@ def test_rank2(self):
64596459
expected = self.mixed_frame.rank(1, numeric_only=True)
64606460
assert_frame_equal(result, expected)
64616461

6462+
def test_rank_na_option(self):
6463+
from pandas.compat.scipy import rankdata
6464+
6465+
self.frame['A'][::2] = np.nan
6466+
self.frame['B'][::3] = np.nan
6467+
self.frame['C'][::4] = np.nan
6468+
self.frame['D'][::5] = np.nan
6469+
6470+
#bottom
6471+
ranks0 = self.frame.rank(na_option='bottom')
6472+
ranks1 = self.frame.rank(1, na_option='bottom')
6473+
6474+
fvals = self.frame.fillna(np.inf).values
6475+
6476+
exp0 = np.apply_along_axis(rankdata, 0, fvals)
6477+
exp1 = np.apply_along_axis(rankdata, 1, fvals)
6478+
6479+
assert_almost_equal(ranks0.values, exp0)
6480+
assert_almost_equal(ranks1.values, exp1)
6481+
6482+
#top
6483+
ranks0 = self.frame.rank(na_option='top')
6484+
ranks1 = self.frame.rank(1, na_option='top')
6485+
6486+
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
6487+
fval1 = self.frame.T
6488+
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
6489+
fval1 = fval1.fillna(np.inf).values
6490+
6491+
exp0 = np.apply_along_axis(rankdata, 0, fval0)
6492+
exp1 = np.apply_along_axis(rankdata, 1, fval1)
6493+
6494+
assert_almost_equal(ranks0.values, exp0)
6495+
assert_almost_equal(ranks1.values, exp1)
6496+
6497+
#descending
6498+
6499+
#bottom
6500+
ranks0 = self.frame.rank(na_option='top', ascending=False)
6501+
ranks1 = self.frame.rank(1, na_option='top', ascending=False)
6502+
6503+
fvals = self.frame.fillna(np.inf).values
6504+
6505+
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
6506+
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
6507+
6508+
assert_almost_equal(ranks0.values, exp0)
6509+
assert_almost_equal(ranks1.values, exp1)
6510+
6511+
#descending
6512+
6513+
#top
6514+
ranks0 = self.frame.rank(na_option='bottom', ascending=False)
6515+
ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
6516+
6517+
fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
6518+
fval1 = self.frame.T
6519+
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
6520+
fval1 = fval1.fillna(np.inf).values
6521+
6522+
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
6523+
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
6524+
6525+
assert_almost_equal(ranks0.values, exp0)
6526+
assert_almost_equal(ranks1.values, exp1)
6527+
6528+
64626529
def test_describe(self):
64636530
desc = self.tsframe.describe()
64646531
desc = self.mixed_frame.describe()

0 commit comments

Comments
 (0)