Skip to content

FIX: add support for desc order when ranking infs with nans #19538 #20091

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 30, 2018
6 changes: 3 additions & 3 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,

sorted_data = values.take(_as)
sorted_mask = mask.take(_as)
_indices = order[1].take(_as).nonzero()[0]
_indices = np.diff(sorted_mask).nonzero()[0]
non_na_idx = _indices[0] if len(_indices) > 0 else -1
argsorted = _as.astype('i8')

Expand All @@ -153,7 +153,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,

if (i == n - 1 or
are_diff(util.get_value_at(sorted_data, i + 1), val) or
i == non_na_idx - 1):
i == non_na_idx):
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
Expand Down Expand Up @@ -190,7 +190,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
count += 1.0

if (i == n - 1 or sorted_data[i + 1] != val or
i == non_na_idx - 1):
i == non_na_idx):
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
Expand Down
60 changes: 38 additions & 22 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from pandas.tests.series.common import TestData
from pandas._libs.tslib import iNaT
from pandas._libs.algos import Infinity, NegInfinity
from itertools import chain
import pandas.util._test_decorators as td


class TestSeriesRank(TestData):
Expand Down Expand Up @@ -257,38 +259,52 @@ def _check(s, expected, method='average'):
series = s if dtype is None else s.astype(dtype)
_check(series, results[method], method=method)

def test_rank_tie_methods_on_infs_nans(self):
@td.skip_if_no_scipy
@pytest.mark.parametrize('ascending', [True, False])
@pytest.mark.parametrize('method', ['average', 'min', 'max', 'first',
'dense'])
@pytest.mark.parametrize('na_option', ['top', 'bottom', 'keep'])
def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending):
dtypes = [('object', None, Infinity(), NegInfinity()),
('float64', np.nan, np.inf, -np.inf)]
chunk = 3
disabled = set([('object', 'first')])

def _check(s, expected, method='average', na_option='keep'):
result = s.rank(method=method, na_option=na_option)
def _check(s, method, na_option, ascending):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you inline these below

exp_ranks = {
'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should start four spaces to the right of the e in exp_ranks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likewise with each one below it.

'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
}
ranks = exp_ranks[method]
if na_option == 'top':
order = [ranks[1], ranks[0], ranks[2]]
elif na_option == 'bottom':
order = [ranks[0], ranks[2], ranks[1]]
else:
order = [ranks[0], [np.nan] * chunk, ranks[1]]
expected = order if ascending else order[::-1]
expected = list(chain.from_iterable(expected))
result = s.rank(method=method, na_option=na_option,
ascending=ascending)
tm.assert_series_equal(result, Series(expected, dtype='float64'))

exp_ranks = {
'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
}
na_options = ('top', 'bottom', 'keep')
for dtype, na_value, pos_inf, neg_inf in dtypes:
in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
iseries = Series(in_arr, dtype=dtype)
for method, na_opt in product(exp_ranks.keys(), na_options):
ranks = exp_ranks[method]
if (dtype, method) in disabled:
continue
if na_opt == 'top':
order = ranks[1] + ranks[0] + ranks[2]
elif na_opt == 'bottom':
order = ranks[0] + ranks[2] + ranks[1]
else:
order = ranks[0] + [np.nan] * chunk + ranks[1]
_check(iseries, order, method, na_opt)
if (dtype, method) in disabled:
continue
_check(iseries, method, na_option, ascending)

def test_rank_desc_mix_nans_infs(self):
#GH 19538
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Space after #

#check descending ranking when mix nans and infs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Space after #

iseries = Series([1, np.nan, np.inf, -np.inf, 25])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number

result = iseries.rank(ascending=False)
exp = Series([3, np.nan, 1, 4, 2], dtype='float64')
tm.assert_series_equal(result, exp)

def test_rank_methods_series(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ca you change this to use the @td.skip_if_no_scipy decorator instead

pytest.importorskip('scipy.stats.special')
Expand Down