Skip to content

Commit c3c92af

Browse files
committed
BUG: fix issues on rank calculations when +/- inf values are present with NaNs (pandas-dev#6945)
1 parent 0e16818 commit c3c92af

File tree

5 files changed

+209
-46
lines changed

5 files changed

+209
-46
lines changed

doc/source/whatsnew/v0.22.0.txt

+62-1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,67 @@ Other Enhancements
7676
- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
7777
- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
7878
- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
79+
- :func:`Series.rank` and :func:`DataFrame.rank` now can handle ``inf`` values properly when ``NaN`` are present (:issue:`6945`)
80+
81+
handle ``inf`` values properly when ``NaN`` are present
82+
"""""""""""""""""""""""""""""""""""""""""""""""""""""""
83+
84+
In previous versions, ``inf`` elements were assigned ``NaN`` as their ranks. Now ranks are calculated properly.
85+
86+
Previous Behavior:
87+
88+
.. code-block:: ipython
89+
90+
In [6]: pd.Series([-np.inf, 0, 1, np.nan, np.inf]).rank()
91+
Out[6]:
92+
0 1.0
93+
1 2.0
94+
2 3.0
95+
3 NaN
96+
4 NaN
97+
dtype: float64
98+
99+
Current Behavior
100+
101+
.. ipython:: python
102+
103+
In [2]: import numpy as np
104+
105+
In [3]: pd.Series([-np.inf, 0, 1, np.nan, np.inf]).rank()
106+
Out[3]:
107+
0 1.0
108+
1 2.0
109+
2 3.0
110+
3 NaN
111+
4 4.0
112+
dtype: float64
113+
114+
Furthermore, previously if you rank ``inf`` or ``-inf`` values together with ``NaN`` values, the calculation won't distinguish ``NaN`` from infinity when using 'top' or 'bottom' argument.
115+
116+
Previously Behavior:
117+
118+
.. code-block:: ipython
119+
120+
In [15]: pd.Series([np.nan, np.nan, -np.inf, -np.inf]).rank(na_option='top')
121+
Out[15]:
122+
0 2.5
123+
1 2.5
124+
2 2.5
125+
3 2.5
126+
dtype: float64
127+
128+
Current Behavior
129+
130+
.. ipython:: python
131+
132+
In [4]: pd.Series([np.nan, np.nan, -np.inf, -np.inf]).rank(na_option='top')
133+
Out[4]:
134+
0 1.5
135+
1 1.5
136+
2 3.5
137+
3 3.5
138+
dtype: float64
139+
79140

80141
.. _whatsnew_0220.api_breaking:
81142

@@ -227,7 +288,7 @@ Reshaping
227288
Numeric
228289
^^^^^^^
229290

230-
-
291+
- Bug in :func:`Series.rank` and :func:`DataFrame.rank` could not properly rank infinity values when ``NaN`` values are present (:issue:`6945`)
231292
-
232293
-
233294

pandas/_libs/algos.pyx

+12-10
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,24 @@ class Infinity(object):
6464
""" provide a positive Infinity comparision method for ranking """
6565

6666
__lt__ = lambda self, other: False
67-
__le__ = lambda self, other: self is other
68-
__eq__ = lambda self, other: self is other
69-
__ne__ = lambda self, other: self is not other
70-
__gt__ = lambda self, other: self is not other
71-
__ge__ = lambda self, other: True
67+
__le__ = lambda self, other: isinstance(other, Infinity)
68+
__eq__ = lambda self, other: isinstance(other, Infinity)
69+
__ne__ = lambda self, other: not isinstance(other, Infinity)
70+
__gt__ = lambda self, other: (not isinstance(other, Infinity) and
71+
not missing.checknull(other))
72+
__ge__ = lambda self, other: not missing.checknull(other)
7273

7374

7475
class NegInfinity(object):
7576
""" provide a negative Infinity comparision method for ranking """
7677

77-
__lt__ = lambda self, other: self is not other
78-
__le__ = lambda self, other: True
79-
__eq__ = lambda self, other: self is other
80-
__ne__ = lambda self, other: self is not other
78+
__lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
79+
not missing.checknull(other))
80+
__le__ = lambda self, other: not missing.checknull(other)
81+
__eq__ = lambda self, other: isinstance(other, NegInfinity)
82+
__ne__ = lambda self, other: not isinstance(other, NegInfinity)
8183
__gt__ = lambda self, other: False
82-
__ge__ = lambda self, other: self is other
84+
__ge__ = lambda self, other: isinstance(other, NegInfinity)
8385

8486

8587
@cython.wraparound(False)

pandas/_libs/algos_rank_helper.pxi.in

+37-27
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
2727
{{if dtype == 'object'}}
2828

2929

30-
def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average',
30+
def rank_1d_{{dtype}}(object in_arr, ties_method='average',
3131
ascending=True, na_option='keep', pct=False):
3232
{{else}}
3333

@@ -40,7 +40,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
4040
"""
4141

4242
cdef:
43-
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
43+
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
4444

4545
{{if dtype == 'object'}}
4646
ndarray sorted_data, values
@@ -50,6 +50,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
5050

5151
ndarray[float64_t] ranks
5252
ndarray[int64_t] argsorted
53+
ndarray[np.uint8_t, cast=True] sorted_mask
5354

5455
{{if dtype == 'uint64'}}
5556
{{ctype}} val
@@ -60,6 +61,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
6061
float64_t sum_ranks = 0
6162
int tiebreak = 0
6263
bint keep_na = 0
64+
bint isnan
6365
float count = 0.0
6466
tiebreak = tiebreakers[ties_method]
6567

@@ -76,12 +78,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
7678

7779
keep_na = na_option == 'keep'
7880

79-
{{if dtype != 'uint64'}}
80-
if ascending ^ (na_option == 'top'):
81-
nan_value = {{pos_nan_value}}
82-
else:
83-
nan_value = {{neg_nan_value}}
84-
8581
{{if dtype == 'object'}}
8682
mask = missing.isnaobj(values)
8783
{{elif dtype == 'float64'}}
@@ -90,56 +86,69 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
9086
mask = values == iNaT
9187
{{endif}}
9288

89+
# double sort first by mask and then by values to ensure nan values are
90+
# either at the beginning or the end. mask/(~mask) controls padding at
91+
# tail or the head
92+
{{if dtype != 'uint64'}}
93+
if ascending ^ (na_option == 'top'):
94+
nan_value = {{pos_nan_value}}
95+
order = (values, mask)
96+
else:
97+
nan_value = {{neg_nan_value}}
98+
order = (values, ~mask)
9399
np.putmask(values, mask, nan_value)
100+
{{else}}
101+
mask = np.zeros(shape=len(values), dtype=bool)
102+
order = (values, mask)
94103
{{endif}}
95104

96105
n = len(values)
97106
ranks = np.empty(n, dtype='f8')
98107

99108
{{if dtype == 'object'}}
109+
100110
try:
101-
_as = values.argsort()
111+
_as = np.lexsort(keys=order)
102112
except TypeError:
103-
if not retry:
104-
raise
105-
106-
valid_locs = (~mask).nonzero()[0]
107-
ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0,
108-
ties_method=ties_method,
109-
ascending=ascending))
110-
np.putmask(ranks, mask, np.nan)
111-
return ranks
113+
# lexsort on object array will raise TypeError for numpy version
114+
# earlier than 1.11.0. Use argsort with order argument instead.
115+
_dt = [('values', 'O'), ('mask', '?')]
116+
_values = np.asarray(list(zip(order[0], order[1])), dtype=_dt)
117+
_as = np.argsort(_values, kind='mergesort', order=('mask', 'values'))
112118
{{else}}
113119
if tiebreak == TIEBREAK_FIRST:
114120
# need to use a stable sort here
115-
_as = values.argsort(kind='mergesort')
121+
_as = np.lexsort(keys=order)
116122
if not ascending:
117123
tiebreak = TIEBREAK_FIRST_DESCENDING
118124
else:
119-
_as = values.argsort()
125+
_as = np.lexsort(keys=order)
120126
{{endif}}
121127

122128
if not ascending:
123129
_as = _as[::-1]
124130

125131
sorted_data = values.take(_as)
132+
sorted_mask = mask.take(_as)
133+
_indices = order[1].take(_as).nonzero()[0]
134+
non_na_idx = _indices[0] if len(_indices) > 0 else -1
126135
argsorted = _as.astype('i8')
127136

128137
{{if dtype == 'object'}}
129138
for i in range(n):
130139
sum_ranks += i + 1
131140
dups += 1
132-
141+
isnan = sorted_mask[i]
133142
val = util.get_value_at(sorted_data, i)
134143

135-
if (val is nan_value) and keep_na:
144+
if isnan and keep_na:
136145
ranks[argsorted[i]] = nan
137146
continue
138-
139147
count += 1.0
140148

141149
if (i == n - 1 or
142-
are_diff(util.get_value_at(sorted_data, i + 1), val)):
150+
are_diff(util.get_value_at(sorted_data, i + 1), val) or
151+
i == non_na_idx - 1):
143152
if tiebreak == TIEBREAK_AVERAGE:
144153
for j in range(i - dups + 1, i + 1):
145154
ranks[argsorted[j]] = sum_ranks / dups
@@ -164,18 +173,19 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
164173
for i in range(n):
165174
sum_ranks += i + 1
166175
dups += 1
167-
168176
val = sorted_data[i]
169177

170178
{{if dtype != 'uint64'}}
171-
if (val == nan_value) and keep_na:
179+
isnan = sorted_mask[i]
180+
if isnan and keep_na:
172181
ranks[argsorted[i]] = nan
173182
continue
174183
{{endif}}
175184

176185
count += 1.0
177186

178-
if i == n - 1 or sorted_data[i + 1] != val:
187+
if (i == n - 1 or sorted_data[i + 1] != val or
188+
i == non_na_idx - 1):
179189
if tiebreak == TIEBREAK_AVERAGE:
180190
for j in range(i - dups + 1, i + 1):
181191
ranks[argsorted[j]] = sum_ranks / dups

pandas/tests/series/test_rank.py

+75-8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from pandas.util.testing import assert_series_equal
1515
import pandas.util.testing as tm
1616
from pandas.tests.series.common import TestData
17+
from pandas._libs.tslib import iNaT
18+
from pandas._libs.algos import Infinity, NegInfinity
1719

1820

1921
class TestSeriesRank(TestData):
@@ -195,16 +197,48 @@ def test_rank_signature(self):
195197
s.rank(method='average')
196198
pytest.raises(ValueError, s.rank, 'average')
197199

198-
def test_rank_inf(self):
199-
pytest.skip('DataFrame.rank does not currently rank '
200-
'np.inf and -np.inf properly')
201-
202-
values = np.array(
203-
[-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
204-
2, 40, np.inf], dtype='float64')
200+
@pytest.mark.parametrize('contents,dtype', [
201+
([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
202+
2, 40, np.inf],
203+
'float64'),
204+
([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10,
205+
2, 40, np.inf],
206+
'float32'),
207+
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max],
208+
'uint8'),
209+
pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000,
210+
1e10, np.iinfo(np.int64).max],
211+
'int64',
212+
marks=pytest.mark.xfail(reason='''iNaT is equivalent to
213+
minimum value of dtype
214+
int64 pending issue
215+
#16674'''),
216+
),
217+
([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()],
218+
'object')
219+
])
220+
def test_rank_inf(self, contents, dtype):
221+
dtype_na_map = {
222+
'float64': np.nan,
223+
'float32': np.nan,
224+
'int64': iNaT,
225+
'object': None
226+
}
227+
# Insert nans at random positions if underlying dtype has missing
228+
# value. Then adjust the expected order by adding nans accordingly
229+
# This is for testing whether rank calculation is affected
230+
# when values are interwined with nan values.
231+
values = np.array(contents, dtype=dtype)
232+
exp_order = np.array(range(len(values)), dtype='float64') + 1.0
233+
if dtype in dtype_na_map:
234+
na_value = dtype_na_map[dtype]
235+
nan_indices = np.random.choice(range(len(values)), 5)
236+
values = np.insert(values, nan_indices, na_value)
237+
exp_order = np.insert(exp_order, nan_indices, np.nan)
238+
# shuffle the testing array and expected results in the same way
205239
random_order = np.random.permutation(len(values))
206240
iseries = Series(values[random_order])
207-
exp = Series(random_order + 1.0, dtype='float64')
241+
exp = Series(exp_order[random_order], dtype='float64')
208242
iranks = iseries.rank()
209243
assert_series_equal(iranks, exp)
210244

@@ -225,6 +259,39 @@ def _check(s, expected, method='average'):
225259
series = s if dtype is None else s.astype(dtype)
226260
_check(series, results[method], method=method)
227261

262+
def test_rank_tie_methods_on_infs_nans(self):
263+
dtypes = [('object', None, Infinity(), NegInfinity()),
264+
('float64', np.nan, np.inf, -np.inf)]
265+
chunk = 3
266+
disabled = set([('object', 'first')])
267+
268+
def _check(s, expected, method='average', na_option='keep'):
269+
result = s.rank(method=method, na_option=na_option)
270+
tm.assert_series_equal(result, Series(expected, dtype='float64'))
271+
272+
exp_ranks = {
273+
'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
274+
'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
275+
'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
276+
'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
277+
'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
278+
}
279+
na_options = ('top', 'bottom', 'keep')
280+
for dtype, na_value, pos_inf, neg_inf in dtypes:
281+
in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
282+
iseries = Series(in_arr, dtype=dtype)
283+
for method, na_opt in product(exp_ranks.keys(), na_options):
284+
ranks = exp_ranks[method]
285+
if (dtype, method) in disabled:
286+
continue
287+
if na_opt == 'top':
288+
order = ranks[1] + ranks[0] + ranks[2]
289+
elif na_opt == 'bottom':
290+
order = ranks[0] + ranks[2] + ranks[1]
291+
else:
292+
order = ranks[0] + [np.nan] * chunk + ranks[1]
293+
_check(iseries, order, method, na_opt)
294+
228295
def test_rank_methods_series(self):
229296
pytest.importorskip('scipy.stats.special')
230297
rankdata = pytest.importorskip('scipy.stats.rankdata')

0 commit comments

Comments
 (0)