Skip to content

Commit 330fb57

Browse files
peterpanmjjreback
authored andcommitted
BUG: rank with +-inf, #6945 (#17903)
1 parent 19ce05e commit 330fb57

File tree

5 files changed

+233
-46
lines changed

5 files changed

+233
-46
lines changed

doc/source/whatsnew/v0.22.0.txt

+86-1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,91 @@ levels <merging.merge_on_columns_and_levels>` documentation section.
6363
left.merge(right, on=['key1', 'key2'])
6464

6565

66+
.. _whatsnew_0220.enhancements.ran_inf:
67+
68+
handle ``inf`` values properly when ``NaN`` are present
69+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
70+
71+
In previous version, ``inf`` elements were assigned ``NaN`` as their ranks. Now ranks are calculated properly. (:issue:`6945`)
72+
73+
.. ipython:: python
74+
75+
In [9]: s = pd.Series([-np.inf, 0, 1, np.nan, np.inf])
76+
77+
In [10]: s
78+
Out[10]:
79+
0 -inf
80+
1 0.000000
81+
2 1.000000
82+
3 NaN
83+
4 inf
84+
dtype: float64
85+
86+
Previous Behavior:
87+
88+
.. code-block:: ipython
89+
90+
In [11]: s.rank()
91+
Out[11]:
92+
0 1.0
93+
1 2.0
94+
2 3.0
95+
3 NaN
96+
4 NaN
97+
dtype: float64
98+
99+
Current Behavior
100+
101+
.. ipython:: python
102+
103+
In [4]: s.rank()
104+
Out[4]:
105+
0 1.0
106+
1 2.0
107+
2 3.0
108+
3 NaN
109+
4 4.0
110+
dtype: float64
111+
112+
Furthermore, previously if you rank ``inf`` or ``-inf`` values together with ``NaN`` values, the calculation won't distinguish ``NaN`` from infinity when using 'top' or 'bottom' argument.
113+
114+
.. ipython:: python
115+
116+
In [14]: s = pd.Series([np.nan, np.nan, -np.inf, -np.inf])
117+
118+
In [15]: s
119+
Out[15]:
120+
0 NaN
121+
1 NaN
122+
2 -inf
123+
3 -inf
124+
dtype: float64
125+
126+
Previous Behavior:
127+
128+
.. code-block:: ipython
129+
130+
In [15]: s.rank(na_option='top')
131+
Out[15]:
132+
0 2.5
133+
1 2.5
134+
2 2.5
135+
3 2.5
136+
dtype: float64
137+
138+
Current Behavior
139+
140+
.. ipython:: python
141+
142+
In [4]: s.rank(na_option='top')
143+
Out[4]:
144+
0 1.5
145+
1 1.5
146+
2 3.5
147+
3 3.5
148+
dtype: float64
149+
150+
66151
.. _whatsnew_0220.enhancements.other:
67152

68153
Other Enhancements
@@ -79,6 +164,7 @@ Other Enhancements
79164
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
80165
- :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`).
81166

167+
82168
.. _whatsnew_0220.api_breaking:
83169

84170
Backwards incompatible API changes
@@ -241,7 +327,6 @@ Reshaping
241327
^^^^^^^^^
242328

243329
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
244-
245330
-
246331
-
247332

pandas/_libs/algos.pyx

+12-10
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,24 @@ class Infinity(object):
6464
""" provide a positive Infinity comparision method for ranking """
6565

6666
__lt__ = lambda self, other: False
67-
__le__ = lambda self, other: self is other
68-
__eq__ = lambda self, other: self is other
69-
__ne__ = lambda self, other: self is not other
70-
__gt__ = lambda self, other: self is not other
71-
__ge__ = lambda self, other: True
67+
__le__ = lambda self, other: isinstance(other, Infinity)
68+
__eq__ = lambda self, other: isinstance(other, Infinity)
69+
__ne__ = lambda self, other: not isinstance(other, Infinity)
70+
__gt__ = lambda self, other: (not isinstance(other, Infinity) and
71+
not missing.checknull(other))
72+
__ge__ = lambda self, other: not missing.checknull(other)
7273

7374

7475
class NegInfinity(object):
7576
""" provide a negative Infinity comparision method for ranking """
7677

77-
__lt__ = lambda self, other: self is not other
78-
__le__ = lambda self, other: True
79-
__eq__ = lambda self, other: self is other
80-
__ne__ = lambda self, other: self is not other
78+
__lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
79+
not missing.checknull(other))
80+
__le__ = lambda self, other: not missing.checknull(other)
81+
__eq__ = lambda self, other: isinstance(other, NegInfinity)
82+
__ne__ = lambda self, other: not isinstance(other, NegInfinity)
8183
__gt__ = lambda self, other: False
82-
__ge__ = lambda self, other: self is other
84+
__ge__ = lambda self, other: isinstance(other, NegInfinity)
8385

8486

8587
@cython.wraparound(False)

pandas/_libs/algos_rank_helper.pxi.in

+37-27
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
2727
{{if dtype == 'object'}}
2828

2929

30-
def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average',
30+
def rank_1d_{{dtype}}(object in_arr, ties_method='average',
3131
ascending=True, na_option='keep', pct=False):
3232
{{else}}
3333

@@ -40,7 +40,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
4040
"""
4141

4242
cdef:
43-
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
43+
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
4444

4545
{{if dtype == 'object'}}
4646
ndarray sorted_data, values
@@ -50,6 +50,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
5050

5151
ndarray[float64_t] ranks
5252
ndarray[int64_t] argsorted
53+
ndarray[np.uint8_t, cast=True] sorted_mask
5354

5455
{{if dtype == 'uint64'}}
5556
{{ctype}} val
@@ -60,6 +61,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
6061
float64_t sum_ranks = 0
6162
int tiebreak = 0
6263
bint keep_na = 0
64+
bint isnan
6365
float count = 0.0
6466
tiebreak = tiebreakers[ties_method]
6567

@@ -76,12 +78,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
7678

7779
keep_na = na_option == 'keep'
7880

79-
{{if dtype != 'uint64'}}
80-
if ascending ^ (na_option == 'top'):
81-
nan_value = {{pos_nan_value}}
82-
else:
83-
nan_value = {{neg_nan_value}}
84-
8581
{{if dtype == 'object'}}
8682
mask = missing.isnaobj(values)
8783
{{elif dtype == 'float64'}}
@@ -90,56 +86,69 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
9086
mask = values == iNaT
9187
{{endif}}
9288

89+
# double sort first by mask and then by values to ensure nan values are
90+
# either at the beginning or the end. mask/(~mask) controls padding at
91+
# tail or the head
92+
{{if dtype != 'uint64'}}
93+
if ascending ^ (na_option == 'top'):
94+
nan_value = {{pos_nan_value}}
95+
order = (values, mask)
96+
else:
97+
nan_value = {{neg_nan_value}}
98+
order = (values, ~mask)
9399
np.putmask(values, mask, nan_value)
100+
{{else}}
101+
mask = np.zeros(shape=len(values), dtype=bool)
102+
order = (values, mask)
94103
{{endif}}
95104

96105
n = len(values)
97106
ranks = np.empty(n, dtype='f8')
98107

99108
{{if dtype == 'object'}}
109+
100110
try:
101-
_as = values.argsort()
111+
_as = np.lexsort(keys=order)
102112
except TypeError:
103-
if not retry:
104-
raise
105-
106-
valid_locs = (~mask).nonzero()[0]
107-
ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0,
108-
ties_method=ties_method,
109-
ascending=ascending))
110-
np.putmask(ranks, mask, np.nan)
111-
return ranks
113+
# lexsort on object array will raise TypeError for numpy version
114+
# earlier than 1.11.0. Use argsort with order argument instead.
115+
_dt = [('values', 'O'), ('mask', '?')]
116+
_values = np.asarray(list(zip(order[0], order[1])), dtype=_dt)
117+
_as = np.argsort(_values, kind='mergesort', order=('mask', 'values'))
112118
{{else}}
113119
if tiebreak == TIEBREAK_FIRST:
114120
# need to use a stable sort here
115-
_as = values.argsort(kind='mergesort')
121+
_as = np.lexsort(keys=order)
116122
if not ascending:
117123
tiebreak = TIEBREAK_FIRST_DESCENDING
118124
else:
119-
_as = values.argsort()
125+
_as = np.lexsort(keys=order)
120126
{{endif}}
121127

122128
if not ascending:
123129
_as = _as[::-1]
124130

125131
sorted_data = values.take(_as)
132+
sorted_mask = mask.take(_as)
133+
_indices = order[1].take(_as).nonzero()[0]
134+
non_na_idx = _indices[0] if len(_indices) > 0 else -1
126135
argsorted = _as.astype('i8')
127136

128137
{{if dtype == 'object'}}
129138
for i in range(n):
130139
sum_ranks += i + 1
131140
dups += 1
132-
141+
isnan = sorted_mask[i]
133142
val = util.get_value_at(sorted_data, i)
134143

135-
if (val is nan_value) and keep_na:
144+
if isnan and keep_na:
136145
ranks[argsorted[i]] = nan
137146
continue
138-
139147
count += 1.0
140148

141149
if (i == n - 1 or
142-
are_diff(util.get_value_at(sorted_data, i + 1), val)):
150+
are_diff(util.get_value_at(sorted_data, i + 1), val) or
151+
i == non_na_idx - 1):
143152
if tiebreak == TIEBREAK_AVERAGE:
144153
for j in range(i - dups + 1, i + 1):
145154
ranks[argsorted[j]] = sum_ranks / dups
@@ -164,18 +173,19 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
164173
for i in range(n):
165174
sum_ranks += i + 1
166175
dups += 1
167-
168176
val = sorted_data[i]
169177

170178
{{if dtype != 'uint64'}}
171-
if (val == nan_value) and keep_na:
179+
isnan = sorted_mask[i]
180+
if isnan and keep_na:
172181
ranks[argsorted[i]] = nan
173182
continue
174183
{{endif}}
175184

176185
count += 1.0
177186

178-
if i == n - 1 or sorted_data[i + 1] != val:
187+
if (i == n - 1 or sorted_data[i + 1] != val or
188+
i == non_na_idx - 1):
179189
if tiebreak == TIEBREAK_AVERAGE:
180190
for j in range(i - dups + 1, i + 1):
181191
ranks[argsorted[j]] = sum_ranks / dups

0 commit comments

Comments
 (0)