Skip to content

Commit 387d485

Browse files
authored
BUG: DataFrame.rank with np.inf and np.nan (#38681)
1 parent ee77500 commit 387d485

File tree

3 files changed

+140
-27
lines changed

3 files changed

+140
-27
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ Numeric
212212
- Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`)
213213
- Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)
214214
- Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
215+
- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
215216

216217
Conversion
217218
^^^^^^^^^^

pandas/_libs/algos.pyx

+24-27
Original file line numberDiff line numberDiff line change
@@ -1022,16 +1022,19 @@ def rank_2d(
10221022
ndarray[float64_t, ndim=2] ranks
10231023
ndarray[rank_t, ndim=2] values
10241024
ndarray[int64_t, ndim=2] argsorted
1025+
ndarray[uint8_t, ndim=2] mask
10251026
rank_t val, nan_value
10261027
float64_t sum_ranks = 0
10271028
int tiebreak = 0
1029+
int64_t idx
10281030
bint keep_na = False
10291031
float64_t count = 0.0
1030-
bint condition, skip_condition
1032+
bint condition, check_mask
10311033

10321034
tiebreak = tiebreakers[ties_method]
10331035

10341036
keep_na = na_option == 'keep'
1037+
check_mask = rank_t is not uint64_t
10351038

10361039
if axis == 0:
10371040
values = np.asarray(in_arr).T.copy()
@@ -1067,6 +1070,8 @@ def rank_2d(
10671070
mask = values == NPY_NAT
10681071

10691072
np.putmask(values, mask, nan_value)
1073+
else:
1074+
mask = np.zeros_like(values, dtype=bool)
10701075

10711076
n, k = (<object>values).shape
10721077
ranks = np.empty((n, k), dtype='f8')
@@ -1099,43 +1104,35 @@ def rank_2d(
10991104
argsorted = _as.astype('i8')
11001105

11011106
for i in range(n):
1102-
if rank_t is object:
1103-
dups = sum_ranks = infs = 0
1104-
else:
1105-
dups = sum_ranks = 0
1107+
dups = sum_ranks = infs = 0
11061108

11071109
total_tie_count = 0
11081110
count = 0.0
11091111
for j in range(k):
1110-
if rank_t is not object:
1111-
sum_ranks += j + 1
1112-
dups += 1
1113-
11141112
val = values[i, j]
1115-
1116-
if rank_t is not uint64_t:
1117-
if rank_t is object:
1118-
skip_condition = (val is nan_value) and keep_na
1119-
else:
1120-
skip_condition = (val == nan_value) and keep_na
1121-
if skip_condition:
1122-
ranks[i, argsorted[i, j]] = NaN
1123-
1124-
if rank_t is object:
1125-
infs += 1
1126-
1127-
continue
1113+
idx = argsorted[i, j]
1114+
if keep_na and check_mask and mask[i, idx]:
1115+
ranks[i, idx] = NaN
1116+
infs += 1
1117+
continue
11281118

11291119
count += 1.0
11301120

1131-
if rank_t is object:
1132-
sum_ranks += (j - infs) + 1
1133-
dups += 1
1121+
sum_ranks += (j - infs) + 1
1122+
dups += 1
11341123

11351124
if rank_t is object:
1136-
condition = j == k - 1 or are_diff(values[i, j + 1], val)
1125+
condition = (
1126+
j == k - 1 or
1127+
are_diff(values[i, j + 1], val) or
1128+
(keep_na and check_mask and mask[i, argsorted[i, j + 1]])
1129+
)
11371130
else:
1138-
condition = j == k - 1 or values[i, j + 1] != val
1131+
condition = (
1132+
j == k - 1 or
1133+
values[i, j + 1] != val or
1134+
(keep_na and check_mask and mask[i, argsorted[i, j + 1]])
1135+
)
11391136

11401137
if condition:
11411138
if tiebreak == TIEBREAK_AVERAGE:

pandas/tests/frame/methods/test_rank.py

+115
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas._libs import iNaT
7+
from pandas._libs.algos import Infinity, NegInfinity
68
import pandas.util._test_decorators as td
79

810
from pandas import DataFrame, Series
@@ -329,3 +331,116 @@ def test_pct_max_many_rows(self):
329331
)
330332
result = df.rank(pct=True).max()
331333
assert (result == 1).all()
334+
335+
@pytest.mark.parametrize(
336+
"contents,dtype",
337+
[
338+
(
339+
[
340+
-np.inf,
341+
-50,
342+
-1,
343+
-1e-20,
344+
-1e-25,
345+
-1e-50,
346+
0,
347+
1e-40,
348+
1e-20,
349+
1e-10,
350+
2,
351+
40,
352+
np.inf,
353+
],
354+
"float64",
355+
),
356+
(
357+
[
358+
-np.inf,
359+
-50,
360+
-1,
361+
-1e-20,
362+
-1e-25,
363+
-1e-45,
364+
0,
365+
1e-40,
366+
1e-20,
367+
1e-10,
368+
2,
369+
40,
370+
np.inf,
371+
],
372+
"float32",
373+
),
374+
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
375+
pytest.param(
376+
[
377+
np.iinfo(np.int64).min,
378+
-100,
379+
0,
380+
1,
381+
9999,
382+
100000,
383+
1e10,
384+
np.iinfo(np.int64).max,
385+
],
386+
"int64",
387+
marks=pytest.mark.xfail(
388+
reason="iNaT is equivalent to minimum value of dtype"
389+
"int64 pending issue GH#16674"
390+
),
391+
),
392+
([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
393+
],
394+
)
395+
def test_rank_inf_and_nan(self, contents, dtype):
396+
dtype_na_map = {
397+
"float64": np.nan,
398+
"float32": np.nan,
399+
"int64": iNaT,
400+
"object": None,
401+
}
402+
# Insert nans at random positions if underlying dtype has missing
403+
# value. Then adjust the expected order by adding nans accordingly
404+
# This is for testing whether rank calculation is affected
405+
# when values are interwined with nan values.
406+
values = np.array(contents, dtype=dtype)
407+
exp_order = np.array(range(len(values)), dtype="float64") + 1.0
408+
if dtype in dtype_na_map:
409+
na_value = dtype_na_map[dtype]
410+
nan_indices = np.random.choice(range(len(values)), 5)
411+
values = np.insert(values, nan_indices, na_value)
412+
exp_order = np.insert(exp_order, nan_indices, np.nan)
413+
# shuffle the testing array and expected results in the same way
414+
random_order = np.random.permutation(len(values))
415+
df = DataFrame({"a": values[random_order]})
416+
expected = DataFrame({"a": exp_order[random_order]}, dtype="float64")
417+
result = df.rank()
418+
tm.assert_frame_equal(result, expected)
419+
420+
def test_df_series_inf_nan_consistency(self):
421+
# GH#32593
422+
index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
423+
col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
424+
col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
425+
df = DataFrame(
426+
data={
427+
"col1": col1,
428+
"col2": col2,
429+
},
430+
index=index,
431+
dtype="f8",
432+
)
433+
df_result = df.rank()
434+
435+
series_result = df.copy()
436+
series_result["col1"] = df["col1"].rank()
437+
series_result["col2"] = df["col2"].rank()
438+
439+
tm.assert_frame_equal(df_result, series_result)
440+
441+
def test_rank_both_inf(self):
442+
# GH#32593
443+
df = DataFrame({"a": [-np.inf, 0, np.inf]})
444+
expected = DataFrame({"a": [1.0, 2.0, 3.0]})
445+
result = df.rank()
446+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)