-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: DataFrame.rank with np.inf and np.nan #38681
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
975de84
fd358b1
256ff84
f7bc058
39df107
7abd460
ee8b95d
34bbcb9
b722d19
acbc2bd
9d6ee7f
36884e6
e60aaf0
732c482
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1067,6 +1067,8 @@ def rank_2d( | |
mask = values == NPY_NAT | ||
|
||
np.putmask(values, mask, nan_value) | ||
else: | ||
mask = np.zeros_like(values, dtype=bool) | ||
|
||
n, k = (<object>values).shape | ||
ranks = np.empty((n, k), dtype='f8') | ||
|
@@ -1099,43 +1101,35 @@ def rank_2d( | |
argsorted = _as.astype('i8') | ||
|
||
for i in range(n): | ||
if rank_t is object: | ||
dups = sum_ranks = infs = 0 | ||
else: | ||
dups = sum_ranks = 0 | ||
dups = sum_ranks = infs = 0 | ||
|
||
total_tie_count = 0 | ||
count = 0.0 | ||
for j in range(k): | ||
if rank_t is not object: | ||
sum_ranks += j + 1 | ||
dups += 1 | ||
|
||
val = values[i, j] | ||
|
||
if rank_t is not uint64_t: | ||
if rank_t is object: | ||
skip_condition = (val is nan_value) and keep_na | ||
else: | ||
skip_condition = (val == nan_value) and keep_na | ||
if skip_condition: | ||
ranks[i, argsorted[i, j]] = NaN | ||
|
||
if rank_t is object: | ||
infs += 1 | ||
|
||
continue | ||
if mask[i, argsorted[i, j]] and keep_na: | ||
ranks[i, argsorted[i, j]] = NaN | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could define There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks |
||
infs += 1 | ||
continue | ||
|
||
count += 1.0 | ||
|
||
if rank_t is object: | ||
sum_ranks += (j - infs) + 1 | ||
dups += 1 | ||
sum_ranks += (j - infs) + 1 | ||
dups += 1 | ||
|
||
if rank_t is object: | ||
condition = j == k - 1 or are_diff(values[i, j + 1], val) | ||
condition = ( | ||
j == k - 1 or | ||
are_diff(values[i, j + 1], val) or | ||
(mask[i, argsorted[i, j + 1]] and keep_na) | ||
) | ||
else: | ||
condition = j == k - 1 or values[i, j + 1] != val | ||
condition = ( | ||
j == k - 1 or | ||
values[i, j + 1] != val or | ||
(mask[i, argsorted[i, j + 1]] and keep_na) | ||
) | ||
|
||
if condition: | ||
if tiebreak == TIEBREAK_AVERAGE: | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -3,6 +3,8 @@ | |||
import numpy as np | ||||
import pytest | ||||
|
||||
from pandas._libs import iNaT | ||||
from pandas._libs.algos import Infinity, NegInfinity | ||||
import pandas.util._test_decorators as td | ||||
|
||||
from pandas import DataFrame, Series | ||||
|
@@ -329,3 +331,116 @@ def test_pct_max_many_rows(self): | |||
) | ||||
result = df.rank(pct=True).max() | ||||
assert (result == 1).all() | ||||
|
||||
@pytest.mark.parametrize( | ||||
"contents,dtype", | ||||
[ | ||||
( | ||||
[ | ||||
-np.inf, | ||||
-50, | ||||
-1, | ||||
-1e-20, | ||||
-1e-25, | ||||
-1e-50, | ||||
0, | ||||
1e-40, | ||||
1e-20, | ||||
1e-10, | ||||
2, | ||||
40, | ||||
np.inf, | ||||
], | ||||
"float64", | ||||
), | ||||
( | ||||
[ | ||||
-np.inf, | ||||
-50, | ||||
-1, | ||||
-1e-20, | ||||
-1e-25, | ||||
-1e-45, | ||||
0, | ||||
1e-40, | ||||
1e-20, | ||||
1e-10, | ||||
2, | ||||
40, | ||||
np.inf, | ||||
], | ||||
"float32", | ||||
), | ||||
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), | ||||
pytest.param( | ||||
[ | ||||
np.iinfo(np.int64).min, | ||||
-100, | ||||
0, | ||||
1, | ||||
9999, | ||||
100000, | ||||
1e10, | ||||
np.iinfo(np.int64).max, | ||||
], | ||||
"int64", | ||||
marks=pytest.mark.xfail( | ||||
reason="iNaT is equivalent to minimum value of dtype" | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. may need to pass a datetimelike keyword to rank; pretty sure we do this for other functions in that file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep makes sense, saw your comment on that issue. Handling this consistently should be easier after #38744 |
||||
"int64 pending issue GH#16674" | ||||
), | ||||
), | ||||
([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), | ||||
], | ||||
) | ||||
def test_rank_inf_and_nan(self, contents, dtype): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mostly duplicated from
Could deduplicate some of this into a fixture in a followup if that makes sense There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that would make sense - besides, the |
||||
dtype_na_map = { | ||||
"float64": np.nan, | ||||
"float32": np.nan, | ||||
"int64": iNaT, | ||||
"object": None, | ||||
} | ||||
# Insert nans at random positions if underlying dtype has missing | ||||
# value. Then adjust the expected order by adding nans accordingly | ||||
# This is for testing whether rank calculation is affected | ||||
# when values are interwined with nan values. | ||||
values = np.array(contents, dtype=dtype) | ||||
exp_order = np.array(range(len(values)), dtype="float64") + 1.0 | ||||
if dtype in dtype_na_map: | ||||
na_value = dtype_na_map[dtype] | ||||
nan_indices = np.random.choice(range(len(values)), 5) | ||||
values = np.insert(values, nan_indices, na_value) | ||||
exp_order = np.insert(exp_order, nan_indices, np.nan) | ||||
# shuffle the testing array and expected results in the same way | ||||
random_order = np.random.permutation(len(values)) | ||||
df = DataFrame({"a": values[random_order]}) | ||||
expected = DataFrame({"a": exp_order[random_order]}, dtype="float64") | ||||
result = df.rank() | ||||
tm.assert_frame_equal(result, expected) | ||||
|
||||
def test_df_series_inf_nan_consistency(self): | ||||
# GH#32593 | ||||
index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10] | ||||
col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] | ||||
col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] | ||||
df = DataFrame( | ||||
index=index, | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick, can index go after data There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||
data={ | ||||
"col1": col1, | ||||
"col2": col2, | ||||
}, | ||||
dtype="f8", | ||||
) | ||||
df_result = df.rank() | ||||
|
||||
series_result = df.copy() | ||||
series_result["col1"] = df["col1"].rank() | ||||
series_result["col2"] = df["col2"].rank() | ||||
|
||||
tm.assert_frame_equal(df_result, series_result) | ||||
|
||||
def test_rank_both_inf(self): | ||||
# GH#32593 | ||||
df = DataFrame({"a": [-np.inf, 0, np.inf]}) | ||||
expected = DataFrame({"a": [1.0, 2.0, 3.0]}) | ||||
result = df.rank() | ||||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i expect we'll see a perf hit on non-float dtypes because of mask[...] lookups. is that effect small?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for bringing this up - good lesson for me to always check perf. Turns out
mask
wasn't typed, so perf got way worse. With mask typed, looks like a small effect (but not negligible, though hard to tell with variation between successive runs). Using the added benchmark from #38744:On a larger (probably more stable example):
(with a second run looking better):
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also changed if conditions to short-circuit to avoid some mask lookups