-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fix problems in group rank when both nans and infinity are present #20561 #20681
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
aa63df3
feaccd4
89341d8
6eb1d8f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -417,25 +417,33 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, | |||||||||||||||||||||
ndarray[int64_t] labels, | ||||||||||||||||||||||
bint is_datetimelike, object ties_method, | ||||||||||||||||||||||
bint ascending, bint pct, object na_option): | ||||||||||||||||||||||
"""Provides the rank of values within each group | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Provides the rank of values within each group. | ||||||||||||||||||||||
|
||||||||||||||||||||||
Parameters | ||||||||||||||||||||||
---------- | ||||||||||||||||||||||
out : array of float64_t values which this method will write its results to | ||||||||||||||||||||||
values : array of {{c_type}} values to be ranked | ||||||||||||||||||||||
labels : array containing unique label for each group, with its ordering | ||||||||||||||||||||||
matching up to the corresponding record in `values` | ||||||||||||||||||||||
is_datetimelike : bool | ||||||||||||||||||||||
is_datetimelike : bool, default False | ||||||||||||||||||||||
unused in this method but provided for call compatibility with other | ||||||||||||||||||||||
Cython transformations | ||||||||||||||||||||||
ties_method : {'keep', 'top', 'bottom'} | ||||||||||||||||||||||
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' | ||||||||||||||||||||||
* average: average rank of group | ||||||||||||||||||||||
* min: lowest rank in group | ||||||||||||||||||||||
* max: highest rank in group | ||||||||||||||||||||||
* first: ranks assigned in order they appear in the array | ||||||||||||||||||||||
* dense: like 'min', but rank always increases by 1 between groups | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is tough to describe in one line so I'm not sure of the best way but I think it can be improved by simply changing "groups" to "values" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pandas/pandas/core/groupby/groupby.py Lines 1848 to 1857 in 5edc5c4
Yes, I agree. It is hard to describe those methods. So I copied from there to save some time. Btw, there are some typos there too. I've raise another issue #20694. I think we should come up with something consistent for both places. |
||||||||||||||||||||||
ascending : boolean, default True | ||||||||||||||||||||||
False for ranks by high (1) to low (N) | ||||||||||||||||||||||
na_option : {'keep', 'top', 'bottom'}, default 'keep' | ||||||||||||||||||||||
pct : boolean, default False | ||||||||||||||||||||||
Compute percentage rank of data within each group | ||||||||||||||||||||||
na_option : {'keep', 'top', 'bottom'}, default 'keep' | ||||||||||||||||||||||
* keep: leave NA values where they are | ||||||||||||||||||||||
* top: smallest rank if ascending | ||||||||||||||||||||||
* bottom: smallest rank if descending | ||||||||||||||||||||||
ascending : boolean | ||||||||||||||||||||||
False for ranks by high (1) to low (N) | ||||||||||||||||||||||
pct : boolean | ||||||||||||||||||||||
Compute percentage rank of data within each group | ||||||||||||||||||||||
|
||||||||||||||||||||||
Notes | ||||||||||||||||||||||
----- | ||||||||||||||||||||||
|
@@ -508,7 +516,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, | |||||||||||||||||||||
|
||||||||||||||||||||||
# if keep_na, check for missing values and assign back | ||||||||||||||||||||||
# to the result where appropriate | ||||||||||||||||||||||
if keep_na and masked_vals[_as[i]] == nan_fill_val: | ||||||||||||||||||||||
|
||||||||||||||||||||||
if keep_na and mask[_as[i]]: | ||||||||||||||||||||||
grp_na_count += 1 | ||||||||||||||||||||||
out[_as[i], 0] = nan | ||||||||||||||||||||||
else: | ||||||||||||||||||||||
|
@@ -548,9 +557,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, | |||||||||||||||||||||
# reset the dups and sum_ranks, knowing that a new value is coming | ||||||||||||||||||||||
# up. the conditional also needs to handle nan equality and the | ||||||||||||||||||||||
# end of iteration | ||||||||||||||||||||||
if (i == N - 1 or ( | ||||||||||||||||||||||
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not | ||||||||||||||||||||||
(mask[_as[i]] and mask[_as[i+1]]))): | ||||||||||||||||||||||
if (i == N - 1 or | ||||||||||||||||||||||
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or | ||||||||||||||||||||||
(mask[_as[i]] ^ mask[_as[i+1]])): | ||||||||||||||||||||||
dups = sum_ranks = 0 | ||||||||||||||||||||||
val_start = i | ||||||||||||||||||||||
grp_vals_seen += 1 | ||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1965,6 +1965,55 @@ def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): | |
exp_df = DataFrame(exp * len(grps), columns=['val']) | ||
assert_frame_equal(result, exp_df) | ||
|
||
@pytest.mark.parametrize("grps", [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also happy to have in another PR, move all of the rank tests to test_functional (you can do it here as well). We may want to move other things too, so maybe new PR. test_groupby is getting large. |
||
['qux'], ['qux', 'quux']]) | ||
@pytest.mark.parametrize("vals", [ | ||
[-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], | ||
]) | ||
@pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To err on the side of caution is it possible to test percentage display here as well? The other tests appear to do so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right. I just found out when pct is true, and ties_method is "dense". The ranks are not calculated as expected. ( with and without inf/nan)
The expected output should be
Maybe another PR ? Or fix it here? It is similar to #15639 @jreback There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it's broken even without There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), | ||
('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), | ||
('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), | ||
('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), | ||
('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), | ||
('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), | ||
('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), | ||
('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), | ||
('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), | ||
('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), | ||
('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), | ||
('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), | ||
('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), | ||
('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), | ||
('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), | ||
('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), | ||
('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), | ||
('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), | ||
('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), | ||
('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), | ||
('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), | ||
('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), | ||
('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), | ||
('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), | ||
('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), | ||
('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), | ||
('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), | ||
('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), | ||
('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), | ||
('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) | ||
]) | ||
def test_infs_n_nans(self, grps, vals, ties_method, ascending, na_option, | ||
exp): | ||
# GH 20561 | ||
key = np.repeat(grps, len(vals)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the issue number |
||
vals = vals * len(grps) | ||
df = DataFrame({'key': key, 'val': vals}) | ||
result = df.groupby('key').rank(method=ties_method, | ||
ascending=ascending, | ||
na_option=na_option) | ||
exp_df = DataFrame(exp * len(grps), columns=['val']) | ||
assert_frame_equal(result, exp_df) | ||
|
||
@pytest.mark.parametrize("grps", [ | ||
['qux'], ['qux', 'quux']]) | ||
@pytest.mark.parametrize("vals", [ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps clearer to say
average rank of tied values
- saying "group" can be confused with the larger Group object. Similar change needed for below points