-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: groupby rank computing incorrect percentiles #40575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
2364330
999d880
d360871
0aaeee7
fe6495a
8fae616
86e736b
f9479e3
a2bea3d
ca45958
a6fd0a5
8a7b535
0d55593
7108e05
ba5dc7c
680a888
49127f3
f6a04b7
e486ca7
268abd0
aecab5f
09860e3
8eb56cf
57e6697
f925124
8c59323
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -946,22 +946,28 @@ def rank_1d( | |
cdef: | ||
TiebreakEnumType tiebreak | ||
Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 | ||
Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 | ||
Py_ssize_t grp_vals_seen=1, grp_na_count=0 | ||
ndarray[int64_t, ndim=1] lexsort_indexer | ||
ndarray[float64_t, ndim=1] grp_sizes, out | ||
ndarray[rank_t, ndim=1] masked_vals | ||
ndarray[uint8_t, ndim=1] mask | ||
bint keep_na, at_end, next_val_diff, check_labels | ||
bint keep_na, at_end, next_val_diff, check_labels, group_changed | ||
rank_t nan_fill_val | ||
float64_t grp_size | ||
|
||
tiebreak = tiebreakers[ties_method] | ||
if tiebreak == TIEBREAK_FIRST: | ||
if not ascending: | ||
tiebreak = TIEBREAK_FIRST_DESCENDING | ||
|
||
keep_na = na_option == 'keep' | ||
|
||
N = len(values) | ||
# TODO Cython 3.0: cast won't be necessary (#2992) | ||
assert <Py_ssize_t>len(labels) == N | ||
out = np.empty(N) | ||
grp_sizes = np.ones(N) | ||
|
||
# If all 0 labels, can short-circuit later label | ||
# comparisons | ||
check_labels = np.any(labels) | ||
|
@@ -983,6 +989,9 @@ def rank_1d( | |
else: | ||
mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) | ||
|
||
# If ascending and na_option == 'bottom' or descending and | ||
# na_option == 'top' -> we want to rank NaN as the highest | ||
# so fill with the maximum value for the type | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just want to highlight #40016 in case relevant here for consistency There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for linking that, I was confused by the documentation as well haha. Will adjust this comment in #40546 |
||
if ascending ^ (na_option == 'top'): | ||
if rank_t is object: | ||
nan_fill_val = Infinity() | ||
|
@@ -993,6 +1002,8 @@ def rank_1d( | |
else: | ||
nan_fill_val = np.inf | ||
order = (masked_vals, mask, labels) | ||
|
||
# Otherwise, fill with the lowest value of the type | ||
else: | ||
if rank_t is object: | ||
nan_fill_val = NegInfinity() | ||
|
@@ -1025,36 +1036,36 @@ def rank_1d( | |
if rank_t is object: | ||
for i in range(N): | ||
at_end = i == N - 1 | ||
|
||
# dups and sum_ranks will be incremented each loop where | ||
# the value / group remains the same, and should be reset | ||
# when either of those change | ||
# Used to calculate tiebreakers | ||
# when either of those change. Used to calculate tiebreakers | ||
dups += 1 | ||
sum_ranks += i - grp_start + 1 | ||
|
||
next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != | ||
masked_vals[lexsort_indexer[i+1]]) | ||
|
||
# We'll need this check later anyway to determine group size, so just | ||
# compute it here since shortcircuiting won't help | ||
group_changed = at_end or (check_labels and | ||
(labels[lexsort_indexer[i]] | ||
!= labels[lexsort_indexer[i+1]])) | ||
|
||
# Update out only when there is a transition of values or labels. | ||
# When a new value or group is encountered, go back #dups steps( | ||
# the number of occurrence of current value) and assign the ranks | ||
# based on the starting index of the current group (grp_start) | ||
# and the current index | ||
if not at_end: | ||
next_val_diff = are_diff(masked_vals[lexsort_indexer[i]], | ||
masked_vals[lexsort_indexer[i+1]]) | ||
else: | ||
next_val_diff = True | ||
|
||
if (next_val_diff | ||
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) | ||
or (check_labels | ||
and (labels[lexsort_indexer[i]] | ||
!= labels[lexsort_indexer[i+1]])) | ||
): | ||
# if keep_na, check for missing values and assign back | ||
if (next_val_diff or group_changed | ||
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): | ||
|
||
# If keep_na, check for missing values and assign back | ||
# to the result where appropriate | ||
if keep_na and mask[lexsort_indexer[i]]: | ||
grp_na_count = dups | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = NaN | ||
grp_na_count = dups | ||
elif tiebreak == TIEBREAK_AVERAGE: | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups | ||
|
@@ -1066,82 +1077,86 @@ def rank_1d( | |
out[lexsort_indexer[j]] = i - grp_start + 1 | ||
elif tiebreak == TIEBREAK_FIRST: | ||
for j in range(i - dups + 1, i + 1): | ||
if ascending: | ||
out[lexsort_indexer[j]] = j + 1 - grp_start | ||
else: | ||
out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start | ||
out[lexsort_indexer[j]] = j + 1 - grp_start | ||
elif tiebreak == TIEBREAK_FIRST_DESCENDING: | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start | ||
elif tiebreak == TIEBREAK_DENSE: | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = grp_vals_seen | ||
|
||
# look forward to the next value (using the sorting in _as) | ||
# if the value does not equal the current value then we need to | ||
# reset the dups and sum_ranks, knowing that a new value is | ||
# coming up. the conditional also needs to handle nan equality | ||
# and the end of iteration | ||
if next_val_diff or (mask[lexsort_indexer[i]] | ||
^ mask[lexsort_indexer[i+1]]): | ||
# Look forward to the next value (using the sorting in | ||
# lexsort_indexer). If the value does not equal the current | ||
# value then we need to reset the dups and sum_ranks, knowing | ||
# that a new value is coming up. The conditional also needs | ||
# to handle nan equality and the end of iteration. If group | ||
# changes we do not record seeing a new value in the group | ||
if not group_changed and (next_val_diff or | ||
(mask[lexsort_indexer[i]] | ||
^ mask[lexsort_indexer[i+1]])): | ||
dups = sum_ranks = 0 | ||
grp_vals_seen += 1 | ||
grp_tie_count += 1 | ||
|
||
# Similar to the previous conditional, check now if we are | ||
# moving to a new group. If so, keep track of the index where | ||
# the new group occurs, so the tiebreaker calculations can | ||
# decrement that from their position. fill in the size of each | ||
# group encountered (used by pct calculations later). also be | ||
# decrement that from their position. Fill in the size of each | ||
# group encountered (used by pct calculations later). Also be | ||
# sure to reset any of the items helping to calculate dups | ||
if (at_end or | ||
(check_labels | ||
and (labels[lexsort_indexer[i]] | ||
!= labels[lexsort_indexer[i+1]]))): | ||
if group_changed: | ||
|
||
# If not dense tiebreak, group size used to compute | ||
# percentile will be # of non-null elements in group | ||
if tiebreak != TIEBREAK_DENSE: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[lexsort_indexer[j]] = \ | ||
(i - grp_start + 1 - grp_na_count) | ||
grp_size = i - grp_start + 1 - grp_na_count | ||
|
||
# Otherwise, it will be the number of distinct values | ||
# in the group, subtracting 1 if NaNs are present | ||
# since that is a distinct value we shouldn't count | ||
else: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[lexsort_indexer[j]] = \ | ||
(grp_tie_count - (grp_na_count > 0)) | ||
grp_size = grp_vals_seen - (grp_na_count > 0) | ||
|
||
for j in range(grp_start, i + 1): | ||
grp_sizes[lexsort_indexer[j]] = grp_size | ||
|
||
dups = sum_ranks = 0 | ||
grp_na_count = 0 | ||
grp_tie_count = 0 | ||
grp_start = i + 1 | ||
grp_vals_seen = 1 | ||
else: | ||
with nogil: | ||
for i in range(N): | ||
at_end = i == N - 1 | ||
|
||
# dups and sum_ranks will be incremented each loop where | ||
# the value / group remains the same, and should be reset | ||
# when either of those change | ||
# Used to calculate tiebreakers | ||
# when either of those change. Used to calculate tiebreakers | ||
dups += 1 | ||
sum_ranks += i - grp_start + 1 | ||
|
||
next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != | ||
masked_vals[lexsort_indexer[i+1]]) | ||
|
||
# We'll need this check later anyway to determine group size, so just | ||
# compute it here since shortcircuiting won't help | ||
group_changed = at_end or (check_labels and | ||
(labels[lexsort_indexer[i]] | ||
!= labels[lexsort_indexer[i+1]])) | ||
|
||
# Update out only when there is a transition of values or labels. | ||
# When a new value or group is encountered, go back #dups steps( | ||
# the number of occurrence of current value) and assign the ranks | ||
# based on the starting index of the current group (grp_start) | ||
# and the current index | ||
if not at_end: | ||
next_val_diff = (masked_vals[lexsort_indexer[i]] | ||
!= masked_vals[lexsort_indexer[i+1]]) | ||
else: | ||
next_val_diff = True | ||
|
||
if (next_val_diff | ||
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) | ||
or (check_labels | ||
and (labels[lexsort_indexer[i]] | ||
!= labels[lexsort_indexer[i+1]])) | ||
): | ||
# if keep_na, check for missing values and assign back | ||
if (next_val_diff or group_changed | ||
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): | ||
|
||
# If keep_na, check for missing values and assign back | ||
# to the result where appropriate | ||
if keep_na and mask[lexsort_indexer[i]]: | ||
grp_na_count = dups | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = NaN | ||
grp_na_count = dups | ||
elif tiebreak == TIEBREAK_AVERAGE: | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups | ||
|
@@ -1153,46 +1168,51 @@ def rank_1d( | |
out[lexsort_indexer[j]] = i - grp_start + 1 | ||
elif tiebreak == TIEBREAK_FIRST: | ||
for j in range(i - dups + 1, i + 1): | ||
if ascending: | ||
out[lexsort_indexer[j]] = j + 1 - grp_start | ||
else: | ||
out[lexsort_indexer[j]] = \ | ||
(2 * i - j - dups + 2 - grp_start) | ||
out[lexsort_indexer[j]] = j + 1 - grp_start | ||
elif tiebreak == TIEBREAK_FIRST_DESCENDING: | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start | ||
elif tiebreak == TIEBREAK_DENSE: | ||
for j in range(i - dups + 1, i + 1): | ||
out[lexsort_indexer[j]] = grp_vals_seen | ||
|
||
# look forward to the next value (using the sorting in | ||
# lexsort_indexer) if the value does not equal the current | ||
# value then we need to reset the dups and sum_ranks, | ||
# knowing that a new value is coming up. the conditional | ||
# also needs to handle nan equality and the end of iteration | ||
if next_val_diff or (mask[lexsort_indexer[i]] | ||
^ mask[lexsort_indexer[i+1]]): | ||
# Look forward to the next value (using the sorting in | ||
# lexsort_indexer). If the value does not equal the current | ||
# value then we need to reset the dups and sum_ranks, knowing | ||
# that a new value is coming up. The conditional also needs | ||
# to handle nan equality and the end of iteration. If group | ||
# changes we do not record seeing a new value in the group | ||
if not group_changed and (next_val_diff or | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(mask[lexsort_indexer[i]] | ||
^ mask[lexsort_indexer[i+1]])): | ||
|
||
dups = sum_ranks = 0 | ||
grp_vals_seen += 1 | ||
grp_tie_count += 1 | ||
|
||
# Similar to the previous conditional, check now if we are | ||
# moving to a new group. If so, keep track of the index where | ||
# the new group occurs, so the tiebreaker calculations can | ||
# decrement that from their position. fill in the size of each | ||
# group encountered (used by pct calculations later). also be | ||
# decrement that from their position. Fill in the size of each | ||
# group encountered (used by pct calculations later). Also be | ||
# sure to reset any of the items helping to calculate dups | ||
if at_end or (check_labels and | ||
(labels[lexsort_indexer[i]] | ||
!= labels[lexsort_indexer[i+1]])): | ||
if group_changed: | ||
|
||
# If not dense tiebreak, group size used to compute | ||
# percentile will be # of non-null elements in group | ||
if tiebreak != TIEBREAK_DENSE: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[lexsort_indexer[j]] = \ | ||
(i - grp_start + 1 - grp_na_count) | ||
grp_size = i - grp_start + 1 - grp_na_count | ||
|
||
# Otherwise, it will be the number of distinct values | ||
# in the group, subtracting 1 if NaNs are present | ||
# since that is a distinct value we shouldn't count | ||
else: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[lexsort_indexer[j]] = \ | ||
(grp_tie_count - (grp_na_count > 0)) | ||
grp_size = grp_vals_seen - (grp_na_count > 0) | ||
|
||
for j in range(grp_start, i + 1): | ||
grp_sizes[lexsort_indexer[j]] = grp_size | ||
|
||
dups = sum_ranks = 0 | ||
grp_na_count = 0 | ||
grp_tie_count = 0 | ||
grp_start = i + 1 | ||
grp_vals_seen = 1 | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this not be an integer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did this because
grp_sizes
was defined as an array offloat64_t
(I'd guess to avoid worrying about casting to float when dividing bygrp_sizes
at the end). Can change both to int if you think that would make more sense.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah makes sense. That feels like a mistake so yea if you can change the array and nothing breaks lets do that. If it causes an issue can do in a follow up
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have changed in latest commit