Skip to content

BUG: groupby rank computing incorrect percentiles #40575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Apr 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2364330
CLN: rank_1d followup
mzeitlin11 Mar 20, 2021
999d880
WIP
mzeitlin11 Mar 20, 2021
d360871
WIP
mzeitlin11 Mar 20, 2021
0aaeee7
WIP
mzeitlin11 Mar 20, 2021
fe6495a
Add comments, whitespace
mzeitlin11 Mar 20, 2021
8fae616
Simplify conditional
mzeitlin11 Mar 21, 2021
86e736b
Merge remote-tracking branch 'origin/master' into cln/rank_1d
mzeitlin11 Mar 21, 2021
f9479e3
Remove unused var
mzeitlin11 Mar 21, 2021
a2bea3d
Avoid compiler warning
mzeitlin11 Mar 21, 2021
ca45958
Merge remote-tracking branch 'origin/master' into cln/rank_1d
mzeitlin11 Mar 21, 2021
a6fd0a5
BUG: groupby rank percentile
mzeitlin11 Mar 22, 2021
8a7b535
Add tests
mzeitlin11 Mar 22, 2021
0d55593
WIP
mzeitlin11 Mar 22, 2021
7108e05
Merge remote-tracking branch 'origin/master' into cln/rank_1d
mzeitlin11 Mar 22, 2021
ba5dc7c
Simplify changes
mzeitlin11 Mar 22, 2021
680a888
Merge remote-tracking branch 'origin/master' into bug/rank_pct_nans_g…
mzeitlin11 Mar 22, 2021
49127f3
Merge branch 'cln/rank_1d' into bug/rank_pct_nans_groupby
mzeitlin11 Mar 22, 2021
f6a04b7
precommit fixup
mzeitlin11 Mar 22, 2021
e486ca7
Merge branch 'cln/rank_1d' into bug/rank_pct_nans_groupby
mzeitlin11 Mar 22, 2021
268abd0
Add whatsnew
mzeitlin11 Mar 22, 2021
aecab5f
Merge remote-tracking branch 'origin/master' into bug/rank_pct_nans_g…
mzeitlin11 Mar 23, 2021
09860e3
Merge remote-tracking branch 'origin/master' into bug/rank_pct_nans_g…
mzeitlin11 Mar 27, 2021
8eb56cf
Use float instead of int for grp_size
mzeitlin11 Mar 27, 2021
57e6697
Use intp for lexsort indexer
mzeitlin11 Mar 27, 2021
f925124
Merge remote-tracking branch 'origin/master' into bug/rank_pct_nans_g…
mzeitlin11 Mar 29, 2021
8c59323
Merge remote-tracking branch 'origin/master' into bug/rank_pct_nans_g…
mzeitlin11 Mar 31, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,7 @@ Numeric
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
-

Conversion
Expand Down
74 changes: 47 additions & 27 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -947,12 +947,14 @@ def rank_1d(
TiebreakEnumType tiebreak
Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
Py_ssize_t grp_vals_seen=1, grp_na_count=0
ndarray[int64_t, ndim=1] lexsort_indexer
ndarray[float64_t, ndim=1] grp_sizes, out
ndarray[int64_t, ndim=1] grp_sizes
ndarray[intp_t, ndim=1] lexsort_indexer
ndarray[float64_t, ndim=1] out
ndarray[rank_t, ndim=1] masked_vals
ndarray[uint8_t, ndim=1] mask
bint keep_na, at_end, next_val_diff, check_labels, group_changed
rank_t nan_fill_val
int64_t grp_size

tiebreak = tiebreakers[ties_method]
if tiebreak == TIEBREAK_FIRST:
Expand All @@ -965,7 +967,7 @@ def rank_1d(
# TODO Cython 3.0: cast won't be necessary (#2992)
assert <Py_ssize_t>len(labels) == N
out = np.empty(N)
grp_sizes = np.ones(N)
grp_sizes = np.ones(N, dtype=np.int64)

# If all 0 labels, can short-circuit later label
# comparisons
Expand Down Expand Up @@ -1022,7 +1024,7 @@ def rank_1d(
# each label corresponds to a different group value,
# the mask helps you differentiate missing values before
# performing sort on the actual values
lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False)
lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)

if not ascending:
lexsort_indexer = lexsort_indexer[::-1]
Expand Down Expand Up @@ -1093,13 +1095,15 @@ def rank_1d(
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = grp_vals_seen

# Look forward to the next value (using the sorting in lexsort_indexer)
# if the value does not equal the current value then we need to
# reset the dups and sum_ranks, knowing that a new value is
# coming up. The conditional also needs to handle nan equality
# and the end of iteration
if next_val_diff or (mask[lexsort_indexer[i]]
^ mask[lexsort_indexer[i+1]]):
# Look forward to the next value (using the sorting in
# lexsort_indexer). If the value does not equal the current
# value then we need to reset the dups and sum_ranks, knowing
# that a new value is coming up. The conditional also needs
# to handle nan equality and the end of iteration. If group
# changes we do not record seeing a new value in the group
if not group_changed and (next_val_diff or
(mask[lexsort_indexer[i]]
^ mask[lexsort_indexer[i+1]])):
dups = sum_ranks = 0
grp_vals_seen += 1

Expand All @@ -1110,14 +1114,21 @@ def rank_1d(
# group encountered (used by pct calculations later). Also be
# sure to reset any of the items helping to calculate dups
if group_changed:

# If not dense tiebreak, group size used to compute
# percentile will be # of non-null elements in group
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(i - grp_start + 1 - grp_na_count)
grp_size = i - grp_start + 1 - grp_na_count

# Otherwise, it will be the number of distinct values
# in the group, subtracting 1 if NaNs are present
# since that is a distinct value we shouldn't count
else:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(grp_vals_seen - 1 - (grp_na_count > 0))
grp_size = grp_vals_seen - (grp_na_count > 0)

for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = grp_size

dups = sum_ranks = 0
grp_na_count = 0
grp_start = i + 1
Expand Down Expand Up @@ -1184,12 +1195,14 @@ def rank_1d(
out[lexsort_indexer[j]] = grp_vals_seen

# Look forward to the next value (using the sorting in
# lexsort_indexer) if the value does not equal the current
# lexsort_indexer). If the value does not equal the current
# value then we need to reset the dups and sum_ranks, knowing
# that a new value is coming up. The conditional also needs
# to handle nan equality and the end of iteration
if next_val_diff or (mask[lexsort_indexer[i]]
^ mask[lexsort_indexer[i+1]]):
# to handle nan equality and the end of iteration. If group
# changes we do not record seeing a new value in the group
if not group_changed and (next_val_diff or
(mask[lexsort_indexer[i]]
^ mask[lexsort_indexer[i+1]])):
dups = sum_ranks = 0
grp_vals_seen += 1

Expand All @@ -1200,14 +1213,21 @@ def rank_1d(
# group encountered (used by pct calculations later). Also be
# sure to reset any of the items helping to calculate dups
if group_changed:

# If not dense tiebreak, group size used to compute
# percentile will be # of non-null elements in group
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(i - grp_start + 1 - grp_na_count)
grp_size = i - grp_start + 1 - grp_na_count

# Otherwise, it will be the number of distinct values
# in the group, subtracting 1 if NaNs are present
# since that is a distinct value we shouldn't count
else:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(grp_vals_seen - 1 - (grp_na_count > 0))
grp_size = grp_vals_seen - (grp_na_count > 0)

for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = grp_size

dups = sum_ranks = 0
grp_na_count = 0
grp_start = i + 1
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/groupby/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,3 +542,28 @@ def test_rank_min_int():
)

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("use_nan", [True, False])
def test_rank_pct_equal_values_on_group_transition(use_nan):
# GH#40518
fill_value = np.nan if use_nan else 3
df = DataFrame(
[
[-1, 1],
[-1, 2],
[1, fill_value],
[-1, fill_value],
],
columns=["group", "val"],
)
result = df.groupby(["group"])["val"].rank(
method="dense",
pct=True,
)
if use_nan:
expected = Series([0.5, 1, np.nan, np.nan], name="val")
else:
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")

tm.assert_series_equal(result, expected)