Skip to content

Commit 128d005

Browse files
committed
BUG: make dense ranks results scale to 100 percent (pandas-dev#20731)
1 parent f91e28c commit 128d005

File tree

2 files changed

+25
-16
lines changed

2 files changed

+25
-16
lines changed

pandas/_libs/groupby_helper.pxi.in

+18-9
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
418418
bint is_datetimelike, object ties_method,
419419
bint ascending, bint pct, object na_option):
420420
"""
421-
Provides the rank of values within each group.
421+
Provides the rank of values within each group.
422422

423423
Parameters
424424
----------
@@ -452,16 +452,18 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
452452
cdef:
453453
TiebreakEnumType tiebreak
454454
Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0
455-
Py_ssize_t grp_vals_seen=1, grp_na_count=0
455+
Py_ssize_t grp_vals_seen=1, grp_na_count=0, lab_start=0, tie_count=0
456+
Py_ssize_t grp_size=1
456457
ndarray[int64_t] _as
457-
ndarray[float64_t, ndim=2] grp_sizes
458458
ndarray[{{c_type}}] masked_vals
459459
ndarray[uint8_t] mask
460460
bint keep_na
461+
bint has_na
461462
{{c_type}} nan_fill_val
462463

463464
tiebreak = tiebreakers[ties_method]
464465
keep_na = na_option == 'keep'
466+
has_na = False
465467
N, K = (<object> values).shape
466468
grp_sizes = np.ones_like(out)
467469

@@ -518,6 +520,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
518520
# to the result where appropriate
519521

520522
if keep_na and mask[_as[i]]:
523+
if has_na == 0: has_na = 1
521524
grp_na_count += 1
522525
out[_as[i], 0] = nan
523526
else:
@@ -560,6 +563,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
560563
if (i == N - 1 or
561564
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
562565
(mask[_as[i]] ^ mask[_as[i+1]])):
566+
tie_count += 1
563567
dups = sum_ranks = 0
564568
val_start = i
565569
grp_vals_seen += 1
@@ -571,17 +575,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
571575
# (used by pct calculations later). also be sure to reset any of
572576
# the items helping to calculate dups
573577
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
574-
for j in range(grp_start, i + 1):
575-
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
576-
dups = sum_ranks = 0
578+
if pct:
579+
if tiebreak != TIEBREAK_DENSE:
580+
for j in range(grp_start, i + 1):
581+
grp_size = i - grp_start + 1 - grp_na_count
582+
out[_as[j], 0] = out[_as[j], 0] / grp_size
583+
else:
584+
for j in range(lab_start, i + 1):
585+
out[_as[j], 0] = (out[_as[j], 0]
586+
/ (tie_count - has_na))
587+
dups = sum_ranks = has_na = tie_count = 0
577588
grp_na_count = 0
578589
val_start = i + 1
579590
grp_start = i + 1
591+
lab_start = i + 1
580592
grp_vals_seen = 1
581593

582-
if pct:
583-
for i in range(N):
584-
out[i, 0] = out[i, 0] / grp_sizes[i, 0]
585594
{{endif}}
586595
{{endfor}}
587596

pandas/tests/groupby/test_rank.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ def test_rank_apply():
5959
('first', False, False, [3., 4., 1., 5., 2.]),
6060
('first', False, True, [.6, .8, .2, 1., .4]),
6161
('dense', True, False, [1., 1., 3., 1., 2.]),
62-
('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]),
62+
('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]),
6363
('dense', False, False, [3., 3., 1., 3., 2.]),
64-
('dense', False, True, [.6, .6, .2, .6, .4]),
64+
('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]),
6565
])
6666
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
6767
key = np.repeat(grps, len(vals))
@@ -126,7 +126,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
126126
@pytest.mark.parametrize("grps", [
127127
['qux'], ['qux', 'quux']])
128128
@pytest.mark.parametrize("vals", [
129-
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats
129+
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
130130
[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
131131
pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
132132
pd.Timestamp('2018-01-06'), np.nan, np.nan]
@@ -167,11 +167,11 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
167167
('dense', True, 'keep', False,
168168
[1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
169169
('dense', True, 'keep', True,
170-
[0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]),
170+
[1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]),
171171
('dense', False, 'keep', False,
172172
[3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
173173
('dense', False, 'keep', True,
174-
[.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
174+
[3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]),
175175
('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
176176
('average', True, 'no_na', True,
177177
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
@@ -198,10 +198,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
198198
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
199199
('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
200200
('dense', True, 'no_na', True,
201-
[0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]),
201+
[1. / 4., 1. / 4., 4. / 4., 3. / 4., 1. / 4., 2. / 4., 4. / 4., 4. / 4.]),
202202
('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
203203
('dense', False, 'no_na', True,
204-
[0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5])
204+
[3. / 4., 3. / 4., 4. / 4., 1. / 4., 3. / 4., 2. / 4., 4. / 4., 4. / 4.])
205205
])
206206
def test_rank_args_missing(grps, vals, ties_method, ascending,
207207
na_option, pct, exp):

0 commit comments

Comments
 (0)