Skip to content

Commit 6c04e5a

Browse files
committed
BUG: make dense ranks results scale to 100 percent (#20731)
1 parent f91e28c commit 6c04e5a

File tree

4 files changed

+43
-22
lines changed

4 files changed

+43
-22
lines changed

doc/source/whatsnew/v0.23.1.txt

+14-5
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ Documentation Changes
4646
Bug Fixes
4747
~~~~~~~~~
4848

49-
- tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`)
50-
5149
Groupby/Resample/Rolling
5250
^^^^^^^^^^^^^^^^^^^^^^^^
5351

5452
- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`)
53+
- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`)
54+
- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True``
5555

5656
Strings
5757
^^^^^^^
@@ -66,6 +66,12 @@ Categorical
6666
^^^^^^^^^^^
6767

6868
- Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`)
69+
- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`)
70+
71+
Sparse
72+
^^^^^^
73+
74+
- Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`)
6975

7076
Conversion
7177
^^^^^^^^^^
@@ -78,11 +84,13 @@ Indexing
7884

7985
- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
8086
- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
87+
- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
8188
-
8289

8390
I/O
8491
^^^
8592

93+
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
8694
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
8795
-
8896

@@ -98,7 +106,8 @@ Reshaping
98106
- Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`)
99107
-
100108

101-
Categorical
102-
^^^^^^^^^^^
109+
Other
110+
^^^^^
103111

104-
-
112+
- Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`)
113+
- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`)

doc/source/whatsnew/v0.24.0.txt

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. _whatsnew_0240:
22

33
v0.24.0
4-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4+
-------
55

66
.. _whatsnew_0240.enhancements:
77

@@ -12,8 +12,8 @@ New features
1212

1313
Other Enhancements
1414
^^^^^^^^^^^^^^^^^^
15-
-
16-
-
15+
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
16+
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
1717
-
1818

1919
.. _whatsnew_0240.api_breaking:
@@ -22,6 +22,13 @@ Other Enhancements
2222
Backwards incompatible API changes
2323
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2424

25+
.. _whatsnew_0240.api.datetimelike:
26+
27+
Datetimelike API Changes
28+
^^^^^^^^^^^^^^^^^^^^^^^^
29+
30+
- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`)
31+
2532
.. _whatsnew_0240.api.other:
2633

2734
Other API Changes
@@ -177,4 +184,3 @@ Other
177184
-
178185
-
179186
-
180-

pandas/_libs/groupby_helper.pxi.in

+12-6
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
418418
bint is_datetimelike, object ties_method,
419419
bint ascending, bint pct, object na_option):
420420
"""
421-
Provides the rank of values within each group.
421+
Provides the rank of values within each group.
422422

423423
Parameters
424424
----------
@@ -451,8 +451,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
451451
"""
452452
cdef:
453453
TiebreakEnumType tiebreak
454-
Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0
455-
Py_ssize_t grp_vals_seen=1, grp_na_count=0
454+
Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
455+
Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
456456
ndarray[int64_t] _as
457457
ndarray[float64_t, ndim=2] grp_sizes
458458
ndarray[{{c_type}}] masked_vals
@@ -563,6 +563,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
563563
dups = sum_ranks = 0
564564
val_start = i
565565
grp_vals_seen += 1
566+
grp_tie_count +=1
566567

567568
# Similar to the previous conditional, check now if we are moving
568569
# to a new group. If so, keep track of the index where the new
@@ -571,11 +572,16 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
571572
# (used by pct calculations later). also be sure to reset any of
572573
# the items helping to calculate dups
573574
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
574-
for j in range(grp_start, i + 1):
575-
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
575+
if tiebreak != TIEBREAK_DENSE:
576+
for j in range(grp_start, i + 1):
577+
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
578+
else:
579+
for j in range(grp_start, i + 1):
580+
grp_sizes[_as[j], 0] = (grp_tie_count -
581+
(grp_na_count > 0))
576582
dups = sum_ranks = 0
577583
grp_na_count = 0
578-
val_start = i + 1
584+
grp_tie_count = 0
579585
grp_start = i + 1
580586
grp_vals_seen = 1
581587

pandas/tests/groupby/test_rank.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ def test_rank_apply():
5959
('first', False, False, [3., 4., 1., 5., 2.]),
6060
('first', False, True, [.6, .8, .2, 1., .4]),
6161
('dense', True, False, [1., 1., 3., 1., 2.]),
62-
('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]),
62+
('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]),
6363
('dense', False, False, [3., 3., 1., 3., 2.]),
64-
('dense', False, True, [.6, .6, .2, .6, .4]),
64+
('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]),
6565
])
6666
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
6767
key = np.repeat(grps, len(vals))
@@ -126,7 +126,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
126126
@pytest.mark.parametrize("grps", [
127127
['qux'], ['qux', 'quux']])
128128
@pytest.mark.parametrize("vals", [
129-
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats
129+
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
130130
[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
131131
pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
132132
pd.Timestamp('2018-01-06'), np.nan, np.nan]
@@ -167,11 +167,11 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
167167
('dense', True, 'keep', False,
168168
[1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
169169
('dense', True, 'keep', True,
170-
[0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]),
170+
[1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]),
171171
('dense', False, 'keep', False,
172172
[3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
173173
('dense', False, 'keep', True,
174-
[.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
174+
[3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]),
175175
('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
176176
('average', True, 'no_na', True,
177177
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
@@ -198,10 +198,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
198198
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
199199
('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
200200
('dense', True, 'no_na', True,
201-
[0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]),
201+
[0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]),
202202
('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
203203
('dense', False, 'no_na', True,
204-
[0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5])
204+
[0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.])
205205
])
206206
def test_rank_args_missing(grps, vals, ties_method, ascending,
207207
na_option, pct, exp):

0 commit comments

Comments
 (0)