Skip to content

Commit 993557b

Browse files
Backport PR pandas-dev#38057: PERF: fix regression in creation of resulting index in RollingGroupby (pandas-dev#38211)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent b376fb9 commit 993557b

File tree

4 files changed

+107
-20
lines changed

4 files changed

+107
-20
lines changed

asv_bench/benchmarks/rolling.py

+14
Original file line numberDiff line numberDiff line change
@@ -216,4 +216,18 @@ def time_rolling_offset(self, method):
216216
getattr(self.groupby_roll_offset, method)()
217217

218218

219+
class GroupbyLargeGroups:
220+
# https://github.com/pandas-dev/pandas/issues/38038
221+
# specific example where the rolling operation on a larger dataframe
222+
# is relatively cheap (few but large groups), but creation of
223+
# MultiIndex of result can be expensive
224+
225+
def setup(self):
226+
N = 100000
227+
self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})
228+
229+
def time_rolling_multiindex_creation(self):
230+
self.df.groupby("A").rolling(3).mean()
231+
232+
219233
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.1.5.rst

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Fixed regressions
2424
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
2525
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
2626
- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
27+
- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`)
2728
- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)
2829

2930
.. ---------------------------------------------------------------------------

pandas/core/window/rolling.py

+22-15
Original file line numberDiff line numberDiff line change
@@ -2216,22 +2216,29 @@ def _apply(
22162216
# Our result will have still kept the column in the result
22172217
result = result.drop(columns=column_keys, errors="ignore")
22182218

2219-
result_index_data = []
2220-
for key, values in self._groupby.grouper.indices.items():
2221-
for value in values:
2222-
data = [
2223-
*com.maybe_make_list(key),
2224-
*com.maybe_make_list(
2225-
grouped_object_index[value]
2226-
if grouped_object_index is not None
2227-
else []
2228-
),
2229-
]
2230-
result_index_data.append(tuple(data))
2231-
2232-
result_index = MultiIndex.from_tuples(
2233-
result_index_data, names=result_index_names
2219+
codes = self._groupby.grouper.codes
2220+
levels = self._groupby.grouper.levels
2221+
2222+
group_indices = self._groupby.grouper.indices.values()
2223+
if group_indices:
2224+
indexer = np.concatenate(list(group_indices))
2225+
else:
2226+
indexer = np.array([], dtype=np.intp)
2227+
codes = [c.take(indexer) for c in codes]
2228+
2229+
# if the index of the original dataframe needs to be preserved, append
2230+
# this index (but reordered) to the codes/levels from the groupby
2231+
if grouped_object_index is not None:
2232+
idx = grouped_object_index.take(indexer)
2233+
if not isinstance(idx, MultiIndex):
2234+
idx = MultiIndex.from_arrays([idx])
2235+
codes.extend(list(idx.codes))
2236+
levels.extend(list(idx.levels))
2237+
2238+
result_index = MultiIndex(
2239+
levels, codes, names=result_index_names, verify_integrity=False
22342240
)
2241+
22352242
result.index = result_index
22362243
return result
22372244

pandas/tests/window/test_grouper.py

+70-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pytest
33

44
import pandas as pd
5-
from pandas import DataFrame, MultiIndex, Series
5+
from pandas import DataFrame, Index, MultiIndex, Series
66
import pandas._testing as tm
77
from pandas.core.groupby.groupby import get_groupby
88

@@ -396,14 +396,25 @@ def test_groupby_rolling_index_changed(self, func):
396396

397397
def test_groupby_rolling_empty_frame(self):
398398
# GH 36197
399-
expected = pd.DataFrame({"s1": []})
399+
expected = DataFrame({"s1": []})
400400
result = expected.groupby("s1").rolling(window=1).sum()
401-
expected.index = pd.MultiIndex.from_tuples([], names=["s1", None])
401+
# GH-38057 from_tuples gives empty object dtype, we now get float/int levels
402+
# expected.index = MultiIndex.from_tuples([], names=["s1", None])
403+
expected.index = MultiIndex.from_product(
404+
[Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
405+
)
402406
tm.assert_frame_equal(result, expected)
403407

404-
expected = pd.DataFrame({"s1": [], "s2": []})
408+
expected = DataFrame({"s1": [], "s2": []})
405409
result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
406-
expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None])
410+
expected.index = MultiIndex.from_product(
411+
[
412+
Index([], dtype="float64"),
413+
Index([], dtype="float64"),
414+
Index([], dtype="int64"),
415+
],
416+
names=["s1", "s2", None],
417+
)
407418
tm.assert_frame_equal(result, expected)
408419

409420
def test_groupby_rolling_string_index(self):
@@ -479,3 +490,57 @@ def test_groupby_rolling_index_level_and_column_label(self):
479490
),
480491
)
481492
tm.assert_frame_equal(result, expected)
493+
494+
def test_groupby_rolling_resulting_multiindex(self):
495+
# a few different cases checking the created MultiIndex of the result
496+
# https://github.com/pandas-dev/pandas/pull/38057
497+
498+
# grouping by 1 columns -> 2-level MI as result
499+
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4})
500+
result = df.groupby("b").rolling(3).mean()
501+
expected_index = MultiIndex.from_tuples(
502+
[(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)],
503+
names=["b", None],
504+
)
505+
tm.assert_index_equal(result.index, expected_index)
506+
507+
# grouping by 2 columns -> 3-level MI as result
508+
df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3})
509+
result = df.groupby(["b", "c"]).rolling(2).sum()
510+
expected_index = MultiIndex.from_tuples(
511+
[
512+
(1, 1, 0),
513+
(1, 1, 4),
514+
(1, 1, 8),
515+
(1, 3, 2),
516+
(1, 3, 6),
517+
(1, 3, 10),
518+
(2, 2, 1),
519+
(2, 2, 5),
520+
(2, 2, 9),
521+
(2, 4, 3),
522+
(2, 4, 7),
523+
(2, 4, 11),
524+
],
525+
names=["b", "c", None],
526+
)
527+
tm.assert_index_equal(result.index, expected_index)
528+
529+
# grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result
530+
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2})
531+
df = df.set_index("c", append=True)
532+
result = df.groupby("b").rolling(3).mean()
533+
expected_index = MultiIndex.from_tuples(
534+
[
535+
(1, 0, 1),
536+
(1, 2, 3),
537+
(1, 4, 1),
538+
(1, 6, 3),
539+
(2, 1, 2),
540+
(2, 3, 4),
541+
(2, 5, 2),
542+
(2, 7, 4),
543+
],
544+
names=["b", None, "c"],
545+
)
546+
tm.assert_index_equal(result.index, expected_index)

0 commit comments

Comments
 (0)