Skip to content

Commit 6d0dab4

Browse files
PERF: fix regression in creation of resulting index in RollingGroupby (#38057)
1 parent a5eb94d commit 6d0dab4

File tree

5 files changed

+122
-21
lines changed

5 files changed

+122
-21
lines changed

asv_bench/benchmarks/rolling.py

+14
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,20 @@ def time_rolling_offset(self, method):
225225
getattr(self.groupby_roll_offset, method)()
226226

227227

228+
class GroupbyLargeGroups:
229+
# https://github.com/pandas-dev/pandas/issues/38038
230+
# specific example where the rolling operation on a larger dataframe
231+
# is relatively cheap (few but large groups), but creation of
232+
# MultiIndex of result can be expensive
233+
234+
def setup(self):
235+
N = 100000
236+
self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})
237+
238+
def time_rolling_multiindex_creation(self):
239+
self.df.groupby("A").rolling(3).mean()
240+
241+
228242
class GroupbyEWM:
229243

230244
params = ["cython", "numba"]

doc/source/whatsnew/v1.1.5.rst

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Fixed regressions
2525
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
2626
- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
2727
- Fixed performance regression for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
28+
- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`)
2829
- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)
2930

3031
.. ---------------------------------------------------------------------------

pandas/core/window/rolling.py

+22-16
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050

5151
from pandas.core.aggregation import aggregate
5252
from pandas.core.base import DataError, SelectionMixin
53-
import pandas.core.common as com
5453
from pandas.core.construction import extract_array
5554
from pandas.core.groupby.base import GotItemMixin, ShallowMixin
5655
from pandas.core.indexes.api import Index, MultiIndex
@@ -791,22 +790,29 @@ def _apply(
791790
# Our result will have still kept the column in the result
792791
result = result.drop(columns=column_keys, errors="ignore")
793792

794-
result_index_data = []
795-
for key, values in self._groupby.grouper.indices.items():
796-
for value in values:
797-
data = [
798-
*com.maybe_make_list(key),
799-
*com.maybe_make_list(
800-
grouped_object_index[value]
801-
if grouped_object_index is not None
802-
else []
803-
),
804-
]
805-
result_index_data.append(tuple(data))
806-
807-
result_index = MultiIndex.from_tuples(
808-
result_index_data, names=result_index_names
793+
codes = self._groupby.grouper.codes
794+
levels = self._groupby.grouper.levels
795+
796+
group_indices = self._groupby.grouper.indices.values()
797+
if group_indices:
798+
indexer = np.concatenate(list(group_indices))
799+
else:
800+
indexer = np.array([], dtype=np.intp)
801+
codes = [c.take(indexer) for c in codes]
802+
803+
# if the index of the original dataframe needs to be preserved, append
804+
# this index (but reordered) to the codes/levels from the groupby
805+
if grouped_object_index is not None:
806+
idx = grouped_object_index.take(indexer)
807+
if not isinstance(idx, MultiIndex):
808+
idx = MultiIndex.from_arrays([idx])
809+
codes.extend(list(idx.codes))
810+
levels.extend(list(idx.levels))
811+
812+
result_index = MultiIndex(
813+
levels, codes, names=result_index_names, verify_integrity=False
809814
)
815+
810816
result.index = result_index
811817
return result
812818

pandas/tests/window/test_groupby.py

+76-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
import numpy as np
22
import pytest
33

4-
from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, to_datetime
4+
from pandas import (
5+
DataFrame,
6+
Index,
7+
MultiIndex,
8+
Series,
9+
Timestamp,
10+
date_range,
11+
to_datetime,
12+
)
513
import pandas._testing as tm
614
from pandas.api.indexers import BaseIndexer
715
from pandas.core.groupby.groupby import get_groupby
@@ -418,12 +426,23 @@ def test_groupby_rolling_empty_frame(self):
418426
# GH 36197
419427
expected = DataFrame({"s1": []})
420428
result = expected.groupby("s1").rolling(window=1).sum()
421-
expected.index = MultiIndex.from_tuples([], names=["s1", None])
429+
# GH-38057 from_tuples gives empty object dtype, we now get float/int levels
430+
# expected.index = MultiIndex.from_tuples([], names=["s1", None])
431+
expected.index = MultiIndex.from_product(
432+
[Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
433+
)
422434
tm.assert_frame_equal(result, expected)
423435

424436
expected = DataFrame({"s1": [], "s2": []})
425437
result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
426-
expected.index = MultiIndex.from_tuples([], names=["s1", "s2", None])
438+
expected.index = MultiIndex.from_product(
439+
[
440+
Index([], dtype="float64"),
441+
Index([], dtype="float64"),
442+
Index([], dtype="int64"),
443+
],
444+
names=["s1", "s2", None],
445+
)
427446
tm.assert_frame_equal(result, expected)
428447

429448
def test_groupby_rolling_string_index(self):
@@ -567,6 +586,60 @@ def test_groupby_rolling_index_level_and_column_label(self):
567586
)
568587
tm.assert_frame_equal(result, expected)
569588

589+
def test_groupby_rolling_resulting_multiindex(self):
590+
# a few different cases checking the created MultiIndex of the result
591+
# https://github.com/pandas-dev/pandas/pull/38057
592+
593+
# grouping by 1 columns -> 2-level MI as result
594+
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4})
595+
result = df.groupby("b").rolling(3).mean()
596+
expected_index = MultiIndex.from_tuples(
597+
[(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)],
598+
names=["b", None],
599+
)
600+
tm.assert_index_equal(result.index, expected_index)
601+
602+
# grouping by 2 columns -> 3-level MI as result
603+
df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3})
604+
result = df.groupby(["b", "c"]).rolling(2).sum()
605+
expected_index = MultiIndex.from_tuples(
606+
[
607+
(1, 1, 0),
608+
(1, 1, 4),
609+
(1, 1, 8),
610+
(1, 3, 2),
611+
(1, 3, 6),
612+
(1, 3, 10),
613+
(2, 2, 1),
614+
(2, 2, 5),
615+
(2, 2, 9),
616+
(2, 4, 3),
617+
(2, 4, 7),
618+
(2, 4, 11),
619+
],
620+
names=["b", "c", None],
621+
)
622+
tm.assert_index_equal(result.index, expected_index)
623+
624+
# grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result
625+
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2})
626+
df = df.set_index("c", append=True)
627+
result = df.groupby("b").rolling(3).mean()
628+
expected_index = MultiIndex.from_tuples(
629+
[
630+
(1, 0, 1),
631+
(1, 2, 3),
632+
(1, 4, 1),
633+
(1, 6, 3),
634+
(2, 1, 2),
635+
(2, 3, 4),
636+
(2, 5, 2),
637+
(2, 7, 4),
638+
],
639+
names=["b", None, "c"],
640+
)
641+
tm.assert_index_equal(result.index, expected_index)
642+
570643

571644
class TestExpanding:
572645
def setup_method(self):

pandas/tests/window/test_rolling.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1085,8 +1085,15 @@ def test_groupby_rolling_nan_included():
10851085
result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean()
10861086
expected = DataFrame(
10871087
{"B": [0.0, 2.0, 3.0, 1.0, 4.0]},
1088-
index=MultiIndex.from_tuples(
1089-
[("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
1088+
# GH-38057 from_tuples puts the NaNs in the codes, result expects them
1089+
# to be in the levels, at the moment
1090+
# index=MultiIndex.from_tuples(
1091+
# [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
1092+
# names=["group", None],
1093+
# ),
1094+
index=MultiIndex(
1095+
[["g1", "g2", np.nan], [0, 1, 2, 3, 4]],
1096+
[[0, 0, 1, 2, 2], [0, 2, 3, 1, 4]],
10901097
names=["group", None],
10911098
),
10921099
)

0 commit comments

Comments
 (0)