Skip to content

BUG: RollingGroupby MultiIndex levels dropped #40701

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 5, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,7 @@ Groupby/resample/rolling
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`)
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
- Bug in :class:`core.window.rolling.RollingGroupby` where an level of a :class:`MultiIndex` would be dropped if it overlapped with a groupby label (:issue:`38787`, :issue:`38523`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's make this a full sub-section and mention this goes back to 1.1.x behavior

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


Reshaping
^^^^^^^^^
Expand Down
23 changes: 10 additions & 13 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,26 +577,23 @@ def _apply(
numba_cache_key,
**kwargs,
)
# Reconstruct the resulting MultiIndex from tuples
# Reconstruct the resulting MultiIndex
# 1st set of levels = group by labels
# 2nd set of levels = original index
# Ignore 2nd set of levels if a group by label include an index level
result_index_names = copy.copy(self._grouper.names)
grouped_object_index = None
# 2nd set of levels = original DataFrame/Series index
grouped_object_index = self.obj.index
grouped_index_name = [*grouped_object_index.names]
groupby_keys = copy.copy(self._grouper.names)
result_index_names = groupby_keys + grouped_index_name

column_keys = [
drop_columns = [
key
for key in result_index_names
for key in self._grouper.names
if key not in self.obj.index.names or key is None
]

if len(column_keys) == len(result_index_names):
grouped_object_index = self.obj.index
grouped_index_name = [*grouped_object_index.names]
result_index_names += grouped_index_name
else:
if len(drop_columns) != len(groupby_keys):
# Our result will have still kept the column in the result
result = result.drop(columns=column_keys, errors="ignore")
result = result.drop(columns=drop_columns, errors="ignore")

codes = self._grouper.codes
levels = copy.copy(self._grouper.levels)
Expand Down
47 changes: 42 additions & 5 deletions pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,23 +588,31 @@ def test_groupby_rolling_nans_in_index(self, rollings, key):
with pytest.raises(ValueError, match=f"{key} must be monotonic"):
df.groupby("c").rolling("60min", **rollings)

def test_groupby_rolling_group_keys(self):
@pytest.mark.parametrize("group_keys", [True, False])
def test_groupby_rolling_group_keys(self, group_keys):
# GH 37641
# GH 38523: GH 37641 actually was not a bug.
# group_keys only applies to groupby.apply directly
arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))

s = Series([1, 2, 3], index=index)
result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean()
result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean()
expected = Series(
[1.0, 2.0, 3.0],
index=MultiIndex.from_tuples(
[("val1", "val1"), ("val1", "val1"), ("val2", "val2")],
names=["idx1", "idx2"],
[
("val1", "val1", "val1", "val1"),
("val1", "val1", "val1", "val1"),
("val2", "val2", "val2", "val2"),
],
names=["idx1", "idx2", "idx1", "idx2"],
),
)
tm.assert_series_equal(result, expected)

def test_groupby_rolling_index_level_and_column_label(self):
# The groupby keys should not appear as a resulting column
arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))

Expand All @@ -613,7 +621,12 @@ def test_groupby_rolling_index_level_and_column_label(self):
expected = DataFrame(
{"B": [0.0, 1.0, 2.0]},
index=MultiIndex.from_tuples(
[("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"]
[
("val1", 1, "val1", "val1"),
("val1", 1, "val1", "val1"),
("val2", 2, "val2", "val2"),
],
names=["idx1", "A", "idx1", "idx2"],
),
)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -695,6 +708,30 @@ def test_by_column_not_in_values(self, columns):
assert "A" not in result.columns
tm.assert_frame_equal(g.obj, original_obj)

def test_groupby_level(self):
# GH 38523, 38787
arrays = [
["Falcon", "Falcon", "Parrot", "Parrot"],
["Captive", "Wild", "Captive", "Wild"],
]
index = MultiIndex.from_arrays(arrays, names=("Animal", "Type"))
df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index)
result = df.groupby(level=0)["Max Speed"].rolling(2).sum()
expected = Series(
[np.nan, 740.0, np.nan, 50.0],
index=MultiIndex.from_tuples(
[
("Falcon", "Falcon", "Captive"),
("Falcon", "Falcon", "Wild"),
("Parrot", "Parrot", "Captive"),
("Parrot", "Parrot", "Wild"),
],
names=["Animal", "Animal", "Type"],
),
name="Max Speed",
)
tm.assert_series_equal(result, expected)


class TestExpanding:
def setup_method(self):
Expand Down