BUG/REG: RollingGroupby MultiIndex levels dropped (pandas-dev#38737)

mroeschke · simonjayhawkins · web-flow · commit a37f1a45e83d · 2020-12-29T13:56:07.000-05:00
Co-authored-by: Simon Hawkins &lt;simonjayhawkins@gmail.com&gt;
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -16,6 +16,7 @@ Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`)
 - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`)
+- Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`)
 - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
 -
 
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -108,7 +108,7 @@
     Note this does not influence the order of observations within each
     group. Groupby preserves the order of rows within each group.
 group_keys : bool, default True
-    When calling apply, add group keys to index to identify pieces.
+    When calling ``groupby().apply()``, add group keys to index to identify pieces.
 squeeze : bool, default False
     Reduce the dimensionality of the return type if possible,
     otherwise return a consistent type.
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -769,28 +769,22 @@ def _apply(
             numba_cache_key,
             **kwargs,
         )
-        # Reconstruct the resulting MultiIndex from tuples
+        # Reconstruct the resulting MultiIndex
         # 1st set of levels = group by labels
-        # 2nd set of levels = original index
-        # Ignore 2nd set of levels if a group by label include an index level
-        result_index_names = [
-            grouping.name for grouping in self._groupby.grouper._groupings
-        ]
-        grouped_object_index = None
+        # 2nd set of levels = original DataFrame/Series index
+        grouped_object_index = self.obj.index
+        grouped_index_name = [*grouped_object_index.names]
+        groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings]
+        result_index_names = groupby_keys + grouped_index_name
 
-        column_keys = [
+        drop_columns = [
             key
-            for key in result_index_names
+            for key in groupby_keys
             if key not in self.obj.index.names or key is None
         ]
-
-        if len(column_keys) == len(result_index_names):
-            grouped_object_index = self.obj.index
-            grouped_index_name = [*grouped_object_index.names]
-            result_index_names += grouped_index_name
-        else:
-            # Our result will have still kept the column in the result
-            result = result.drop(columns=column_keys, errors="ignore")
+        if len(drop_columns) != len(groupby_keys):
+            # Our result will have kept groupby columns which should be dropped
+            result = result.drop(columns=drop_columns, errors="ignore")
 
         codes = self._groupby.grouper.codes
         levels = self._groupby.grouper.levels
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -556,23 +556,31 @@ def test_groupby_rolling_nans_in_index(self, rollings, key):
         with pytest.raises(ValueError, match=f"{key} must be monotonic"):
             df.groupby("c").rolling("60min", **rollings)
 
-    def test_groupby_rolling_group_keys(self):
+    @pytest.mark.parametrize("group_keys", [True, False])
+    def test_groupby_rolling_group_keys(self, group_keys):
         # GH 37641
+        # GH 38523: GH 37641 actually was not a bug.
+        # group_keys only applies to groupby.apply directly
         arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
         index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
 
         s = Series([1, 2, 3], index=index)
-        result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean()
+        result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean()
         expected = Series(
             [1.0, 2.0, 3.0],
             index=MultiIndex.from_tuples(
-                [("val1", "val1"), ("val1", "val1"), ("val2", "val2")],
-                names=["idx1", "idx2"],
+                [
+                    ("val1", "val1", "val1", "val1"),
+                    ("val1", "val1", "val1", "val1"),
+                    ("val2", "val2", "val2", "val2"),
+                ],
+                names=["idx1", "idx2", "idx1", "idx2"],
             ),
         )
         tm.assert_series_equal(result, expected)
 
     def test_groupby_rolling_index_level_and_column_label(self):
+        # The groupby keys should not appear as a resulting column
         arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
         index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
 
@@ -581,7 +589,12 @@ def test_groupby_rolling_index_level_and_column_label(self):
         expected = DataFrame(
             {"B": [0.0, 1.0, 2.0]},
             index=MultiIndex.from_tuples(
-                [("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"]
+                [
+                    ("val1", 1, "val1", "val1"),
+                    ("val1", 1, "val1", "val1"),
+                    ("val2", 2, "val2", "val2"),
+                ],
+                names=["idx1", "A", "idx1", "idx2"],
             ),
         )
         tm.assert_frame_equal(result, expected)
@@ -640,6 +653,30 @@ def test_groupby_rolling_resulting_multiindex(self):
         )
         tm.assert_index_equal(result.index, expected_index)
 
+    def test_groupby_level(self):
+        # GH 38523
+        arrays = [
+            ["Falcon", "Falcon", "Parrot", "Parrot"],
+            ["Captive", "Wild", "Captive", "Wild"],
+        ]
+        index = MultiIndex.from_arrays(arrays, names=("Animal", "Type"))
+        df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index)
+        result = df.groupby(level=0)["Max Speed"].rolling(2).sum()
+        expected = Series(
+            [np.nan, 740.0, np.nan, 50.0],
+            index=MultiIndex.from_tuples(
+                [
+                    ("Falcon", "Falcon", "Captive"),
+                    ("Falcon", "Falcon", "Wild"),
+                    ("Parrot", "Parrot", "Captive"),
+                    ("Parrot", "Parrot", "Wild"),
+                ],
+                names=["Animal", "Animal", "Type"],
+            ),
+            name="Max Speed",
+        )
+        tm.assert_series_equal(result, expected)
+
 
 class TestExpanding:
     def setup_method(self):

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ Fixed regressions`
`16`	`16`	`~~~~~~~~~~~~~~~~~`
`17`	`17`	- The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`)
`18`	`18`	- :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`)
	`19`	+- Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`)
`19`	`20`	- Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
`20`	`21`	`-`
`21`	`22`