BUG: rolling.groupby with on and __getitem__ doesn't mutate underlying object (pandas-dev#43374)

mroeschke · feefladder · commit 69beb8cc8a01 · 2021-09-07T07:54:12.000+02:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -408,6 +408,7 @@ Groupby/resample/rolling
 - Bug in :meth:`pandas.DataFrame.rolling` operation along rows (``axis=1``) incorrectly omits columns containing ``float16`` and ``float32`` (:issue:`41779`)
 - Bug in :meth:`Resampler.aggregate` did not allow the use of Named Aggregation (:issue:`32803`)
 - Bug in :meth:`Series.rolling` when the :class:`Series` ``dtype`` was ``Int64`` (:issue:`43016`)
+- Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -766,7 +766,8 @@ def _gotitem(self, key, ndim, subset=None):
         # here so our index is carried through to the selected obj
         # when we do the splitting for the groupby
         if self.on is not None:
-            self.obj = self.obj.set_index(self._on)
+            # GH 43355
+            subset = self.obj.set_index(self._on)
         return super()._gotitem(key, ndim, subset=subset)
 
     def _validate_monotonic(self):
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -1019,3 +1019,42 @@ def test_times_array(self, times_frame):
             result = gb.ewm(halflife=halflife, times="C").mean()
             expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean()
         tm.assert_frame_equal(result, expected)
+
+    def test_dont_mutate_obj_after_slicing(self):
+        # GH 43355
+        df = DataFrame(
+            {
+                "id": ["a", "a", "b", "b", "b"],
+                "timestamp": date_range("2021-9-1", periods=5, freq="H"),
+                "y": range(5),
+            }
+        )
+        grp = df.groupby("id").rolling("1H", on="timestamp")
+        result = grp.count()
+        expected_df = DataFrame(
+            {
+                "timestamp": date_range("2021-9-1", periods=5, freq="H"),
+                "y": [1.0] * 5,
+            },
+            index=MultiIndex.from_arrays(
+                [["a", "a", "b", "b", "b"], list(range(5))], names=["id", None]
+            ),
+        )
+        tm.assert_frame_equal(result, expected_df)
+
+        result = grp["y"].count()
+        expected_series = Series(
+            [1.0] * 5,
+            index=MultiIndex.from_arrays(
+                [
+                    ["a", "a", "b", "b", "b"],
+                    date_range("2021-9-1", periods=5, freq="H"),
+                ],
+                names=["id", "timestamp"],
+            ),
+            name="y",
+        )
+        tm.assert_series_equal(result, expected_series)
+        # This is the key test
+        result = grp.count()
+        tm.assert_frame_equal(result, expected_df)