From b8ac6ac5901b9cd4465094c0c7031ec91228764d Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 2 Oct 2020 21:27:31 +0200 Subject: [PATCH] Rolling on DataFrameGroupBy duplicated index column when part of the grouping cols is from index --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/window/rolling.py | 15 ++++++++++++ pandas/tests/window/test_grouper.py | 38 +++++++++++++++++++++++++++++ pandas/tests/window/test_rolling.py | 4 +-- 4 files changed, 55 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 016e8d90e7d21..b5fca1ec1dc98 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -404,6 +404,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) +- Bug in :meth:`DataFrameGroupBy.rolling` duplicates index columns when a part of the grouping columns was from the index (:issue:`36794`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 6ab42dda865e7..eebb3a2c81f86 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2209,6 +2209,20 @@ def _apply( grouped_index_name = [*grouped_object_index.names] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name + drop_levels = [] + obj = self._groupby._selected_obj + # We have to handle a unnamed Series different, because we can not say, if + # grouping column was index or Series column. + if ( + isinstance(obj, ABCDataFrame) + or obj.name is not None + or obj.index.names != [None] + ): + drop_levels = [ + i for i, name in enumerate(groupby_keys) if name in grouped_index_name + ] + elif self._groupby.level is not None: + drop_levels = com.maybe_make_list(self._groupby.level) result_index_data = [] for key, values in self._groupby.grouper.indices.items(): @@ -2222,6 +2236,7 @@ def _apply( result_index = MultiIndex.from_tuples( result_index_data, names=result_index_names ) + result_index = result_index.droplevel(drop_levels) result.index = result_index return result diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 0eebd657e97b7..e7d9d81da75bf 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -416,3 +416,41 @@ def test_groupby_rolling_empty_frame(self): result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("groupings", [{"level": 0}, {"by": "groupby_col"}]) + def test_groupby_rolling_index_in_by_columns(self, groupings): + # GH: 36794 + df = pd.DataFrame( + {"groupby_col": [1, 1, 1, 2, 2, 2], "agg_col": [1, 1, 0, 0, 1, 0]} + ).set_index("groupby_col") + result = df.groupby(**groupings).rolling(2).sum() + expected = pd.DataFrame( + { + "groupby_col": [1, 1, 1, 2, 2, 2], + "agg_col": [np.nan, 2.0, 1.0, np.nan, 1.0, 1.0], + } + ).set_index("groupby_col") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("index_name", ["index", None]) + def test_groupby_rolling_unnamed_series(self, index_name): + # GH: 36794 + ds = pd.Series([1, 2, 3, 4], index=pd.Index([0, 1, 2, 3], name=index_name)) + result = ds.groupby(ds).rolling(2).sum() + expected = pd.Series( + [np.nan] * 4, + index=pd.MultiIndex.from_tuples( + [(1, 0), (2, 1), (3, 2), (4, 3)], names=[None, index_name] + ), + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("groupings", [{"level": 0}, {"by": "index"}]) + def test_groupby_rolling_index_of_unnamed_series(self, groupings): + # GH: 36794 + ds = pd.Series([1, 2, 3, 4], index=pd.Index([0, 0, 1, 1], name="index")) + result = ds.groupby(**groupings).rolling(2).sum() + expected = pd.Series( + [np.nan, 3.0, np.nan, 7.0], index=pd.Index([0, 0, 1, 1], name="index") + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 9ac4871ad24a1..a9ada97227205 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -675,9 +675,7 @@ def test_iter_rolling_datetime(expected, expected_index, window): [ ( {"level": 0}, - pd.MultiIndex.from_tuples( - [(0, 0), (0, 0), (1, 1), (1, 1), (1, 1)], names=[None, None] - ), + pd.Index([0, 0, 1, 1, 1]), ), ( {"by": "X"},