From 2bcc559388f590ab03afe39b971bb407d0f16dcb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 30 Mar 2021 23:24:11 -0700 Subject: [PATCH 1/2] BUG: RollingGroupby MultiIndex levels dropped --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/window/rolling.py | 23 ++++++-------- pandas/tests/window/test_groupby.py | 47 ++++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 63902b53ea36d..196cf2ffffc37 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -692,6 +692,7 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) +- Bug in :class:`core.window.rolling.RollingGroupby` where an level of a :class:`MultiIndex` would be dropped if it overlapped with a groupby label (:issue:`38787`, :issue:`38523`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b482934dd25d2..6a0cb117ebca5 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -577,26 +577,23 @@ def _apply( numba_cache_key, **kwargs, ) - # Reconstruct the resulting MultiIndex from tuples + # Reconstruct the resulting MultiIndex # 1st set of levels = group by labels - # 2nd set of levels = original index - # Ignore 2nd set of levels if a group by label include an index level - result_index_names = copy.copy(self._grouper.names) - grouped_object_index = None + # 2nd set of levels = original DataFrame/Series index + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + groupby_keys = copy.copy(self._grouper.names) + result_index_names = groupby_keys + grouped_index_name - column_keys = [ + drop_columns = [ key - for key in result_index_names + for key in self._grouper.names if key not in self.obj.index.names or key is None ] - if len(column_keys) == len(result_index_names): - grouped_object_index = self.obj.index - grouped_index_name = [*grouped_object_index.names] - result_index_names += grouped_index_name - else: + if len(drop_columns) != len(groupby_keys): # Our result will have still kept the column in the result - result = result.drop(columns=column_keys, errors="ignore") + result = result.drop(columns=drop_columns, errors="ignore") codes = self._grouper.codes levels = copy.copy(self._grouper.levels) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 5c2f69a9247e9..dd988a4abd9e1 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -588,23 +588,31 @@ def test_groupby_rolling_nans_in_index(self, rollings, key): with pytest.raises(ValueError, match=f"{key} must be monotonic"): df.groupby("c").rolling("60min", **rollings) - def test_groupby_rolling_group_keys(self): + @pytest.mark.parametrize("group_keys", [True, False]) + def test_groupby_rolling_group_keys(self, group_keys): # GH 37641 + # GH 38523: GH 37641 actually was not a bug. + # group_keys only applies to groupby.apply directly arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) s = Series([1, 2, 3], index=index) - result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean() + result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean() expected = Series( [1.0, 2.0, 3.0], index=MultiIndex.from_tuples( - [("val1", "val1"), ("val1", "val1"), ("val2", "val2")], - names=["idx1", "idx2"], + [ + ("val1", "val1", "val1", "val1"), + ("val1", "val1", "val1", "val1"), + ("val2", "val2", "val2", "val2"), + ], + names=["idx1", "idx2", "idx1", "idx2"], ), ) tm.assert_series_equal(result, expected) def test_groupby_rolling_index_level_and_column_label(self): + # The groupby keys should not appear as a resulting column arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) @@ -613,7 +621,12 @@ def test_groupby_rolling_index_level_and_column_label(self): expected = DataFrame( {"B": [0.0, 1.0, 2.0]}, index=MultiIndex.from_tuples( - [("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"] + [ + ("val1", 1, "val1", "val1"), + ("val1", 1, "val1", "val1"), + ("val2", 2, "val2", "val2"), + ], + names=["idx1", "A", "idx1", "idx2"], ), ) tm.assert_frame_equal(result, expected) @@ -695,6 +708,30 @@ def test_by_column_not_in_values(self, columns): assert "A" not in result.columns tm.assert_frame_equal(g.obj, original_obj) + def test_groupby_level(self): + # GH 38523, 38787 + arrays = [ + ["Falcon", "Falcon", "Parrot", "Parrot"], + ["Captive", "Wild", "Captive", "Wild"], + ] + index = MultiIndex.from_arrays(arrays, names=("Animal", "Type")) + df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index) + result = df.groupby(level=0)["Max Speed"].rolling(2).sum() + expected = Series( + [np.nan, 740.0, np.nan, 50.0], + index=MultiIndex.from_tuples( + [ + ("Falcon", "Falcon", "Captive"), + ("Falcon", "Falcon", "Wild"), + ("Parrot", "Parrot", "Captive"), + ("Parrot", "Parrot", "Wild"), + ], + names=["Animal", "Animal", "Type"], + ), + name="Max Speed", + ) + tm.assert_series_equal(result, expected) + class TestExpanding: def setup_method(self): From 6974f9c094475dcefe6a754f45d1be03e8dd57f1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 2 Apr 2021 21:51:01 -0700 Subject: [PATCH 2/2] promote whatsnew to section --- doc/source/whatsnew/v1.3.0.rst | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ff5967907dabb..e1358b433a0b9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -400,6 +400,38 @@ However, floating point artifacts may now exist in the results when rolling over s = pd.Series([7, 5, 5, 5]) s.rolling(3).var() +.. _whatsnew_130.notable_bug_fixes.rolling_groupby_multiindex: + +GroupBy.rolling with MultiIndex no longer drops levels in the result +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`core.window.rolling.RollingGroupby` will no longer drop levels of a :class:`DataFrame` +with a :class:`MultiIndex` in the result. This can lead to a perceived duplication of levels in the resulting +:class:`MultiIndex`, but this change restores the behavior that was present in version 1.1.3 (:issue:`38787`, :issue:`38523`). + + +.. ipython:: python + + index = pd.MultiIndex.from_tuples([('idx1', 'idx2')], names=['label1', 'label2']) + df = pd.DataFrame({'a': [1], 'b': [2]}, index=index) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: df.groupby('label1').rolling(1).sum() + Out[1]: + a b + label1 + idx1 1.0 2.0 + +*New behavior*: + +.. ipython:: python + + df.groupby('label1').rolling(1).sum() + .. _whatsnew_130.api_breaking.deps: @@ -720,7 +752,6 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) -- Bug in :class:`core.window.rolling.RollingGroupby` where an level of a :class:`MultiIndex` would be dropped if it overlapped with a groupby label (:issue:`38787`, :issue:`38523`) Reshaping ^^^^^^^^^