diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 720ce7af47a18..60b1805abbdb4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -167,6 +167,35 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss ... KeyError: Timestamp('1970-01-01 00:00:00') +GroupBy.rolling no longer returns grouped-by column in values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Suppose we start with + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 1, 2, 3], "B": [0, 1, 2, 3]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: df.groupby("A").rolling(2).sum() + Out[1]: + A B + A + 1 0 NaN NaN + 1 2.0 1.0 + 2 2 NaN NaN + 3 3 NaN NaN + +*New behavior*: + +.. ipython:: python + + df.groupby("A").rolling(2).sum() + .. --------------------------------------------------------------------------- .. _whatsnew_110.api_breaking.assignment_to_multiple_columns: diff --git a/pandas/core/base.py b/pandas/core/base.py index e1c6bef66239d..a7aa6b39ea96d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -209,11 +209,18 @@ def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, ABCDataFrame): return self.obj.reindex(columns=self._selection_list) - if len(self.exclusions) > 0: - return self.obj.drop(self.exclusions, axis=1) - else: + elif not self.exclusions or not isinstance(self.obj, ABCDataFrame): return self.obj + # there may be elements in self.exclusions that are no longer + # in self.obj, see GH 32468 + nlevels = self.obj.columns.nlevels + unique_column_names = { + j for i in range(nlevels) for j in self.obj.columns.get_level_values(i) + } + exclusions = self.exclusions.intersection(unique_column_names) + return self.obj.drop(exclusions, axis=1) + def __getitem__(self, key): if self._selection is not None: raise IndexError(f"Column(s) {self._selection} already selected") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 19e51d05feb92..7cf6e9106ec0d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1577,6 +1577,7 @@ def rolling(self, *args, **kwargs): """ from pandas.core.window import RollingGroupby + kwargs["exclusions"] = self.exclusions return RollingGroupby(self, *args, **kwargs) @Substitution(name="groupby") diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index ed0b816f64800..bbaa345c85970 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -35,6 +35,8 @@ def _dispatch(name: str, *args, **kwargs): def outer(self, *args, **kwargs): def f(x): x = self._shallow_copy(x, groupby=self._groupby) + # patch for GH 32332 + x.obj = x._obj_with_exclusions return getattr(x, name)(*args, **kwargs) return self._groupby.apply(f) @@ -82,6 +84,8 @@ def _apply( # TODO: can we de-duplicate with _dispatch? def f(x, name=name, *args): x = self._shallow_copy(x) + # patch for GH 32332 + x.obj = x._obj_with_exclusions if isinstance(name, str): return getattr(x, name)(*args, **kwargs) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 0ec876583dcde..2da3f4008a7f5 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -142,6 +142,7 @@ def __init__( adjust=True, ignore_na=False, axis=0, + **kwargs, ): self.obj = obj self.com = _get_center_of_mass(com, span, halflife, alpha) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 3784989de10ab..de20e61c304e3 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -93,6 +93,10 @@ def __init__( self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() self._numba_func_cache: Dict[Optional[str], Callable] = dict() + self.exclusions = kwargs.get("exclusions", set()) + + def _shallow_copy(self, obj: FrameOrSeries, **kwargs) -> ShallowMixin: + return super()._shallow_copy(obj, exclusions=self.exclusions, **kwargs) @property def _constructor(self): @@ -1187,8 +1191,7 @@ def count(self): closed=self.closed, ).sum() results.append(result) - - return self._wrap_results(results, blocks, obj) + return self._wrap_results(results, blocks, obj, exclude=self.exclusions) _shared_docs["apply"] = dedent( r""" @@ -1632,6 +1635,8 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): # only default unset pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) + # patch for GH 32332 + other.obj = other._obj_with_exclusions # GH 16058: offset window if self.is_freq_type: @@ -1775,6 +1780,8 @@ def corr(self, other=None, pairwise=None, **kwargs): # only default unset pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) + # patch for GH 32332 + other.obj = other._obj_with_exclusions window = self._get_window(other) if not self.is_freq_type else self.win_freq def _get_corr(a, b): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 5b2687271f9d6..f979099e21cb6 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -62,11 +62,15 @@ def test_rolling(self): for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + # groupby.apply doesn't drop the grouped-by column + expected = expected.drop("A", axis=1) tm.assert_frame_equal(result, expected) for f in ["std", "var"]: result = getattr(r, f)(ddof=1) expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + # groupby.apply doesn't drop the grouped-by column + expected = expected.drop("A", axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -79,6 +83,8 @@ def test_rolling_quantile(self, interpolation): expected = g.apply( lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) ) + # groupby.apply doesn't drop the grouped-by column + expected = expected.drop("A", axis=1) tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): @@ -92,6 +98,8 @@ def func(x): return getattr(x.rolling(4), f)(self.frame) expected = g.apply(func) + # groupby.apply doesn't drop the grouped-by column + expected = expected.drop("A", axis=1) tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) @@ -109,6 +117,8 @@ def test_rolling_apply(self, raw): # reduction result = r.apply(lambda x: x.sum(), raw=raw) expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + # groupby.apply doesn't drop the grouped-by column + expected = expected.drop("A", axis=1) tm.assert_frame_equal(result, expected) def test_rolling_apply_mutability(self): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index ab2c7fcb7a0dc..2e8d5a6772e08 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -465,3 +465,18 @@ def test_rolling_count_default_min_periods_with_null_values(constructor): result = constructor(values).rolling(3).count() expected = constructor(expected_counts) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "columns", [pd.MultiIndex.from_tuples([("A", ""), ("B", "C")]), ["A", "B"]] +) +def test_by_column_not_in_values(columns): + # GH 32262 + df = pd.DataFrame([[1, 0]] * 20 + [[2, 0]] * 12 + [[3, 0]] * 8, columns=columns) + + g = df.groupby("A") + original_obj = g.obj.copy(deep=True) + r = g.rolling(4) + result = r.sum() + assert "A" not in result.columns + tm.assert_frame_equal(g.obj, original_obj) # check for side-effects