pandas-dev · mroeschke · Apr 12, 2023 · Mar 21, 2023 · Apr 8, 2023 · Apr 8, 2023
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
    df
 
    # List the size of the animals with the highest weight.
-   df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()])
+   df.groupby("animal")[["size", "weight"]].apply(lambda subf: subf["size"][subf["weight"].idxmax()])
 
 `Using get_group
 <https://stackoverflow.com/questions/14734533/how-to-access-pandas-groupby-dataframe-by-key>`__
@@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
        return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"])
 
 
-   expected_df = gb.apply(GrowUp)
+   expected_df = gb[["size", "weight"]].apply(GrowUp)
    expected_df
 
 `Expanding apply

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -430,6 +430,12 @@ This is mainly syntactic sugar for the alternative, which is much more verbose:
 Additionally, this method avoids recomputing the internal grouping information
 derived from the passed key.
 
+You can also include the grouping columns if you want to operate on them.
+
+.. ipython:: python
+
+   grouped[["A", "B"]].sum()
+
 .. _groupby.iterating-label:
 
 Iterating through groups
@@ -1067,7 +1073,7 @@ missing values with the ``ffill()`` method.
    ).set_index("date")
    df_re
 
-   df_re.groupby("group").resample("1D").ffill()
+   df_re.groupby("group")[["val"]].resample("1D").ffill()
 
 .. _groupby.filter:
 
@@ -1233,13 +1239,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare
 
 .. ipython:: python
 
-    df.groupby("A", group_keys=True).apply(lambda x: x)
+    df.groupby("A", group_keys=True)[["B", "C", "D"]].apply(lambda x: x)
 
 with
 
 .. ipython:: python
 
-    df.groupby("A", group_keys=False).apply(lambda x: x)
+    df.groupby("A", group_keys=False)[["B", "C", "D"]].apply(lambda x: x)
 
 
 Numba Accelerated Routines
@@ -1722,7 +1728,7 @@ column index name will be used as the name of the inserted column:
        result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
        return pd.Series(result, name="metrics")
 
-   result = df.groupby("a").apply(compute_metrics)
+   result = df.groupby("a")[["b", "c"]].apply(compute_metrics)
 
    result
 

diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
@@ -328,13 +328,25 @@ More consistent behavior for some groupby methods:
 
 - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:
 
-  .. ipython:: python
+  .. code-block:: ipython
 
-     df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
-     g = df.groupby('A')
-     g.head(1)  # filters DataFrame
+     In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+
+     In [2]: g = df.groupby('A')
+
+     In [3]: g.head(1)  # filters DataFrame
+     Out[3]:
+        A  B
+     0  1  2
+     2  5  6
+
+     In [4]: g.apply(lambda x: x.head(1))  # used to simply fall-through
+     Out[4]:
+          A  B
+     A
+     1 0  1  2
+     5 2  5  6
 
-     g.apply(lambda x: x.head(1))  # used to simply fall-through
 
 - groupby head and tail respect column selection:
 

diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
@@ -77,9 +77,52 @@ Previously you would have to do this to get a rolling window mean per-group:
    df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
    df
 
-.. ipython:: python
+.. code-block:: ipython
 
-   df.groupby("A").apply(lambda x: x.rolling(4).B.mean())
+   In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean())
+   Out[1]:
+   A
+   1  0      NaN
+      1      NaN
+      2      NaN
+      3      1.5
+      4      2.5
+      5      3.5
+      6      4.5
+      7      5.5
+      8      6.5
+      9      7.5
+      10     8.5
+      11     9.5
+      12    10.5
+      13    11.5
+      14    12.5
+      15    13.5
+      16    14.5
+      17    15.5
+      18    16.5
+      19    17.5
+   2  20     NaN
+      21     NaN
+      22     NaN
+      23    21.5
+      24    22.5
+      25    23.5
+      26    24.5
+      27    25.5
+      28    26.5
+      29    27.5
+      30    28.5
+      31    29.5
+   3  32     NaN
+      33     NaN
+      34     NaN
+      35    33.5
+      36    34.5
+      37    35.5
+      38    36.5
+      39    37.5
+   Name: B, dtype: float64
 
 Now you can do:
 
@@ -101,15 +144,53 @@ For ``.resample(..)`` type of operations, previously you would have to:
 
    df
 
-.. ipython:: python
+.. code-block:: ipython
 
-   df.groupby("group").apply(lambda x: x.resample("1D").ffill())
+   In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill())
+   Out[1]:
+                     group  val
+   group date
+   1     2016-01-03      1    5
+         2016-01-04      1    5
+         2016-01-05      1    5
+         2016-01-06      1    5
+         2016-01-07      1    5
+         2016-01-08      1    5
+         2016-01-09      1    5
+         2016-01-10      1    6
+   2     2016-01-17      2    7
+         2016-01-18      2    7
+         2016-01-19      2    7
+         2016-01-20      2    7
+         2016-01-21      2    7
+         2016-01-22      2    7
+         2016-01-23      2    7
+         2016-01-24      2    8
 
 Now you can do:
 
-.. ipython:: python
+.. code-block:: ipython
 
-   df.groupby("group").resample("1D").ffill()
+   In[1]: df.groupby("group").resample("1D").ffill()
+   Out[1]:
+                     group  val
+   group date
+   1     2016-01-03      1    5
+         2016-01-04      1    5
+         2016-01-05      1    5
+         2016-01-06      1    5
+         2016-01-07      1    5
+         2016-01-08      1    5
+         2016-01-09      1    5
+         2016-01-10      1    6
+   2     2016-01-17      2    7
+         2016-01-18      2    7
+         2016-01-19      2    7
+         2016-01-20      2    7
+         2016-01-21      2    7
+         2016-01-22      2    7
+         2016-01-23      2    7
+         2016-01-24      2    8
 
 .. _whatsnew_0181.enhancements.method_chain:
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -149,6 +149,7 @@ Other API changes
 
 Deprecations
 ~~~~~~~~~~~~
+- Deprecated :meth:`.DataFrameGroupBy.apply` and methods on the objects returned by :meth:`.DataFrameGroupBy.resample` operating on the grouping column(s); select the columns to operate on after groupby to either explicitly include or exclude the groupings and avoid the ``FutureWarning`` (:issue:`7155`)
 - Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`)
 - Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`)
 - Deprecated :meth:`.Groupby.all` and :meth:`.GroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`)

@@ -1487,6 +1487,16 @@ def f(g):
         with option_context("mode.chained_assignment", None):
             try:
                 result = self._python_apply_general(f, self._selected_obj)
+                if (
+                    not isinstance(self.obj, Series)
+                    and self._selection is None
+                    and self._selected_obj.shape != self._obj_with_exclusions.shape
+                ):
+                    warnings.warn(
+                        message=_apply_groupings_depr.format(type(self).__name__),
+                        category=FutureWarning,
+                        stacklevel=find_stack_level(),
+                    )
             except TypeError:
                 # gh-20949
                 # try again, with .apply acting as a filtering
@@ -4313,3 +4323,13 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde
     else:
         mi = MultiIndex.from_product([idx, qs])
     return mi
+
+
+# GH#7155
+_apply_groupings_depr = (
+    "{}.apply operated on the grouping columns. This behavior is deprecated, "
+    "and in a future version of pandas the grouping columns will be excluded "
+    "from the operation. Select the columns to operate on after groupby to"
+    "either explicitly include or exclude the groupings and silence "
+    "this warning."
+)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -33,7 +33,10 @@
     Substitution,
     doc,
 )
-from pandas.util._exceptions import find_stack_level
+from pandas.util._exceptions import (
+    find_stack_level,
+    rewrite_warning,
+)
 
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
@@ -52,6 +55,7 @@
 from pandas.core.groupby.groupby import (
     BaseGroupBy,
     GroupBy,
+    _apply_groupings_depr,
     _pipe_template,
     get_groupby,
 )
@@ -420,6 +424,9 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
             obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
         )
 
+        target_message = "DataFrameGroupBy.apply operated on the grouping columns"
+        new_message = _apply_groupings_depr.format(type(self).__name__)
+
         try:
             if callable(how):
                 # TODO: test_resample_apply_with_additional_args fails if we go
@@ -436,7 +443,12 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
             #  a DataFrame column, but aggregate_item_by_item operates column-wise
             #  on Series, raising AttributeError or KeyError
             #  (depending on whether the column lookup uses getattr/__getitem__)
-            result = grouped.apply(how, *args, **kwargs)
+            with rewrite_warning(
+                target_message=target_message,
+                target_category=FutureWarning,
+                new_message=new_message,
+            ):
+                result = grouped.apply(how, *args, **kwargs)
 
         except ValueError as err:
             if "Must produce aggregated value" in str(err):
@@ -448,7 +460,12 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
 
             # we have a non-reducing function
             # try to evaluate
-            result = grouped.apply(how, *args, **kwargs)
+            with rewrite_warning(
+                target_message=target_message,
+                target_category=FutureWarning,
+                new_message=new_message,
+            ):
+                result = grouped.apply(how, *args, **kwargs)
 
         return self._wrap_result(result)
 
@@ -1344,7 +1361,18 @@ def func(x):
 
             return x.apply(f, *args, **kwargs)
 
-        result = self._groupby.apply(func)
+        msg = (
+            "DataFrameGroupBy.resample operated on the grouping columns. "
+            "This behavior is deprecated, and in a future version of "
+            "pandas the grouping columns will be excluded from the operation. "
+            "Subset the data to exclude the groupings and silence this warning."
+        )
+        with rewrite_warning(
+            target_message="DataFrameGroupBy.apply operated on the grouping columns",
+            target_category=FutureWarning,
+            new_message=msg,
+        ):
+            result = self._groupby.apply(func)
         return self._wrap_result(result)
 
     _upsample = _apply

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -457,7 +457,7 @@ def _all_key():
             return (margins_name,) + ("",) * (len(cols) - 1)
 
         if len(rows) > 0:
-            margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
+            margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc)
             all_key = _all_key()
             table[all_key] = margin
             result = table
@@ -475,7 +475,7 @@ def _all_key():
         margin_keys = table.columns
 
     if len(cols):
-        row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc)
+        row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc)
     else:
         row_margin = Series(np.nan, index=result.columns)
 

diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
@@ -99,9 +99,13 @@ def test_groupby_extension_transform(self, data_for_grouping):
 
     def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
         df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
-        df.groupby("B", group_keys=False).apply(groupby_apply_op)
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            df.groupby("B", group_keys=False).apply(groupby_apply_op)
         df.groupby("B", group_keys=False).A.apply(groupby_apply_op)
-        df.groupby("A", group_keys=False).apply(groupby_apply_op)
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            df.groupby("A", group_keys=False).apply(groupby_apply_op)
         df.groupby("A", group_keys=False).B.apply(groupby_apply_op)
 
     def test_groupby_apply_identity(self, data_for_grouping):

diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -298,9 +298,13 @@ def test_groupby_extension_transform(self, data_for_grouping):
 
     def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
         df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
-        df.groupby("B", group_keys=False).apply(groupby_apply_op)
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            df.groupby("B", group_keys=False).apply(groupby_apply_op)
         df.groupby("B", group_keys=False).A.apply(groupby_apply_op)
-        df.groupby("A", group_keys=False).apply(groupby_apply_op)
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            df.groupby("A", group_keys=False).apply(groupby_apply_op)
         df.groupby("A", group_keys=False).B.apply(groupby_apply_op)
 
     def test_groupby_apply_identity(self, data_for_grouping):

diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1577,7 +1577,9 @@ def test_unstack_bug(self):
             }
         )
 
-        result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
+        msg = "DataFrameGroupBy.apply operated on the grouping columns"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
 
         unstacked = result.unstack()
         restacked = unstacked.stack()