diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 3a0aad41933bc..8a01be6d55aa2 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) + df.groupby("animal")[["size", "weight"]].apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp) + expected_df = gb[["size", "weight"]].apply(GrowUp) expected_df `Expanding apply diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index d301373bb6eea..6585ea158e9e9 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -430,6 +430,12 @@ This is mainly syntactic sugar for the alternative, which is much more verbose: Additionally, this method avoids recomputing the internal grouping information derived from the passed key. +You can also include the grouping columns if you want to operate on them. + +.. ipython:: python + + grouped[["A", "B"]].sum() + .. _groupby.iterating-label: Iterating through groups @@ -1067,7 +1073,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D").ffill() + df_re.groupby("group")[["val"]].resample("1D").ffill() .. _groupby.filter: @@ -1233,13 +1239,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x) + df.groupby("A", group_keys=True)[["B", "C", "D"]].apply(lambda x: x) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x) + df.groupby("A", group_keys=False)[["B", "C", "D"]].apply(lambda x: x) Numba Accelerated Routines @@ -1722,7 +1728,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics) + result = df.groupby("a")[["b", "c"]].apply(compute_metrics) result diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index b59938a9b9c9b..b8e7eae8de841 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -328,13 +328,25 @@ More consistent behavior for some groupby methods: - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: - .. ipython:: python + .. code-block:: ipython - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - g.head(1) # filters DataFrame + In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + + In [2]: g = df.groupby('A') + + In [3]: g.head(1) # filters DataFrame + Out[3]: + A B + 0 1 2 + 2 5 6 + + In [4]: g.apply(lambda x: x.head(1)) # used to simply fall-through + Out[4]: + A B + A + 1 0 1 2 + 5 2 5 6 - g.apply(lambda x: x.head(1)) # used to simply fall-through - groupby head and tail respect column selection: diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 7d9008fdbdecd..ee6a60144bc35 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -77,9 +77,52 @@ Previously you would have to do this to get a rolling window mean per-group: df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df -.. ipython:: python +.. code-block:: ipython - df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + Out[1]: + A + 1 0 NaN + 1 NaN + 2 NaN + 3 1.5 + 4 2.5 + 5 3.5 + 6 4.5 + 7 5.5 + 8 6.5 + 9 7.5 + 10 8.5 + 11 9.5 + 12 10.5 + 13 11.5 + 14 12.5 + 15 13.5 + 16 14.5 + 17 15.5 + 18 16.5 + 19 17.5 + 2 20 NaN + 21 NaN + 22 NaN + 23 21.5 + 24 22.5 + 25 23.5 + 26 24.5 + 27 25.5 + 28 26.5 + 29 27.5 + 30 28.5 + 31 29.5 + 3 32 NaN + 33 NaN + 34 NaN + 35 33.5 + 36 34.5 + 37 35.5 + 38 36.5 + 39 37.5 + Name: B, dtype: float64 Now you can do: @@ -101,15 +144,53 @@ For ``.resample(..)`` type of operations, previously you would have to: df -.. ipython:: python +.. code-block:: ipython - df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 Now you can do: -.. ipython:: python +.. code-block:: ipython - df.groupby("group").resample("1D").ffill() + In[1]: df.groupby("group").resample("1D").ffill() + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 .. _whatsnew_0181.enhancements.method_chain: diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index fca355069ae74..f5f90fc322d1c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -199,6 +199,7 @@ Other API changes Deprecations ~~~~~~~~~~~~ +- Deprecated :meth:`.DataFrameGroupBy.apply` and methods on the objects returned by :meth:`.DataFrameGroupBy.resample` operating on the grouping column(s); select the columns to operate on after groupby to either explicitly include or exclude the groupings and avoid the ``FutureWarning`` (:issue:`7155`) - Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`) - Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`) - Deprecated :meth:`.Groupby.all` and :meth:`.GroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01830961b06f9..dfee04a784630 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8595,20 +8595,20 @@ def update( >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) - Animal Max Speed + >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) + Max Speed Animal - Falcon 0 Falcon 380.0 - 1 Falcon 370.0 - Parrot 2 Parrot 24.0 - 3 Parrot 26.0 - - >>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 + Falcon 0 380.0 + 1 370.0 + Parrot 2 24.0 + 3 26.0 + + >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) + Max Speed + 0 380.0 + 1 370.0 + 2 24.0 + 3 26.0 """ ) ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6c105b6a51e3e..a12d312d148b9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -260,7 +260,7 @@ class providing the base-class of operations. each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min()) + >>> g1[['B', 'C']].apply(lambda x: x.C.max() - x.B.min()) A a 5 b 2 @@ -1487,6 +1487,16 @@ def f(g): with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f, self._selected_obj) + if ( + not isinstance(self.obj, Series) + and self._selection is None + and self._selected_obj.shape != self._obj_with_exclusions.shape + ): + warnings.warn( + message=_apply_groupings_depr.format(type(self).__name__), + category=FutureWarning, + stacklevel=find_stack_level(), + ) except TypeError: # gh-20949 # try again, with .apply acting as a filtering @@ -2645,55 +2655,55 @@ def resample(self, rule, *args, **kwargs): Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby('a').resample('3T').sum() - a b + >>> df.groupby('a')[['b']].resample('3T').sum() + b a - 0 2000-01-01 00:00:00 0 2 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:00:00 5 1 + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 Upsample the series into 30 second bins. - >>> df.groupby('a').resample('30S').sum() - a b + >>> df.groupby('a')[['b']].resample('30S').sum() + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:00:30 0 0 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:01:30 0 0 - 2000-01-01 00:02:00 0 0 - 2000-01-01 00:02:30 0 0 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:02:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('M').sum() - a b + >>> df.groupby('a')[['b']].resample('M').sum() + b a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 + 0 2000-01-31 3 + 5 2000-01-31 1 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> df.groupby('a').resample('3T', closed='right').sum() - a b + >>> df.groupby('a')[['b']].resample('3T', closed='right').sum() + b a - 0 1999-12-31 23:57:00 0 1 - 2000-01-01 00:00:00 0 2 - 5 2000-01-01 00:00:00 5 1 + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 Downsample the series into 3 minute bins and close the right side of the bin interval, but label each bin using the right edge instead of the left. - >>> df.groupby('a').resample('3T', closed='right', label='right').sum() - a b + >>> df.groupby('a')[['b']].resample('3T', closed='right', label='right').sum() + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:03:00 0 2 - 5 2000-01-01 00:03:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 """ from pandas.core.resample import get_resampler_for_grouping @@ -4309,3 +4319,13 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde else: mi = MultiIndex.from_product([idx, qs]) return mi + + +# GH#7155 +_apply_groupings_depr = ( + "{}.apply operated on the grouping columns. This behavior is deprecated, " + "and in a future version of pandas the grouping columns will be excluded " + "from the operation. Select the columns to operate on after groupby to" + "either explicitly include or exclude the groupings and silence " + "this warning." +) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8cc578b7fd0b6..0b9ebb1117821 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -33,7 +33,10 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -52,6 +55,7 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, + _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -420,6 +424,9 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys ) + target_message = "DataFrameGroupBy.apply operated on the grouping columns" + new_message = _apply_groupings_depr.format(type(self).__name__) + try: if callable(how): # TODO: test_resample_apply_with_additional_args fails if we go @@ -436,7 +443,12 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = grouped.apply(how, *args, **kwargs) + with rewrite_warning( + target_message=target_message, + target_category=FutureWarning, + new_message=new_message, + ): + result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -448,7 +460,12 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = grouped.apply(how, *args, **kwargs) + with rewrite_warning( + target_message=target_message, + target_category=FutureWarning, + new_message=new_message, + ): + result = grouped.apply(how, *args, **kwargs) return self._wrap_result(result) @@ -1344,7 +1361,18 @@ def func(x): return x.apply(f, *args, **kwargs) - result = self._groupby.apply(func) + msg = ( + "DataFrameGroupBy.resample operated on the grouping columns. " + "This behavior is deprecated, and in a future version of " + "pandas the grouping columns will be excluded from the operation. " + "Subset the data to exclude the groupings and silence this warning." + ) + with rewrite_warning( + target_message="DataFrameGroupBy.apply operated on the grouping columns", + target_category=FutureWarning, + new_message=msg, + ): + result = self._groupby.apply(func) return self._wrap_result(result) _upsample = _apply diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ebba3ae14f797..3ed3f1953f089 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -466,7 +466,7 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -484,7 +484,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 9b8f3a43fbe64..c697f670bfa65 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -99,9 +99,13 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - df.groupby("B", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("B", group_keys=False).apply(groupby_apply_op) df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - df.groupby("A", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("A", group_keys=False).apply(groupby_apply_op) df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 8c8cbfa5200b1..9b2bcb34f53c4 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -298,9 +298,13 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - df.groupby("B", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("B", group_keys=False).apply(groupby_apply_op) df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - df.groupby("A", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("A", group_keys=False).apply(groupby_apply_op) df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 889c44522f7bb..081af0a6df5c7 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1577,7 +1577,9 @@ def test_unstack_bug(self): } ) - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack() diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index aad1218190a84..f62a23bcca384 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -496,13 +496,17 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index b56ce326d1962..e5599d60b4f0d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -54,9 +54,11 @@ def test_apply_issues(): ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("date", group_keys=False).apply( + lambda x: x["time"][x["value"].idxmax()] + ) tm.assert_series_equal(result, expected) @@ -162,7 +164,9 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - df.groupby("a", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -180,9 +184,11 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -202,8 +208,11 @@ def slow(group): def fast(group): return group.copy() - fast_df = df.groupby("A", group_keys=False).apply(fast) - slow_df = df.groupby("A", group_keys=False).apply(slow) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + fast_df = df.groupby("A", group_keys=False).apply(fast) + with tm.assert_produces_warning(FutureWarning, match=msg): + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -225,7 +234,9 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - result = df.groupby("g", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -268,8 +279,11 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) - res_as_apply = g_as.apply(lambda x: x.head(2)).index - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res_as_apply = g_as.apply(lambda x: x.head(2)).index + with tm.assert_produces_warning(FutureWarning, match=msg): + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here @@ -282,7 +296,9 @@ def test_groupby_as_index_apply(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -311,13 +327,19 @@ def desc3(group): # weirdo return result - result = grouped.apply(desc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - result2 = grouped.apply(desc2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - result3 = grouped.apply(desc3) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -347,7 +369,9 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - result = df.groupby(["A", "B"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -358,7 +382,9 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -367,7 +393,9 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -390,7 +418,9 @@ def trans2(group): } ) - result = df.groupby("A").apply(trans) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -419,7 +449,9 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( @@ -440,7 +472,9 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -457,7 +491,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -481,7 +517,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -519,8 +557,11 @@ def filt2(x): else: return x[x.category == "c"] - expected = data.groupby("id_field").apply(filt1) - result = data.groupby("id_field").apply(filt2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = data.groupby("id_field").apply(filt1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -539,7 +580,9 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -584,7 +627,9 @@ def f(g): g["value3"] = g["value1"] * 2 return g - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert "value3" in result @@ -598,9 +643,13 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 @@ -611,7 +660,9 @@ def test_apply_numeric_coercion_when_datetime(): def get_B(g): return g.iloc[0][["B"]] - result = df.groupby("A").apply(get_B)["B"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -636,8 +687,11 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby("Key").apply(predictions).p1 - result = df2.groupby("Key").apply(predictions).p1 + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df1.groupby("Key").apply(predictions).p1 + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -652,11 +706,13 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + ) ) - ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -699,11 +755,15 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame({"b": datetime(2015, 1, 1), "c": 2}, index=[1]) dfg_conversion_expected.index.name = "a" @@ -745,7 +805,9 @@ def test_groupby_apply_all_none(): def test_func(x): pass - result = test_df.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -760,8 +822,11 @@ def test_func(x): return None return x.iloc[[0, -1]] - result1 = test_df1.groupby("groups").apply(test_func) - result2 = test_df2.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = test_df1.groupby("groups").apply(test_func) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) @@ -774,7 +839,9 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - result = groups.apply(lambda group: group[group.value != 1]["value"]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -801,7 +868,9 @@ def test_apply_with_mixed_types(): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - result = df.groupby("a").apply(lambda g: g.index) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -818,7 +887,9 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame( ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] @@ -857,7 +928,9 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - result = tdf.groupby("day").apply(most_common_values)["userId"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -899,7 +972,9 @@ def test_groupby_apply_datetime_result_dtypes(): ], columns=["observation", "color", "mood", "intensity", "score"], ) - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), object, object, np.int64, object], index=["observation", "color", "mood", "intensity", "score"], @@ -919,7 +994,9 @@ def test_groupby_apply_datetime_result_dtypes(): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -942,7 +1019,9 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - result = df.groupby("groups").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -954,7 +1033,9 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - result = df.groupby("A").apply(fct) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -965,7 +1046,9 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - result = df.groupby("id").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1001,7 +1084,9 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1016,8 +1101,11 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - result = df.groupby("A", group_keys=False).apply(lambda x: x) - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", group_keys=False).apply(lambda x: x) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1029,8 +1117,15 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) - result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df1.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) tm.assert_frame_equal(result1, result2) @@ -1053,14 +1148,18 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - result = grp.apply(sum) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.apply(sum) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) - result = grp.apply(sum) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.apply(sum) tm.assert_frame_equal(result, expected) @@ -1082,7 +1181,9 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - result = grp.apply(lambda x: x.head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() @@ -1130,7 +1231,9 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1155,7 +1258,9 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1165,7 +1270,9 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1185,9 +1292,11 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) tm.assert_frame_equal(result, expected) @@ -1230,24 +1339,29 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } + ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) - ) expected = DataFrame( [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], columns=["a", "b", "c"], @@ -1272,9 +1386,11 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1322,7 +1438,9 @@ def test_apply_index_key_error_bug(index_values): def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 1df55abdc038d..6a7764e8c005a 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -13,10 +13,16 @@ def test_group_by_copy(): } ).set_index("name") - grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_same_value = df.groupby(["age"], group_keys=False).apply( + lambda group: group + ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) tm.assert_frame_equal(grp_by_same_value, grp_by_copy) @@ -47,8 +53,11 @@ def f_no_copy(x): x["rank"] = x.val.rank(method="min") return x.groupby("cat2")["rank"].min() - grpby_copy = df.groupby("cat1").apply(f_copy) - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_copy = df.groupby("cat1").apply(f_copy) + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -58,8 +67,11 @@ def test_no_mutate_but_looks_like(): # second does not, but should yield the same results df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) @@ -73,7 +85,9 @@ def fn(x): x.loc[x.index[-1], "col2"] = 0 return x.col2 - result = df.groupby(["col1"], as_index=False).apply(fn) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5ecb765e5861e..fa8873c22b6b0 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -123,7 +123,9 @@ def test_basic(): # TODO: split this test def f(x): return x.drop_duplicates("person_name").iloc[0] - result = g.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") @@ -299,7 +301,9 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - result = grouped.apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -1962,7 +1966,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + warn = FutureWarning if method == "apply" and index_kind == "range" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 7e7f1a628da6e..921b42efb2bb3 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,7 +289,9 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5f6f99370080f..ac192f190962d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -73,17 +73,20 @@ def test_builtins_apply(keys, f): gb = df.groupby(keys) fname = f.__name__ - result = gb.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.apply(f) ngroups = len(df.drop_duplicates(subset=keys)) assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" assert result.shape == (ngroups, 3), assert_msg npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - expected = gb.apply(npfunc) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(FutureWarning, match=msg): expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4388913511be2..7de1e08b319a6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -135,7 +135,9 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): def max_value(group): return group.loc[group["value"].idxmax()] - applied = df.groupby("A").apply(max_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes tm.assert_series_equal(result, expected) @@ -156,7 +158,9 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - result = df.groupby("A").apply(f_0)[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -164,7 +168,9 @@ def f_1(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_1)[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -174,7 +180,9 @@ def f_2(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_2)[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -185,7 +193,9 @@ def f_3(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_3)[["C"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -196,7 +206,9 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - result = df.groupby("A").apply(f_4) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None @@ -362,8 +374,11 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -1260,11 +1275,15 @@ def summarize_random_name(df): # inconsistent. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby("A").apply(summarize) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - metrics = df.groupby("A").apply(summarize, "metrics") + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - metrics = df.groupby("A").apply(summarize_random_name) + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1535,7 +1554,9 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1609,7 +1630,9 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1630,7 +1653,9 @@ def f(group): names.append(group.name) return group.copy() - df.groupby("a", sort=False, group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1839,7 +1864,9 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - g.apply(test_sort) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -2021,7 +2048,9 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - res = gb.apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index a051b30307a28..b132494dafd91 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -324,7 +324,9 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..601e67bbca5e3 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -63,7 +63,9 @@ def func(group): assert hasattr(group, "testattr") return group.testattr - result = custom_df.groupby("c").apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) @@ -101,5 +103,7 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - result = df.groupby("Buyer").resample("5D").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4928d893f2cb2..84df18d46aa92 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -217,7 +217,9 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index f9a1349081529..21ab41b47fea1 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -472,8 +472,12 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -489,8 +493,11 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -896,7 +903,9 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 # function that returns a Series - res = gb.apply(lambda x: x["Quantity"] * 2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x["Quantity"] * 2) expected = DataFrame( [[36, 6, 6, 10, 2]], diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 5477ad75a56f7..febb1c8abb5f5 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -307,9 +307,12 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + warn = FutureWarning if groupby == "column" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c1201c33123ab..a93336b298c81 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -587,7 +587,9 @@ def f(group): return group[:1] grouped = df.groupby("c") - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -742,7 +744,13 @@ def test_cython_transform_frame(op, args, targop): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - expected = gb.apply(targop) + if op != "shift" or not isinstance(gb_target.get("by"), str): + warn = None + else: + warn = FutureWarning + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.apply(targop) expected = expected.sort_index(axis=1) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index cbf530e995c43..87b7999d88da3 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1039,8 +1039,12 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - result = df.groupby("ID").resample("5min").sum() - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("ID").resample("5min").sum() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1059,7 +1063,9 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1778,8 +1784,12 @@ def f(data, add_arg): # Testing dataframe df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index c97fa93cbd660..b1d4207b04250 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -71,7 +71,9 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index fdc09246b479a..609d58096abca 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -64,8 +64,12 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - expected = df.groupby("id").apply(f_0) - result = df.set_index("date").groupby("id").resample("D").asfreq() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("id").apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -79,8 +83,12 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - expected = df.groupby("group").apply(f_1) - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("group").apply(f_1) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -95,7 +103,9 @@ def test_getitem(): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - result = g.resample("2s").mean().B + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -220,8 +230,12 @@ def test_methods(f): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -238,8 +252,12 @@ def test_methods_nunique(): def test_methods_std_var(f): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)(ddof=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -248,18 +266,24 @@ def test_apply(): r = g.resample("2s") # reduction - expected = g.resample("2s").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - result = r.apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - result = g.apply(f_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -306,7 +330,9 @@ def test_resample_groupby_with_label(): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - result = df.groupby("col0").resample("1W", label="left").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -326,7 +352,9 @@ def test_consistency_with_window(): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - result = df.groupby("A").resample("2s").mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -424,7 +452,9 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) @@ -447,7 +477,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - result = df.groupby(["key"]).resample("W", on="date").min() + with tm.assert_produces_warning(FutureWarning): + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -499,7 +530,9 @@ def test_resample_empty_Dataframe(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) expected = expected.set_index("date", append=True, drop=True) @@ -515,7 +548,9 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="H", periods=12), ) - result = df.groupby("A").resample("D").size() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").size() expected = Series( 3, index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index debfb48c2b39c..59c291263c1c5 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -315,12 +315,14 @@ def test_groupby_resample_interpolate(): df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") - ) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.set_index("week_starting") + .groupby("volume") + .resample("1D") + .interpolate(method="linear") + ) expected_ind = pd.MultiIndex.from_tuples( [ diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index eb4fc06aa523e..dd2836b60bf64 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -99,7 +99,9 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -113,7 +115,9 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -129,9 +133,11 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -174,7 +180,9 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -190,7 +198,9 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -235,7 +245,9 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -778,9 +790,13 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - expected = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - result = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -954,11 +970,13 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -977,9 +995,13 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("B") + .groupby("A") + .apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1009,7 +1031,9 @@ def test_expanding(self, f, frame): r = g.expanding() result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.expanding(), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1023,7 +1047,9 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1039,9 +1065,11 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1059,7 +1087,9 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - expected = g.apply(func_0) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1074,7 +1104,9 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - expected = g.apply(func_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1083,7 +1115,11 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 168846cd04c59..3752c4b5f6917 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -36,6 +36,7 @@ "_agg_template_series", "_agg_template_frame", "_pipe_template", + "_apply_groupings_depr", "__main__", "_transform_template", "_use_inf_as_na",