DEPR: dropping nuisance columns in DataFrameGroupby apply, agg, transform (pandas-dev#41475)

jbrockmendel · JulianWgs · commit aa84c19545ae · 2021-07-03T13:10:22.000+02:00
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda
 functions:
 
 .. ipython:: python
+   :okwarning:
 
    grouped = df.groupby("A")
    grouped.agg(lambda x: x.std())
@@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the
 ability to "dispatch" method calls to the groups:
 
 .. ipython:: python
+   :okwarning:
 
    grouped.std()
 
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -726,6 +726,44 @@ For example:
     A    24
     dtype: int64
 
+
+Similarly, when applying a function to :class:`DataFrameGroupBy`, columns on which
+the function raises ``TypeError`` are currently silently ignored and dropped
+from the result.
+
+This behavior is deprecated.  In a future version, the ``TypeError``
+will be raised, and users will need to select only valid columns before calling
+the function.
+
+For example:
+
+.. ipython:: python
+
+   df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)})
+   gb = df.groupby([1, 1, 2, 2])
+
+*Old behavior*:
+
+.. code-block:: ipython
+
+    In [4]: gb.prod(numeric_only=False)
+    Out[4]:
+    A
+    1   2
+    2  12
+
+.. code-block:: ipython
+
+    In [5]: gb.prod(numeric_only=False)
+    ...
+    TypeError: datetime64 type does not support prod operations
+
+    In [6]: gb[["A"]].prod(numeric_only=False)
+    Out[6]:
+        A
+    1   2
+    2  12
+
 .. ---------------------------------------------------------------------------
 
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1087,6 +1087,15 @@ def array_func(values: ArrayLike) -> ArrayLike:
         if not len(new_mgr) and len(orig):
             # If the original Manager was already empty, no need to raise
             raise DataError("No numeric types to aggregate")
+        if len(new_mgr) < len(data):
+            warnings.warn(
+                f"Dropping invalid columns in {type(self).__name__}.{how} "
+                "is deprecated. In a future version, a TypeError will be raised. "
+                f"Before calling .{how}, select only columns which should be "
+                "valid for the function.",
+                FutureWarning,
+                stacklevel=4,
+            )
 
         return self._wrap_agged_manager(new_mgr)
 
@@ -1283,6 +1292,16 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
         res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
         res_mgr.set_axis(1, mgr.axes[1])
 
+        if len(res_mgr) < len(mgr):
+            warnings.warn(
+                f"Dropping invalid columns in {type(self).__name__}.{how} "
+                "is deprecated. In a future version, a TypeError will be raised. "
+                f"Before calling .{how}, select only columns which should be "
+                "valid for the transforming function.",
+                FutureWarning,
+                stacklevel=4,
+            )
+
         res_df = self.obj._constructor(res_mgr)
         if self.axis == 1:
             res_df = res_df.T
@@ -1420,7 +1439,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
                 output[i] = sgb.transform(wrapper)
             except TypeError:
                 # e.g. trying to call nanmean with string values
-                pass
+                warnings.warn(
+                    f"Dropping invalid columns in {type(self).__name__}.transform "
+                    "is deprecated. In a future version, a TypeError will be raised. "
+                    "Before calling .transform, select only columns which should be "
+                    "valid for the transforming function.",
+                    FutureWarning,
+                    stacklevel=5,
+                )
             else:
                 inds.append(i)
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -30,6 +30,7 @@ class providing the base-class of operations.
     Union,
     cast,
 )
+import warnings
 
 import numpy as np
 
@@ -1285,6 +1286,14 @@ def _python_agg_general(self, func, *args, **kwargs):
                 # if this function is invalid for this dtype, we will ignore it.
                 result = self.grouper.agg_series(obj, f)
             except TypeError:
+                warnings.warn(
+                    f"Dropping invalid columns in {type(self).__name__}.agg "
+                    "is deprecated. In a future version, a TypeError will be raised. "
+                    "Before calling .agg, select only columns which should be "
+                    "valid for the aggregating function.",
+                    FutureWarning,
+                    stacklevel=3,
+                )
                 continue
 
             key = base.OutputKey(label=name, position=idx)
@@ -2844,6 +2853,16 @@ def _get_cythonized_result(
                         vals, inferences = pre_processing(vals)
                     except TypeError as err:
                         error_msg = str(err)
+                        howstr = how.replace("group_", "")
+                        warnings.warn(
+                            "Dropping invalid columns in "
+                            f"{type(self).__name__}.{howstr} is deprecated. "
+                            "In a future version, a TypeError will be raised. "
+                            f"Before calling .{howstr}, select only columns which "
+                            "should be valid for the function.",
+                            FutureWarning,
+                            stacklevel=3,
+                        )
                         continue
                 vals = vals.astype(cython_dtype, copy=False)
                 if needs_2d:
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -257,7 +257,8 @@ def func(ser):
         else:
             return ser.sum()
 
-    result = grouped.aggregate(func)
+    with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
+        result = grouped.aggregate(func)
     exp_grouped = three_group.loc[:, three_group.columns != "C"]
     expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
     tm.assert_frame_equal(result, expected)
@@ -1020,6 +1021,7 @@ def test_mangle_series_groupby(self):
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
+    @pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
     def test_with_kwargs(self):
         f1 = lambda x, y, b=1: x.sum() + y + b
         f2 = lambda x, y, b=2: x.sum() + y * b
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
@@ -44,9 +44,16 @@ def test_agg_api():
     def peak_to_peak(arr):
         return arr.max() - arr.min()
 
-    expected = grouped.agg([peak_to_peak])
+    with tm.assert_produces_warning(
+        FutureWarning, match="Dropping invalid", check_stacklevel=False
+    ):
+        expected = grouped.agg([peak_to_peak])
     expected.columns = ["data1", "data2"]
-    result = grouped.agg(peak_to_peak)
+
+    with tm.assert_produces_warning(
+        FutureWarning, match="Dropping invalid", check_stacklevel=False
+    ):
+        result = grouped.agg(peak_to_peak)
     tm.assert_frame_equal(result, expected)
 
 
@@ -294,7 +301,8 @@ def raiseException(df):
         raise TypeError("test")
 
     with pytest.raises(TypeError, match="test"):
-        df.groupby(0).agg(raiseException)
+        with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
+            df.groupby(0).agg(raiseException)
 
 
 def test_series_agg_multikey():
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -87,13 +87,15 @@ def test_max_min_object_multiple_columns(using_array_manager):
 
     gb = df.groupby("A")
 
-    result = gb.max(numeric_only=False)
+    with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
+        result = gb.max(numeric_only=False)
     # "max" is valid for column "C" but not for "B"
     ei = Index([1, 2, 3], name="A")
     expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
     tm.assert_frame_equal(result, expected)
 
-    result = gb.min(numeric_only=False)
+    with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
+        result = gb.min(numeric_only=False)
     # "min" is valid for column "C" but not for "B"
     ei = Index([1, 2, 3], name="A")
     expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
@@ -221,7 +223,10 @@ def test_averages(self, df, method):
             ],
         )
 
-        result = getattr(gb, method)(numeric_only=False)
+        with tm.assert_produces_warning(
+            FutureWarning, match="Dropping invalid", check_stacklevel=False
+        ):
+            result = getattr(gb, method)(numeric_only=False)
         tm.assert_frame_equal(result.reindex_like(expected), expected)
 
         expected_columns = expected.columns
@@ -303,10 +308,27 @@ def test_cummin_cummax(self, df, method):
     def _check(self, df, method, expected_columns, expected_columns_numeric):
         gb = df.groupby("group")
 
-        result = getattr(gb, method)()
+        # cummin, cummax dont have numeric_only kwarg, always use False
+        warn = None
+        if method in ["cummin", "cummax"]:
+            # these dont have numeric_only kwarg, always use False
+            warn = FutureWarning
+        elif method in ["min", "max"]:
+            # these have numeric_only kwarg, but default to False
+            warn = FutureWarning
+
+        with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
+            result = getattr(gb, method)()
+
         tm.assert_index_equal(result.columns, expected_columns_numeric)
 
-        result = getattr(gb, method)(numeric_only=False)
+        # GH#41475 deprecated silently ignoring nuisance columns
+        warn = None
+        if len(expected_columns) < len(gb._obj_with_exclusions.columns):
+            warn = FutureWarning
+        with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
+            result = getattr(gb, method)(numeric_only=False)
+
         tm.assert_index_equal(result.columns, expected_columns)
 
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -923,7 +923,8 @@ def aggfun(ser):
         else:
             return ser.sum()
 
-    agged2 = df.groupby(keys).aggregate(aggfun)
+    with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
+        agged2 = df.groupby(keys).aggregate(aggfun)
     assert len(agged2.columns) + 1 == len(df.columns)
 
 
@@ -1757,6 +1758,7 @@ def test_pivot_table_values_key_error():
 @pytest.mark.parametrize(
     "op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
 )
+@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
 @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
 def test_empty_groupby(columns, keys, values, method, op, request):
     # GH8093 & GH26411
diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py
@@ -155,7 +155,10 @@ def test_quantile_raises():
     df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
 
     with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
-        df.groupby("key").quantile()
+        with tm.assert_produces_warning(
+            FutureWarning, match="Dropping invalid columns"
+        ):
+            df.groupby("key").quantile()
 
 
 def test_quantile_out_of_bounds_q_raises():
@@ -236,7 +239,11 @@ def test_groupby_quantile_nullable_array(values, q):
 @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
 def test_groupby_quantile_skips_invalid_dtype(q):
     df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
-    result = df.groupby("a").quantile(q)
+
+    warn = None if isinstance(q, list) else FutureWarning
+    with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
+        result = df.groupby("a").quantile(q)
+
     expected = df.groupby("a")[["b"]].quantile(q)
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py