DEPR: Silent dropping of nuisance columns in agg_list_like (#43741)

rhshadrach · web-flow · commit 4b54c5358f44 · 2021-09-29T09:10:45.000-04:00
diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -1045,6 +1045,9 @@ not noted for a particular column will be ``NaN``:
 Mixed dtypes
 ++++++++++++
 
+.. deprecated:: 1.4.0
+   Attempting to determine which columns cannot be aggregated and silently dropping them from the results is deprecated and will be removed in a future version. If any porition of the columns or operations provided fail, the call to ``.agg`` will raise.
+
 When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid
 aggregations. This is similar to how ``.groupby.agg`` works.
 
@@ -1061,6 +1064,7 @@ aggregations. This is similar to how ``.groupby.agg`` works.
    mdf.dtypes
 
 .. ipython:: python
+   :okwarning:
 
    mdf.agg(["min", "sum"])
 
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -578,7 +578,7 @@ column, which produces an aggregated result with a hierarchical index:
 
 .. ipython:: python
 
-   grouped.agg([np.sum, np.mean, np.std])
+   grouped[["C", "D"]].agg([np.sum, np.mean, np.std])
 
 
 The resulting aggregations are named for the functions themselves. If you
@@ -597,7 +597,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner:
 .. ipython:: python
 
    (
-       grouped.agg([np.sum, np.mean, np.std]).rename(
+       grouped[["C", "D"]].agg([np.sum, np.mean, np.std]).rename(
            columns={"sum": "foo", "mean": "bar", "std": "baz"}
        )
    )
diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
@@ -105,6 +105,7 @@ aggregations. This is similar to how groupby ``.agg()`` works. (:issue:`15015`)
    df.dtypes
 
 .. ipython:: python
+   :okwarning:
 
    df.agg(['min', 'sum'])
 
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -338,6 +338,7 @@ Other Deprecations
 - Deprecated the ``index`` argument to :class:`SparseArray` construction (:issue:`23089`)
 - Deprecated :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`)
 - Deprecated silent dropping of columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a dictionary (:issue:`43740`)
+- Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, and some cases of ``ValueError`` in :meth:`Series.aggregate`, :meth:`DataFrame.aggregate`, :meth:`Series.groupby.aggregate`, and :meth:`DataFrame.groupby.aggregate` when used with a list (:issue:`43740`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -4,6 +4,7 @@
 from collections import defaultdict
 from functools import partial
 import inspect
+import re
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -336,6 +337,13 @@ def agg_list_like(self) -> DataFrame | Series:
 
         results = []
         keys = []
+        failed_names = []
+
+        depr_nuisance_columns_msg = (
+            "{} did not aggregate successfully. If any error is "
+            "raised this will raise in a future version of pandas. "
+            "Drop these columns/ops to avoid this warning."
+        )
 
         # degenerate case
         if selected_obj.ndim == 1:
@@ -345,7 +353,7 @@ def agg_list_like(self) -> DataFrame | Series:
                     new_res = colg.aggregate(a)
 
                 except TypeError:
-                    pass
+                    failed_names.append(com.get_callable_name(a) or a)
                 else:
                     results.append(new_res)
 
@@ -359,20 +367,37 @@ def agg_list_like(self) -> DataFrame | Series:
             for index, col in enumerate(selected_obj):
                 colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
                 try:
-                    new_res = colg.aggregate(arg)
+                    # Capture and suppress any warnings emitted by us in the call
+                    # to agg below, but pass through any warnings that were
+                    # generated otherwise.
+                    with warnings.catch_warnings(record=True) as record:
+                        new_res = colg.aggregate(arg)
+                    if len(record) > 0:
+                        match = re.compile(depr_nuisance_columns_msg.format(".*"))
+                        for warning in record:
+                            if re.match(match, str(warning.message)):
+                                failed_names.append(col)
+                            else:
+                                warnings.warn_explicit(
+                                    message=warning.message,
+                                    category=warning.category,
+                                    filename=warning.filename,
+                                    lineno=warning.lineno,
+                                )
+
                 except (TypeError, DataError):
-                    pass
+                    failed_names.append(col)
                 except ValueError as err:
                     # cannot aggregate
                     if "Must produce aggregated value" in str(err):
                         # raised directly in _aggregate_named
-                        pass
+                        failed_names.append(col)
                     elif "no results" in str(err):
                         # reached in test_frame_apply.test_nuiscance_columns
                         #  where the colg.aggregate(arg) ends up going through
                         #  the selected_obj.ndim == 1 branch above with arg == ["sum"]
                         #  on a datetime64[ns] column
-                        pass
+                        failed_names.append(col)
                     else:
                         raise
                 else:
@@ -385,6 +410,13 @@ def agg_list_like(self) -> DataFrame | Series:
         if not len(results):
             raise ValueError("no results")
 
+        if len(failed_names) > 0:
+            warnings.warn(
+                depr_nuisance_columns_msg.format(failed_names),
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+
         try:
             concatenated = concat(results, keys=keys, axis=1, sort=False)
         except TypeError as err:
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -1087,12 +1087,16 @@ def test_agg_multiple_mixed_no_warning():
         index=["min", "sum"],
     )
     # sorted index
-    with tm.assert_produces_warning(None):
+    with tm.assert_produces_warning(
+        FutureWarning, match=r"\['D'\] did not aggregate successfully"
+    ):
         result = mdf.agg(["min", "sum"])
 
     tm.assert_frame_equal(result, expected)
 
-    with tm.assert_produces_warning(None):
+    with tm.assert_produces_warning(
+        FutureWarning, match=r"\['D'\] did not aggregate successfully"
+    ):
         result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"])
 
     # GH40420: the result of .agg should have an index that is sorted
@@ -1201,7 +1205,10 @@ def test_nuiscance_columns():
     expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
     tm.assert_series_equal(result, expected)
 
-    result = df.agg(["sum"])
+    with tm.assert_produces_warning(
+        FutureWarning, match=r"\['D'\] did not aggregate successfully"
+    ):
+        result = df.agg(["sum"])
     expected = DataFrame(
         [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"]
     )
@@ -1433,7 +1440,10 @@ def foo(s):
         return s.sum() / 2
 
     aggs = ["sum", foo, "count", "min"]
-    result = df.agg(aggs)
+    with tm.assert_produces_warning(
+        FutureWarning, match=r"\['item'\] did not aggregate successfully"
+    ):
+        result = df.agg(aggs)
     expected = DataFrame(
         {
             "item": ["123456", np.nan, 6, "1"],
@@ -1452,3 +1462,20 @@ def test_apply_getitem_axis_1():
     result = df[["a", "a"]].apply(lambda x: x[0] + x[1], axis=1)
     expected = Series([0, 2, 4])
     tm.assert_series_equal(result, expected)
+
+
+def test_nuisance_depr_passes_through_warnings():
+    # GH 43740
+    # DataFrame.agg with list-likes may emit warnings for both individual
+    # args and for entire columns, but we only want to emit once. We
+    # catch and suppress the warnings for individual args, but need to make
+    # sure if some other warnings were raised, they get passed through to
+    # the user.
+
+    def foo(x):
+        warnings.warn("Hello, World!")
+        return x.sum()
+
+    df = DataFrame({"a": [1, 2, 3]})
+    with tm.assert_produces_warning(UserWarning, match="Hello, World!"):
+        df.agg([foo])
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -339,8 +339,14 @@ def test_multiple_functions_tuples_and_non_tuples(df):
     expected = df.groupby("A")["C"].agg(ex_funcs)
     tm.assert_frame_equal(result, expected)
 
-    result = df.groupby("A").agg(funcs)
-    expected = df.groupby("A").agg(ex_funcs)
+    with tm.assert_produces_warning(
+        FutureWarning, match=r"\['B'\] did not aggregate successfully"
+    ):
+        result = df.groupby("A").agg(funcs)
+    with tm.assert_produces_warning(
+        FutureWarning, match=r"\['B'\] did not aggregate successfully"
+    ):
+        expected = df.groupby("A").agg(ex_funcs)
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
@@ -45,13 +45,15 @@ def peak_to_peak(arr):
         return arr.max() - arr.min()
 
     with tm.assert_produces_warning(
-        FutureWarning, match="Dropping invalid", check_stacklevel=False
+        FutureWarning,
+        match=r"\['key2'\] did not aggregate successfully",
     ):
         expected = grouped.agg([peak_to_peak])
     expected.columns = ["data1", "data2"]
 
     with tm.assert_produces_warning(
-        FutureWarning, match="Dropping invalid", check_stacklevel=False
+        FutureWarning,
+        match=r"\['key2'\] did not aggregate successfully",
     ):
         result = grouped.agg(peak_to_peak)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -583,7 +583,10 @@ def test_frame_multi_key_function_list():
 
     grouped = data.groupby(["A", "B"])
     funcs = [np.mean, np.std]
-    agged = grouped.agg(funcs)
+    with tm.assert_produces_warning(
+        FutureWarning, match=r"\['C'\] did not aggregate successfully"
+    ):
+        agged = grouped.agg(funcs)
     expected = pd.concat(
         [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
         keys=["D", "E", "F"],
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
@@ -350,10 +350,12 @@ def test_agg():
     expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
     expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]])
     for t in cases:
-        with tm.assert_produces_warning(None):
-            # .var on dt64 column raises and is dropped, but the path in core.apply
-            #  that it goes through will still suppress a TypeError even
-            #  once the deprecations in the groupby code are enforced
+        warn = FutureWarning if t in cases[1:3] else None
+        with tm.assert_produces_warning(
+            warn,
+            match=r"\['date'\] did not aggregate successfully",
+        ):
+            # .var on dt64 column raises and is dropped
             result = t.aggregate([np.mean, np.std])
         tm.assert_frame_equal(result, expected)
 

Original file line number	Diff line number	Diff line change
`@@ -578,7 +578,7 @@ column, which produces an aggregated result with a hierarchical index:`
`578`	`578`
`579`	`579`	`.. ipython:: python`
`580`	`580`
`581`		`- grouped.agg([np.sum, np.mean, np.std])`
	`581`	`+ grouped[["C", "D"]].agg([np.sum, np.mean, np.std])`
`582`	`582`
`583`	`583`
`584`	`584`	`The resulting aggregations are named for the functions themselves. If you`
@@ -597,7 +597,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner:
`597`	`597`	`.. ipython:: python`
`598`	`598`
`599`	`599`	`(`
`600`		`- grouped.agg([np.sum, np.mean, np.std]).rename(`
	`600`	`+ grouped[["C", "D"]].agg([np.sum, np.mean, np.std]).rename(`
`601`	`601`	`columns={"sum": "foo", "mean": "bar", "std": "baz"}`
`602`	`602`	`)`
`603`	`603`	`)`