From bc67672031ff1b1b51719ea5af65388a3eb09799 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 24 Sep 2021 22:31:31 -0400 Subject: [PATCH 1/8] DEPR: Dropping of silent columns in NDFrame.agg with list-like func --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/apply.py | 21 ++++++++++++++++--- pandas/tests/apply/test_frame_apply.py | 18 ++++++++++++---- .../tests/groupby/aggregate/test_aggregate.py | 10 +++++++-- pandas/tests/groupby/aggregate/test_other.py | 8 +++++-- pandas/tests/groupby/test_groupby.py | 5 ++++- pandas/tests/resample/test_resample_api.py | 4 +++- 7 files changed, 54 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f6e90a3341424..b59bc3006c5d7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -337,6 +337,7 @@ Other Deprecations - Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`) - Deprecated the ``index`` argument to :class:`SparseArray` construction (:issue:`23089`) - Deprecated :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`) +- Deprecated silent dropping of columns that raised a ``TypeError`` or ``DataError`` in :class:`Series.aggregate` and :class:`DataFrame.aggregate` when used with a list (:issue:``) .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index dafcc71ae0b5a..40962b1e4009d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -33,6 +33,7 @@ FrameOrSeries, ) from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( @@ -335,6 +336,7 @@ def agg_list_like(self) -> DataFrame | Series: results = [] keys = [] + failed_names = [] # degenerate case if selected_obj.ndim == 1: @@ -344,7 +346,7 @@ def agg_list_like(self) -> DataFrame | Series: new_res = colg.aggregate(a) except TypeError: - pass + failed_names.append(com.get_callable_name(a) or a) else: results.append(new_res) @@ -358,10 +360,14 @@ def agg_list_like(self) -> DataFrame | Series: for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: - new_res = colg.aggregate(arg) + with warnings.catch_warnings(record=True) as w: + new_res = colg.aggregate(arg) + if len(w) > 0: + failed_names.append(col) except (TypeError, DataError): - pass + failed_names.append(col) except ValueError as err: + failed_names.append(col) # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named @@ -384,6 +390,15 @@ def agg_list_like(self) -> DataFrame | Series: if not len(results): raise ValueError("no results") + if len(failed_names) > 0: + warnings.warn( + f"{failed_names} did not aggregate successfully. If any error is " + f"raised this will raise in a future version of pandas. " + f"Drop these columns/ops to avoid this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + try: concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 62983b5327a26..1ebded43396e1 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1087,12 +1087,16 @@ def test_agg_multiple_mixed_no_warning(): index=["min", "sum"], ) # sorted index - with tm.assert_produces_warning(None): + with tm.assert_produces_warning( + FutureWarning, match=r"\['D'\] did not aggregate successfully" + ): result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning( + FutureWarning, match=r"\['D'\] did not aggregate successfully" + ): result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # GH40420: the result of .agg should have an index that is sorted @@ -1201,7 +1205,10 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - result = df.agg(["sum"]) + with tm.assert_produces_warning( + FutureWarning, match=r"\['D'\] did not aggregate successfully" + ): + result = df.agg(["sum"]) expected = DataFrame( [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) @@ -1433,7 +1440,10 @@ def foo(s): return s.sum() / 2 aggs = ["sum", foo, "count", "min"] - result = df.agg(aggs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['item'\] did not aggregate successfully" + ): + result = df.agg(aggs) expected = DataFrame( { "item": ["123456", np.nan, 6, "1"], diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4bda0e6ef9872..4338980a16c43 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -339,8 +339,14 @@ def test_multiple_functions_tuples_and_non_tuples(df): expected = df.groupby("A")["C"].agg(ex_funcs) tm.assert_frame_equal(result, expected) - result = df.groupby("A").agg(funcs) - expected = df.groupby("A").agg(ex_funcs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['B'\] did not aggregate successfully" + ): + result = df.groupby("A").agg(funcs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['B'\] did not aggregate successfully" + ): + expected = df.groupby("A").agg(ex_funcs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 79990deed261d..c4e8d9e3a2269 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -45,13 +45,17 @@ def peak_to_peak(arr): return arr.max() - arr.min() with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid", check_stacklevel=False + FutureWarning, + match=r"\['key2'\] did not aggregate successfully", + check_stacklevel=False, ): expected = grouped.agg([peak_to_peak]) expected.columns = ["data1", "data2"] with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid", check_stacklevel=False + FutureWarning, + match=r"\['key2'\] did not aggregate successfully", + check_stacklevel=False, ): result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b9a6730996a02..3de6af8eef694 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -583,7 +583,10 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - agged = grouped.agg(funcs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['C'\] did not aggregate successfully" + ): + agged = grouped.agg(funcs) expected = pd.concat( [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], keys=["D", "E", "F"], diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 3b3bd402e4cc7..5ce89c7de9283 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -352,7 +352,9 @@ def test_agg(): for t in cases: warn = FutureWarning if t in cases[1:3] else None with tm.assert_produces_warning( - warn, match="Dropping invalid columns", check_stacklevel=False + warn, + match=r"\['date'\] did not aggregate successfully", + check_stacklevel=False, ): # .var on dt64 column raises and is dropped result = t.aggregate([np.mean, np.std]) From a1f72776bd8f767e0c71a8b601c6f257d66a37ef Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Sep 2021 12:09:37 -0400 Subject: [PATCH 2/8] DEPR: Dropping of silent columns in NDFrame.agg with list-like func --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index b59bc3006c5d7..0a6f88daff0e1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -337,7 +337,7 @@ Other Deprecations - Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`) - Deprecated the ``index`` argument to :class:`SparseArray` construction (:issue:`23089`) - Deprecated :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`) -- Deprecated silent dropping of columns that raised a ``TypeError`` or ``DataError`` in :class:`Series.aggregate` and :class:`DataFrame.aggregate` when used with a list (:issue:``) +- Deprecated silent dropping of columns that raised a ``TypeError`` or ``DataError`` in :class:`Series.aggregate` and :class:`DataFrame.aggregate` when used with a list (:issue:`43740`) .. --------------------------------------------------------------------------- From bbece5e0b7ac2ce81e0fa62391c001eaafe830e2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Sep 2021 12:14:15 -0400 Subject: [PATCH 3/8] Remove check_stacklevel=False --- pandas/tests/groupby/aggregate/test_other.py | 2 -- pandas/tests/resample/test_resample_api.py | 1 - 2 files changed, 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index c4e8d9e3a2269..66b968e01eef1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -47,7 +47,6 @@ def peak_to_peak(arr): with tm.assert_produces_warning( FutureWarning, match=r"\['key2'\] did not aggregate successfully", - check_stacklevel=False, ): expected = grouped.agg([peak_to_peak]) expected.columns = ["data1", "data2"] @@ -55,7 +54,6 @@ def peak_to_peak(arr): with tm.assert_produces_warning( FutureWarning, match=r"\['key2'\] did not aggregate successfully", - check_stacklevel=False, ): result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5ce89c7de9283..10fabe234d218 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -354,7 +354,6 @@ def test_agg(): with tm.assert_produces_warning( warn, match=r"\['date'\] did not aggregate successfully", - check_stacklevel=False, ): # .var on dt64 column raises and is dropped result = t.aggregate([np.mean, np.std]) From f08c782197aca473ec1d20d5a7cbb84dc4482414 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Sep 2021 12:23:55 -0400 Subject: [PATCH 4/8] Minor refactor --- pandas/core/apply.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 40962b1e4009d..e4fdc87eb06e9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -367,17 +367,16 @@ def agg_list_like(self) -> DataFrame | Series: except (TypeError, DataError): failed_names.append(col) except ValueError as err: - failed_names.append(col) # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named - pass + failed_names.append(col) elif "no results" in str(err): # reached in test_frame_apply.test_nuiscance_columns # where the colg.aggregate(arg) ends up going through # the selected_obj.ndim == 1 branch above with arg == ["sum"] # on a datetime64[ns] column - pass + failed_names.append(col) else: raise else: From 5e9de2d39dec32348b82fc62bd56fce8d58b12aa Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Sep 2021 12:26:07 -0400 Subject: [PATCH 5/8] Fixup for whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 138a86d2c0b6e..077a25a500dea 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -337,7 +337,7 @@ Other Deprecations - Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`) - Deprecated the ``index`` argument to :class:`SparseArray` construction (:issue:`23089`) - Deprecated :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`) -- Deprecated silent dropping of columns that raised a ``TypeError`` or ``DataError`` in :class:`Series.aggregate` and :class:`DataFrame.aggregate` when used with a list (:issue:`43740`) +- Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, or some cases of ``ValueError`` in :class:`Series.aggregate` and :class:`DataFrame.aggregate` when used with a list (:issue:`43740`) .. --------------------------------------------------------------------------- From 816db574fff3de8aaff1aa40adf9a5c024bb7586 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Sep 2021 13:49:56 -0400 Subject: [PATCH 6/8] Doc fixups --- doc/source/user_guide/basics.rst | 3 +++ doc/source/user_guide/groupby.rst | 4 ++-- doc/source/whatsnew/v0.20.0.rst | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 82c8a27bec3a5..4dfad2eb803cd 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1045,6 +1045,9 @@ not noted for a particular column will be ``NaN``: Mixed dtypes ++++++++++++ +.. deprecated:: 1.4.0 + Attempting to determine which columns cannot be aggregated and silently dropping them from the results is deprecated and will be removed in a future version. If any porition of the columns or operations provided fail, the call to ``.agg`` will raise. + When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid aggregations. This is similar to how ``.groupby.agg`` works. diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 2cd6efe592277..0fb59c50efa74 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -578,7 +578,7 @@ column, which produces an aggregated result with a hierarchical index: .. ipython:: python - grouped.agg([np.sum, np.mean, np.std]) + grouped[["C", "D"]].agg([np.sum, np.mean, np.std]) The resulting aggregations are named for the functions themselves. If you @@ -597,7 +597,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python ( - grouped.agg([np.sum, np.mean, np.std]).rename( + grouped[["C", "D"]].agg([np.sum, np.mean, np.std]).rename( columns={"sum": "foo", "mean": "bar", "std": "baz"} ) ) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 8d9821e53e30c..239431b7621c6 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -105,6 +105,7 @@ aggregations. This is similar to how groupby ``.agg()`` works. (:issue:`15015`) df.dtypes .. ipython:: python + :okwarning: df.agg(['min', 'sum']) From 9f47591f711c7fde824ace09941523d5e332ce2d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Sep 2021 19:10:59 -0400 Subject: [PATCH 7/8] Pass through user warnings, update whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/apply.py | 31 +++++++++++++++++++++----- pandas/tests/apply/test_frame_apply.py | 17 ++++++++++++++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 077a25a500dea..6d5108db12ca9 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -337,7 +337,7 @@ Other Deprecations - Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`) - Deprecated the ``index`` argument to :class:`SparseArray` construction (:issue:`23089`) - Deprecated :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`) -- Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, or some cases of ``ValueError`` in :class:`Series.aggregate` and :class:`DataFrame.aggregate` when used with a list (:issue:`43740`) +- Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, and some cases of ``ValueError`` in :meth:`Series.aggregate`, :meth:`DataFrame.aggregate`, :meth:`Series.groupby.aggregate`, and :meth:`DataFrame.groupby.aggregate` when used with a list (:issue:`43740`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e4fdc87eb06e9..6b54cea5f8c28 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -4,6 +4,7 @@ from collections import defaultdict from functools import partial import inspect +import re from typing import ( TYPE_CHECKING, Any, @@ -338,6 +339,12 @@ def agg_list_like(self) -> DataFrame | Series: keys = [] failed_names = [] + depr_nuisance_columns_msg = ( + "{} did not aggregate successfully. If any error is " + "raised this will raise in a future version of pandas. " + "Drop these columns/ops to avoid this warning." + ) + # degenerate case if selected_obj.ndim == 1: for a in arg: @@ -360,10 +367,24 @@ def agg_list_like(self) -> DataFrame | Series: for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: - with warnings.catch_warnings(record=True) as w: + # Capture and suppress any warnings emitted by us in the call + # to agg below, but pass through any warnings that were + # generated otherwise. + with warnings.catch_warnings(record=True) as record: new_res = colg.aggregate(arg) - if len(w) > 0: - failed_names.append(col) + if len(record) > 0: + match = re.compile(depr_nuisance_columns_msg.format(".*")) + for warning in record: + if re.match(match, str(warning.message)): + failed_names.append(col) + else: + warnings.warn_explicit( + message=warning.message, + category=warning.category, + filename=warning.filename, + lineno=warning.lineno, + ) + except (TypeError, DataError): failed_names.append(col) except ValueError as err: @@ -391,9 +412,7 @@ def agg_list_like(self) -> DataFrame | Series: if len(failed_names) > 0: warnings.warn( - f"{failed_names} did not aggregate successfully. If any error is " - f"raised this will raise in a future version of pandas. " - f"Drop these columns/ops to avoid this warning.", + depr_nuisance_columns_msg.format(failed_names), FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 1ebded43396e1..f8c945bb496a8 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1462,3 +1462,20 @@ def test_apply_getitem_axis_1(): result = df[["a", "a"]].apply(lambda x: x[0] + x[1], axis=1) expected = Series([0, 2, 4]) tm.assert_series_equal(result, expected) + + +def test_nuisance_depr_passes_through_warnings(): + # GH 43740 + # DataFrame.agg with list-likes may emit warnings for both individual + # args and for entire columns, but we only want to emit once. We + # catch and suppress the warnings for individual args, but need to make + # sure if some other warnings were raised, they get passed through to + # the user. + + def foo(x): + warnings.warn("Hello, World!") + return x.sum() + + df = DataFrame({"a": [1, 2, 3]}) + with tm.assert_produces_warning(UserWarning, match="Hello, World!"): + df.agg([foo]) From 78e013382867447a82936d49312cd25c81131973 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 28 Sep 2021 21:16:39 -0400 Subject: [PATCH 8/8] Supress warning in docs --- doc/source/user_guide/basics.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 4dfad2eb803cd..a589ad96ca7d9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1064,6 +1064,7 @@ aggregations. This is similar to how ``.groupby.agg`` works. mdf.dtypes .. ipython:: python + :okwarning: mdf.agg(["min", "sum"])