diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 137be168985d2..f1f32ae8f3740 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -109,6 +109,7 @@ Other enhancements - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`). - Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) +- Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`) - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5b6d28ac9ab4a..958f033a0dae9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1711,7 +1711,7 @@ def _agg_general( return result.__finalize__(self.obj, method="groupby") def _agg_py_fallback( - self, values: ArrayLike, ndim: int, alt: Callable + self, how: str, values: ArrayLike, ndim: int, alt: Callable ) -> ArrayLike: """ Fallback to pure-python aggregation if _cython_operation raises @@ -1737,7 +1737,12 @@ def _agg_py_fallback( # We do not get here with UDFs, so we know that our dtype # should always be preserved by the implemented aggregations # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? - res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) + try: + res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) + except Exception as err: + msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" + # preserve the kind of exception that raised + raise type(err)(msg) from err if ser.dtype == object: res_values = res_values.astype(object, copy=False) @@ -1779,8 +1784,10 @@ def array_func(values: ArrayLike) -> ArrayLike: # TODO: shouldn't min_count matter? if how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached - result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) + else: + return result + result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) return result new_mgr = data.grouped_reduce(array_func) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 9b8f3a43fbe64..bc8781eacfe06 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,3 +1,5 @@ +import re + import pytest from pandas.core.dtypes.common import ( @@ -141,7 +143,16 @@ def test_in_numeric_groupby(self, data_for_grouping): result = df.groupby("A").sum().columns else: expected = pd.Index(["C"]) - with pytest.raises(TypeError, match="does not support"): - df.groupby("A").sum().columns + + msg = "|".join( + [ + # period/datetime + "does not support sum operations", + # all others + re.escape(f"agg function failed [how->sum,dtype->{dtype}"), + ] + ) + with pytest.raises(TypeError, match=msg): + df.groupby("A").sum() result = df.groupby("A").sum(numeric_only=True).columns tm.assert_index_equal(result, expected) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 05d6e0e8722c5..452ccf24f1582 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -585,7 +585,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request): super().test_groupby_extension_agg(as_index, data_for_grouping) def test_in_numeric_groupby(self, data_for_grouping): - if is_string_dtype(data_for_grouping.dtype): + dtype = data_for_grouping.dtype + if is_string_dtype(dtype): df = pd.DataFrame( { "A": [1, 1, 2, 2, 3, 3, 1, 4], @@ -595,8 +596,9 @@ def test_in_numeric_groupby(self, data_for_grouping): ) expected = pd.Index(["C"]) - with pytest.raises(TypeError, match="does not support"): - df.groupby("A").sum().columns + msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}") + with pytest.raises(TypeError, match=msg): + df.groupby("A").sum() result = df.groupby("A").sum(numeric_only=True).columns tm.assert_index_equal(result, expected) else: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 9900db7910c46..13d93bb0a490d 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1,6 +1,7 @@ from datetime import datetime from io import StringIO import itertools +import re import numpy as np import pytest @@ -1897,7 +1898,8 @@ def test_stack_multiple_bug(self): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) rs = down.stack("ID") diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 98fce9d668e44..0535bafc2a907 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,5 +1,6 @@ import builtins from io import StringIO +import re import numpy as np import pytest @@ -249,8 +250,10 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): msg = "|".join( [ "Categorical is not ordered", - "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", + re.escape(f"agg function failed [how->{method},dtype->object]"), + # cumsum/cummin/cummax/cumprod + "function is not implemented for this dtype", ] ) with pytest.raises(exception, match=msg): @@ -259,12 +262,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): msg = "|".join( [ "category type does not support sum operations", - "[Cc]ould not convert", - "can't multiply sequence by non-int of type 'str'", + re.escape(f"agg function failed [how->{method},dtype->object]"), ] ) - if method == "median": - msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)() else: @@ -274,16 +274,13 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): if method not in ("first", "last"): msg = "|".join( [ - "[Cc]ould not convert", "Categorical is not ordered", "category type does not support", - "can't multiply sequence", "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", + re.escape(f"agg function failed [how->{method},dtype->object]"), ] ) - if method == "median": - msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)(numeric_only=False) else: @@ -1464,16 +1461,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): msg = "|".join( [ "not allowed for this dtype", - "must be a string or a number", "cannot be performed against 'object' dtypes", - "must be a string or a real number", + # On PY39 message is "a number"; on PY310 and after is "a real number" + "must be a string or a.* number", "unsupported operand type", - "not supported between instances of", "function is not implemented for this dtype", + re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) - if kernel == "median": - msg = r"Cannot convert \[ \] to numeric" with pytest.raises(exception, match=msg): method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bf0b646847ed6..0af22f4cb822b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1,5 +1,6 @@ from datetime import datetime from decimal import Decimal +import re import numpy as np import pytest @@ -641,7 +642,7 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - msg = "Could not convert string 'dullshinyshiny' to numeric" + msg = re.escape("agg function failed [how->mean,dtype->object]") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -915,9 +916,10 @@ def test_groupby_multi_corner(df): def test_raises_on_nuisance(df): grouped = df.groupby("A") - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): grouped.agg(np.mean) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): grouped.mean() df = df.loc[:, ["A", "C", "D"]] @@ -965,10 +967,12 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - klass = ValueError if agg_function in ("std", "sem") else TypeError - msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"]) - if agg_function == "median": - msg = r"Cannot convert \['one' 'three' 'two'\] to numeric" + if agg_function in ("std", "sem"): + klass = ValueError + msg = "could not convert string to float: 'one'" + else: + klass = TypeError + msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]") with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: @@ -993,9 +997,10 @@ def test_raise_on_nuisance_python_single(df): def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): grouped.agg(np.mean) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): grouped.mean() @@ -1035,7 +1040,8 @@ def test_wrap_aggregated_output_multindex(mframe): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): df.groupby(keys).agg(np.mean) agged = df.drop(columns=("baz", "two")).groupby(keys).agg(np.mean) assert isinstance(agged.columns, MultiIndex) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 6fb903b02b62f..180755c1dca12 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -3,6 +3,7 @@ # test file. import datetime +import re import numpy as np import pytest @@ -162,24 +163,20 @@ def test_groupby_raises_string( "max": (None, ""), "mean": ( TypeError, - "Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric", + re.escape("agg function failed [how->mean,dtype->object]"), ), "median": ( TypeError, - "|".join( - [ - r"Cannot convert \['x' 'y' 'z'\] to numeric", - r"Cannot convert \['x' 'y'\] to numeric", - r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric", - r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric", - ] - ), + re.escape("agg function failed [how->median,dtype->object]"), ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), "pct_change": (TypeError, "unsupported operand type"), - "prod": (TypeError, "can't multiply sequence by non-int of type 'str'"), + "prod": ( + TypeError, + re.escape("agg function failed [how->prod,dtype->object]"), + ), "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), @@ -188,7 +185,10 @@ def test_groupby_raises_string( "skew": (ValueError, "could not convert string to float"), "std": (ValueError, "could not convert string to float"), "sum": (None, ""), - "var": (TypeError, "could not convert string to float"), + "var": ( + TypeError, + re.escape("agg function failed [how->var,dtype->object]"), + ), }[groupby_func] _call_and_check(klass, msg, how, gb, groupby_func, args) @@ -225,7 +225,7 @@ def test_groupby_raises_string_np( np.sum: (None, ""), np.mean: ( TypeError, - "Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric", + re.escape("agg function failed [how->mean,dtype->object]"), ), }[groupby_func_np] diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index d7b8f0c8053da..ba0b93d0b06f4 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -1,4 +1,5 @@ from datetime import datetime +import re import numpy as np import pytest @@ -186,7 +187,8 @@ def tests_raises_on_nuisance(test_frame): tm.assert_frame_equal(result, expected) expected = r[["A", "B", "C"]].mean() - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) tm.assert_frame_equal(result, expected) @@ -886,8 +888,13 @@ def test_frame_downsample_method(method, numeric_only, expected_data): func = getattr(resampled, method) if isinstance(expected_data, str): - klass = TypeError if method in ("var", "mean", "median", "prod") else ValueError - with pytest.raises(klass, match=expected_data): + if method in ("var", "mean", "median", "prod"): + klass = TypeError + msg = re.escape(f"agg function failed [how->{method},dtype->object]") + else: + klass = ValueError + msg = expected_data + with pytest.raises(klass, match=msg): _ = func(**kwargs) else: result = func(**kwargs) @@ -933,7 +940,8 @@ def test_series_downsample_method(method, numeric_only, expected_data): with pytest.raises(TypeError, match=msg): func(**kwargs) elif method == "prod": - with pytest.raises(TypeError, match="can't multiply sequence by non-int"): + msg = re.escape("agg function failed [how->prod,dtype->object]") + with pytest.raises(TypeError, match=msg): func(**kwargs) else: result = func(**kwargs) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index d5b0ad6b2d56d..ffdff75e53cf7 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -567,7 +569,8 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) cn = grouped.count() diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b40f0f7a45263..b6fcb27faf146 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -4,6 +4,7 @@ timedelta, ) from itertools import product +import re import numpy as np import pytest @@ -923,7 +924,8 @@ def test_no_col(self, data): # to help with a buglet data.columns = [k * 2 for k in data.columns] - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) table = data.drop(columns="CC").pivot_table( index=["AA", "BB"], margins=True, aggfunc=np.mean @@ -932,7 +934,7 @@ def test_no_col(self, data): totals = table.loc[("All", ""), value_col] assert totals == data[value_col].mean() - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( index=["AA", "BB"], margins=True, aggfunc="mean" @@ -995,7 +997,8 @@ def test_margin_with_only_columns_defined( } ) if aggfunc != "sum": - with pytest.raises(TypeError, match="Could not convert"): + msg = re.escape("agg function failed [how->mean,dtype->object]") + with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: df = df.drop(columns="B")