Skip to content

Commit d06f2d3

Browse files
authored
ERR: Shorten traceback in groupby _cython_agg_general (#52992)
1 parent 25c579a commit d06f2d3

File tree

11 files changed

+91
-53
lines changed

11 files changed

+91
-53
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ Other enhancements
109109
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
110110
- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`).
111111
- Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`)
112+
- Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`)
112113
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
113114
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
114115
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)

pandas/core/groupby/groupby.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -1723,7 +1723,7 @@ def _agg_general(
17231723
return result.__finalize__(self.obj, method="groupby")
17241724

17251725
def _agg_py_fallback(
1726-
self, values: ArrayLike, ndim: int, alt: Callable
1726+
self, how: str, values: ArrayLike, ndim: int, alt: Callable
17271727
) -> ArrayLike:
17281728
"""
17291729
Fallback to pure-python aggregation if _cython_operation raises
@@ -1749,7 +1749,12 @@ def _agg_py_fallback(
17491749
# We do not get here with UDFs, so we know that our dtype
17501750
# should always be preserved by the implemented aggregations
17511751
# TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
1752-
res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
1752+
try:
1753+
res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
1754+
except Exception as err:
1755+
msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
1756+
# preserve the kind of exception that raised
1757+
raise type(err)(msg) from err
17531758

17541759
if ser.dtype == object:
17551760
res_values = res_values.astype(object, copy=False)
@@ -1791,8 +1796,10 @@ def array_func(values: ArrayLike) -> ArrayLike:
17911796
# TODO: shouldn't min_count matter?
17921797
if how in ["any", "all", "std", "sem"]:
17931798
raise # TODO: re-raise as TypeError? should not be reached
1794-
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
1799+
else:
1800+
return result
17951801

1802+
result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
17961803
return result
17971804

17981805
new_mgr = data.grouped_reduce(array_func)

pandas/tests/extension/base/groupby.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import re
2+
13
import pytest
24

35
from pandas.core.dtypes.common import (
@@ -141,7 +143,16 @@ def test_in_numeric_groupby(self, data_for_grouping):
141143
result = df.groupby("A").sum().columns
142144
else:
143145
expected = pd.Index(["C"])
144-
with pytest.raises(TypeError, match="does not support"):
145-
df.groupby("A").sum().columns
146+
147+
msg = "|".join(
148+
[
149+
# period/datetime
150+
"does not support sum operations",
151+
# all others
152+
re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
153+
]
154+
)
155+
with pytest.raises(TypeError, match=msg):
156+
df.groupby("A").sum()
146157
result = df.groupby("A").sum(numeric_only=True).columns
147158
tm.assert_index_equal(result, expected)

pandas/tests/extension/test_arrow.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request):
585585
super().test_groupby_extension_agg(as_index, data_for_grouping)
586586

587587
def test_in_numeric_groupby(self, data_for_grouping):
588-
if is_string_dtype(data_for_grouping.dtype):
588+
dtype = data_for_grouping.dtype
589+
if is_string_dtype(dtype):
589590
df = pd.DataFrame(
590591
{
591592
"A": [1, 1, 2, 2, 3, 3, 1, 4],
@@ -595,8 +596,9 @@ def test_in_numeric_groupby(self, data_for_grouping):
595596
)
596597

597598
expected = pd.Index(["C"])
598-
with pytest.raises(TypeError, match="does not support"):
599-
df.groupby("A").sum().columns
599+
msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}")
600+
with pytest.raises(TypeError, match=msg):
601+
df.groupby("A").sum()
600602
result = df.groupby("A").sum(numeric_only=True).columns
601603
tm.assert_index_equal(result, expected)
602604
else:

pandas/tests/frame/test_stack_unstack.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datetime import datetime
22
from io import StringIO
33
import itertools
4+
import re
45

56
import numpy as np
67
import pytest
@@ -1897,7 +1898,8 @@ def test_stack_multiple_bug(self):
18971898
multi = df.set_index(["DATE", "ID"])
18981899
multi.columns.name = "Params"
18991900
unst = multi.unstack("ID")
1900-
with pytest.raises(TypeError, match="Could not convert"):
1901+
msg = re.escape("agg function failed [how->mean,dtype->object]")
1902+
with pytest.raises(TypeError, match=msg):
19011903
unst.resample("W-THU").mean()
19021904
down = unst.resample("W-THU").mean(numeric_only=True)
19031905
rs = down.stack("ID")

pandas/tests/groupby/test_function.py

+9-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import builtins
22
from io import StringIO
3+
import re
34

45
import numpy as np
56
import pytest
@@ -249,8 +250,10 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
249250
msg = "|".join(
250251
[
251252
"Categorical is not ordered",
252-
"function is not implemented for this dtype",
253253
f"Cannot perform {method} with non-ordered Categorical",
254+
re.escape(f"agg function failed [how->{method},dtype->object]"),
255+
# cumsum/cummin/cummax/cumprod
256+
"function is not implemented for this dtype",
254257
]
255258
)
256259
with pytest.raises(exception, match=msg):
@@ -259,12 +262,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
259262
msg = "|".join(
260263
[
261264
"category type does not support sum operations",
262-
"[Cc]ould not convert",
263-
"can't multiply sequence by non-int of type 'str'",
265+
re.escape(f"agg function failed [how->{method},dtype->object]"),
264266
]
265267
)
266-
if method == "median":
267-
msg = r"Cannot convert \['a' 'b'\] to numeric"
268268
with pytest.raises(exception, match=msg):
269269
getattr(gb, method)()
270270
else:
@@ -274,16 +274,13 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
274274
if method not in ("first", "last"):
275275
msg = "|".join(
276276
[
277-
"[Cc]ould not convert",
278277
"Categorical is not ordered",
279278
"category type does not support",
280-
"can't multiply sequence",
281279
"function is not implemented for this dtype",
282280
f"Cannot perform {method} with non-ordered Categorical",
281+
re.escape(f"agg function failed [how->{method},dtype->object]"),
283282
]
284283
)
285-
if method == "median":
286-
msg = r"Cannot convert \['a' 'b'\] to numeric"
287284
with pytest.raises(exception, match=msg):
288285
getattr(gb, method)(numeric_only=False)
289286
else:
@@ -1464,16 +1461,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
14641461
msg = "|".join(
14651462
[
14661463
"not allowed for this dtype",
1467-
"must be a string or a number",
14681464
"cannot be performed against 'object' dtypes",
1469-
"must be a string or a real number",
1465+
# On PY39 message is "a number"; on PY310 and after is "a real number"
1466+
"must be a string or a.* number",
14701467
"unsupported operand type",
1471-
"not supported between instances of",
14721468
"function is not implemented for this dtype",
1469+
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
14731470
]
14741471
)
1475-
if kernel == "median":
1476-
msg = r"Cannot convert \[<class 'object'> <class 'object'>\] to numeric"
14771472
with pytest.raises(exception, match=msg):
14781473
method(*args, **kwargs)
14791474
elif not has_arg and numeric_only is not lib.no_default:

pandas/tests/groupby/test_groupby.py

+16-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import datetime
22
from decimal import Decimal
3+
import re
34

45
import numpy as np
56
import pytest
@@ -641,7 +642,7 @@ def test_frame_multi_key_function_list_partial_failure():
641642

642643
grouped = data.groupby(["A", "B"])
643644
funcs = [np.mean, np.std]
644-
msg = "Could not convert string 'dullshinyshiny' to numeric"
645+
msg = re.escape("agg function failed [how->mean,dtype->object]")
645646
with pytest.raises(TypeError, match=msg):
646647
grouped.agg(funcs)
647648

@@ -925,9 +926,10 @@ def test_groupby_multi_corner(df):
925926

926927
def test_raises_on_nuisance(df):
927928
grouped = df.groupby("A")
928-
with pytest.raises(TypeError, match="Could not convert"):
929+
msg = re.escape("agg function failed [how->mean,dtype->object]")
930+
with pytest.raises(TypeError, match=msg):
929931
grouped.agg(np.mean)
930-
with pytest.raises(TypeError, match="Could not convert"):
932+
with pytest.raises(TypeError, match=msg):
931933
grouped.mean()
932934

933935
df = df.loc[:, ["A", "C", "D"]]
@@ -975,10 +977,12 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
975977
if agg_function in no_drop_nuisance and not numeric_only:
976978
# Added numeric_only as part of GH#46560; these do not drop nuisance
977979
# columns when numeric_only is False
978-
klass = ValueError if agg_function in ("std", "sem") else TypeError
979-
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
980-
if agg_function == "median":
981-
msg = r"Cannot convert \['one' 'three' 'two'\] to numeric"
980+
if agg_function in ("std", "sem"):
981+
klass = ValueError
982+
msg = "could not convert string to float: 'one'"
983+
else:
984+
klass = TypeError
985+
msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]")
982986
with pytest.raises(klass, match=msg):
983987
getattr(grouped, agg_function)(numeric_only=numeric_only)
984988
else:
@@ -1003,9 +1007,10 @@ def test_raise_on_nuisance_python_single(df):
10031007

10041008
def test_raise_on_nuisance_python_multiple(three_group):
10051009
grouped = three_group.groupby(["A", "B"])
1006-
with pytest.raises(TypeError, match="Could not convert"):
1010+
msg = re.escape("agg function failed [how->mean,dtype->object]")
1011+
with pytest.raises(TypeError, match=msg):
10071012
grouped.agg(np.mean)
1008-
with pytest.raises(TypeError, match="Could not convert"):
1013+
with pytest.raises(TypeError, match=msg):
10091014
grouped.mean()
10101015

10111016

@@ -1045,7 +1050,8 @@ def test_wrap_aggregated_output_multindex(mframe):
10451050
df["baz", "two"] = "peekaboo"
10461051

10471052
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
1048-
with pytest.raises(TypeError, match="Could not convert"):
1053+
msg = re.escape("agg function failed [how->mean,dtype->object]")
1054+
with pytest.raises(TypeError, match=msg):
10491055
df.groupby(keys).agg(np.mean)
10501056
agged = df.drop(columns=("baz", "two")).groupby(keys).agg(np.mean)
10511057
assert isinstance(agged.columns, MultiIndex)

pandas/tests/groupby/test_raises.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# test file.
44

55
import datetime
6+
import re
67

78
import numpy as np
89
import pytest
@@ -162,24 +163,20 @@ def test_groupby_raises_string(
162163
"max": (None, ""),
163164
"mean": (
164165
TypeError,
165-
"Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric",
166+
re.escape("agg function failed [how->mean,dtype->object]"),
166167
),
167168
"median": (
168169
TypeError,
169-
"|".join(
170-
[
171-
r"Cannot convert \['x' 'y' 'z'\] to numeric",
172-
r"Cannot convert \['x' 'y'\] to numeric",
173-
r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric",
174-
r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric",
175-
]
176-
),
170+
re.escape("agg function failed [how->median,dtype->object]"),
177171
),
178172
"min": (None, ""),
179173
"ngroup": (None, ""),
180174
"nunique": (None, ""),
181175
"pct_change": (TypeError, "unsupported operand type"),
182-
"prod": (TypeError, "can't multiply sequence by non-int of type 'str'"),
176+
"prod": (
177+
TypeError,
178+
re.escape("agg function failed [how->prod,dtype->object]"),
179+
),
183180
"quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
184181
"rank": (None, ""),
185182
"sem": (ValueError, "could not convert string to float"),
@@ -188,7 +185,10 @@ def test_groupby_raises_string(
188185
"skew": (ValueError, "could not convert string to float"),
189186
"std": (ValueError, "could not convert string to float"),
190187
"sum": (None, ""),
191-
"var": (TypeError, "could not convert string to float"),
188+
"var": (
189+
TypeError,
190+
re.escape("agg function failed [how->var,dtype->object]"),
191+
),
192192
}[groupby_func]
193193

194194
_call_and_check(klass, msg, how, gb, groupby_func, args)
@@ -225,7 +225,7 @@ def test_groupby_raises_string_np(
225225
np.sum: (None, ""),
226226
np.mean: (
227227
TypeError,
228-
"Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric",
228+
re.escape("agg function failed [how->mean,dtype->object]"),
229229
),
230230
}[groupby_func_np]
231231

pandas/tests/resample/test_resample_api.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import datetime
2+
import re
23

34
import numpy as np
45
import pytest
@@ -186,7 +187,8 @@ def tests_raises_on_nuisance(test_frame):
186187
tm.assert_frame_equal(result, expected)
187188

188189
expected = r[["A", "B", "C"]].mean()
189-
with pytest.raises(TypeError, match="Could not convert"):
190+
msg = re.escape("agg function failed [how->mean,dtype->object]")
191+
with pytest.raises(TypeError, match=msg):
190192
r.mean()
191193
result = r.mean(numeric_only=True)
192194
tm.assert_frame_equal(result, expected)
@@ -886,8 +888,13 @@ def test_frame_downsample_method(method, numeric_only, expected_data):
886888

887889
func = getattr(resampled, method)
888890
if isinstance(expected_data, str):
889-
klass = TypeError if method in ("var", "mean", "median", "prod") else ValueError
890-
with pytest.raises(klass, match=expected_data):
891+
if method in ("var", "mean", "median", "prod"):
892+
klass = TypeError
893+
msg = re.escape(f"agg function failed [how->{method},dtype->object]")
894+
else:
895+
klass = ValueError
896+
msg = expected_data
897+
with pytest.raises(klass, match=msg):
891898
_ = func(**kwargs)
892899
else:
893900
result = func(**kwargs)
@@ -933,7 +940,8 @@ def test_series_downsample_method(method, numeric_only, expected_data):
933940
with pytest.raises(TypeError, match=msg):
934941
func(**kwargs)
935942
elif method == "prod":
936-
with pytest.raises(TypeError, match="can't multiply sequence by non-int"):
943+
msg = re.escape("agg function failed [how->prod,dtype->object]")
944+
with pytest.raises(TypeError, match=msg):
937945
func(**kwargs)
938946
else:
939947
result = func(**kwargs)

pandas/tests/reshape/merge/test_join.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import re
2+
13
import numpy as np
24
import pytest
35

@@ -567,7 +569,8 @@ def test_mixed_type_join_with_suffix(self):
567569
df.insert(5, "dt", "foo")
568570

569571
grouped = df.groupby("id")
570-
with pytest.raises(TypeError, match="Could not convert"):
572+
msg = re.escape("agg function failed [how->mean,dtype->object]")
573+
with pytest.raises(TypeError, match=msg):
571574
grouped.mean()
572575
mn = grouped.mean(numeric_only=True)
573576
cn = grouped.count()

0 commit comments

Comments
 (0)