Skip to content

ERR: Shorten traceback in groupby _cython_agg_general #52992

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jun 19, 2023
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ Other enhancements
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`).
- Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`)
- Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`)
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1711,7 +1711,7 @@ def _agg_general(
return result.__finalize__(self.obj, method="groupby")

def _agg_py_fallback(
self, values: ArrayLike, ndim: int, alt: Callable
self, how: str, values: ArrayLike, ndim: int, alt: Callable
) -> ArrayLike:
"""
Fallback to pure-python aggregation if _cython_operation raises
Expand All @@ -1737,7 +1737,12 @@ def _agg_py_fallback(
# We do not get here with UDFs, so we know that our dtype
# should always be preserved by the implemented aggregations
# TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
try:
res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
except Exception as err:
msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
# preserve the kind of exception that raised
raise type(err)(msg) from err

if ser.dtype == object:
res_values = res_values.astype(object, copy=False)
Expand Down Expand Up @@ -1779,8 +1784,10 @@ def array_func(values: ArrayLike) -> ArrayLike:
# TODO: shouldn't min_count matter?
if how in ["any", "all", "std", "sem"]:
raise # TODO: re-raise as TypeError? should not be reached
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
else:
return result

result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
return result

new_mgr = data.grouped_reduce(array_func)
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import pytest

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -141,7 +143,16 @@ def test_in_numeric_groupby(self, data_for_grouping):
result = df.groupby("A").sum().columns
else:
expected = pd.Index(["C"])
with pytest.raises(TypeError, match="does not support"):
df.groupby("A").sum().columns

msg = "|".join(
[
# period/datetime
"does not support sum operations",
# all others
re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
]
)
with pytest.raises(TypeError, match=msg):
df.groupby("A").sum()
result = df.groupby("A").sum(numeric_only=True).columns
tm.assert_index_equal(result, expected)
8 changes: 5 additions & 3 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request):
super().test_groupby_extension_agg(as_index, data_for_grouping)

def test_in_numeric_groupby(self, data_for_grouping):
if is_string_dtype(data_for_grouping.dtype):
dtype = data_for_grouping.dtype
if is_string_dtype(dtype):
df = pd.DataFrame(
{
"A": [1, 1, 2, 2, 3, 3, 1, 4],
Expand All @@ -595,8 +596,9 @@ def test_in_numeric_groupby(self, data_for_grouping):
)

expected = pd.Index(["C"])
with pytest.raises(TypeError, match="does not support"):
df.groupby("A").sum().columns
msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}")
with pytest.raises(TypeError, match=msg):
df.groupby("A").sum()
result = df.groupby("A").sum(numeric_only=True).columns
tm.assert_index_equal(result, expected)
else:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import datetime
from io import StringIO
import itertools
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -1897,7 +1898,8 @@ def test_stack_multiple_bug(self):
multi = df.set_index(["DATE", "ID"])
multi.columns.name = "Params"
unst = multi.unstack("ID")
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
unst.resample("W-THU").mean()
down = unst.resample("W-THU").mean(numeric_only=True)
rs = down.stack("ID")
Expand Down
23 changes: 9 additions & 14 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import builtins
from io import StringIO
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -249,8 +250,10 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
msg = "|".join(
[
"Categorical is not ordered",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -259,12 +262,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
msg = "|".join(
[
"category type does not support sum operations",
"[Cc]ould not convert",
"can't multiply sequence by non-int of type 'str'",
re.escape(f"agg function failed [how->{method},dtype->object]"),
]
)
if method == "median":
msg = r"Cannot convert \['a' 'b'\] to numeric"
with pytest.raises(exception, match=msg):
getattr(gb, method)()
else:
Expand All @@ -274,16 +274,13 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
if method not in ("first", "last"):
msg = "|".join(
[
"[Cc]ould not convert",
"Categorical is not ordered",
"category type does not support",
"can't multiply sequence",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
]
)
if method == "median":
msg = r"Cannot convert \['a' 'b'\] to numeric"
with pytest.raises(exception, match=msg):
getattr(gb, method)(numeric_only=False)
else:
Expand Down Expand Up @@ -1464,16 +1461,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
msg = "|".join(
[
"not allowed for this dtype",
"must be a string or a number",
"cannot be performed against 'object' dtypes",
"must be a string or a real number",
# On PY39 message is "a number"; on PY310 and after is "a real number"
"must be a string or a.* number",
"unsupported operand type",
"not supported between instances of",
"function is not implemented for this dtype",
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
]
)
if kernel == "median":
msg = r"Cannot convert \[<class 'object'> <class 'object'>\] to numeric"
with pytest.raises(exception, match=msg):
method(*args, **kwargs)
elif not has_arg and numeric_only is not lib.no_default:
Expand Down
26 changes: 16 additions & 10 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime
from decimal import Decimal
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -641,7 +642,7 @@ def test_frame_multi_key_function_list_partial_failure():

grouped = data.groupby(["A", "B"])
funcs = [np.mean, np.std]
msg = "Could not convert string 'dullshinyshiny' to numeric"
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.agg(funcs)

Expand Down Expand Up @@ -915,9 +916,10 @@ def test_groupby_multi_corner(df):

def test_raises_on_nuisance(df):
grouped = df.groupby("A")
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.agg(np.mean)
with pytest.raises(TypeError, match="Could not convert"):
with pytest.raises(TypeError, match=msg):
grouped.mean()

df = df.loc[:, ["A", "C", "D"]]
Expand Down Expand Up @@ -965,10 +967,12 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
klass = ValueError if agg_function in ("std", "sem") else TypeError
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
if agg_function == "median":
msg = r"Cannot convert \['one' 'three' 'two'\] to numeric"
if agg_function in ("std", "sem"):
klass = ValueError
msg = "could not convert string to float: 'one'"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the difference intended here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously we checked multiple possibilities for the message, of which this was one. By separating out the ValueError from the TypeError cases, we only have to check the message associated with the ValuError.

At least, I think that's what you're asking about.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I meant that std and see have different errors

Copy link
Member Author

@rhshadrach rhshadrach Jun 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see! I do think there is something to improve here. If the algorithm actually tries to convert the value to a numeric one and fails, that to me is a ValueError. If, on the other hand, it sees that the object is a string and then doesn't even make an attempt to convert it, that would be a TypeError.

It would be good if we made this consistent by either (a) always trying to convert or (b) never trying to convert. However, even without this consistency, currently mean et al don't try to make the conversion and still raise "Could not convert":

df = DataFrame({'a': [1, 1, 2], 'b': list('456')})
gb = df.groupby('a')
gb.mean()
# TypeError: Could not convert string '45' to numeric

This should be improved, but separate from this PR.

else:
klass = TypeError
msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]")
with pytest.raises(klass, match=msg):
getattr(grouped, agg_function)(numeric_only=numeric_only)
else:
Expand All @@ -993,9 +997,10 @@ def test_raise_on_nuisance_python_single(df):

def test_raise_on_nuisance_python_multiple(three_group):
grouped = three_group.groupby(["A", "B"])
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.agg(np.mean)
with pytest.raises(TypeError, match="Could not convert"):
with pytest.raises(TypeError, match=msg):
grouped.mean()


Expand Down Expand Up @@ -1035,7 +1040,8 @@ def test_wrap_aggregated_output_multindex(mframe):
df["baz", "two"] = "peekaboo"

keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
df.groupby(keys).agg(np.mean)
agged = df.drop(columns=("baz", "two")).groupby(keys).agg(np.mean)
assert isinstance(agged.columns, MultiIndex)
Expand Down
24 changes: 12 additions & 12 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# test file.

import datetime
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -162,24 +163,20 @@ def test_groupby_raises_string(
"max": (None, ""),
"mean": (
TypeError,
"Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric",
re.escape("agg function failed [how->mean,dtype->object]"),
),
"median": (
TypeError,
"|".join(
[
r"Cannot convert \['x' 'y' 'z'\] to numeric",
r"Cannot convert \['x' 'y'\] to numeric",
r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric",
r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric",
]
),
re.escape("agg function failed [how->median,dtype->object]"),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"prod": (TypeError, "can't multiply sequence by non-int of type 'str'"),
"prod": (
TypeError,
re.escape("agg function failed [how->prod,dtype->object]"),
),
"quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
"rank": (None, ""),
"sem": (ValueError, "could not convert string to float"),
Expand All @@ -188,7 +185,10 @@ def test_groupby_raises_string(
"skew": (ValueError, "could not convert string to float"),
"std": (ValueError, "could not convert string to float"),
"sum": (None, ""),
"var": (TypeError, "could not convert string to float"),
"var": (
TypeError,
re.escape("agg function failed [how->var,dtype->object]"),
),
}[groupby_func]

_call_and_check(klass, msg, how, gb, groupby_func, args)
Expand Down Expand Up @@ -225,7 +225,7 @@ def test_groupby_raises_string_np(
np.sum: (None, ""),
np.mean: (
TypeError,
"Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric",
re.escape("agg function failed [how->mean,dtype->object]"),
),
}[groupby_func_np]

Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -186,7 +187,8 @@ def tests_raises_on_nuisance(test_frame):
tm.assert_frame_equal(result, expected)

expected = r[["A", "B", "C"]].mean()
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
r.mean()
result = r.mean(numeric_only=True)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -886,8 +888,13 @@ def test_frame_downsample_method(method, numeric_only, expected_data):

func = getattr(resampled, method)
if isinstance(expected_data, str):
klass = TypeError if method in ("var", "mean", "median", "prod") else ValueError
with pytest.raises(klass, match=expected_data):
if method in ("var", "mean", "median", "prod"):
klass = TypeError
msg = re.escape(f"agg function failed [how->{method},dtype->object]")
else:
klass = ValueError
msg = expected_data
with pytest.raises(klass, match=msg):
_ = func(**kwargs)
else:
result = func(**kwargs)
Expand Down Expand Up @@ -933,7 +940,8 @@ def test_series_downsample_method(method, numeric_only, expected_data):
with pytest.raises(TypeError, match=msg):
func(**kwargs)
elif method == "prod":
with pytest.raises(TypeError, match="can't multiply sequence by non-int"):
msg = re.escape("agg function failed [how->prod,dtype->object]")
with pytest.raises(TypeError, match=msg):
func(**kwargs)
else:
result = func(**kwargs)
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import numpy as np
import pytest

Expand Down Expand Up @@ -567,7 +569,8 @@ def test_mixed_type_join_with_suffix(self):
df.insert(5, "dt", "foo")

grouped = df.groupby("id")
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.mean()
mn = grouped.mean(numeric_only=True)
cn = grouped.count()
Expand Down
Loading