diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8a893db95dc22..3b54918ae99c1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1536,6 +1536,7 @@ def func(df): result = [index[i] if i >= 0 else np.nan for i in indices] return df._constructor_sliced(result, index=res.index) + func.__name__ = "idxmax" return self._python_apply_general(func, self._obj_with_exclusions) @Appender(DataFrame.idxmin.__doc__) @@ -1557,6 +1558,7 @@ def func(df): result = [index[i] if i >= 0 else np.nan for i in indices] return df._constructor_sliced(result, index=res.index) + func.__name__ = "idxmin" return self._python_apply_general(func, self._obj_with_exclusions) boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 46e4465667e7e..60c8851f059fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -753,6 +753,18 @@ def apply( mutated = True result_values.append(res) + # getattr pattern for __name__ is needed for functools.partial objects + if len(group_keys) == 0 and getattr(f, "__name__", None) not in [ + "idxmin", + "idxmax", + "nanargmin", + "nanargmax", + ]: + # If group_keys is empty, then no function calls have been made, + # so we will not have raised even if this is an invalid dtype. + # So do one dummy call here to raise appropriate TypeError. + f(data.iloc[:0]) + return result_values, mutated @cache_readonly diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 83b096cfc2d05..203d8abb465d0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -24,6 +24,11 @@ to_datetime, ) import pandas._testing as tm +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) from pandas.core.base import SpecificationError import pandas.core.common as com @@ -1822,17 +1827,23 @@ def test_pivot_table_values_key_error(): ) @pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning") @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") -def test_empty_groupby(columns, keys, values, method, op, request): +def test_empty_groupby(columns, keys, values, method, op, request, using_array_manager): # GH8093 & GH26411 override_dtype = None if ( isinstance(values, Categorical) and not isinstance(columns, list) - and op in ["sum", "prod"] + and op in ["sum", "prod", "skew", "mad"] ): # handled below GH#41291 - pass + + if using_array_manager and op == "mad": + right_msg = "Cannot interpret 'CategoricalDtype.* as a data type" + msg = "Regex pattern \"'Categorical' does not implement.*" + right_msg + mark = pytest.mark.xfail(raises=AssertionError, match=msg) + request.node.add_marker(mark) + elif ( isinstance(values, Categorical) and len(keys) == 1 @@ -1851,11 +1862,7 @@ def test_empty_groupby(columns, keys, values, method, op, request): raises=TypeError, match="'Categorical' does not implement" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["mad", "min", "max", "sum", "prod", "skew"] - ): + elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) @@ -1869,7 +1876,30 @@ def test_empty_groupby(columns, keys, values, method, op, request): raises=AssertionError, match="(DataFrame|Series) are different" ) request.node.add_marker(mark) - elif isinstance(values, pd.core.arrays.BooleanArray) and op in ["sum", "prod"]: + elif ( + isinstance(values, (IntegerArray, FloatingArray)) + and op == "mad" + and isinstance(columns, list) + ): + mark = pytest.mark.xfail( + raises=TypeError, match="can only perform ops with numeric values" + ) + request.node.add_marker(mark) + + elif ( + op == "mad" + and not isinstance(columns, list) + and isinstance(values, pd.DatetimeIndex) + and values.tz is not None + and using_array_manager + ): + mark = pytest.mark.xfail( + raises=TypeError, + match=r"Cannot interpret 'datetime64\[ns, US/Eastern\]' as a data type", + ) + request.node.add_marker(mark) + + elif isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -1895,19 +1925,29 @@ def get_result(): if columns == "C": # i.e. SeriesGroupBy - if op in ["prod", "sum"]: + if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness if df.dtypes[0].kind == "M": # GH#41291 # datetime64 -> prod and sum are invalid - msg = "datetime64 type does not support" + if op == "skew": + msg = "'DatetimeArray' does not implement reduction 'skew'" + else: + msg = "datetime64 type does not support" with pytest.raises(TypeError, match=msg): get_result() return - elif isinstance(values, Categorical): + if op in ["prod", "sum", "skew", "mad"]: + if isinstance(values, Categorical): # GH#41291 - msg = "category type does not support" + if op == "mad": + # mad calls mean, which Categorical doesn't implement + msg = "'Categorical' does not implement reduction 'mean'" + elif op == "skew": + msg = f"'Categorical' does not implement reduction '{op}'" + else: + msg = "category type does not support" with pytest.raises(TypeError, match=msg): get_result() @@ -1954,6 +1994,34 @@ def get_result(): tm.assert_equal(result, expected) return + if ( + op in ["mad", "min", "max", "skew"] + and isinstance(values, Categorical) + and len(keys) == 1 + ): + # Categorical doesn't implement, so with numeric_only=True + # these are dropped and we get an empty DataFrame back + result = get_result() + expected = df.set_index(keys)[[]] + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + if len(keys) != 1: + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + mi = MultiIndex.from_product([lev, lev], names=keys) + expected = DataFrame([], columns=[], index=mi) + else: + # all columns are dropped, but we end up with one row + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + ci = Index(lev, name=keys[0]) + expected = DataFrame([], columns=[], index=ci) + # expected = df.set_index(keys)[columns] + + tm.assert_equal(result, expected) + return + result = get_result() expected = df.set_index(keys)[columns] if override_dtype is not None: