diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 68f1c78688b1d..8d96d49daba4f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -256,6 +256,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`) - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d3a86fa5950ed..aad43e4f96b81 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1242,7 +1242,17 @@ def f(g): raise ValueError( "func must be a callable if args or kwargs are supplied" ) + elif isinstance(func, str): + if hasattr(self, func): + res = getattr(self, func) + if callable(res): + return res() + return res + + else: + raise TypeError(f"apply func should be callable, not '{func}'") else: + f = func # ignore SettingWithCopy here in case the user mutates diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0181481b29c44..bcdb6817c0321 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1770,13 +1770,9 @@ def test_empty_groupby(columns, keys, values, method, op, request): isinstance(values, Categorical) and not isinstance(columns, list) and op in ["sum", "prod"] - and method != "apply" ): # handled below GH#41291 pass - elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply": - mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable") - request.node.add_marker(mark) elif ( isinstance(values, Categorical) and len(keys) == 1 @@ -1808,21 +1804,16 @@ def test_empty_groupby(columns, keys, values, method, op, request): isinstance(values, Categorical) and len(keys) == 2 and op in ["min", "max", "sum"] - and method != "apply" ): mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) request.node.add_marker(mark) - elif ( - isinstance(values, pd.core.arrays.BooleanArray) - and op in ["sum", "prod"] - and method != "apply" - ): + elif isinstance(values, pd.core.arrays.BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" - if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply": + if isinstance(values[0], bool) and op in ("prod", "sum"): # sum/product of bools is an integer override_dtype = "int64" @@ -1846,66 +1837,62 @@ def get_result(): # i.e. SeriesGroupBy if op in ["prod", "sum"]: # ops that require more than just ordered-ness - if method != "apply": - # FIXME: apply goes through different code path - if df.dtypes[0].kind == "M": - # GH#41291 - # datetime64 -> prod and sum are invalid - msg = "datetime64 type does not support" - with pytest.raises(TypeError, match=msg): - get_result() - - return - elif isinstance(values, Categorical): - # GH#41291 - msg = "category type does not support" - with pytest.raises(TypeError, match=msg): - get_result() - - return + if df.dtypes[0].kind == "M": + # GH#41291 + # datetime64 -> prod and sum are invalid + msg = "datetime64 type does not support" + with pytest.raises(TypeError, match=msg): + get_result() + + return + elif isinstance(values, Categorical): + # GH#41291 + msg = "category type does not support" + with pytest.raises(TypeError, match=msg): + get_result() + + return else: # ie. DataFrameGroupBy if op in ["prod", "sum"]: # ops that require more than just ordered-ness - if method != "apply": - # FIXME: apply goes through different code path - if df.dtypes[0].kind == "M": - # GH#41291 - # datetime64 -> prod and sum are invalid - result = get_result() - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - expected = df.set_index(keys)[[]] - tm.assert_equal(result, expected) - return - - elif isinstance(values, Categorical): - # GH#41291 - # Categorical doesn't implement sum or prod - result = get_result() - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - expected = df.set_index(keys)[[]] - if len(keys) != 1 and op == "prod": - # TODO: why just prod and not sum? - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=["A", "B"]) - expected = DataFrame([], columns=[], index=mi) - - tm.assert_equal(result, expected) - return - - elif df.dtypes[0] == object: - # FIXME: the test is actually wrong here, xref #41341 - result = get_result() - # In this case we have list-of-list, will raise TypeError, - # and subsequently be dropped as nuisance columns - expected = df.set_index(keys)[[]] - tm.assert_equal(result, expected) - return + if df.dtypes[0].kind == "M": + # GH#41291 + # datetime64 -> prod and sum are invalid + result = get_result() + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + expected = df.set_index(keys)[[]] + tm.assert_equal(result, expected) + return + + elif isinstance(values, Categorical): + # GH#41291 + # Categorical doesn't implement sum or prod + result = get_result() + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + expected = df.set_index(keys)[[]] + if len(keys) != 1 and op == "prod": + # TODO: why just prod and not sum? + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + mi = MultiIndex.from_product([lev, lev], names=["A", "B"]) + expected = DataFrame([], columns=[], index=mi) + + tm.assert_equal(result, expected) + return + + elif df.dtypes[0] == object: + # FIXME: the test is actually wrong here, xref #41341 + result = get_result() + # In this case we have list-of-list, will raise TypeError, + # and subsequently be dropped as nuisance columns + expected = df.set_index(keys)[[]] + tm.assert_equal(result, expected) + return result = get_result() expected = df.set_index(keys)[columns]