diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 725043616eaa7..0279d5418fc9c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -14,6 +14,7 @@ Iterable, Iterator, List, + Literal, Sequence, cast, ) @@ -158,19 +159,54 @@ def agg(self) -> DataFrame | Series | None: return self.apply_str() if is_dict_like(arg): - return self.agg_dict_like() + return self.dict_like("agg") elif is_list_like(arg): # we require a list, but not a 'str' - return self.agg_list_like() + return self.list_like("agg") if callable(arg): f = com.get_cython_func(arg) if f and not args and not kwargs: return getattr(obj, f)() + elif not isinstance(obj, SelectionMixin): + # i.e. obj is Series or DataFrame + return self.agg_udf() # caller can react return None + def agg_udf(self): + obj = self.obj + arg = cast(Callable, self.f) + + if not isinstance(obj, SelectionMixin): + # i.e. obj is Series or DataFrame + selected_obj = obj + elif obj._selected_obj.ndim == 1: + # For SeriesGroupBy this matches _obj_with_exclusions + selected_obj = obj._selected_obj + else: + selected_obj = obj._obj_with_exclusions + + results = [] + + if selected_obj.ndim == 1: + colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) + return arg(colg) + + indices = [] + for index, col in enumerate(selected_obj): + colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) + new_res = arg(colg) + results.append(new_res) + indices.append(index) + keys = selected_obj.columns.take(indices) + + from pandas import Series + + result = Series(results, index=keys) + return result + def transform(self) -> DataFrame | Series: """ Transform a DataFrame or Series. @@ -284,7 +320,7 @@ def transform_str_or_callable(self, func) -> DataFrame | Series: except Exception: return func(obj, *args, **kwargs) - def agg_list_like(self) -> DataFrame | Series: + def list_like(self, method: Literal["agg", "apply"]) -> DataFrame | Series: """ Compute aggregation in the case of a list-like argument. @@ -316,7 +352,7 @@ def agg_list_like(self) -> DataFrame | Series: if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) - new_res = colg.aggregate(a) + new_res = getattr(colg, method)(a) results.append(new_res) # make sure we find a good name @@ -328,7 +364,7 @@ def agg_list_like(self) -> DataFrame | Series: indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) - new_res = colg.aggregate(arg) + new_res = getattr(colg, method)(arg) results.append(new_res) indices.append(index) keys = selected_obj.columns.take(indices) @@ -357,7 +393,7 @@ def agg_list_like(self) -> DataFrame | Series: ) return concatenated.reindex(full_ordered_index, copy=False) - def agg_dict_like(self) -> DataFrame | Series: + def dict_like(self, method: Literal["agg", "apply"]) -> DataFrame | Series: """ Compute aggregation in the case of a dict-like argument. @@ -382,16 +418,17 @@ def agg_dict_like(self) -> DataFrame | Series: selected_obj = obj._selected_obj selection = obj._selection - arg = self.normalize_dictlike_arg("agg", selected_obj, arg) + arg = self.normalize_dictlike_arg(method, selected_obj, arg) if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} + results = {key: getattr(colg, method)(how) for key, how in arg.items()} else: # key used for column selection and output results = { - key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + key: getattr(obj._gotitem(key, ndim=1), method)(how) + for key, how in arg.items() } # set the final keys @@ -412,7 +449,7 @@ def agg_dict_like(self) -> DataFrame | Series: ktu._set_names(selected_obj.columns.names) keys_to_use = ktu - axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 + axis: AxisInt = 0 if isinstance(obj, ABCSeries) and method == "agg" else 1 result = concat( {k: results[k] for k in keys_to_use}, # type: ignore[misc] axis=axis, @@ -477,7 +514,10 @@ def apply_multiple(self) -> DataFrame | Series: result: Series, DataFrame, or None Result when self.f is a list-like or dict-like, None otherwise. """ - return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) + if is_dict_like(self.f): + return self.dict_like("apply") + else: + return self.list_like("apply") def normalize_dictlike_arg( self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict @@ -676,9 +716,6 @@ def agg(self): if axis == 1: result = result.T if result is not None else result - if result is None: - result = self.obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) - return result def apply_empty_result(self): @@ -1009,34 +1046,6 @@ def apply(self) -> DataFrame | Series: # self.f is Callable return self.apply_standard() - def agg(self): - result = super().agg() - if result is None: - f = self.f - kwargs = self.kwargs - - # string, list-like, and dict-like are entirely handled in super - assert callable(f) - - # we can be called from an inner function which - # passes this meta-data - kwargs.pop("_level", None) - - # try a regular apply, this evaluates lambdas - # row-by-row; however if the lambda is expected a Series - # expression, e.g.: lambda x: x-x.quantile(0.25) - # this will fail, so we can try a vectorized evaluation - - # we cannot FIRST try the vectorized evaluation, because - # then .agg and .apply would have different semantics if the - # operation is actually defined on the Series, e.g. str - try: - result = self.obj.apply(f) - except (ValueError, AttributeError, TypeError): - result = f(self.obj) - - return result - def apply_empty_result(self) -> Series: obj = self.obj return obj._constructor(dtype=obj.dtype, index=obj.index).__finalize__( diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 28c776d0a6d35..6a71617f3b702 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -58,10 +58,10 @@ def test_apply_axis1_with_ea(): "data, dtype", [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)], ) -def test_agg_axis1_duplicate_index(data, dtype): +def test_apply_axis1_duplicate_index(data, dtype): # GH 42380 expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype) - result = expected.agg(lambda x: x, axis=1) + result = expected.apply(lambda x: x, axis=1) tm.assert_frame_equal(result, expected) @@ -1065,8 +1065,6 @@ def test_consistency_for_boxed(box, int_frame_const_col): def test_agg_transform(axis, float_frame): - other_axis = 1 if axis in {0, "index"} else 0 - with np.errstate(all="ignore"): f_abs = np.abs(float_frame) @@ -1080,25 +1078,17 @@ def test_agg_transform(axis, float_frame): # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) - else: - expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) tm.assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) - else: - expected.index = MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) + expected = zip_frames([f_abs, f_sqrt], axis=1) + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) tm.assert_frame_equal(result, expected) @@ -1486,10 +1476,10 @@ def test_apply_empty_list_reduce(): tm.assert_series_equal(result, expected) -def test_apply_no_suffix_index(): +def test_agg_no_suffix_index(): # GH36189 pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) - result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + result = pdf.agg(["sum", lambda x: x.sum(), lambda x: x.sum()]) expected = DataFrame( {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] ) @@ -1624,3 +1614,10 @@ def test_any_apply_keyword_non_zero_axis_regression(): result = df.apply("any", 1) tm.assert_series_equal(result, expected) + + +def test_agg_list_aggregated(): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.agg(list) + expected = Series({"a": [1, 2, 3], "b": [4, 5, 6]}) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 5986f1f6cf51d..c0ceadfda6a62 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -269,10 +269,9 @@ def test_transform(string_series): # dict, provide renaming expected = concat([f_sqrt, f_abs], axis=1) expected.columns = ["foo", "bar"] - expected = expected.unstack().rename("series") result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) - tm.assert_series_equal(result.reindex_like(expected), expected) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op", series_transform_kernels) @@ -348,18 +347,26 @@ def test_demo(): tm.assert_series_equal(result, expected) -def test_agg_apply_evaluate_lambdas_the_same(string_series): +def test_apply_evaluate_lambdas_the_same(string_series): # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.apply(lambda x: str(x)) - expected = string_series.agg(lambda x: str(x)) + expected = string_series.astype(str) tm.assert_series_equal(result, expected) result = string_series.apply(str) - expected = string_series.agg(str) tm.assert_series_equal(result, expected) +def test_agg_evaluate_lambdas_the_same(string_series): + result = string_series.agg(lambda x: str(x)) + expected = str(string_series) + assert result == expected + + result = string_series.agg(str) + assert result == expected + + def test_with_nested_series(datetime_series): # GH 2316 # .agg with a reducer and a transform, what to do @@ -368,13 +375,14 @@ def test_with_nested_series(datetime_series): tm.assert_frame_equal(result, expected) result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"])) - tm.assert_frame_equal(result, expected) + expected = Series([datetime_series, datetime_series**2], index=["x", "x^2"]) + tm.assert_series_equal(result, expected) def test_replicate_describe(string_series): # this also tests a result set that is all scalars expected = string_series.describe() - result = string_series.apply( + result = string_series.agg( { "count": "count", "mean": "mean", @@ -417,10 +425,10 @@ def test_non_callable_aggregates(how): tm.assert_series_equal(result, expected) -def test_series_apply_no_suffix_index(): +def test_series_agg_no_suffix_index(): # GH36189 s = Series([4] * 3) - result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + result = s.agg(["sum", lambda x: x.sum(), lambda x: x.sum()]) expected = Series([12, 12, 12], index=["sum", "", ""]) tm.assert_series_equal(result, expected) @@ -860,12 +868,27 @@ def test_apply_to_timedelta(): (np.array([np.sum, np.mean]), ["sum", "mean"]), ], ) -@pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_listlike_reducer(string_series, ops, names, how): +def test_apply_listlike_reducer(string_series, ops, names): + # GH 39140 + expected = DataFrame({name: string_series for name, op in zip(names, ops)}) + result = string_series.apply(ops) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "ops, names", + [ + ([np.sum], ["sum"]), + ([np.sum, np.mean], ["sum", "mean"]), + (np.array([np.sum]), ["sum"]), + (np.array([np.sum, np.mean]), ["sum", "mean"]), + ], +) +def test_agg_listlike_reducer(string_series, ops, names): # GH 39140 expected = Series({name: op(string_series) for name, op in zip(names, ops)}) expected.name = "series" - result = getattr(string_series, how)(ops) + result = string_series.agg(ops) tm.assert_series_equal(result, expected) @@ -878,12 +901,27 @@ def test_apply_listlike_reducer(string_series, ops, names, how): Series({"A": np.sum, "B": np.mean}), ], ) -@pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_dictlike_reducer(string_series, ops, how): +def test_apply_dictlike_reducer(string_series, ops): + # GH 39140 + expected = DataFrame({name: string_series for name, op in ops.items()}) + result = string_series.apply(ops) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + {"A": np.sum}, + {"A": np.sum, "B": np.mean}, + Series({"A": np.sum}), + Series({"A": np.sum, "B": np.mean}), + ], +) +def test_agg_dictlike_reducer(string_series, ops): # GH 39140 expected = Series({name: op(string_series) for name, op in ops.items()}) expected.name = string_series.name - result = getattr(string_series, how)(ops) + result = string_series.agg(ops) tm.assert_series_equal(result, expected) @@ -917,10 +955,9 @@ def test_apply_listlike_transformer(string_series, ops, names): def test_apply_dictlike_transformer(string_series, ops): # GH 39140 with np.errstate(all="ignore"): - expected = concat({name: op(string_series) for name, op in ops.items()}) - expected.name = string_series.name + expected = DataFrame({name: op(string_series) for name, op in ops.items()}) result = string_series.apply(ops) - tm.assert_series_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_apply_retains_column_name():