diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 838722f60b380..16c4a9f862d79 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,12 +5,99 @@ from collections import defaultdict from functools import partial -from typing import Any, Callable, DefaultDict, List, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) + +from pandas._typing import Label from pandas.core.dtypes.common import is_dict_like, is_list_like +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index +from pandas.core.series import FrameOrSeriesUnion, Series + +# types of `func` kwarg for DataFrame.aggregate and Series.aggregate +AggFuncTypeBase = Union[Callable, str] +AggFuncType = Union[ + AggFuncTypeBase, + List[AggFuncTypeBase], + Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], +] + + +def reconstruct_func( + func: Optional[AggFuncType], **kwargs, +) -> Tuple[ + bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], +]: + """ + This is the internal function to reconstruct func given if there is relabeling + or not and also normalize the keyword to get new order of columns. + + If named aggregation is applied, `func` will be None, and kwargs contains the + column and aggregation function information to be parsed; + If named aggregation is not applied, `func` is either string (e.g. 'min') or + Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name + and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]}) + + If relabeling is True, will return relabeling, reconstructed func, column + names, and the reconstructed order of columns. + If relabeling is False, the columns and order will be None. + + Parameters + ---------- + func: agg function (e.g. 'min' or Callable) or list of agg functions + (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}). + **kwargs: dict, kwargs used in is_multi_agg_with_relabel and + normalize_keyword_aggregation function for relabelling + + Returns + ------- + relabelling: bool, if there is relabelling or not + func: normalized and mangled func + columns: list of column names + order: list of columns indices + + Examples + -------- + >>> reconstruct_func(None, **{"foo": ("col", "min")}) + (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) + + >>> reconstruct_func("min") + (False, 'min', None, None) + """ + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) + columns: Optional[List[str]] = None + order: Optional[List[int]] = None + + if not relabeling: + if isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column names " + "assigned" + ) + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + if relabeling: + func, columns, order = normalize_keyword_aggregation(kwargs) + func = maybe_mangle_lambdas(func) + + return relabeling, func, columns, order def is_multi_agg_with_relabel(**kwargs) -> bool: @@ -198,6 +285,79 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: return mangled_aggspec +def relabel_result( + result: FrameOrSeriesUnion, + func: Dict[str, List[Union[Callable, str]]], + columns: Tuple, + order: List[int], +) -> Dict[Label, Series]: + """Internal function to reorder result if relabelling is True for + dataframe.agg, and return the reordered result in dict. + + Parameters: + ---------- + result: Result from aggregation + func: Dict of (column name, funcs) + columns: New columns name for relabelling + order: New order for relabelling + + Examples: + --------- + >>> result = DataFrame({"A": [np.nan, 2, np.nan], + ... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP + >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} + >>> columns = ("foo", "aab", "bar", "dat") + >>> order = [0, 1, 2, 3] + >>> _relabel_result(result, func, columns, order) # doctest: +SKIP + dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]), + C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]), + B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"])) + """ + reordered_indexes = [ + pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) + ] + reordered_result_in_dict: Dict[Label, Series] = {} + idx = 0 + + reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 + for col, fun in func.items(): + s = result[col].dropna() + + # In the `_aggregate`, the callable names are obtained and used in `result`, and + # these names are ordered alphabetically. e.g. + # C2 C1 + # 1 NaN + # amax NaN 4.0 + # max NaN 4.0 + # sum 18.0 6.0 + # Therefore, the order of functions for each column could be shuffled + # accordingly so need to get the callable name if it is not parsed names, and + # reorder the aggregated result for each column. + # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is + # [sum, ], but in `result`, it will be [, sum], and we need to + # reorder so that aggregated values map to their functions regarding the order. + + # However there is only one column being used for aggregation, not need to + # reorder since the index is not sorted, and keep as is in `funcs`, e.g. + # A + # min 1.0 + # mean 1.5 + # mean 1.5 + if reorder_mask: + fun = [ + com.get_callable_name(f) if not isinstance(f, str) else f for f in fun + ] + col_idx_order = Index(s.index).get_indexer(fun) + s = s[col_idx_order] + + # assign the new user-provided "named aggregation" as index names, and reindex + # it based on the whole user-provided names. + s.index = reordered_indexes[idx : idx + len(fun)] + reordered_result_in_dict[col] = s.reindex(columns, copy=False) + idx = idx + len(fun) + return reordered_result_in_dict + + def validate_func_kwargs( kwargs: dict, ) -> Tuple[List[str], List[Union[str, Callable[..., Any]]]]: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d2200cb45c6e..10539ab74b4aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -114,6 +114,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor +from pandas.core.aggregation import reconstruct_func, relabel_result from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -7301,9 +7302,11 @@ def _gotitem( examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", ) - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + result = None try: result, how = self._aggregate(func, axis=axis, *args, **kwargs) @@ -7315,6 +7318,13 @@ def aggregate(self, func, axis=0, *args, **kwargs): raise exc from err if result is None: return self.apply(func, axis=axis, args=args, **kwargs) + + if relabeling: + # This is to keep the order to columns occurrence unchanged, and also + # keep the order of new columns occurrence unchanged + result_in_dict = relabel_result(result, func, columns, order) + result = DataFrame(result_in_dict, index=columns) + return result def _aggregate(self, arg, axis=0, *args, **kwargs): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ebb9d82766c1b..7f2eac520264d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -55,9 +55,8 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core.aggregation import ( - is_multi_agg_with_relabel, maybe_mangle_lambdas, - normalize_keyword_aggregation, + reconstruct_func, validate_func_kwargs, ) import pandas.core.algorithms as algorithms @@ -937,24 +936,7 @@ def aggregate( self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs ): - relabeling = func is None and is_multi_agg_with_relabel(**kwargs) - if relabeling: - func, columns, order = normalize_keyword_aggregation(kwargs) - - kwargs = {} - elif isinstance(func, list) and len(func) > len(set(func)): - - # GH 28426 will raise error if duplicated function names are used and - # there is no reassigned name - raise SpecificationError( - "Function names must be unique if there is no new column " - "names assigned" - ) - elif func is None: - # nicer error message - raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - - func = maybe_mangle_lambdas(func) + relabeling, func, columns, order = reconstruct_func(func, **kwargs) if engine == "numba": return self._python_agg_general( diff --git a/pandas/core/series.py b/pandas/core/series.py index 6c1d21e4526cf..9a633079b8c1d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4016,9 +4016,14 @@ def _gotitem(self, key, ndim, subset=None) -> "Series": examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", ) - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate(self, func=None, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) + + # if func is None, will switch to user-provided "named aggregation" kwargs + if func is None: + func = dict(kwargs.items()) + result, how = self._aggregate(func, *args, **kwargs) if result is None: diff --git a/pandas/tests/frame/apply/__init__.py b/pandas/tests/frame/apply/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/frame/apply/test_apply_relabeling.py b/pandas/tests/frame/apply/test_apply_relabeling.py new file mode 100644 index 0000000000000..965f69753bdc7 --- /dev/null +++ b/pandas/tests/frame/apply/test_apply_relabeling.py @@ -0,0 +1,104 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestDataFrameNamedAggregate: + def test_agg_relabel(self): + # GH 26513 + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + + # simplest case with one column, one func + result = df.agg(foo=("B", "sum")) + expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) + tm.assert_frame_equal(result, expected) + + # test on same column with different methods + result = df.agg(foo=("B", "sum"), bar=("B", "min")) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_multi_columns_multi_methods(self): + # GH 26513, test on multiple columns with multiple methods + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg( + foo=("A", "sum"), + bar=("B", "mean"), + cat=("A", "min"), + dat=("B", "max"), + f=("A", "max"), + g=("C", "min"), + ) + expected = pd.DataFrame( + { + "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], + "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_partial_functions(self): + # GH 26513, test on partial, functools or more complex cases + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + expected = pd.DataFrame( + {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + kk=("B", lambda x: min(x)), + ) + expected = pd.DataFrame( + { + "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_namedtuple(self): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.agg( + foo=pd.NamedAgg("B", "sum"), + bar=pd.NamedAgg("B", min), + cat=pd.NamedAgg(column="B", aggfunc="count"), + fft=pd.NamedAgg("B", aggfunc="max"), + ) + + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=pd.NamedAgg("A", "min"), + bar=pd.NamedAgg(column="B", aggfunc="max"), + cat=pd.NamedAgg(column="A", aggfunc="max"), + ) + expected = pd.DataFrame( + {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, + index=pd.Index(["foo", "bar", "cat"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_raises(self): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/apply/test_frame_apply.py similarity index 100% rename from pandas/tests/frame/test_apply.py rename to pandas/tests/frame/apply/test_frame_apply.py diff --git a/pandas/tests/series/apply/__init__.py b/pandas/tests/series/apply/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/series/apply/test_apply_relabeling.py b/pandas/tests/series/apply/test_apply_relabeling.py new file mode 100644 index 0000000000000..0b8d2c4e1f26d --- /dev/null +++ b/pandas/tests/series/apply/test_apply_relabeling.py @@ -0,0 +1,33 @@ +import pandas as pd +import pandas._testing as tm + + +class TestNamedAggregation: + def test_relabel_no_duplicated_method(self): + # this is to test there is no duplicated method used in agg + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum") + expected = df["A"].agg({"foo": "sum"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo="min", bar="max") + expected = df["B"].agg({"foo": "min", "bar": "max"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=sum, bar=min, cat="max") + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) + tm.assert_series_equal(result, expected) + + def test_relabel_duplicated_method(self): + # this is to test with nested renaming, duplicated method can be used + # if they are assigned with different new names + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum", bar="sum") + expected = pd.Series([6, 6], index=["foo", "bar"], name="A") + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=min, bar="min") + expected = pd.Series([1, 1], index=["foo", "bar"], name="B") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/apply/test_series_apply.py similarity index 100% rename from pandas/tests/series/test_apply.py rename to pandas/tests/series/apply/test_series_apply.py