diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py new file mode 100644 index 0000000000000..79b87f146b9a7 --- /dev/null +++ b/pandas/core/aggregation.py @@ -0,0 +1,198 @@ +""" +aggregation.py contains utility functions to handle multiple named and lambda +kwarg aggregations in groupby and DataFrame/Series aggregation +""" + +from collections import defaultdict +from functools import partial +from typing import Any, DefaultDict, List, Sequence, Tuple + +from pandas.core.dtypes.common import is_dict_like, is_list_like + +import pandas.core.common as com +from pandas.core.indexes.api import Index + + +def is_multi_agg_with_relabel(**kwargs) -> bool: + """ + Check whether kwargs passed to .agg look like multi-agg with relabeling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> is_multi_agg_with_relabel(a='max') + False + >>> is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> is_multi_agg_with_relabel() + False + """ + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) + + +def normalize_keyword_aggregation(kwargs: dict) -> Tuple[dict, List[str], List[int]]: + """ + Normalize user-provided "named aggregation" kwargs. + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs + to the old Dict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + col_idx_order : List[int] + List of columns indices. + + Examples + -------- + >>> normalize_keyword_aggregation({'output': ('input', 'sum')}) + ({'input': ['sum']}, ('output',), [('input', 'sum')]) + """ + # Normalize the aggregation functions as Mapping[column, List[func]], + # process normally, then fixup the names. + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec: DefaultDict = defaultdict(list) + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + aggspec[column].append(aggfunc) + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique_kwarg_list(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique_kwarg_list(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique_kwarg_list( + seq: Sequence[Tuple[Any, Any]] +) -> Sequence[Tuple[Any, Any]]: + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> kwarg_list = [('a', ''), ('a', ''), ('b', '')] + >>> _make_unique_kwarg_list(kwarg_list) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] + + +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = partial(aggfunc) + aggfunc.__name__ = f"" + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to GroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> maybe_mangle_lambdas('sum') + 'sum' + >>> maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c49677fa27a31..98cdcd0f2b6ee 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,7 +5,7 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ -from collections import abc, defaultdict, namedtuple +from collections import abc, namedtuple import copy from functools import partial from textwrap import dedent @@ -42,10 +42,8 @@ ensure_int64, ensure_platform_int, is_bool, - is_dict_like, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_dtype, is_object_dtype, is_scalar, @@ -53,6 +51,11 @@ ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna +from pandas.core.aggregation import ( + is_multi_agg_with_relabel, + maybe_mangle_lambdas, + normalize_keyword_aggregation, +) import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -249,7 +252,7 @@ def aggregate(self, func=None, *args, **kwargs): elif isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. - func = _maybe_mangle_lambdas(func) + func = maybe_mangle_lambdas(func) ret = self._aggregate_multiple_funcs(func) if relabeling: ret.columns = columns @@ -918,9 +921,9 @@ class DataFrameGroupBy(GroupBy): @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) if relabeling: - func, columns, order = _normalize_keyword_aggregation(kwargs) + func, columns, order = normalize_keyword_aggregation(kwargs) kwargs = {} elif isinstance(func, list) and len(func) > len(set(func)): @@ -935,7 +938,7 @@ def aggregate(self, func=None, *args, **kwargs): # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - func = _maybe_mangle_lambdas(func) + func = maybe_mangle_lambdas(func) result, how = self._aggregate(func, *args, **kwargs) if how is None: @@ -1860,190 +1863,6 @@ def groupby_series(obj, col=None): boxplot = boxplot_frame_groupby -def _is_multi_agg_with_relabel(**kwargs) -> bool: - """ - Check whether kwargs passed to .agg look like multi-agg with relabeling. - - Parameters - ---------- - **kwargs : dict - - Returns - ------- - bool - - Examples - -------- - >>> _is_multi_agg_with_relabel(a='max') - False - >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), - ... a_min=('a', 'min')) - True - >>> _is_multi_agg_with_relabel() - False - """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( - len(kwargs) > 0 - ) - - -def _normalize_keyword_aggregation(kwargs): - """ - Normalize user-provided "named aggregation" kwargs. - - Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs - to the old Dict[str, List[scalar]]]. - - Parameters - ---------- - kwargs : dict - - Returns - ------- - aggspec : dict - The transformed kwargs. - columns : List[str] - The user-provided keys. - col_idx_order : List[int] - List of columns indices. - - Examples - -------- - >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - ({'input': ['sum']}, ('output',), [('input', 'sum')]) - """ - # Normalize the aggregation functions as Mapping[column, List[func]], - # process normally, then fixup the names. - # TODO: aggspec type: typing.Dict[str, List[AggScalar]] - # May be hitting https://github.com/python/mypy/issues/5958 - # saying it doesn't have an attribute __name__ - aggspec = defaultdict(list) - order = [] - columns, pairs = list(zip(*kwargs.items())) - - for name, (column, aggfunc) in zip(columns, pairs): - aggspec[column].append(aggfunc) - order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - - # uniquify aggfunc name if duplicated in order list - uniquified_order = _make_unique(order) - - # GH 25719, due to aggspec will change the order of assigned columns in aggregation - # uniquified_aggspec will store uniquified order list and will compare it with order - # based on index - aggspec_order = [ - (column, com.get_callable_name(aggfunc) or aggfunc) - for column, aggfuncs in aggspec.items() - for aggfunc in aggfuncs - ] - uniquified_aggspec = _make_unique(aggspec_order) - - # get the new indice of columns by comparison - col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) - return aggspec, columns, col_idx_order - - -def _make_unique(seq): - """Uniquify aggfunc name of the pairs in the order list - - Examples: - -------- - >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) - [('a', '_0'), ('a', '_1'), ('b', '')] - """ - return [ - (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) - if seq.count(pair) > 1 - else pair - for i, pair in enumerate(seq) - ] - - -# TODO: Can't use, because mypy doesn't like us setting __name__ -# error: "partial[Any]" has no attribute "__name__" -# the type is: -# typing.Sequence[Callable[..., ScalarResult]] -# -> typing.Sequence[Callable[..., ScalarResult]]: - - -def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: - """ - Possibly mangle a list of aggfuncs. - - Parameters - ---------- - aggfuncs : Sequence - - Returns - ------- - mangled: list-like - A new AggSpec sequence, where lambdas have been converted - to have unique names. - - Notes - ----- - If just one aggfunc is passed, the name will not be mangled. - """ - if len(aggfuncs) <= 1: - # don't mangle for .agg([lambda x: .]) - return aggfuncs - i = 0 - mangled_aggfuncs = [] - for aggfunc in aggfuncs: - if com.get_callable_name(aggfunc) == "": - aggfunc = partial(aggfunc) - aggfunc.__name__ = f"" - i += 1 - mangled_aggfuncs.append(aggfunc) - - return mangled_aggfuncs - - -def _maybe_mangle_lambdas(agg_spec: Any) -> Any: - """ - Make new lambdas with unique names. - - Parameters - ---------- - agg_spec : Any - An argument to GroupBy.agg. - Non-dict-like `agg_spec` are pass through as is. - For dict-like `agg_spec` a new spec is returned - with name-mangled lambdas. - - Returns - ------- - mangled : Any - Same type as the input. - - Examples - -------- - >>> _maybe_mangle_lambdas('sum') - 'sum' - - >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP - [, - .f(*args, **kwargs)>] - """ - is_dict = is_dict_like(agg_spec) - if not (is_dict or is_list_like(agg_spec)): - return agg_spec - mangled_aggspec = type(agg_spec)() # dict or OrderdDict - - if is_dict: - for key, aggfuncs in agg_spec.items(): - if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): - mangled_aggfuncs = _managle_lambda_list(aggfuncs) - else: - mangled_aggfuncs = aggfuncs - - mangled_aggspec[key] = mangled_aggfuncs - else: - mangled_aggspec = _managle_lambda_list(agg_spec) - - return mangled_aggspec - - def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ If we have date/time like in the original, then coerce dates diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0b72a61ed84de..3d842aca210ed 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,7 +10,6 @@ from pandas import DataFrame, Index, MultiIndex, Series, concat import pandas._testing as tm from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping @@ -632,41 +631,6 @@ def test_lambda_named_agg(func): class TestLambdaMangling: - def test_maybe_mangle_lambdas_passthrough(self): - assert _maybe_mangle_lambdas("mean") == "mean" - assert _maybe_mangle_lambdas(lambda x: x).__name__ == "" - # don't mangel single lambda. - assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" - - def test_maybe_mangle_lambdas_listlike(self): - aggfuncs = [lambda x: 1, lambda x: 2] - result = _maybe_mangle_lambdas(aggfuncs) - assert result[0].__name__ == "" - assert result[1].__name__ == "" - assert aggfuncs[0](None) == result[0](None) - assert aggfuncs[1](None) == result[1](None) - - def test_maybe_mangle_lambdas(self): - func = {"A": [lambda x: 0, lambda x: 1]} - result = _maybe_mangle_lambdas(func) - assert result["A"][0].__name__ == "" - assert result["A"][1].__name__ == "" - - def test_maybe_mangle_lambdas_args(self): - func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} - result = _maybe_mangle_lambdas(func) - assert result["A"][0].__name__ == "" - assert result["A"][1].__name__ == "" - - assert func["A"][0](0, 1) == (0, 1, 1) - assert func["A"][0](0, 1, 2) == (0, 1, 2) - assert func["A"][0](0, 2, b=3) == (0, 2, 3) - - def test_maybe_mangle_lambdas_named(self): - func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}} - result = _maybe_mangle_lambdas(func) - assert result == func - def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) @@ -784,48 +748,3 @@ def test_agg_multiple_lambda(self): weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), ) tm.assert_frame_equal(result2, expected) - - @pytest.mark.parametrize( - "order, expected_reorder", - [ - ( - [ - ("height", ""), - ("height", "max"), - ("weight", "max"), - ("height", ""), - ("weight", ""), - ], - [ - ("height", "_0"), - ("height", "max"), - ("weight", "max"), - ("height", "_1"), - ("weight", ""), - ], - ), - ( - [ - ("col2", "min"), - ("col1", ""), - ("col1", ""), - ("col1", ""), - ], - [ - ("col2", "min"), - ("col1", "_0"), - ("col1", "_1"), - ("col1", "_2"), - ], - ), - ( - [("col", ""), ("col", ""), ("col", "")], - [("col", "_0"), ("col", "_1"), ("col", "_2")], - ), - ], - ) - def test_make_unique(self, order, expected_reorder): - # GH 27519, test if make_unique function reorders correctly - result = _make_unique(order) - - assert result == expected_reorder diff --git a/pandas/tests/test_aggregation.py b/pandas/tests/test_aggregation.py new file mode 100644 index 0000000000000..74ccebc8e2275 --- /dev/null +++ b/pandas/tests/test_aggregation.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +from pandas.core.aggregation import _make_unique_kwarg_list, maybe_mangle_lambdas + + +def test_maybe_mangle_lambdas_passthrough(): + assert maybe_mangle_lambdas("mean") == "mean" + assert maybe_mangle_lambdas(lambda x: x).__name__ == "" + # don't mangel single lambda. + assert maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" + + +def test_maybe_mangle_lambdas_listlike(): + aggfuncs = [lambda x: 1, lambda x: 2] + result = maybe_mangle_lambdas(aggfuncs) + assert result[0].__name__ == "" + assert result[1].__name__ == "" + assert aggfuncs[0](None) == result[0](None) + assert aggfuncs[1](None) == result[1](None) + + +def test_maybe_mangle_lambdas(): + func = {"A": [lambda x: 0, lambda x: 1]} + result = maybe_mangle_lambdas(func) + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" + + +def test_maybe_mangle_lambdas_args(): + func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} + result = maybe_mangle_lambdas(func) + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" + + assert func["A"][0](0, 1) == (0, 1, 1) + assert func["A"][0](0, 1, 2) == (0, 1, 2) + assert func["A"][0](0, 2, b=3) == (0, 2, 3) + + +def test_maybe_mangle_lambdas_named(): + func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}} + result = maybe_mangle_lambdas(func) + assert result == func + + +@pytest.mark.parametrize( + "order, expected_reorder", + [ + ( + [ + ("height", ""), + ("height", "max"), + ("weight", "max"), + ("height", ""), + ("weight", ""), + ], + [ + ("height", "_0"), + ("height", "max"), + ("weight", "max"), + ("height", "_1"), + ("weight", ""), + ], + ), + ( + [ + ("col2", "min"), + ("col1", ""), + ("col1", ""), + ("col1", ""), + ], + [ + ("col2", "min"), + ("col1", "_0"), + ("col1", "_1"), + ("col1", "_2"), + ], + ), + ( + [("col", ""), ("col", ""), ("col", "")], + [("col", "_0"), ("col", "_1"), ("col", "_2")], + ), + ], +) +def test_make_unique(order, expected_reorder): + # GH 27519, test if make_unique function reorders correctly + result = _make_unique_kwarg_list(order) + + assert result == expected_reorder