Skip to content

CLN: Breakup agg #37452

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 153 additions & 108 deletions pandas/core/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,133 +528,44 @@ def transform_str_or_callable(
return func(obj, *args, **kwargs)


def aggregate(obj, arg: AggFuncType, *args, **kwargs):
def aggregate(
obj,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you can type this as some point (obj), FrameOrSeries i think.

arg: AggFuncType,
*args,
**kwargs,
):
"""
provide an implementation for the aggregators
Provide an implementation for the aggregators.

Parameters
----------
arg : string, dict, function
*args : args to pass on to the function
**kwargs : kwargs to pass on to the function
obj : Pandas object to compute aggregation on.
arg : string, dict, function.
*args : args to pass on to the function.
**kwargs : kwargs to pass on to the function.

Returns
-------
tuple of result, how
tuple of result, how.

Notes
-----
how can be a string describe the required post-processing, or
None if not required
None if not required.
"""
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

_axis = kwargs.pop("_axis", None)
if _axis is None:
_axis = getattr(obj, "axis", 0)

if isinstance(arg, str):
return obj._try_aggregate_string_function(arg, *args, **kwargs), None

if isinstance(arg, dict):
# aggregate based on the passed dict
if _axis != 0: # pragma: no cover
raise ValueError("Can only pass dict with axis=0")

selected_obj = obj._selected_obj

# if we have a dict of any non-scalars
# eg. {'A' : ['mean']}, normalize all to
# be list-likes
if any(is_aggregator(x) for x in arg.values()):
new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {}
for k, v in arg.items():
if not isinstance(v, (tuple, list, dict)):
new_arg[k] = [v]
else:
new_arg[k] = v

# the keys must be in the columns
# for ndim=2, or renamers for ndim=1

# ok for now, but deprecated
# {'A': { 'ra': 'mean' }}
# {'A': { 'ra': ['mean'] }}
# {'ra': ['mean']}

# not ok
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
raise SpecificationError("nested renamer is not supported")
elif isinstance(selected_obj, ABCSeries):
raise SpecificationError("nested renamer is not supported")
elif (
isinstance(selected_obj, ABCDataFrame)
and k not in selected_obj.columns
):
raise KeyError(f"Column '{k}' does not exist!")

arg = new_arg

else:
# deprecation of renaming keys
# GH 15931
keys = list(arg.keys())
if isinstance(selected_obj, ABCDataFrame) and len(
selected_obj.columns.intersection(keys)
) != len(keys):
cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys)))
raise SpecificationError(f"Column(s) {cols} do not exist")

from pandas.core.reshape.concat import concat

if selected_obj.ndim == 1:
# key only used for output
colg = obj._gotitem(obj._selection, ndim=1)
results = {key: colg.agg(how) for key, how in arg.items()}
else:
# key used for column selection and output
results = {
key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
}

# set the final keys
keys = list(arg.keys())

# Avoid making two isinstance calls in all and any below
is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]

# combine results
if all(is_ndframe):
keys_to_use = [k for k in keys if not results[k].empty]
# Have to check, if at least one DataFrame is not empty.
keys_to_use = keys_to_use if keys_to_use != [] else keys
axis = 0 if isinstance(obj, ABCSeries) else 1
result = concat({k: results[k] for k in keys_to_use}, axis=axis)
elif any(is_ndframe):
# There is a mix of NDFrames and scalars
raise ValueError(
"cannot perform both aggregation "
"and transformation operations "
"simultaneously"
)
else:
from pandas import Series

# we have a dict of scalars
# GH 36212 use name only if obj is a series
if obj.ndim == 1:
obj = cast("Series", obj)
name = obj.name
else:
name = None

result = Series(results, name=name)

return result, True
elif is_dict_like(arg):
arg = cast(Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], arg)
return agg_dict_like(obj, arg, _axis), True
elif is_list_like(arg):
# we require a list, but not an 'str'
return aggregate_multiple_funcs(obj, arg, _axis=_axis), None
arg = cast(List[AggFuncTypeBase], arg)
return agg_list_like(obj, arg, _axis=_axis), None
else:
result = None

Expand All @@ -667,7 +578,26 @@ def aggregate(obj, arg: AggFuncType, *args, **kwargs):
return result, True


def aggregate_multiple_funcs(obj, arg, _axis):
def agg_list_like(
obj,
arg: List[AggFuncTypeBase],
_axis: int,
) -> FrameOrSeriesUnion:
"""
Compute aggregation in the case of a list-like argument.

Parameters
----------
obj : Pandas object to compute aggregation on.
arg : list
Aggregations to compute.
_axis : int, 0 or 1
Axis to compute aggregation on.

Returns
-------
Result of aggregation.
"""
from pandas.core.reshape.concat import concat

if _axis != 0:
Expand Down Expand Up @@ -738,3 +668,118 @@ def aggregate_multiple_funcs(obj, arg, _axis):
"cannot combine transform and aggregation operations"
) from err
return result


def agg_dict_like(
obj,
arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe alias this

_axis: int,
) -> FrameOrSeriesUnion:
"""
Compute aggregation in the case of a dict-like argument.

Parameters
----------
obj : Pandas object to compute aggregation on.
arg : dict
label-aggregation pairs to compute.
_axis : int, 0 or 1
Axis to compute aggregation on.

Returns
-------
Result of aggregation.
"""
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

if _axis != 0: # pragma: no cover
raise ValueError("Can only pass dict with axis=0")

selected_obj = obj._selected_obj

# if we have a dict of any non-scalars
# eg. {'A' : ['mean']}, normalize all to
# be list-likes
if any(is_aggregator(x) for x in arg.values()):
new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {}
for k, v in arg.items():
if not isinstance(v, (tuple, list, dict)):
new_arg[k] = [v]
else:
new_arg[k] = v

# the keys must be in the columns
# for ndim=2, or renamers for ndim=1

# ok for now, but deprecated
# {'A': { 'ra': 'mean' }}
# {'A': { 'ra': ['mean'] }}
# {'ra': ['mean']}

# not ok
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
raise SpecificationError("nested renamer is not supported")
elif isinstance(selected_obj, ABCSeries):
raise SpecificationError("nested renamer is not supported")
elif (
isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns
):
raise KeyError(f"Column '{k}' does not exist!")

arg = new_arg

else:
# deprecation of renaming keys
# GH 15931
keys = list(arg.keys())
if isinstance(selected_obj, ABCDataFrame) and len(
selected_obj.columns.intersection(keys)
) != len(keys):
cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys)))
raise SpecificationError(f"Column(s) {cols} do not exist")

from pandas.core.reshape.concat import concat

if selected_obj.ndim == 1:
# key only used for output
colg = obj._gotitem(obj._selection, ndim=1)
results = {key: colg.agg(how) for key, how in arg.items()}
else:
# key used for column selection and output
results = {key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()}

# set the final keys
keys = list(arg.keys())

# Avoid making two isinstance calls in all and any below
is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]

# combine results
if all(is_ndframe):
Comment on lines +758 to +759
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, would be even better to have this if/elif/else statement in a separate function.
I would not suggest the best name, but I can infer from the comment that something containing combine could be the way to go.
Although, not a big deal.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this would be nice, this pass or next ok

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, and am working in that direction.

keys_to_use = [k for k in keys if not results[k].empty]
# Have to check, if at least one DataFrame is not empty.
keys_to_use = keys_to_use if keys_to_use != [] else keys
axis = 0 if isinstance(obj, ABCSeries) else 1
result = concat({k: results[k] for k in keys_to_use}, axis=axis)
elif any(is_ndframe):
# There is a mix of NDFrames and scalars
raise ValueError(
"cannot perform both aggregation "
"and transformation operations "
"simultaneously"
)
else:
from pandas import Series

# we have a dict of scalars
# GH 36212 use name only if obj is a series
if obj.ndim == 1:
obj = cast("Series", obj)
Comment on lines +777 to +778
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it the same as this?

if isinstance(obj, Series):
   ...

No need for casting.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #36677 for a discussion on this section; note that Series is no longer imported in this module.

name = obj.name
else:
name = None

result = Series(results, name=name)

return result
4 changes: 2 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
from pandas.core.dtypes.missing import isna, notna

from pandas.core.aggregation import (
agg_list_like,
aggregate,
aggregate_multiple_funcs,
maybe_mangle_lambdas,
reconstruct_func,
validate_func_kwargs,
Expand Down Expand Up @@ -968,7 +968,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)

# try to treat as if we are passing a list
try:
result = aggregate_multiple_funcs(self, [func], _axis=self.axis)
result = agg_list_like(self, [func], _axis=self.axis)

# select everything except for the last level, which is the one
# containing the name of the function(s), see GH 32040
Expand Down