-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
CLN: Breakup agg #37452
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CLN: Breakup agg #37452
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -528,133 +528,44 @@ def transform_str_or_callable( | |
return func(obj, *args, **kwargs) | ||
|
||
|
||
def aggregate(obj, arg: AggFuncType, *args, **kwargs): | ||
def aggregate( | ||
obj, | ||
arg: AggFuncType, | ||
*args, | ||
**kwargs, | ||
): | ||
""" | ||
provide an implementation for the aggregators | ||
Provide an implementation for the aggregators. | ||
|
||
Parameters | ||
---------- | ||
arg : string, dict, function | ||
*args : args to pass on to the function | ||
**kwargs : kwargs to pass on to the function | ||
obj : Pandas object to compute aggregation on. | ||
arg : string, dict, function. | ||
*args : args to pass on to the function. | ||
**kwargs : kwargs to pass on to the function. | ||
|
||
Returns | ||
------- | ||
tuple of result, how | ||
tuple of result, how. | ||
|
||
Notes | ||
----- | ||
how can be a string describe the required post-processing, or | ||
None if not required | ||
None if not required. | ||
""" | ||
is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) | ||
|
||
_axis = kwargs.pop("_axis", None) | ||
if _axis is None: | ||
_axis = getattr(obj, "axis", 0) | ||
|
||
if isinstance(arg, str): | ||
return obj._try_aggregate_string_function(arg, *args, **kwargs), None | ||
|
||
if isinstance(arg, dict): | ||
# aggregate based on the passed dict | ||
if _axis != 0: # pragma: no cover | ||
raise ValueError("Can only pass dict with axis=0") | ||
|
||
selected_obj = obj._selected_obj | ||
|
||
# if we have a dict of any non-scalars | ||
# eg. {'A' : ['mean']}, normalize all to | ||
# be list-likes | ||
if any(is_aggregator(x) for x in arg.values()): | ||
new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {} | ||
for k, v in arg.items(): | ||
if not isinstance(v, (tuple, list, dict)): | ||
new_arg[k] = [v] | ||
else: | ||
new_arg[k] = v | ||
|
||
# the keys must be in the columns | ||
# for ndim=2, or renamers for ndim=1 | ||
|
||
# ok for now, but deprecated | ||
# {'A': { 'ra': 'mean' }} | ||
# {'A': { 'ra': ['mean'] }} | ||
# {'ra': ['mean']} | ||
|
||
# not ok | ||
# {'ra' : { 'A' : 'mean' }} | ||
if isinstance(v, dict): | ||
raise SpecificationError("nested renamer is not supported") | ||
elif isinstance(selected_obj, ABCSeries): | ||
raise SpecificationError("nested renamer is not supported") | ||
elif ( | ||
isinstance(selected_obj, ABCDataFrame) | ||
and k not in selected_obj.columns | ||
): | ||
raise KeyError(f"Column '{k}' does not exist!") | ||
|
||
arg = new_arg | ||
|
||
else: | ||
# deprecation of renaming keys | ||
# GH 15931 | ||
keys = list(arg.keys()) | ||
if isinstance(selected_obj, ABCDataFrame) and len( | ||
selected_obj.columns.intersection(keys) | ||
) != len(keys): | ||
cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys))) | ||
raise SpecificationError(f"Column(s) {cols} do not exist") | ||
|
||
from pandas.core.reshape.concat import concat | ||
|
||
if selected_obj.ndim == 1: | ||
# key only used for output | ||
colg = obj._gotitem(obj._selection, ndim=1) | ||
results = {key: colg.agg(how) for key, how in arg.items()} | ||
else: | ||
# key used for column selection and output | ||
results = { | ||
key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() | ||
} | ||
|
||
# set the final keys | ||
keys = list(arg.keys()) | ||
|
||
# Avoid making two isinstance calls in all and any below | ||
is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] | ||
|
||
# combine results | ||
if all(is_ndframe): | ||
keys_to_use = [k for k in keys if not results[k].empty] | ||
# Have to check, if at least one DataFrame is not empty. | ||
keys_to_use = keys_to_use if keys_to_use != [] else keys | ||
axis = 0 if isinstance(obj, ABCSeries) else 1 | ||
result = concat({k: results[k] for k in keys_to_use}, axis=axis) | ||
elif any(is_ndframe): | ||
# There is a mix of NDFrames and scalars | ||
raise ValueError( | ||
"cannot perform both aggregation " | ||
"and transformation operations " | ||
"simultaneously" | ||
) | ||
else: | ||
from pandas import Series | ||
|
||
# we have a dict of scalars | ||
# GH 36212 use name only if obj is a series | ||
if obj.ndim == 1: | ||
obj = cast("Series", obj) | ||
name = obj.name | ||
else: | ||
name = None | ||
|
||
result = Series(results, name=name) | ||
|
||
return result, True | ||
elif is_dict_like(arg): | ||
arg = cast(Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], arg) | ||
return agg_dict_like(obj, arg, _axis), True | ||
elif is_list_like(arg): | ||
# we require a list, but not an 'str' | ||
return aggregate_multiple_funcs(obj, arg, _axis=_axis), None | ||
arg = cast(List[AggFuncTypeBase], arg) | ||
return agg_list_like(obj, arg, _axis=_axis), None | ||
else: | ||
result = None | ||
|
||
|
@@ -667,7 +578,26 @@ def aggregate(obj, arg: AggFuncType, *args, **kwargs): | |
return result, True | ||
|
||
|
||
def aggregate_multiple_funcs(obj, arg, _axis): | ||
def agg_list_like( | ||
obj, | ||
arg: List[AggFuncTypeBase], | ||
_axis: int, | ||
) -> FrameOrSeriesUnion: | ||
""" | ||
Compute aggregation in the case of a list-like argument. | ||
|
||
Parameters | ||
---------- | ||
obj : Pandas object to compute aggregation on. | ||
arg : list | ||
Aggregations to compute. | ||
_axis : int, 0 or 1 | ||
Axis to compute aggregation on. | ||
|
||
Returns | ||
------- | ||
Result of aggregation. | ||
""" | ||
from pandas.core.reshape.concat import concat | ||
|
||
if _axis != 0: | ||
|
@@ -738,3 +668,118 @@ def aggregate_multiple_funcs(obj, arg, _axis): | |
"cannot combine transform and aggregation operations" | ||
) from err | ||
return result | ||
|
||
|
||
def agg_dict_like( | ||
obj, | ||
arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe alias this |
||
_axis: int, | ||
) -> FrameOrSeriesUnion: | ||
""" | ||
Compute aggregation in the case of a dict-like argument. | ||
|
||
Parameters | ||
---------- | ||
obj : Pandas object to compute aggregation on. | ||
arg : dict | ||
label-aggregation pairs to compute. | ||
_axis : int, 0 or 1 | ||
Axis to compute aggregation on. | ||
|
||
Returns | ||
------- | ||
Result of aggregation. | ||
""" | ||
is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) | ||
|
||
if _axis != 0: # pragma: no cover | ||
raise ValueError("Can only pass dict with axis=0") | ||
|
||
selected_obj = obj._selected_obj | ||
|
||
# if we have a dict of any non-scalars | ||
# eg. {'A' : ['mean']}, normalize all to | ||
# be list-likes | ||
if any(is_aggregator(x) for x in arg.values()): | ||
new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {} | ||
for k, v in arg.items(): | ||
if not isinstance(v, (tuple, list, dict)): | ||
new_arg[k] = [v] | ||
else: | ||
new_arg[k] = v | ||
|
||
# the keys must be in the columns | ||
# for ndim=2, or renamers for ndim=1 | ||
|
||
# ok for now, but deprecated | ||
# {'A': { 'ra': 'mean' }} | ||
# {'A': { 'ra': ['mean'] }} | ||
# {'ra': ['mean']} | ||
|
||
# not ok | ||
# {'ra' : { 'A' : 'mean' }} | ||
if isinstance(v, dict): | ||
raise SpecificationError("nested renamer is not supported") | ||
elif isinstance(selected_obj, ABCSeries): | ||
raise SpecificationError("nested renamer is not supported") | ||
elif ( | ||
isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns | ||
): | ||
raise KeyError(f"Column '{k}' does not exist!") | ||
|
||
arg = new_arg | ||
|
||
else: | ||
# deprecation of renaming keys | ||
# GH 15931 | ||
keys = list(arg.keys()) | ||
if isinstance(selected_obj, ABCDataFrame) and len( | ||
selected_obj.columns.intersection(keys) | ||
) != len(keys): | ||
cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys))) | ||
raise SpecificationError(f"Column(s) {cols} do not exist") | ||
|
||
from pandas.core.reshape.concat import concat | ||
|
||
if selected_obj.ndim == 1: | ||
# key only used for output | ||
colg = obj._gotitem(obj._selection, ndim=1) | ||
results = {key: colg.agg(how) for key, how in arg.items()} | ||
else: | ||
# key used for column selection and output | ||
results = {key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()} | ||
|
||
# set the final keys | ||
keys = list(arg.keys()) | ||
|
||
# Avoid making two isinstance calls in all and any below | ||
is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] | ||
|
||
# combine results | ||
if all(is_ndframe): | ||
Comment on lines
+758
to
+759
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my opinion, would be even better to have this if/elif/else statement in a separate function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah this would be nice, this pass or next ok There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, and am working in that direction. |
||
keys_to_use = [k for k in keys if not results[k].empty] | ||
# Have to check, if at least one DataFrame is not empty. | ||
keys_to_use = keys_to_use if keys_to_use != [] else keys | ||
axis = 0 if isinstance(obj, ABCSeries) else 1 | ||
result = concat({k: results[k] for k in keys_to_use}, axis=axis) | ||
elif any(is_ndframe): | ||
# There is a mix of NDFrames and scalars | ||
raise ValueError( | ||
"cannot perform both aggregation " | ||
"and transformation operations " | ||
"simultaneously" | ||
) | ||
else: | ||
from pandas import Series | ||
|
||
# we have a dict of scalars | ||
# GH 36212 use name only if obj is a series | ||
if obj.ndim == 1: | ||
obj = cast("Series", obj) | ||
Comment on lines
+777
to
+778
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it the same as this?
No need for casting. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See #36677 for a discussion on this section; note that |
||
name = obj.name | ||
else: | ||
name = None | ||
|
||
result = Series(results, name=name) | ||
|
||
return result |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you can type this as some point (obj), FrameOrSeries i think.