Skip to content

Commit f03db6f

Browse files
rhshadrachKevin D Smith
authored and
Kevin D Smith
committed
CLN: Move _aggregate and _aggregate_multiple_funcs to core.aggregation (pandas-dev#36999)
1 parent 4e51287 commit f03db6f

File tree

7 files changed

+318
-310
lines changed

7 files changed

+318
-310
lines changed

pandas/core/aggregation.py

+296-1
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,11 @@
2929
Label,
3030
)
3131

32+
from pandas.core.dtypes.cast import is_nested_object
3233
from pandas.core.dtypes.common import is_dict_like, is_list_like
3334
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
3435

35-
from pandas.core.base import SpecificationError
36+
from pandas.core.base import DataError, SpecificationError
3637
import pandas.core.common as com
3738
from pandas.core.indexes.api import Index
3839

@@ -525,3 +526,297 @@ def transform_str_or_callable(
525526
return obj.apply(func, args=args, **kwargs)
526527
except Exception:
527528
return func(obj, *args, **kwargs)
529+
530+
531+
def aggregate(obj, arg: AggFuncType, *args, **kwargs):
532+
"""
533+
provide an implementation for the aggregators
534+
535+
Parameters
536+
----------
537+
arg : string, dict, function
538+
*args : args to pass on to the function
539+
**kwargs : kwargs to pass on to the function
540+
541+
Returns
542+
-------
543+
tuple of result, how
544+
545+
Notes
546+
-----
547+
how can be a string describe the required post-processing, or
548+
None if not required
549+
"""
550+
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
551+
552+
_axis = kwargs.pop("_axis", None)
553+
if _axis is None:
554+
_axis = getattr(obj, "axis", 0)
555+
556+
if isinstance(arg, str):
557+
return obj._try_aggregate_string_function(arg, *args, **kwargs), None
558+
559+
if isinstance(arg, dict):
560+
# aggregate based on the passed dict
561+
if _axis != 0: # pragma: no cover
562+
raise ValueError("Can only pass dict with axis=0")
563+
564+
selected_obj = obj._selected_obj
565+
566+
# if we have a dict of any non-scalars
567+
# eg. {'A' : ['mean']}, normalize all to
568+
# be list-likes
569+
if any(is_aggregator(x) for x in arg.values()):
570+
new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {}
571+
for k, v in arg.items():
572+
if not isinstance(v, (tuple, list, dict)):
573+
new_arg[k] = [v]
574+
else:
575+
new_arg[k] = v
576+
577+
# the keys must be in the columns
578+
# for ndim=2, or renamers for ndim=1
579+
580+
# ok for now, but deprecated
581+
# {'A': { 'ra': 'mean' }}
582+
# {'A': { 'ra': ['mean'] }}
583+
# {'ra': ['mean']}
584+
585+
# not ok
586+
# {'ra' : { 'A' : 'mean' }}
587+
if isinstance(v, dict):
588+
raise SpecificationError("nested renamer is not supported")
589+
elif isinstance(selected_obj, ABCSeries):
590+
raise SpecificationError("nested renamer is not supported")
591+
elif (
592+
isinstance(selected_obj, ABCDataFrame)
593+
and k not in selected_obj.columns
594+
):
595+
raise KeyError(f"Column '{k}' does not exist!")
596+
597+
arg = new_arg
598+
599+
else:
600+
# deprecation of renaming keys
601+
# GH 15931
602+
keys = list(arg.keys())
603+
if isinstance(selected_obj, ABCDataFrame) and len(
604+
selected_obj.columns.intersection(keys)
605+
) != len(keys):
606+
cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys)))
607+
raise SpecificationError(f"Column(s) {cols} do not exist")
608+
609+
from pandas.core.reshape.concat import concat
610+
611+
def _agg_1dim(name, how, subset=None):
612+
"""
613+
aggregate a 1-dim with how
614+
"""
615+
colg = obj._gotitem(name, ndim=1, subset=subset)
616+
if colg.ndim != 1:
617+
raise SpecificationError(
618+
"nested dictionary is ambiguous in aggregation"
619+
)
620+
return colg.aggregate(how)
621+
622+
def _agg_2dim(how):
623+
"""
624+
aggregate a 2-dim with how
625+
"""
626+
colg = obj._gotitem(obj._selection, ndim=2, subset=selected_obj)
627+
return colg.aggregate(how)
628+
629+
def _agg(arg, func):
630+
"""
631+
run the aggregations over the arg with func
632+
return a dict
633+
"""
634+
result = {}
635+
for fname, agg_how in arg.items():
636+
result[fname] = func(fname, agg_how)
637+
return result
638+
639+
# set the final keys
640+
keys = list(arg.keys())
641+
642+
if obj._selection is not None:
643+
644+
sl = set(obj._selection_list)
645+
646+
# we are a Series like object,
647+
# but may have multiple aggregations
648+
if len(sl) == 1:
649+
650+
result = _agg(
651+
arg, lambda fname, agg_how: _agg_1dim(obj._selection, agg_how)
652+
)
653+
654+
# we are selecting the same set as we are aggregating
655+
elif not len(sl - set(keys)):
656+
657+
result = _agg(arg, _agg_1dim)
658+
659+
# we are a DataFrame, with possibly multiple aggregations
660+
else:
661+
662+
result = _agg(arg, _agg_2dim)
663+
664+
# no selection
665+
else:
666+
667+
try:
668+
result = _agg(arg, _agg_1dim)
669+
except SpecificationError:
670+
671+
# we are aggregating expecting all 1d-returns
672+
# but we have 2d
673+
result = _agg(arg, _agg_2dim)
674+
675+
# combine results
676+
677+
def is_any_series() -> bool:
678+
# return a boolean if we have *any* nested series
679+
return any(isinstance(r, ABCSeries) for r in result.values())
680+
681+
def is_any_frame() -> bool:
682+
# return a boolean if we have *any* nested series
683+
return any(isinstance(r, ABCDataFrame) for r in result.values())
684+
685+
if isinstance(result, list):
686+
return concat(result, keys=keys, axis=1, sort=True), True
687+
688+
elif is_any_frame():
689+
# we have a dict of DataFrames
690+
# return a MI DataFrame
691+
692+
keys_to_use = [k for k in keys if not result[k].empty]
693+
# Have to check, if at least one DataFrame is not empty.
694+
keys_to_use = keys_to_use if keys_to_use != [] else keys
695+
return (
696+
concat([result[k] for k in keys_to_use], keys=keys_to_use, axis=1),
697+
True,
698+
)
699+
700+
elif isinstance(obj, ABCSeries) and is_any_series():
701+
702+
# we have a dict of Series
703+
# return a MI Series
704+
try:
705+
result = concat(result)
706+
except TypeError as err:
707+
# we want to give a nice error here if
708+
# we have non-same sized objects, so
709+
# we don't automatically broadcast
710+
711+
raise ValueError(
712+
"cannot perform both aggregation "
713+
"and transformation operations "
714+
"simultaneously"
715+
) from err
716+
717+
return result, True
718+
719+
# fall thru
720+
from pandas import DataFrame, Series
721+
722+
try:
723+
result = DataFrame(result)
724+
except ValueError:
725+
# we have a dict of scalars
726+
727+
# GH 36212 use name only if obj is a series
728+
if obj.ndim == 1:
729+
obj = cast("Series", obj)
730+
name = obj.name
731+
else:
732+
name = None
733+
734+
result = Series(result, name=name)
735+
736+
return result, True
737+
elif is_list_like(arg):
738+
# we require a list, but not an 'str'
739+
return aggregate_multiple_funcs(obj, arg, _axis=_axis), None
740+
else:
741+
result = None
742+
743+
if callable(arg):
744+
f = obj._get_cython_func(arg)
745+
if f and not args and not kwargs:
746+
return getattr(obj, f)(), None
747+
748+
# caller can react
749+
return result, True
750+
751+
752+
def aggregate_multiple_funcs(obj, arg, _axis):
753+
from pandas.core.reshape.concat import concat
754+
755+
if _axis != 0:
756+
raise NotImplementedError("axis other than 0 is not supported")
757+
758+
if obj._selected_obj.ndim == 1:
759+
selected_obj = obj._selected_obj
760+
else:
761+
selected_obj = obj._obj_with_exclusions
762+
763+
results = []
764+
keys = []
765+
766+
# degenerate case
767+
if selected_obj.ndim == 1:
768+
for a in arg:
769+
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
770+
try:
771+
new_res = colg.aggregate(a)
772+
773+
except TypeError:
774+
pass
775+
else:
776+
results.append(new_res)
777+
778+
# make sure we find a good name
779+
name = com.get_callable_name(a) or a
780+
keys.append(name)
781+
782+
# multiples
783+
else:
784+
for index, col in enumerate(selected_obj):
785+
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
786+
try:
787+
new_res = colg.aggregate(arg)
788+
except (TypeError, DataError):
789+
pass
790+
except ValueError as err:
791+
# cannot aggregate
792+
if "Must produce aggregated value" in str(err):
793+
# raised directly in _aggregate_named
794+
pass
795+
elif "no results" in str(err):
796+
# raised directly in _aggregate_multiple_funcs
797+
pass
798+
else:
799+
raise
800+
else:
801+
results.append(new_res)
802+
keys.append(col)
803+
804+
# if we are empty
805+
if not len(results):
806+
raise ValueError("no results")
807+
808+
try:
809+
return concat(results, keys=keys, axis=1, sort=False)
810+
except TypeError as err:
811+
812+
# we are concatting non-NDFrame objects,
813+
# e.g. a list of scalars
814+
815+
from pandas import Series
816+
817+
result = Series(results, index=keys, name=obj.name)
818+
if is_nested_object(result):
819+
raise ValueError(
820+
"cannot combine transform and aggregation operations"
821+
) from err
822+
return result

0 commit comments

Comments
 (0)