-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
CLN/TYP: aggregation methods in core.base #36677
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,11 +4,12 @@ | |
|
||
import builtins | ||
import textwrap | ||
from typing import Any, Callable, Dict, FrozenSet, Optional, Union | ||
from typing import Any, Callable, Dict, FrozenSet, List, Optional, Union, cast | ||
|
||
import numpy as np | ||
|
||
import pandas._libs.lib as lib | ||
from pandas._typing import AggFuncType, AggFuncTypeBase, Label | ||
from pandas.compat import PYPY | ||
from pandas.compat.numpy import function as nv | ||
from pandas.errors import AbstractMethodError | ||
|
@@ -278,7 +279,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): | |
f"'{arg}' is not a valid function for '{type(self).__name__}' object" | ||
) | ||
|
||
def _aggregate(self, arg, *args, **kwargs): | ||
def _aggregate(self, arg: AggFuncType, *args, **kwargs): | ||
""" | ||
provide an implementation for the aggregators | ||
|
||
|
@@ -311,13 +312,13 @@ def _aggregate(self, arg, *args, **kwargs): | |
if _axis != 0: # pragma: no cover | ||
raise ValueError("Can only pass dict with axis=0") | ||
|
||
obj = self._selected_obj | ||
selected_obj = self._selected_obj | ||
|
||
# if we have a dict of any non-scalars | ||
# eg. {'A' : ['mean']}, normalize all to | ||
# be list-likes | ||
if any(is_aggregator(x) for x in arg.values()): | ||
new_arg = {} | ||
new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {} | ||
for k, v in arg.items(): | ||
if not isinstance(v, (tuple, list, dict)): | ||
new_arg[k] = [v] | ||
|
@@ -336,9 +337,12 @@ def _aggregate(self, arg, *args, **kwargs): | |
# {'ra' : { 'A' : 'mean' }} | ||
if isinstance(v, dict): | ||
raise SpecificationError("nested renamer is not supported") | ||
elif isinstance(obj, ABCSeries): | ||
elif isinstance(selected_obj, ABCSeries): | ||
raise SpecificationError("nested renamer is not supported") | ||
elif isinstance(obj, ABCDataFrame) and k not in obj.columns: | ||
elif ( | ||
isinstance(selected_obj, ABCDataFrame) | ||
and k not in selected_obj.columns | ||
): | ||
raise KeyError(f"Column '{k}' does not exist!") | ||
|
||
arg = new_arg | ||
|
@@ -347,10 +351,12 @@ def _aggregate(self, arg, *args, **kwargs): | |
# deprecation of renaming keys | ||
# GH 15931 | ||
keys = list(arg.keys()) | ||
if isinstance(obj, ABCDataFrame) and len( | ||
obj.columns.intersection(keys) | ||
if isinstance(selected_obj, ABCDataFrame) and len( | ||
selected_obj.columns.intersection(keys) | ||
) != len(keys): | ||
cols = sorted(set(keys) - set(obj.columns.intersection(keys))) | ||
cols = sorted( | ||
set(keys) - set(selected_obj.columns.intersection(keys)) | ||
) | ||
raise SpecificationError(f"Column(s) {cols} do not exist") | ||
|
||
from pandas.core.reshape.concat import concat | ||
|
@@ -370,7 +376,7 @@ def _agg_2dim(how): | |
""" | ||
aggregate a 2-dim with how | ||
""" | ||
colg = self._gotitem(self._selection, ndim=2, subset=obj) | ||
colg = self._gotitem(self._selection, ndim=2, subset=selected_obj) | ||
return colg.aggregate(how) | ||
|
||
def _agg(arg, func): | ||
|
@@ -385,7 +391,6 @@ def _agg(arg, func): | |
|
||
# set the final keys | ||
keys = list(arg.keys()) | ||
result = {} | ||
|
||
if self._selection is not None: | ||
|
||
|
@@ -473,7 +478,11 @@ def is_any_frame() -> bool: | |
# we have a dict of scalars | ||
|
||
# GH 36212 use name only if self is a series | ||
name = self.name if (self.ndim == 1) else None | ||
if self.ndim == 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can just rewrite this to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, at the expense of being a little slower. On my machine, I'm seeing 127 ns for ndim and cast together and 156 ns for isinstance. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, is that because ABCSeries resolves to isinstance(.., ABC..) are generally slow and should normally be avoided if possible, but here it seems the ndim check is not that much more performant. but this is OK for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A 28 ns timing difference is not worth worrying about. If we wanted to instance check to be stricter we should probably just add a stub for the ABCs (or see if there's a mypy hook to strongly type them) but that is out of scope for this PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. stubs for ABCs would be good. but I see three issues with the current change.
all three together and i'm now firmly -1 on this approach. |
||
self = cast(Series, self) | ||
name = self.name | ||
else: | ||
name = None | ||
|
||
result = Series(result, name=name) | ||
|
||
|
@@ -484,9 +493,10 @@ def is_any_frame() -> bool: | |
else: | ||
result = None | ||
|
||
f = self._get_cython_func(arg) | ||
if f and not args and not kwargs: | ||
return getattr(self, f)(), None | ||
if callable(arg): | ||
f = self._get_cython_func(arg) | ||
if f and not args and not kwargs: | ||
return getattr(self, f)(), None | ||
|
||
# caller can react | ||
return result, True | ||
|
@@ -498,17 +508,17 @@ def _aggregate_multiple_funcs(self, arg, _axis): | |
raise NotImplementedError("axis other than 0 is not supported") | ||
|
||
if self._selected_obj.ndim == 1: | ||
obj = self._selected_obj | ||
selected_obj = self._selected_obj | ||
else: | ||
obj = self._obj_with_exclusions | ||
selected_obj = self._obj_with_exclusions | ||
|
||
results = [] | ||
keys = [] | ||
|
||
# degenerate case | ||
if obj.ndim == 1: | ||
if selected_obj.ndim == 1: | ||
for a in arg: | ||
colg = self._gotitem(obj.name, ndim=1, subset=obj) | ||
colg = self._gotitem(selected_obj.name, ndim=1, subset=selected_obj) | ||
try: | ||
new_res = colg.aggregate(a) | ||
|
||
|
@@ -523,8 +533,8 @@ def _aggregate_multiple_funcs(self, arg, _axis): | |
|
||
# multiples | ||
else: | ||
for index, col in enumerate(obj): | ||
colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) | ||
for index, col in enumerate(selected_obj): | ||
colg = self._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) | ||
try: | ||
new_res = colg.aggregate(arg) | ||
except (TypeError, DataError): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Orthogonal to this PR but seems strange that this is really a subset of
AggFuncType
- any chance we have test coverage for the other parts of that Union coming through this method?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe you're asking why this can't be typed
Dict[Label, AggFuncType]
. The last part of AggFuncType not included here are where the values are a Dict. This has been removed and will raise on 338 below. That it raises is tested inframe.apply.test_agg_dict_nested_renaming_depr
.