Skip to content

CLN/TYP: aggregation methods in core.base #36677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 2, 2020
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 31 additions & 21 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

import builtins
import textwrap
from typing import Any, Callable, Dict, FrozenSet, Optional, Union
from typing import Any, Callable, Dict, FrozenSet, List, Optional, Union, cast

import numpy as np

import pandas._libs.lib as lib
from pandas._typing import AggFuncType, AggFuncTypeBase, Label
from pandas.compat import PYPY
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
Expand Down Expand Up @@ -278,7 +279,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs):
f"'{arg}' is not a valid function for '{type(self).__name__}' object"
)

def _aggregate(self, arg, *args, **kwargs):
def _aggregate(self, arg: AggFuncType, *args, **kwargs):
"""
provide an implementation for the aggregators

Expand Down Expand Up @@ -311,13 +312,13 @@ def _aggregate(self, arg, *args, **kwargs):
if _axis != 0: # pragma: no cover
raise ValueError("Can only pass dict with axis=0")

obj = self._selected_obj
selected_obj = self._selected_obj

# if we have a dict of any non-scalars
# eg. {'A' : ['mean']}, normalize all to
# be list-likes
if any(is_aggregator(x) for x in arg.values()):
new_arg = {}
new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Orthogonal to this PR but seems strange that this is really a subset of AggFuncType - any chance we have test coverage for the other parts of that Union coming through this method?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you're asking why this can't be typed Dict[Label, AggFuncType]. The last part of AggFuncType not included here are where the values are a Dict. This has been removed and will raise on 338 below. That it raises is tested in frame.apply.test_agg_dict_nested_renaming_depr.

for k, v in arg.items():
if not isinstance(v, (tuple, list, dict)):
new_arg[k] = [v]
Expand All @@ -336,9 +337,12 @@ def _aggregate(self, arg, *args, **kwargs):
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
raise SpecificationError("nested renamer is not supported")
elif isinstance(obj, ABCSeries):
elif isinstance(selected_obj, ABCSeries):
raise SpecificationError("nested renamer is not supported")
elif isinstance(obj, ABCDataFrame) and k not in obj.columns:
elif (
isinstance(selected_obj, ABCDataFrame)
and k not in selected_obj.columns
):
raise KeyError(f"Column '{k}' does not exist!")

arg = new_arg
Expand All @@ -347,10 +351,12 @@ def _aggregate(self, arg, *args, **kwargs):
# deprecation of renaming keys
# GH 15931
keys = list(arg.keys())
if isinstance(obj, ABCDataFrame) and len(
obj.columns.intersection(keys)
if isinstance(selected_obj, ABCDataFrame) and len(
selected_obj.columns.intersection(keys)
) != len(keys):
cols = sorted(set(keys) - set(obj.columns.intersection(keys)))
cols = sorted(
set(keys) - set(selected_obj.columns.intersection(keys))
)
raise SpecificationError(f"Column(s) {cols} do not exist")

from pandas.core.reshape.concat import concat
Expand All @@ -370,7 +376,7 @@ def _agg_2dim(how):
"""
aggregate a 2-dim with how
"""
colg = self._gotitem(self._selection, ndim=2, subset=obj)
colg = self._gotitem(self._selection, ndim=2, subset=selected_obj)
return colg.aggregate(how)

def _agg(arg, func):
Expand All @@ -385,7 +391,6 @@ def _agg(arg, func):

# set the final keys
keys = list(arg.keys())
result = {}

if self._selection is not None:

Expand Down Expand Up @@ -473,7 +478,11 @@ def is_any_frame() -> bool:
# we have a dict of scalars

# GH 36212 use name only if self is a series
name = self.name if (self.ndim == 1) else None
if self.ndim == 1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just rewrite this to if isinstance(self, ABCSeries): instead of checking ndim. It would be more consistent with other checks in the code base and get rid of the need for a cast

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, at the expense of being a little slower. On my machine, I'm seeing 127 ns for ndim and cast together and 156 ns for isinstance.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, is that because ABCSeries resolves to Any and hence mypy is no longer checking this. So I guess a cast will be needed in the future when we turn on more strict checking options for Any.

isinstance(.., ABC..) are generally slow and should normally be avoided if possible, but here it seems the ndim check is not that much more performant.

but this is OK for now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A 28 ns timing difference is not worth worrying about. If we wanted to instance check to be stricter we should probably just add a stub for the ABCs (or see if there's a mypy hook to strongly type them) but that is out of scope for this PR

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stubs for ABCs would be good.

but I see three issues with the current change.

  1. impacted performance just to pass mypy. no matter how small
  2. changing the typing to resolve to Any just to pass mypy. bad practice
  3. code churn. no matter how trivial.

all three together and i'm now firmly -1 on this approach.

self = cast(Series, self)
name = self.name
else:
name = None

result = Series(result, name=name)

Expand All @@ -484,9 +493,10 @@ def is_any_frame() -> bool:
else:
result = None

f = self._get_cython_func(arg)
if f and not args and not kwargs:
return getattr(self, f)(), None
if callable(arg):
f = self._get_cython_func(arg)
if f and not args and not kwargs:
return getattr(self, f)(), None

# caller can react
return result, True
Expand All @@ -498,17 +508,17 @@ def _aggregate_multiple_funcs(self, arg, _axis):
raise NotImplementedError("axis other than 0 is not supported")

if self._selected_obj.ndim == 1:
obj = self._selected_obj
selected_obj = self._selected_obj
else:
obj = self._obj_with_exclusions
selected_obj = self._obj_with_exclusions

results = []
keys = []

# degenerate case
if obj.ndim == 1:
if selected_obj.ndim == 1:
for a in arg:
colg = self._gotitem(obj.name, ndim=1, subset=obj)
colg = self._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
try:
new_res = colg.aggregate(a)

Expand All @@ -523,8 +533,8 @@ def _aggregate_multiple_funcs(self, arg, _axis):

# multiples
else:
for index, col in enumerate(obj):
colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index])
for index, col in enumerate(selected_obj):
colg = self._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
try:
new_res = colg.aggregate(arg)
except (TypeError, DataError):
Expand Down