-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: add np.nan funcs to _cython_table #21123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
aa1b457
39e2e59
5ec7e18
f91b716
396b327
580edcf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,8 @@ | |
from pandas.core import common as com, algorithms | ||
import pandas.core.nanops as nanops | ||
import pandas._libs.lib as lib | ||
from pandas.compat.numpy import function as nv | ||
from pandas.compat.numpy import (function as nv, _np_version_under1p10, | ||
_np_version_under1p12) | ||
from pandas.compat import PYPY | ||
from pandas.util._decorators import (Appender, cache_readonly, | ||
deprecate_kwarg, Substitution) | ||
|
@@ -191,17 +192,31 @@ class SelectionMixin(object): | |
np.all: 'all', | ||
np.any: 'any', | ||
np.sum: 'sum', | ||
np.nansum: 'sum', | ||
np.mean: 'mean', | ||
np.nanmean: 'mean', | ||
np.prod: 'prod', | ||
np.std: 'std', | ||
np.nanstd: 'std', | ||
np.var: 'var', | ||
np.nanvar: 'var', | ||
np.median: 'median', | ||
np.nanmedian: 'median', | ||
np.max: 'max', | ||
np.nanmax: 'max', | ||
np.min: 'min', | ||
np.nanmin: 'min', | ||
np.cumprod: 'cumprod', | ||
np.cumsum: 'cumsum' | ||
np.cumsum: 'cumsum', | ||
} | ||
|
||
if not _np_version_under1p10: | ||
_cython_table[np.nanprod] = 'prod' | ||
|
||
if not _np_version_under1p12: | ||
_cython_table[np.nancumprod] = 'cumprod' | ||
_cython_table[np.nancumsum] = 'cumsum' | ||
|
||
@property | ||
def _selection_name(self): | ||
""" | ||
|
@@ -316,13 +331,14 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): | |
|
||
raise ValueError("{arg} is an unknown string function".format(arg=arg)) | ||
|
||
def _aggregate(self, arg, *args, **kwargs): | ||
def _aggregate(self, arg, axis=0, *args, **kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think this should be a separate PR. I know you mentioned there was some interweaving of test dependency, but I feel like we are injecting this keyword in here without any regard to existing test coverage for axis=1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can look into making separate PR. That will have to be pulled in before this one, so the tests of this PR won't break. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If timing is a concern you can also xfail the axis=1 tests. Rebase thereafter would be minor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has now been created as #21224 |
||
""" | ||
provide an implementation for the aggregators | ||
|
||
Parameters | ||
---------- | ||
arg : string, dict, function | ||
axis : int | ||
*args : args to pass on to the function | ||
**kwargs : kwargs to pass on to the function | ||
|
||
|
@@ -335,25 +351,26 @@ def _aggregate(self, arg, *args, **kwargs): | |
how can be a string describe the required post-processing, or | ||
None if not required | ||
""" | ||
obj = self if axis == 0 else self.T | ||
is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) | ||
is_nested_renamer = False | ||
|
||
_axis = kwargs.pop('_axis', None) | ||
if _axis is None: | ||
_axis = getattr(self, 'axis', 0) | ||
_axis = getattr(obj, 'axis', 0) | ||
_level = kwargs.pop('_level', None) | ||
|
||
if isinstance(arg, compat.string_types): | ||
return self._try_aggregate_string_function(arg, *args, | ||
**kwargs), None | ||
return obj._try_aggregate_string_function(arg, *args, | ||
**kwargs), None | ||
|
||
if isinstance(arg, dict): | ||
|
||
# aggregate based on the passed dict | ||
if _axis != 0: # pragma: no cover | ||
raise ValueError('Can only pass dict with axis=0') | ||
|
||
obj = self._selected_obj | ||
selected_obj = obj._selected_obj | ||
|
||
def nested_renaming_depr(level=4): | ||
# deprecation of nested renaming | ||
|
@@ -388,16 +405,16 @@ def nested_renaming_depr(level=4): | |
if isinstance(v, dict): | ||
is_nested_renamer = True | ||
|
||
if k not in obj.columns: | ||
if k not in selected_obj.columns: | ||
msg = ('cannot perform renaming for {key} with a ' | ||
'nested dictionary').format(key=k) | ||
raise SpecificationError(msg) | ||
nested_renaming_depr(4 + (_level or 0)) | ||
|
||
elif isinstance(obj, ABCSeries): | ||
elif isinstance(selected_obj, ABCSeries): | ||
nested_renaming_depr() | ||
elif isinstance(obj, ABCDataFrame) and \ | ||
k not in obj.columns: | ||
elif isinstance(selected_obj, ABCDataFrame) and \ | ||
k not in selected_obj.columns: | ||
raise KeyError( | ||
"Column '{col}' does not exist!".format(col=k)) | ||
|
||
|
@@ -407,8 +424,8 @@ def nested_renaming_depr(level=4): | |
# deprecation of renaming keys | ||
# GH 15931 | ||
keys = list(compat.iterkeys(arg)) | ||
if (isinstance(obj, ABCDataFrame) and | ||
len(obj.columns.intersection(keys)) != len(keys)): | ||
if (isinstance(selected_obj, ABCDataFrame) and len( | ||
selected_obj.columns.intersection(keys)) != len(keys)): | ||
nested_renaming_depr() | ||
|
||
from pandas.core.reshape.concat import concat | ||
|
@@ -417,7 +434,7 @@ def _agg_1dim(name, how, subset=None): | |
""" | ||
aggregate a 1-dim with how | ||
""" | ||
colg = self._gotitem(name, ndim=1, subset=subset) | ||
colg = obj._gotitem(name, ndim=1, subset=subset) | ||
if colg.ndim != 1: | ||
raise SpecificationError("nested dictionary is ambiguous " | ||
"in aggregation") | ||
|
@@ -427,8 +444,8 @@ def _agg_2dim(name, how): | |
""" | ||
aggregate a 2-dim with how | ||
""" | ||
colg = self._gotitem(self._selection, ndim=2, | ||
subset=obj) | ||
colg = obj._gotitem(obj._selection, ndim=2, | ||
subset=selected_obj) | ||
return colg.aggregate(how, _level=None) | ||
|
||
def _agg(arg, func): | ||
|
@@ -458,20 +475,22 @@ def _agg(arg, func): | |
|
||
else: | ||
|
||
if self._selection is not None: | ||
if obj._selection is not None: | ||
keys = None | ||
|
||
# some selection on the object | ||
elif self._selection is not None: | ||
elif obj._selection is not None: | ||
|
||
sl = set(self._selection_list) | ||
sl = set(obj._selection_list) | ||
|
||
# we are a Series like object, | ||
# but may have multiple aggregations | ||
if len(sl) == 1: | ||
|
||
result = _agg(arg, lambda fname, | ||
agg_how: _agg_1dim(self._selection, agg_how)) | ||
result = _agg( | ||
arg, | ||
lambda fname, agg_how: _agg_1dim( | ||
obj._selection, agg_how)) | ||
|
||
# we are selecting the same set as we are aggregating | ||
elif not len(sl - set(keys)): | ||
|
@@ -516,7 +535,7 @@ def is_any_frame(): | |
return concat([result[k] for k in keys], | ||
keys=keys, axis=1), True | ||
|
||
elif isinstance(self, ABCSeries) and is_any_series(): | ||
elif isinstance(obj, ABCSeries) and is_any_series(): | ||
|
||
# we have a dict of Series | ||
# return a MI Series | ||
|
@@ -541,20 +560,20 @@ def is_any_frame(): | |
|
||
# we have a dict of scalars | ||
result = Series(result, | ||
name=getattr(self, 'name', None)) | ||
name=getattr(obj, 'name', None)) | ||
|
||
return result, True | ||
elif is_list_like(arg) and arg not in compat.string_types: | ||
# we require a list, but not an 'str' | ||
return self._aggregate_multiple_funcs(arg, | ||
_level=_level, | ||
_axis=_axis), None | ||
return obj._aggregate_multiple_funcs(arg, | ||
_level=_level, | ||
_axis=_axis), None | ||
else: | ||
result = None | ||
|
||
f = self._is_cython_func(arg) | ||
if f and not args and not kwargs: | ||
return getattr(self, f)(), None | ||
f = obj._is_cython_func(arg) | ||
if f is not None: | ||
return getattr(obj, f)(*args, **kwargs), None | ||
|
||
# caller can react | ||
return result, True | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5818,13 +5818,11 @@ def _gotitem(self, | |
def aggregate(self, func, axis=0, *args, **kwargs): | ||
axis = self._get_axis_number(axis) | ||
|
||
# TODO: flipped axis | ||
result = None | ||
if axis == 0: | ||
try: | ||
result, how = self._aggregate(func, axis=0, *args, **kwargs) | ||
except TypeError: | ||
pass | ||
try: | ||
result, how = self._aggregate(func, axis=axis, *args, **kwargs) | ||
except TypeError: | ||
pass | ||
if result is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to the axis change, do we still hit this condition? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The axis change is related to #21134. So if I move that to a separate PR, this will move too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep that is expected just wanted to see if we still needed it (regardless of the PR it appears in) |
||
return self.apply(func, axis=axis, args=args, **kwargs) | ||
return result | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4086,7 +4086,10 @@ def _post_process_cython_aggregate(self, obj): | |
def aggregate(self, arg, *args, **kwargs): | ||
|
||
_level = kwargs.pop('_level', None) | ||
result, how = self._aggregate(arg, _level=_level, *args, **kwargs) | ||
_agg_kwargs = kwargs.copy() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can just list axis as a kwarg There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That breaks a test. The issue is >>> _level, args, _agg_kwargs = None, (80,), {'axis': 0}
>>> self._aggregate(arg, _level=_level, *args, **_agg_kwargs)
TypeError: _aggregate() got multiple values for argument 'axis' There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I understand this - what test is breaking? Perhaps the test is configured incorrectly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both To avoid this we'd prefer the signature to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is that test located? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
axis = _agg_kwargs.pop('axis', 0) | ||
result, how = self._aggregate(arg, axis, _level=_level, | ||
*args, **_agg_kwargs) | ||
if how is None: | ||
return result | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1056,3 +1056,71 @@ def test_non_callable_aggregates(self): | |
expected = df.size | ||
|
||
assert result == expected | ||
|
||
@pytest.mark.parametrize("inputs", [ | ||
[DataFrame(), { | ||
'sum': Series(), | ||
'max': Series(), | ||
'min': Series(), | ||
'all': Series(dtype=bool), | ||
'any': Series(dtype=bool), | ||
'mean': Series(), | ||
'prod': Series(), | ||
'std': Series(), | ||
'var': Series(), | ||
'median': Series(), | ||
'cumprod': DataFrame(), | ||
'cumsum': DataFrame(), | ||
}], | ||
[DataFrame([[np.nan, 1], [1, 2]]), { | ||
'sum': Series([1., 3]), | ||
'max': Series([1., 2]), | ||
'min': Series([1., 1]), | ||
'all': Series([True, True]), | ||
'any': Series([True, True]), | ||
'mean': Series([1, 1.5]), | ||
'prod': Series([1., 2]), | ||
'std': Series([np.nan, 0.707107]), | ||
'var': Series([np.nan, 0.5]), | ||
'median': Series([1, 1.5]), | ||
'cumprod': DataFrame([[np.nan, 1], [1., 2.]]), | ||
'cumsum': DataFrame([[np.nan, 1], [1., 3.]]), | ||
}], | ||
[DataFrame([['a', 'b'], ['b', 'a']]), { | ||
'sum': Series(['ab', 'ba']), | ||
'max': Series(['b', 'b']), | ||
'min': Series(['a', 'a']), | ||
'all': Series([True, True]), | ||
'any': Series([True, True]), | ||
'mean': Series([], index=pd.Index([], dtype='int64')), | ||
'prod': Series([], index=pd.Index([], dtype='int64')), | ||
'std': Series([], index=pd.Index([], dtype='int64')), | ||
'var': Series([], index=pd.Index([], dtype='int64')), | ||
'median': Series([], index=pd.Index([], dtype='int64')), | ||
'cumprod': TypeError, | ||
'cumsum': DataFrame([['a', 'b'], ['ab', 'ba']]), | ||
}], | ||
]) | ||
@pytest.mark.parametrize("axis", [0, 1], ids=lambda x: "axis {}".format(x)) | ||
def test_agg_function_input(self, cython_table_items, inputs, axis): | ||
# GH21123 | ||
np_func, str_func = cython_table_items | ||
df = inputs[0] | ||
expected = inputs[1][str_func] | ||
|
||
if isinstance(expected, type) and issubclass(expected, Exception): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Anything that raises should be done in a separate test, i.e. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree in principle, but this test iterates over all items in So I'd have to construct the tests quite a bit differently and probably the fixture in conftest.py couldn't be used (because it returns all combinations and I now have to select the relevant ones for each test method). So something like:
which will be very inelegant and repetitive IMO. Is it not possible to bend this rule on this one (or give hint on how to do it elegantly)?... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm OK understood. May make sense as an exception then - don't have anything off the top of my head to improve but will think more about it |
||
with pytest.raises(expected): | ||
# e.g. DataFrame(['a b'.split()]).cumprod() will raise | ||
df.agg(np_func, axis=axis) | ||
with pytest.raises(expected): | ||
df.agg(str_func, axis=axis) | ||
return | ||
|
||
result = df.agg(np_func, axis=axis) | ||
result_str_func = df.agg(str_func, axis=axis) | ||
if str_func in ('cumprod', 'cumsum'): | ||
tm.assert_frame_equal(result, expected) | ||
tm.assert_frame_equal(result_str_func, expected) | ||
else: | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(result_str_func, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so would like to have this fixture in your other PR