-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: add np.nan funcs to _cython_table #21123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
aa1b457
39e2e59
5ec7e18
f91b716
396b327
580edcf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,3 +149,20 @@ def tz_aware_fixture(request): | |
Fixture for trying explicit timezones: {0} | ||
""" | ||
return request.param | ||
|
||
|
||
@pytest.fixture( | ||
# params: Python 3.5 randomizes dict access and xdist doesn't like that | ||
# in fixtures. In order to get predetermined values we need to sort | ||
# the list deterministically | ||
# GH 21123 | ||
params=list(sorted(pd.core.base.SelectionMixin._cython_table.items(), | ||
key=lambda x: x[0].__name__)), | ||
ids=lambda x: "({}-{!r})".format(x[0].__name__, x[1]), | ||
) | ||
def cython_table_items(request): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add _fixture to the end of the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
""" | ||
Fixture for returning the items in | ||
pandas.core.base.SelectionMixin._cython_table | ||
""" | ||
return request.param |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,8 @@ | |
from pandas.core import common as com, algorithms | ||
import pandas.core.nanops as nanops | ||
import pandas._libs.lib as lib | ||
from pandas.compat.numpy import function as nv | ||
from pandas.compat.numpy import (function as nv, _np_version_under1p10, | ||
_np_version_under1p12) | ||
from pandas.compat import PYPY | ||
from pandas.util._decorators import (Appender, cache_readonly, | ||
deprecate_kwarg, Substitution) | ||
|
@@ -191,17 +192,31 @@ class SelectionMixin(object): | |
np.all: 'all', | ||
np.any: 'any', | ||
np.sum: 'sum', | ||
np.nansum: 'sum', | ||
np.mean: 'mean', | ||
np.nanmean: 'mean', | ||
np.prod: 'prod', | ||
np.std: 'std', | ||
np.nanstd: 'std', | ||
np.var: 'var', | ||
np.nanvar: 'var', | ||
np.median: 'median', | ||
np.nanmedian: 'median', | ||
np.max: 'max', | ||
np.nanmax: 'max', | ||
np.min: 'min', | ||
np.nanmin: 'min', | ||
np.cumprod: 'cumprod', | ||
np.cumsum: 'cumsum' | ||
np.cumsum: 'cumsum', | ||
} | ||
|
||
if not _np_version_under1p10: | ||
_cython_table[np.nanprod] = 'prod' | ||
|
||
if not _np_version_under1p12: | ||
_cython_table[np.nancumprod] = 'cumprod' | ||
_cython_table[np.nancumsum] = 'cumsum' | ||
|
||
@property | ||
def _selection_name(self): | ||
""" | ||
|
@@ -316,13 +331,14 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): | |
|
||
raise ValueError("{arg} is an unknown string function".format(arg=arg)) | ||
|
||
def _aggregate(self, arg, *args, **kwargs): | ||
def _aggregate(self, arg, axis=0, *args, **kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think this should be a separate PR. I know you mentioned there was some interweaving of test dependency, but I feel like we are injecting this keyword in here without any regard to existing test coverage for axis=1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can look into making separate PR. That will have to be pulled in before this one, so the tests of this PR won't break. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If timing is a concern you can also xfail the axis=1 tests. Rebase thereafter would be minor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has now been created as #21224 |
||
""" | ||
provide an implementation for the aggregators | ||
|
||
Parameters | ||
---------- | ||
arg : string, dict, function | ||
axis : int | ||
*args : args to pass on to the function | ||
**kwargs : kwargs to pass on to the function | ||
|
||
|
@@ -335,25 +351,26 @@ def _aggregate(self, arg, *args, **kwargs): | |
how can be a string describe the required post-processing, or | ||
None if not required | ||
""" | ||
obj = self if axis == 0 else self.T | ||
is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) | ||
is_nested_renamer = False | ||
|
||
_axis = kwargs.pop('_axis', None) | ||
if _axis is None: | ||
_axis = getattr(self, 'axis', 0) | ||
_axis = getattr(obj, 'axis', 0) | ||
_level = kwargs.pop('_level', None) | ||
|
||
if isinstance(arg, compat.string_types): | ||
return self._try_aggregate_string_function(arg, *args, | ||
**kwargs), None | ||
return obj._try_aggregate_string_function(arg, *args, | ||
**kwargs), None | ||
|
||
if isinstance(arg, dict): | ||
|
||
# aggregate based on the passed dict | ||
if _axis != 0: # pragma: no cover | ||
raise ValueError('Can only pass dict with axis=0') | ||
|
||
obj = self._selected_obj | ||
selected_obj = obj._selected_obj | ||
|
||
def nested_renaming_depr(level=4): | ||
# deprecation of nested renaming | ||
|
@@ -388,16 +405,16 @@ def nested_renaming_depr(level=4): | |
if isinstance(v, dict): | ||
is_nested_renamer = True | ||
|
||
if k not in obj.columns: | ||
if k not in selected_obj.columns: | ||
msg = ('cannot perform renaming for {key} with a ' | ||
'nested dictionary').format(key=k) | ||
raise SpecificationError(msg) | ||
nested_renaming_depr(4 + (_level or 0)) | ||
|
||
elif isinstance(obj, ABCSeries): | ||
elif isinstance(selected_obj, ABCSeries): | ||
nested_renaming_depr() | ||
elif isinstance(obj, ABCDataFrame) and \ | ||
k not in obj.columns: | ||
elif isinstance(selected_obj, ABCDataFrame) and \ | ||
k not in selected_obj.columns: | ||
raise KeyError( | ||
"Column '{col}' does not exist!".format(col=k)) | ||
|
||
|
@@ -407,8 +424,8 @@ def nested_renaming_depr(level=4): | |
# deprecation of renaming keys | ||
# GH 15931 | ||
keys = list(compat.iterkeys(arg)) | ||
if (isinstance(obj, ABCDataFrame) and | ||
len(obj.columns.intersection(keys)) != len(keys)): | ||
if (isinstance(selected_obj, ABCDataFrame) and len( | ||
selected_obj.columns.intersection(keys)) != len(keys)): | ||
nested_renaming_depr() | ||
|
||
from pandas.core.reshape.concat import concat | ||
|
@@ -417,7 +434,7 @@ def _agg_1dim(name, how, subset=None): | |
""" | ||
aggregate a 1-dim with how | ||
""" | ||
colg = self._gotitem(name, ndim=1, subset=subset) | ||
colg = obj._gotitem(name, ndim=1, subset=subset) | ||
if colg.ndim != 1: | ||
raise SpecificationError("nested dictionary is ambiguous " | ||
"in aggregation") | ||
|
@@ -427,8 +444,8 @@ def _agg_2dim(name, how): | |
""" | ||
aggregate a 2-dim with how | ||
""" | ||
colg = self._gotitem(self._selection, ndim=2, | ||
subset=obj) | ||
colg = obj._gotitem(obj._selection, ndim=2, | ||
subset=selected_obj) | ||
return colg.aggregate(how, _level=None) | ||
|
||
def _agg(arg, func): | ||
|
@@ -458,20 +475,22 @@ def _agg(arg, func): | |
|
||
else: | ||
|
||
if self._selection is not None: | ||
if obj._selection is not None: | ||
keys = None | ||
|
||
# some selection on the object | ||
elif self._selection is not None: | ||
elif obj._selection is not None: | ||
|
||
sl = set(self._selection_list) | ||
sl = set(obj._selection_list) | ||
|
||
# we are a Series like object, | ||
# but may have multiple aggregations | ||
if len(sl) == 1: | ||
|
||
result = _agg(arg, lambda fname, | ||
agg_how: _agg_1dim(self._selection, agg_how)) | ||
result = _agg( | ||
arg, | ||
lambda fname, agg_how: _agg_1dim( | ||
obj._selection, agg_how)) | ||
|
||
# we are selecting the same set as we are aggregating | ||
elif not len(sl - set(keys)): | ||
|
@@ -516,7 +535,7 @@ def is_any_frame(): | |
return concat([result[k] for k in keys], | ||
keys=keys, axis=1), True | ||
|
||
elif isinstance(self, ABCSeries) and is_any_series(): | ||
elif isinstance(obj, ABCSeries) and is_any_series(): | ||
|
||
# we have a dict of Series | ||
# return a MI Series | ||
|
@@ -541,20 +560,20 @@ def is_any_frame(): | |
|
||
# we have a dict of scalars | ||
result = Series(result, | ||
name=getattr(self, 'name', None)) | ||
name=getattr(obj, 'name', None)) | ||
|
||
return result, True | ||
elif is_list_like(arg) and arg not in compat.string_types: | ||
# we require a list, but not an 'str' | ||
return self._aggregate_multiple_funcs(arg, | ||
_level=_level, | ||
_axis=_axis), None | ||
return obj._aggregate_multiple_funcs(arg, | ||
_level=_level, | ||
_axis=_axis), None | ||
else: | ||
result = None | ||
|
||
f = self._is_cython_func(arg) | ||
if f and not args and not kwargs: | ||
return getattr(self, f)(), None | ||
f = obj._is_cython_func(arg) | ||
if f is not None: | ||
return getattr(obj, f)(*args, **kwargs), None | ||
|
||
# caller can react | ||
return result, True | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5818,13 +5818,11 @@ def _gotitem(self, | |
def aggregate(self, func, axis=0, *args, **kwargs): | ||
axis = self._get_axis_number(axis) | ||
|
||
# TODO: flipped axis | ||
result = None | ||
if axis == 0: | ||
try: | ||
result, how = self._aggregate(func, axis=0, *args, **kwargs) | ||
except TypeError: | ||
pass | ||
try: | ||
result, how = self._aggregate(func, axis=axis, *args, **kwargs) | ||
except TypeError: | ||
pass | ||
if result is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to the axis change, do we still hit this condition? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The axis change is related to #21134. So if I move that to a separate PR, this will move too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep that is expected just wanted to see if we still needed it (regardless of the PR it appears in) |
||
return self.apply(func, axis=axis, args=args, **kwargs) | ||
return result | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4086,7 +4086,10 @@ def _post_process_cython_aggregate(self, obj): | |
def aggregate(self, arg, *args, **kwargs): | ||
|
||
_level = kwargs.pop('_level', None) | ||
result, how = self._aggregate(arg, _level=_level, *args, **kwargs) | ||
_agg_kwargs = kwargs.copy() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can just list axis as a kwarg There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That breaks a test. The issue is >>> _level, args, _agg_kwargs = None, (80,), {'axis': 0}
>>> self._aggregate(arg, _level=_level, *args, **_agg_kwargs)
TypeError: _aggregate() got multiple values for argument 'axis' There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I understand this - what test is breaking? Perhaps the test is configured incorrectly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both To avoid this we'd prefer the signature to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is that test located? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
axis = _agg_kwargs.pop('axis', 0) | ||
result, how = self._aggregate(arg, axis, _level=_level, | ||
*args, **_agg_kwargs) | ||
if how is None: | ||
return result | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1056,3 +1056,26 @@ def test_non_callable_aggregates(self): | |
expected = df.size | ||
|
||
assert result == expected | ||
|
||
@pytest.mark.parametrize("df", [ | ||
pd.DataFrame([[1, 2], [3, 4]]), | ||
pd.DataFrame([[np.nan, 2], [3, 4]]), | ||
pd.DataFrame(), | ||
]) | ||
def test_agg_function_input(self, df, cython_table_items): | ||
# test whether the functions (keys) in | ||
# pd.core.base.SelectionMixin._cython_table give the same result | ||
# as the related strings (values) when used in df.agg. Examples: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add an example which actually tests (say for sum, nansum) axis=1, IOW contruct the resultant frame this test doesn't actually tests that axis=1 works, just that it matches with a string (which doesn't have tests itself) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand, could you expand. This tests if the result are the same if e.g. same result when |
||
# - ``df.agg(np.nansum)`` should give the same result as | ||
# ``df.agg('sum')`` | ||
# - ``df.agg(sum)`` should give the same result as ``df.agg('sum')`` | ||
# etc. | ||
# GH21123 | ||
np_func, str_func = cython_table_items | ||
|
||
tm.assert_almost_equal(df.agg(np_func), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use assert_frame_equal it provides stronger guarantees There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Most of these aggregate to series (so I could add a conditional, so the more correct assert is used each time. |
||
df.agg(str_func), | ||
) | ||
tm.assert_almost_equal(df.agg(np_func, axis=1), | ||
df.agg(str_func, axis=1), | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -587,3 +587,23 @@ def test_map_missing_mixed(self, vals, mapping, exp): | |
result = s.map(mapping) | ||
|
||
tm.assert_series_equal(result, pd.Series(exp)) | ||
|
||
@pytest.mark.parametrize("series", [ | ||
pd.Series([1, 2, 3, 4]), | ||
pd.Series([np.nan, 2, 3, 4]), | ||
pd.Series(), | ||
]) | ||
def test_agg_function_input(self, series, cython_table_items): | ||
# test whether the functions (keys) in | ||
# pd.core.base.SelectionMixin._cython_table give the same result | ||
# as the related strings (values), when used in ser.agg. Examples: | ||
# - ``ser.agg(np.nansum)`` should give the same result as | ||
# ``ser.agg('sum')`` | ||
# - ``ser.agg(sum)`` should give the same result as ``ser.agg('sum')`` | ||
# etc. | ||
# GH21123 | ||
np_func, str_func = cython_table_items | ||
|
||
tm.assert_almost_equal(series.agg(np_func), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use assert_series_equal |
||
series.agg(str_func), | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so would like to have this fixture in your other PR