diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 5a553264e828b..e08f9809c8f92 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -94,3 +94,9 @@ Categorical ^^^^^^^^^^^ - + +Numeric +^^^^^^^ + +- :meth:`~DataFrame.agg` now correctly handles numpy NaN-aware methods like :meth:`numpy.nansum` (:issue:`19629`) +- :meth:`~DataFrame.agg` now correctly handles built-in methods like ``sum`` when axis=1 (:issue:`21134`) diff --git a/pandas/conftest.py b/pandas/conftest.py index b09cb872a12fb..3eda078a802f4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -149,3 +149,20 @@ def tz_aware_fixture(request): Fixture for trying explicit timezones: {0} """ return request.param + + +@pytest.fixture( + # params: Python 3.5 randomizes dict access and xdist doesn't like that + # in fixtures. In order to get predetermined values we need to sort + # the list deterministically + # GH 21123 + params=list(sorted(pd.core.base.SelectionMixin._cython_table.items(), + key=lambda x: x[0].__name__)), + ids=lambda x: "({}-{!r})_fixture".format(x[0].__name__, x[1]), +) +def cython_table_items(request): + """ + Fixture for returning the items in + pandas.core.base.SelectionMixin._cython_table + """ + return request.param diff --git a/pandas/core/base.py b/pandas/core/base.py index c331ead8d2fef..874168f5a49c7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -22,7 +22,8 @@ from pandas.core import common as com, algorithms import pandas.core.nanops as nanops import pandas._libs.lib as lib -from pandas.compat.numpy import function as nv +from pandas.compat.numpy import (function as nv, _np_version_under1p10, + _np_version_under1p12) from pandas.compat import PYPY from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) @@ -191,17 +192,31 @@ class SelectionMixin(object): np.all: 'all', np.any: 'any', np.sum: 'sum', + np.nansum: 'sum', np.mean: 'mean', + np.nanmean: 'mean', np.prod: 'prod', np.std: 'std', + np.nanstd: 'std', np.var: 'var', + np.nanvar: 'var', np.median: 'median', + np.nanmedian: 'median', np.max: 'max', + np.nanmax: 'max', np.min: 'min', + np.nanmin: 'min', np.cumprod: 'cumprod', - np.cumsum: 'cumsum' + np.cumsum: 'cumsum', } + if not _np_version_under1p10: + _cython_table[np.nanprod] = 'prod' + + if not _np_version_under1p12: + _cython_table[np.nancumprod] = 'cumprod' + _cython_table[np.nancumsum] = 'cumsum' + @property def _selection_name(self): """ @@ -316,13 +331,14 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): raise ValueError("{arg} is an unknown string function".format(arg=arg)) - def _aggregate(self, arg, *args, **kwargs): + def _aggregate(self, arg, axis=0, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function + axis : int *args : args to pass on to the function **kwargs : kwargs to pass on to the function @@ -335,17 +351,18 @@ def _aggregate(self, arg, *args, **kwargs): how can be a string describe the required post-processing, or None if not required """ + obj = self if axis == 0 else self.T is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: - _axis = getattr(self, 'axis', 0) + _axis = getattr(obj, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): - return self._try_aggregate_string_function(arg, *args, - **kwargs), None + return obj._try_aggregate_string_function(arg, *args, + **kwargs), None if isinstance(arg, dict): @@ -353,7 +370,7 @@ def _aggregate(self, arg, *args, **kwargs): if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') - obj = self._selected_obj + selected_obj = obj._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming @@ -388,16 +405,16 @@ def nested_renaming_depr(level=4): if isinstance(v, dict): is_nested_renamer = True - if k not in obj.columns: + if k not in selected_obj.columns: msg = ('cannot perform renaming for {key} with a ' 'nested dictionary').format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) - elif isinstance(obj, ABCSeries): + elif isinstance(selected_obj, ABCSeries): nested_renaming_depr() - elif isinstance(obj, ABCDataFrame) and \ - k not in obj.columns: + elif isinstance(selected_obj, ABCDataFrame) and \ + k not in selected_obj.columns: raise KeyError( "Column '{col}' does not exist!".format(col=k)) @@ -407,8 +424,8 @@ def nested_renaming_depr(level=4): # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) - if (isinstance(obj, ABCDataFrame) and - len(obj.columns.intersection(keys)) != len(keys)): + if (isinstance(selected_obj, ABCDataFrame) and len( + selected_obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.core.reshape.concat import concat @@ -417,7 +434,7 @@ def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ - colg = self._gotitem(name, ndim=1, subset=subset) + colg = obj._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") @@ -427,8 +444,8 @@ def _agg_2dim(name, how): """ aggregate a 2-dim with how """ - colg = self._gotitem(self._selection, ndim=2, - subset=obj) + colg = obj._gotitem(obj._selection, ndim=2, + subset=selected_obj) return colg.aggregate(how, _level=None) def _agg(arg, func): @@ -458,20 +475,22 @@ def _agg(arg, func): else: - if self._selection is not None: + if obj._selection is not None: keys = None # some selection on the object - elif self._selection is not None: + elif obj._selection is not None: - sl = set(self._selection_list) + sl = set(obj._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: - result = _agg(arg, lambda fname, - agg_how: _agg_1dim(self._selection, agg_how)) + result = _agg( + arg, + lambda fname, agg_how: _agg_1dim( + obj._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): @@ -516,7 +535,7 @@ def is_any_frame(): return concat([result[k] for k in keys], keys=keys, axis=1), True - elif isinstance(self, ABCSeries) and is_any_series(): + elif isinstance(obj, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series @@ -541,20 +560,20 @@ def is_any_frame(): # we have a dict of scalars result = Series(result, - name=getattr(self, 'name', None)) + name=getattr(obj, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' - return self._aggregate_multiple_funcs(arg, - _level=_level, - _axis=_axis), None + return obj._aggregate_multiple_funcs(arg, + _level=_level, + _axis=_axis), None else: result = None - f = self._is_cython_func(arg) - if f and not args and not kwargs: - return getattr(self, f)(), None + f = obj._is_cython_func(arg) + if f is not None: + return getattr(obj, f)(*args, **kwargs), None # caller can react return result, True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6c33b4f79478..c515b13aaac82 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5818,13 +5818,11 @@ def _gotitem(self, def aggregate(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) - # TODO: flipped axis result = None - if axis == 0: - try: - result, how = self._aggregate(func, axis=0, *args, **kwargs) - except TypeError: - pass + try: + result, how = self._aggregate(func, axis=axis, *args, **kwargs) + except TypeError: + pass if result is None: return self.apply(func, axis=axis, args=args, **kwargs) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..616345dde2d2f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4086,7 +4086,10 @@ def _post_process_cython_aggregate(self, obj): def aggregate(self, arg, *args, **kwargs): _level = kwargs.pop('_level', None) - result, how = self._aggregate(arg, _level=_level, *args, **kwargs) + _agg_kwargs = kwargs.copy() + axis = _agg_kwargs.pop('axis', 0) + result, how = self._aggregate(arg, axis, _level=_level, + *args, **_agg_kwargs) if how is None: return result diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index dfb2961befe35..e3c87917ec89f 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1056,3 +1056,71 @@ def test_non_callable_aggregates(self): expected = df.size assert result == expected + + @pytest.mark.parametrize("inputs", [ + [DataFrame(), { + 'sum': Series(), + 'max': Series(), + 'min': Series(), + 'all': Series(dtype=bool), + 'any': Series(dtype=bool), + 'mean': Series(), + 'prod': Series(), + 'std': Series(), + 'var': Series(), + 'median': Series(), + 'cumprod': DataFrame(), + 'cumsum': DataFrame(), + }], + [DataFrame([[np.nan, 1], [1, 2]]), { + 'sum': Series([1., 3]), + 'max': Series([1., 2]), + 'min': Series([1., 1]), + 'all': Series([True, True]), + 'any': Series([True, True]), + 'mean': Series([1, 1.5]), + 'prod': Series([1., 2]), + 'std': Series([np.nan, 0.707107]), + 'var': Series([np.nan, 0.5]), + 'median': Series([1, 1.5]), + 'cumprod': DataFrame([[np.nan, 1], [1., 2.]]), + 'cumsum': DataFrame([[np.nan, 1], [1., 3.]]), + }], + [DataFrame([['a', 'b'], ['b', 'a']]), { + 'sum': Series(['ab', 'ba']), + 'max': Series(['b', 'b']), + 'min': Series(['a', 'a']), + 'all': Series([True, True]), + 'any': Series([True, True]), + 'mean': Series([], index=pd.Index([], dtype='int64')), + 'prod': Series([], index=pd.Index([], dtype='int64')), + 'std': Series([], index=pd.Index([], dtype='int64')), + 'var': Series([], index=pd.Index([], dtype='int64')), + 'median': Series([], index=pd.Index([], dtype='int64')), + 'cumprod': TypeError, + 'cumsum': DataFrame([['a', 'b'], ['ab', 'ba']]), + }], + ]) + @pytest.mark.parametrize("axis", [0, 1], ids=lambda x: "axis {}".format(x)) + def test_agg_function_input(self, cython_table_items, inputs, axis): + # GH21123 + np_func, str_func = cython_table_items + df = inputs[0] + expected = inputs[1][str_func] + + if isinstance(expected, type) and issubclass(expected, Exception): + with pytest.raises(expected): + # e.g. DataFrame(['a b'.split()]).cumprod() will raise + df.agg(np_func, axis=axis) + with pytest.raises(expected): + df.agg(str_func, axis=axis) + return + + result = df.agg(np_func, axis=axis) + result_str_func = df.agg(str_func, axis=axis) + if str_func in ('cumprod', 'cumsum'): + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result_str_func, expected) + else: + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result_str_func, expected) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index b28b9f342695f..662a411c6fbd3 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -331,6 +331,75 @@ def test_non_callable_aggregates(self): ('mean', 1.5)])) assert_series_equal(result[expected.index], expected) + @pytest.mark.parametrize("inputs", [ + [Series(), { + 'sum': 0, + 'max': np.nan, + 'min': np.nan, + 'all': True, + 'any': False, + 'mean': np.nan, + 'prod': 1, + 'std': np.nan, + 'var': np.nan, + 'median': np.nan, + 'cumprod': Series([], Index([])), + 'cumsum': Series([], Index([])), + }], + [Series([np.nan, 1, 2, 3]), { + 'sum': 6, + 'max': 3, + 'min': 1, + 'all': True, + 'any': True, + 'mean': 2, + 'prod': 6, + 'std': 1, + 'var': 1, + 'median': 2, + 'cumprod': Series([np.nan, 1, 2, 6]), + 'cumsum': Series([np.nan, 1, 3, 6]), + }], + [Series('a b c'.split()), { + 'sum': 'abc', + 'max': 'c', + 'min': 'a', + 'all': 'c', # see GH12863 + 'any': 'a', + 'mean': TypeError, # mean raises TypeError + 'prod': TypeError, + 'std': TypeError, + 'var': TypeError, + 'median': TypeError, + 'cumprod': TypeError, + 'cumsum': Series(['a', 'ab', 'abc']), + }], + ]) + def test_agg_function_input(self, inputs, cython_table_items): + # GH21123 + np_func, str_func = cython_table_items + series = inputs[0] + expected = inputs[1][str_func] + + if isinstance(expected, type) and issubclass(expected, Exception): + with pytest.raises(expected): + series.agg(np_func) + with pytest.raises(expected): + series.agg(str_func) + return + + result = series.agg(np_func) + result_str_func = series.agg(str_func) + if str_func in ('cumprod', 'cumsum'): + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result_str_func, expected) + elif tm.is_number(expected): + assert np.isclose(result, expected, equal_nan=True) + assert np.isclose(result_str_func, expected, equal_nan=True) + else: + assert result == expected + assert result_str_func == expected + class TestSeriesMap(TestData):