Fix bug where df.agg(..., axis=1) gives wrong result

tp · tp · commit 0caac6b69878 · 2018-05-27T11:02:15.000+01:00
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -94,3 +94,8 @@ Categorical
 ^^^^^^^^^^^
 
 -
+
+Numeric
+^^^^^^^
+
+- :meth:`~DataFrame.agg` now correctly handles built-in methods like ``sum`` when axis=1 (:issue:`21134`)
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -149,3 +149,20 @@ def tz_aware_fixture(request):
     Fixture for trying explicit timezones: {0}
     """
     return request.param
+
+
+@pytest.fixture(
+    # params: Python 3.5 randomizes dict access and xdist doesn't like that
+    # in fixtures. In order to get predetermined values we need to sort
+    # the list deterministically
+    # GH 21123
+    params=list(sorted(pd.core.base.SelectionMixin._cython_table.items(),
+                       key=lambda x: x[0].__name__)),
+    ids=lambda x: "({}-{!r})_fixture".format(x[0].__name__, x[1]),
+)
+def cython_table_items(request):
+    """
+    Fixture for returning the items in
+    pandas.core.base.SelectionMixin._cython_table
+    """
+    return request.param
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -316,13 +316,14 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs):
 
         raise ValueError("{arg} is an unknown string function".format(arg=arg))
 
-    def _aggregate(self, arg, *args, **kwargs):
+    def _aggregate(self, arg, axis=0, *args, **kwargs):
         """
         provide an implementation for the aggregators
 
         Parameters
         ----------
         arg : string, dict, function
+        axis : int
         *args : args to pass on to the function
         **kwargs : kwargs to pass on to the function
 
@@ -335,25 +336,26 @@ def _aggregate(self, arg, *args, **kwargs):
         how can be a string describe the required post-processing, or
         None if not required
         """
+        obj = self if axis == 0 else self.T
         is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
         is_nested_renamer = False
 
         _axis = kwargs.pop('_axis', None)
         if _axis is None:
-            _axis = getattr(self, 'axis', 0)
+            _axis = getattr(obj, 'axis', 0)
         _level = kwargs.pop('_level', None)
 
         if isinstance(arg, compat.string_types):
-            return self._try_aggregate_string_function(arg, *args,
-                                                       **kwargs), None
+            return obj._try_aggregate_string_function(arg, *args,
+                                                      **kwargs), None
 
         if isinstance(arg, dict):
 
             # aggregate based on the passed dict
             if _axis != 0:  # pragma: no cover
                 raise ValueError('Can only pass dict with axis=0')
 
-            obj = self._selected_obj
+            selected_obj = obj._selected_obj
 
             def nested_renaming_depr(level=4):
                 # deprecation of nested renaming
@@ -388,16 +390,16 @@ def nested_renaming_depr(level=4):
                     if isinstance(v, dict):
                         is_nested_renamer = True
 
-                        if k not in obj.columns:
+                        if k not in selected_obj.columns:
                             msg = ('cannot perform renaming for {key} with a '
                                    'nested dictionary').format(key=k)
                             raise SpecificationError(msg)
                         nested_renaming_depr(4 + (_level or 0))
 
-                    elif isinstance(obj, ABCSeries):
+                    elif isinstance(selected_obj, ABCSeries):
                         nested_renaming_depr()
-                    elif isinstance(obj, ABCDataFrame) and \
-                            k not in obj.columns:
+                    elif isinstance(selected_obj, ABCDataFrame) and \
+                            k not in selected_obj.columns:
                         raise KeyError(
                             "Column '{col}' does not exist!".format(col=k))
 
@@ -407,8 +409,8 @@ def nested_renaming_depr(level=4):
                 # deprecation of renaming keys
                 # GH 15931
                 keys = list(compat.iterkeys(arg))
-                if (isinstance(obj, ABCDataFrame) and
-                        len(obj.columns.intersection(keys)) != len(keys)):
+                if (isinstance(selected_obj, ABCDataFrame) and len(
+                        selected_obj.columns.intersection(keys)) != len(keys)):
                     nested_renaming_depr()
 
             from pandas.core.reshape.concat import concat
@@ -417,7 +419,7 @@ def _agg_1dim(name, how, subset=None):
                 """
                 aggregate a 1-dim with how
                 """
-                colg = self._gotitem(name, ndim=1, subset=subset)
+                colg = obj._gotitem(name, ndim=1, subset=subset)
                 if colg.ndim != 1:
                     raise SpecificationError("nested dictionary is ambiguous "
                                              "in aggregation")
@@ -427,8 +429,8 @@ def _agg_2dim(name, how):
                 """
                 aggregate a 2-dim with how
                 """
-                colg = self._gotitem(self._selection, ndim=2,
-                                     subset=obj)
+                colg = obj._gotitem(obj._selection, ndim=2,
+                                    subset=selected_obj)
                 return colg.aggregate(how, _level=None)
 
             def _agg(arg, func):
@@ -458,20 +460,22 @@ def _agg(arg, func):
 
                 else:
 
-                    if self._selection is not None:
+                    if obj._selection is not None:
                         keys = None
 
             # some selection on the object
-            elif self._selection is not None:
+            elif obj._selection is not None:
 
-                sl = set(self._selection_list)
+                sl = set(obj._selection_list)
 
                 # we are a Series like object,
                 # but may have multiple aggregations
                 if len(sl) == 1:
 
-                    result = _agg(arg, lambda fname,
-                                  agg_how: _agg_1dim(self._selection, agg_how))
+                    result = _agg(
+                        arg,
+                        lambda fname, agg_how: _agg_1dim(
+                            obj._selection, agg_how))
 
                 # we are selecting the same set as we are aggregating
                 elif not len(sl - set(keys)):
@@ -516,7 +520,7 @@ def is_any_frame():
                 return concat([result[k] for k in keys],
                               keys=keys, axis=1), True
 
-            elif isinstance(self, ABCSeries) and is_any_series():
+            elif isinstance(obj, ABCSeries) and is_any_series():
 
                 # we have a dict of Series
                 # return a MI Series
@@ -541,20 +545,20 @@ def is_any_frame():
 
                 # we have a dict of scalars
                 result = Series(result,
-                                name=getattr(self, 'name', None))
+                                name=getattr(obj, 'name', None))
 
             return result, True
         elif is_list_like(arg) and arg not in compat.string_types:
             # we require a list, but not an 'str'
-            return self._aggregate_multiple_funcs(arg,
-                                                  _level=_level,
-                                                  _axis=_axis), None
+            return obj._aggregate_multiple_funcs(arg,
+                                                 _level=_level,
+                                                 _axis=_axis), None
         else:
             result = None
 
-        f = self._is_cython_func(arg)
-        if f and not args and not kwargs:
-            return getattr(self, f)(), None
+        f = obj._is_cython_func(arg)
+        if f is not None:
+            return getattr(obj, f)(*args, **kwargs), None
 
         # caller can react
         return result, True
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5818,13 +5818,11 @@ def _gotitem(self,
     def aggregate(self, func, axis=0, *args, **kwargs):
         axis = self._get_axis_number(axis)
 
-        # TODO: flipped axis
         result = None
-        if axis == 0:
-            try:
-                result, how = self._aggregate(func, axis=0, *args, **kwargs)
-            except TypeError:
-                pass
+        try:
+            result, how = self._aggregate(func, axis=axis, *args, **kwargs)
+        except TypeError:
+            pass
         if result is None:
             return self.apply(func, axis=axis, args=args, **kwargs)
         return result
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -4086,7 +4086,10 @@ def _post_process_cython_aggregate(self, obj):
     def aggregate(self, arg, *args, **kwargs):
 
         _level = kwargs.pop('_level', None)
-        result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
+        _agg_kwargs = kwargs.copy()
+        axis = _agg_kwargs.pop('axis', 0)
+        result, how = self._aggregate(arg, axis, _level=_level,
+                                      *args, **_agg_kwargs)
         if how is None:
             return result
 
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -1056,3 +1056,72 @@ def test_non_callable_aggregates(self):
         expected = df.size
 
         assert result == expected
+
+    @pytest.mark.parametrize("frame, expected_dict", [
+        [DataFrame(), {
+            'sum': Series(),
+            'max': Series(),
+            'min': Series(),
+            'all': Series(dtype=bool),
+            'any': Series(dtype=bool),
+            'mean': Series(),
+            'prod': Series(),
+            'std': Series(),
+            'var': Series(),
+            'median': Series(),
+            'cumprod': DataFrame(),
+            'cumsum': DataFrame(),
+        }],
+        [DataFrame([[np.nan, 1], [1, 2]]), {
+            'sum': Series([1., 3]),
+            'max': Series([1., 2]),
+            'min': Series([1., 1]),
+            'all': Series([True, True]),
+            'any': Series([True, True]),
+            'mean': Series([1, 1.5]),
+            'prod': Series([1., 2]),
+            'std': Series([np.nan, 0.707107]),
+            'var': Series([np.nan, 0.5]),
+            'median': Series([1, 1.5]),
+            'cumprod': DataFrame([[np.nan, 1], [1., 2.]]),
+            'cumsum': DataFrame([[np.nan, 1], [1., 3.]]),
+        }],
+        [DataFrame([['a', 'b'], ['b', 'a']]), {
+            'sum': Series(['ab', 'ba']),
+            'max': Series(['b', 'b']),
+            'min': Series(['a', 'a']),
+            'all': Series([True, True]),
+            'any': Series([True, True]),
+            'mean': Series([], index=pd.Index([], dtype='int64')),
+            'prod': Series([], index=pd.Index([], dtype='int64')),
+            'std': Series([], index=pd.Index([], dtype='int64')),
+            'var': Series([], index=pd.Index([], dtype='int64')),
+            'median': Series([], index=pd.Index([], dtype='int64')),
+            'cumprod': TypeError,
+            'cumsum': DataFrame([['a', 'b'], ['ab', 'ba']]),
+        }],
+    ])
+    @pytest.mark.parametrize("axis", [0, 1], ids=lambda x: "axis {}".format(x))
+    def test_agg_function_input(self, cython_table_items,
+                                frame, expected_dict, axis):
+        # GH21123
+        # test if using items in _cython_table gives correct results
+        np_func, str_func = cython_table_items
+        expected = expected_dict[str_func]
+
+        if isinstance(expected, type) and issubclass(expected, Exception):
+            with pytest.raises(expected):
+                # e.g. DataFrame(['a b'.split()]).cumprod() will raise
+                frame.agg(np_func, axis=axis)
+            with pytest.raises(expected):
+                frame.agg(str_func, axis=axis)
+            return
+
+        result = frame.agg(np_func, axis=axis)
+        result_str_func = frame.agg(str_func, axis=axis)
+        if str_func in ('cumprod', 'cumsum'):
+            tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result_str_func, expected)
+        else:
+            tm.assert_series_equal(result, expected)
+            tm.assert_series_equal(result_str_func, expected)
diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py
@@ -331,6 +331,76 @@ def test_non_callable_aggregates(self):
                                        ('mean', 1.5)]))
         assert_series_equal(result[expected.index], expected)
 
+    @pytest.mark.parametrize("series, expected_dict", [
+        [Series(), {
+            'sum': 0,
+            'max': np.nan,
+            'min': np.nan,
+            'all': True,
+            'any': False,
+            'mean': np.nan,
+            'prod': 1,
+            'std': np.nan,
+            'var': np.nan,
+            'median': np.nan,
+            'cumprod': Series([], Index([])),
+            'cumsum': Series([], Index([])),
+        }],
+        [Series([np.nan, 1, 2, 3]), {
+            'sum': 6,
+            'max': 3,
+            'min': 1,
+            'all': True,
+            'any': True,
+            'mean': 2,
+            'prod': 6,
+            'std': 1,
+            'var': 1,
+            'median': 2,
+            'cumprod': Series([np.nan, 1, 2, 6]),
+            'cumsum': Series([np.nan, 1, 3, 6]),
+        }],
+        [Series('a b c'.split()), {
+            'sum': 'abc',
+            'max': 'c',
+            'min': 'a',
+            'all': 'c',  # see GH12863
+            'any': 'a',
+            'mean': TypeError,  # mean raises TypeError
+            'prod': TypeError,
+            'std': TypeError,
+            'var': TypeError,
+            'median': TypeError,
+            'cumprod': TypeError,
+            'cumsum': Series(['a', 'ab', 'abc']),
+        }],
+    ])
+    def test_agg_cython_table_input(self, cython_table_items,
+                                    series, expected_dict):
+        # GH21123
+        # test if using items in _cython_table gives correct results
+        np_func, str_func = cython_table_items
+        expected = expected_dict[str_func]
+
+        if isinstance(expected, type) and issubclass(expected, Exception):
+            with pytest.raises(expected):
+                series.agg(np_func)
+            with pytest.raises(expected):
+                series.agg(str_func)
+            return
+
+        result = series.agg(np_func)
+        result_str_func = series.agg(str_func)
+        if str_func in ('cumprod', 'cumsum'):
+            tm.assert_series_equal(result, expected)
+            tm.assert_series_equal(result_str_func, expected)
+        elif tm.is_number(expected):
+            assert np.isclose(result, expected, equal_nan=True)
+            assert np.isclose(result_str_func, expected, equal_nan=True)
+        else:
+            assert result == expected
+            assert result_str_func == expected
+
 
 class TestSeriesMap(TestData):
 

Original file line number	Diff line number	Diff line change
`@@ -94,3 +94,8 @@ Categorical`
`94`	`94`	`^^^^^^^^^^^`
`95`	`95`
`96`	`96`	`-`
	`97`	`+`
	`98`	`+Numeric`
	`99`	`+^^^^^^^`
	`100`	`+`
	`101`	+- :meth:`~DataFrame.agg` now correctly handles built-in methods like ``sum`` when axis=1 (:issue:`21134`)