Fix tests for bug where df.agg(..., axis=1) gives wrong result

tp · tp · commit 9c1325660aa9 · 2018-06-09T22:50:14.000+01:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -119,7 +119,7 @@ Offsets
 Numeric
 ^^^^^^^
 
--
+- :meth:`~DataFrame.agg` now handles built-in methods like ``sum`` in the same manner when axis=1 as when axis=0 (:issue:`21224`)
 -
 -
 
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -170,3 +170,11 @@ def string_dtype(request):
     * 'U'
     """
     return request.param
+
+
+@pytest.fixture(params=[0, 1], ids=lambda x: "axis {}".format(x))
+def axis(request):
+    """
+     Fixture for returning the axis numbers of a dataframe.
+     """
+    return request.param
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5829,7 +5829,6 @@ def aggregate(self, func, axis=0, *args, **kwargs):
             return self.apply(func, axis=axis, args=args, **kwargs)
         return result
 
-    @Appender(NDFrame._aggregate.__doc__, indents=2)
     def _aggregate(self, arg, axis=0, *args, **kwargs):
         obj = self.T if axis == 1 else self
         return super(DataFrame, obj)._aggregate(arg, *args, **kwargs)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1795,7 +1795,7 @@ def error():
                     error()
                 raise
             except:
-                error()
+                raise
 
     def _is_scalar_access(self, key):
         # this is a shortcut accessor to both .loc and .iloc
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -6,6 +6,7 @@
 
 import operator
 from datetime import datetime
+from itertools import chain
 
 import warnings
 import numpy as np
@@ -21,6 +22,38 @@
 from pandas.tests.frame.common import TestData
 
 
+def _get_cython_table_params(frame, func_names_and_expected):
+    """combine frame, functions from SelectionMixin._cython_table
+    keys and expected result.
+
+    Parameters
+    ----------
+    frame : DataFrame
+        A symmetrical DataFrame
+    func_names_and_expected : Sequence of two items
+        The first item is a name of a NDFrame method ('sum', 'prod') etc.
+        The second item is the expected return value
+
+    Returns
+    -------
+    results : list
+        List of three items (DataFrame, function, expected result)
+    """
+    table = pd.core.base.SelectionMixin._cython_table
+    if compat.PY36:
+        table = list(table.items())
+    else:  # dicts have random order in Python<3.6, which xdist doesn't like
+        table = sorted(((key, value) for key, value in table.items()),
+                       key=lambda x: x[0].__class__.__name__)
+    results = []
+    for func_name, expected in func_names_and_expected:
+        results.append((frame, func_name, expected))
+        results += [
+            (frame, func, expected) for func, name in table
+            if name == func_name]
+    return results
+
+
 class TestDataFrameApply(TestData):
 
     def test_apply(self):
@@ -950,38 +983,47 @@ def test_agg_dict_nested_renaming_depr(self):
             df.agg({'A': {'foo': 'min'},
                     'B': {'bar': 'max'}})
 
-    def test_agg_reduce(self):
+    def test_agg_reduce(self, axis):
+        other_axis = abs(axis - 1)
+        name1, name2 = self.frame.axes[other_axis].unique()[:2]
+
         # all reducers
-        expected = zip_frames(self.frame.mean().to_frame(),
-                              self.frame.max().to_frame(),
-                              self.frame.sum().to_frame()).T
+        expected = zip_frames(self.frame.mean(axis=axis).to_frame(),
+                              self.frame.max(axis=axis).to_frame(),
+                              self.frame.sum(axis=axis).to_frame()).T
         expected.index = ['mean', 'max', 'sum']
-        result = self.frame.agg(['mean', 'max', 'sum'])
+        result = self.frame.agg(['mean', 'max', 'sum'], axis=axis)
         assert_frame_equal(result, expected)
 
         # dict input with scalars
-        result = self.frame.agg({'A': 'mean', 'B': 'sum'})
-        expected = Series([self.frame.A.mean(), self.frame.B.sum()],
-                          index=['A', 'B'])
+        func = {name1: 'mean', name2: 'sum'}
+        result = self.frame.agg(func, axis=axis)
+        expected = Series([self.frame.loc(other_axis)[name1].mean(),
+                           self.frame.loc(other_axis)[name2].sum()],
+                          index=[name1, name2])
         assert_series_equal(result.reindex_like(expected), expected)
 
         # dict input with lists
-        result = self.frame.agg({'A': ['mean'], 'B': ['sum']})
-        expected = DataFrame({'A': Series([self.frame.A.mean()],
-                                          index=['mean']),
-                              'B': Series([self.frame.B.sum()],
-                                          index=['sum'])})
+        func = {name1: ['mean'], name2: ['sum']}
+        result = self.frame.agg(func, axis=axis)
+        expected = DataFrame({
+            name1: Series([self.frame.loc(other_axis)[name1].mean()],
+                          index=['mean']),
+            name2: Series([self.frame.loc(other_axis)[name2].sum()],
+                          index=['sum'])})
         assert_frame_equal(result.reindex_like(expected), expected)
 
         # dict input with lists with multiple
-        result = self.frame.agg({'A': ['mean', 'sum'],
-                                 'B': ['sum', 'max']})
-        expected = DataFrame({'A': Series([self.frame.A.mean(),
-                                           self.frame.A.sum()],
-                                          index=['mean', 'sum']),
-                              'B': Series([self.frame.B.sum(),
-                                           self.frame.B.max()],
-                                          index=['sum', 'max'])})
+        func = {name1: ['mean', 'sum'],
+                name2: ['sum', 'max']}
+        result = self.frame.agg(func, axis=axis)
+        expected = DataFrame({
+            name1: Series([self.frame.loc(other_axis)[name1].mean(),
+                           self.frame.loc(other_axis)[name1].sum()],
+                          index=['mean', 'sum']),
+            name2: Series([self.frame.loc(other_axis)[name2].sum(),
+                           self.frame.loc(other_axis)[name2].max()],
+                          index=['sum', 'max'])})
         assert_frame_equal(result.reindex_like(expected), expected)
 
     def test_nuiscance_columns(self):
@@ -1057,72 +1099,66 @@ def test_non_callable_aggregates(self):
 
         assert result == expected
 
-    @pytest.mark.parametrize("frame, expected_dict", [
-        [DataFrame(), {
-            'sum': Series(),
-            'max': Series(),
-            'min': Series(),
-            'all': Series(dtype=bool),
-            'any': Series(dtype=bool),
-            'mean': Series(),
-            'prod': Series(),
-            'std': Series(),
-            'var': Series(),
-            'median': Series(),
-            'cumprod': DataFrame(),
-            'cumsum': DataFrame(),
-        }],
-        [DataFrame([[np.nan, 1], [1, 2]]), {
-            'sum': Series([1., 3]),
-            'max': Series([1., 2]),
-            'min': Series([1., 1]),
-            'all': Series([True, True]),
-            'any': Series([True, True]),
-            'mean': Series([1, 1.5]),
-            'prod': Series([1., 2]),
-            'std': Series([np.nan, 0.707107]),
-            'var': Series([np.nan, 0.5]),
-            'median': Series([1, 1.5]),
-            'cumprod': DataFrame([[np.nan, 1], [1., 2.]]),
-            'cumsum': DataFrame([[np.nan, 1], [1., 3.]]),
-        }],
-        [DataFrame([['a', 'b'], ['b', 'a']]), {
-            'sum': Series(['ab', 'ba']),
-            'max': Series(['b', 'b']),
-            'min': Series(['a', 'a']),
-            'all': Series([True, True]),
-            'any': Series([True, True]),
-            'mean': Series([], index=pd.Index([], dtype='int64')),
-            'prod': Series([], index=pd.Index([], dtype='int64')),
-            'std': Series([], index=pd.Index([], dtype='int64')),
-            'var': Series([], index=pd.Index([], dtype='int64')),
-            'median': Series([], index=pd.Index([], dtype='int64')),
-            'cumprod': TypeError,
-            'cumsum': DataFrame([['a', 'b'], ['ab', 'ba']]),
-        }],
-    ])
-    @pytest.mark.parametrize("axis", [0, 1], ids=lambda x: "axis {}".format(x))
-    def test_agg_cython_table(self, cython_table_items,
-                              frame, expected_dict, axis):
+    @pytest.mark.parametrize("df, func, expected", chain(
+        _get_cython_table_params(
+            DataFrame(), [
+                ('sum', Series()),
+                ('max', Series()),
+                ('min', Series()),
+                ('all', Series(dtype=bool)),
+                ('any', Series(dtype=bool)),
+                ('mean', Series()),
+                ('prod', Series()),
+                ('std', Series()),
+                ('var', Series()),
+                ('median', Series()),
+            ]),
+        _get_cython_table_params(
+            DataFrame([[np.nan, 1], [1, 2]]), [
+                ('sum', Series([1., 3])),
+                ('max', Series([1., 2])),
+                ('min', Series([1., 1])),
+                ('all', Series([True, True])),
+                ('any', Series([True, True])),
+                ('mean', Series([1, 1.5])),
+                ('prod', Series([1., 2])),
+                ('std', Series([np.nan, 0.707107])),
+                ('var', Series([np.nan, 0.5])),
+                ('median', Series([1, 1.5])),
+            ]),
+    ))
+    def test_agg_cython_table(self, df, func, expected, axis):
         # GH21224
-        # test if using items in pandas.core.base.SelectionMixin._cython_table
-        # in agg gives correct results
-        np_func, str_func = cython_table_items
-        expected = expected_dict[str_func]
-
-        if isinstance(expected, type) and issubclass(expected, Exception):
-            with pytest.raises(expected):
-                # e.g. DataFrame(['a b'.split()]).cumprod() will raise
-                frame.agg(np_func, axis=axis)
-            with pytest.raises(expected):
-                frame.agg(str_func, axis=axis)
-            return
-
-        result = frame.agg(np_func, axis=axis)
-        result_str_func = frame.agg(str_func, axis=axis)
-        if str_func in ('cumprod', 'cumsum'):
-            tm.assert_frame_equal(result, expected)
-            tm.assert_frame_equal(result_str_func, expected)
-        else:
-            tm.assert_series_equal(result, expected)
-            tm.assert_series_equal(result_str_func, expected)
+        # test reducing functions in
+        # pandas.core.base.SelectionMixin._cython_table
+        result = df.agg(func, axis=axis)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("df, func, expected", chain(
+        _get_cython_table_params(
+            DataFrame(), [
+                ('cumprod', DataFrame()),
+                ('cumsum', DataFrame()),
+            ]),
+        _get_cython_table_params(
+            DataFrame([[np.nan, 1], [1, 2]]), [
+                ('cumprod', DataFrame([[np.nan, 1], [1., 2.]])),
+                ('cumsum', DataFrame([[np.nan, 1], [1., 3.]])),
+            ]),
+    ))
+    def test_agg_cython_table_transform(self, df, func, expected, axis):
+        # GH21224
+        # test transforming functions in
+        # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+        result = df.agg(func, axis=axis)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("df, func, expected", _get_cython_table_params(
+        DataFrame([['a', 'b'], ['b', 'a']]), [
+            ['cumprod', TypeError],
+        ]),
+    )
+    def test_agg_cython_table_raises(self, df, func, expected, axis):
+        # GH21224
+        with pytest.raises(expected):
+            df.agg(func, axis=axis)
diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py

Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ Offsets`
`119`	`119`	`Numeric`
`120`	`120`	`^^^^^^^`
`121`	`121`
`122`		`--`
	`122`	+- :meth:`~DataFrame.agg` now handles built-in methods like ``sum`` in the same manner when axis=1 as when axis=0 (:issue:`21224`)
`123`	`123`	`-`
`124`	`124`	`-`
`125`	`125`