Skip to content

BUG: df.agg(sum, axis=1) uses different method than when axis=0 #21222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,8 @@ Categorical
^^^^^^^^^^^

-

Numeric
^^^^^^^

- :meth:`~DataFrame.agg` now correctly handles built-in methods like ``sum`` when axis=1 (:issue:`21134`)
17 changes: 17 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,20 @@ def tz_aware_fixture(request):
Fixture for trying explicit timezones: {0}
"""
return request.param


@pytest.fixture(
# params: Python 3.5 randomizes dict access and xdist doesn't like that
# in fixtures. In order to get predetermined values we need to sort
# the list deterministically
# GH 21123
params=list(sorted(pd.core.base.SelectionMixin._cython_table.items(),
key=lambda x: x[0].__name__)),
ids=lambda x: "({}-{!r})_fixture".format(x[0].__name__, x[1]),
)
def cython_table_items(request):
"""
Fixture for returning the items in
pandas.core.base.SelectionMixin._cython_table
"""
return request.param
58 changes: 31 additions & 27 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,13 +316,14 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs):

raise ValueError("{arg} is an unknown string function".format(arg=arg))

def _aggregate(self, arg, *args, **kwargs):
def _aggregate(self, arg, axis=0, *args, **kwargs):
"""
provide an implementation for the aggregators
Parameters
----------
arg : string, dict, function
axis : int
*args : args to pass on to the function
**kwargs : kwargs to pass on to the function
Expand All @@ -335,25 +336,26 @@ def _aggregate(self, arg, *args, **kwargs):
how can be a string describe the required post-processing, or
None if not required
"""
obj = self if axis == 0 else self.T
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
is_nested_renamer = False

_axis = kwargs.pop('_axis', None)
if _axis is None:
_axis = getattr(self, 'axis', 0)
_axis = getattr(obj, 'axis', 0)
_level = kwargs.pop('_level', None)

if isinstance(arg, compat.string_types):
return self._try_aggregate_string_function(arg, *args,
**kwargs), None
return obj._try_aggregate_string_function(arg, *args,
**kwargs), None

if isinstance(arg, dict):

# aggregate based on the passed dict
if _axis != 0: # pragma: no cover
raise ValueError('Can only pass dict with axis=0')

obj = self._selected_obj
selected_obj = obj._selected_obj

def nested_renaming_depr(level=4):
# deprecation of nested renaming
Expand Down Expand Up @@ -388,16 +390,16 @@ def nested_renaming_depr(level=4):
if isinstance(v, dict):
is_nested_renamer = True

if k not in obj.columns:
if k not in selected_obj.columns:
msg = ('cannot perform renaming for {key} with a '
'nested dictionary').format(key=k)
raise SpecificationError(msg)
nested_renaming_depr(4 + (_level or 0))

elif isinstance(obj, ABCSeries):
elif isinstance(selected_obj, ABCSeries):
nested_renaming_depr()
elif isinstance(obj, ABCDataFrame) and \
k not in obj.columns:
elif isinstance(selected_obj, ABCDataFrame) and \
k not in selected_obj.columns:
raise KeyError(
"Column '{col}' does not exist!".format(col=k))

Expand All @@ -407,8 +409,8 @@ def nested_renaming_depr(level=4):
# deprecation of renaming keys
# GH 15931
keys = list(compat.iterkeys(arg))
if (isinstance(obj, ABCDataFrame) and
len(obj.columns.intersection(keys)) != len(keys)):
if (isinstance(selected_obj, ABCDataFrame) and len(
selected_obj.columns.intersection(keys)) != len(keys)):
nested_renaming_depr()

from pandas.core.reshape.concat import concat
Expand All @@ -417,7 +419,7 @@ def _agg_1dim(name, how, subset=None):
"""
aggregate a 1-dim with how
"""
colg = self._gotitem(name, ndim=1, subset=subset)
colg = obj._gotitem(name, ndim=1, subset=subset)
if colg.ndim != 1:
raise SpecificationError("nested dictionary is ambiguous "
"in aggregation")
Expand All @@ -427,8 +429,8 @@ def _agg_2dim(name, how):
"""
aggregate a 2-dim with how
"""
colg = self._gotitem(self._selection, ndim=2,
subset=obj)
colg = obj._gotitem(obj._selection, ndim=2,
subset=selected_obj)
return colg.aggregate(how, _level=None)

def _agg(arg, func):
Expand Down Expand Up @@ -458,20 +460,22 @@ def _agg(arg, func):

else:

if self._selection is not None:
if obj._selection is not None:
keys = None

# some selection on the object
elif self._selection is not None:
elif obj._selection is not None:

sl = set(self._selection_list)
sl = set(obj._selection_list)

# we are a Series like object,
# but may have multiple aggregations
if len(sl) == 1:

result = _agg(arg, lambda fname,
agg_how: _agg_1dim(self._selection, agg_how))
result = _agg(
arg,
lambda fname, agg_how: _agg_1dim(
obj._selection, agg_how))

# we are selecting the same set as we are aggregating
elif not len(sl - set(keys)):
Expand Down Expand Up @@ -516,7 +520,7 @@ def is_any_frame():
return concat([result[k] for k in keys],
keys=keys, axis=1), True

elif isinstance(self, ABCSeries) and is_any_series():
elif isinstance(obj, ABCSeries) and is_any_series():

# we have a dict of Series
# return a MI Series
Expand All @@ -541,20 +545,20 @@ def is_any_frame():

# we have a dict of scalars
result = Series(result,
name=getattr(self, 'name', None))
name=getattr(obj, 'name', None))

return result, True
elif is_list_like(arg) and arg not in compat.string_types:
# we require a list, but not an 'str'
return self._aggregate_multiple_funcs(arg,
_level=_level,
_axis=_axis), None
return obj._aggregate_multiple_funcs(arg,
_level=_level,
_axis=_axis), None
else:
result = None

f = self._is_cython_func(arg)
if f and not args and not kwargs:
return getattr(self, f)(), None
f = obj._is_cython_func(arg)
if f is not None:
return getattr(obj, f)(*args, **kwargs), None

# caller can react
return result, True
Expand Down
10 changes: 4 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5818,13 +5818,11 @@ def _gotitem(self,
def aggregate(self, func, axis=0, *args, **kwargs):
axis = self._get_axis_number(axis)

# TODO: flipped axis
result = None
if axis == 0:
try:
result, how = self._aggregate(func, axis=0, *args, **kwargs)
except TypeError:
pass
try:
result, how = self._aggregate(func, axis=axis, *args, **kwargs)
except TypeError:
pass
if result is None:
return self.apply(func, axis=axis, args=args, **kwargs)
return result
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4086,7 +4086,10 @@ def _post_process_cython_aggregate(self, obj):
def aggregate(self, arg, *args, **kwargs):

_level = kwargs.pop('_level', None)
result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
_agg_kwargs = kwargs.copy()
axis = _agg_kwargs.pop('axis', 0)
result, how = self._aggregate(arg, axis, _level=_level,
*args, **_agg_kwargs)
if how is None:
return result

Expand Down
69 changes: 69 additions & 0 deletions pandas/tests/frame/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,3 +1056,72 @@ def test_non_callable_aggregates(self):
expected = df.size

assert result == expected

@pytest.mark.parametrize("frame, expected_dict", [
[DataFrame(), {
'sum': Series(),
'max': Series(),
'min': Series(),
'all': Series(dtype=bool),
'any': Series(dtype=bool),
'mean': Series(),
'prod': Series(),
'std': Series(),
'var': Series(),
'median': Series(),
'cumprod': DataFrame(),
'cumsum': DataFrame(),
}],
[DataFrame([[np.nan, 1], [1, 2]]), {
'sum': Series([1., 3]),
'max': Series([1., 2]),
'min': Series([1., 1]),
'all': Series([True, True]),
'any': Series([True, True]),
'mean': Series([1, 1.5]),
'prod': Series([1., 2]),
'std': Series([np.nan, 0.707107]),
'var': Series([np.nan, 0.5]),
'median': Series([1, 1.5]),
'cumprod': DataFrame([[np.nan, 1], [1., 2.]]),
'cumsum': DataFrame([[np.nan, 1], [1., 3.]]),
}],
[DataFrame([['a', 'b'], ['b', 'a']]), {
'sum': Series(['ab', 'ba']),
'max': Series(['b', 'b']),
'min': Series(['a', 'a']),
'all': Series([True, True]),
'any': Series([True, True]),
'mean': Series([], index=pd.Index([], dtype='int64')),
'prod': Series([], index=pd.Index([], dtype='int64')),
'std': Series([], index=pd.Index([], dtype='int64')),
'var': Series([], index=pd.Index([], dtype='int64')),
'median': Series([], index=pd.Index([], dtype='int64')),
'cumprod': TypeError,
'cumsum': DataFrame([['a', 'b'], ['ab', 'ba']]),
}],
])
@pytest.mark.parametrize("axis", [0, 1], ids=lambda x: "axis {}".format(x))
def test_agg_function_input(self, cython_table_items,
frame, expected_dict, axis):
# GH21123
# test if using items in _cython_table gives correct results
np_func, str_func = cython_table_items
expected = expected_dict[str_func]

if isinstance(expected, type) and issubclass(expected, Exception):
with pytest.raises(expected):
# e.g. DataFrame(['a b'.split()]).cumprod() will raise
frame.agg(np_func, axis=axis)
with pytest.raises(expected):
frame.agg(str_func, axis=axis)
return

result = frame.agg(np_func, axis=axis)
result_str_func = frame.agg(str_func, axis=axis)
if str_func in ('cumprod', 'cumsum'):
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result_str_func, expected)
else:
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result_str_func, expected)
70 changes: 70 additions & 0 deletions pandas/tests/series/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,76 @@ def test_non_callable_aggregates(self):
('mean', 1.5)]))
assert_series_equal(result[expected.index], expected)

@pytest.mark.parametrize("series, expected_dict", [
[Series(), {
'sum': 0,
'max': np.nan,
'min': np.nan,
'all': True,
'any': False,
'mean': np.nan,
'prod': 1,
'std': np.nan,
'var': np.nan,
'median': np.nan,
'cumprod': Series([], Index([])),
'cumsum': Series([], Index([])),
}],
[Series([np.nan, 1, 2, 3]), {
'sum': 6,
'max': 3,
'min': 1,
'all': True,
'any': True,
'mean': 2,
'prod': 6,
'std': 1,
'var': 1,
'median': 2,
'cumprod': Series([np.nan, 1, 2, 6]),
'cumsum': Series([np.nan, 1, 3, 6]),
}],
[Series('a b c'.split()), {
'sum': 'abc',
'max': 'c',
'min': 'a',
'all': 'c', # see GH12863
'any': 'a',
'mean': TypeError, # mean raises TypeError
'prod': TypeError,
'std': TypeError,
'var': TypeError,
'median': TypeError,
'cumprod': TypeError,
'cumsum': Series(['a', 'ab', 'abc']),
}],
])
def test_agg_cython_table_input(self, cython_table_items,
series, expected_dict):
# GH21123
# test if using items in _cython_table gives correct results
np_func, str_func = cython_table_items
expected = expected_dict[str_func]

if isinstance(expected, type) and issubclass(expected, Exception):
with pytest.raises(expected):
series.agg(np_func)
with pytest.raises(expected):
series.agg(str_func)
return

result = series.agg(np_func)
result_str_func = series.agg(str_func)
if str_func in ('cumprod', 'cumsum'):
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result_str_func, expected)
elif tm.is_number(expected):
assert np.isclose(result, expected, equal_nan=True)
assert np.isclose(result_str_func, expected, equal_nan=True)
else:
assert result == expected
assert result_str_func == expected


class TestSeriesMap(TestData):

Expand Down