diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index caf2365a54ec8..7cc6c2fa7b88c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,285 +10,298 @@ import pandas as pd from pandas import concat, DataFrame, Index, MultiIndex, Series -from pandas.core.groupby import SpecificationError +from pandas.core.groupby import Grouping, SpecificationError from pandas.compat import OrderedDict import pandas.util.testing as tm -class TestGroupByAggregate(object): - - def setup_method(self, method): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array(np.random.randn(8), dtype='float32')}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.mframe = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def test_agg_regression1(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - def test_agg_must_agg(self): - grouped = self.df.groupby('A')['C'] - - msg = "Must produce aggregated value" - with tm.assert_raises_regex(Exception, msg): - grouped.agg(lambda x: x.describe()) - with tm.assert_raises_regex(Exception, msg): - grouped.agg(lambda x: x.index[:2]) - - def test_agg_ser_multi_key(self): - # TODO(wesm): unused - ser = self.df.C # noqa - - f = lambda x: x.sum() - results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) - expected = self.df.groupby(['A', 'B']).sum()['C'] - tm.assert_series_equal(results, expected) - - def test_agg_apply_corner(self): - # nothing to group, all NA - grouped = self.ts.groupby(self.ts * np.nan) - assert self.ts.dtype == np.float64 - - # groupby float64 values results in Float64Index - exp = Series([], dtype=np.float64, - index=pd.Index([], dtype=np.float64)) - tm.assert_series_equal(grouped.sum(), exp) - tm.assert_series_equal(grouped.agg(np.sum), exp) - tm.assert_series_equal(grouped.apply(np.sum), exp, - check_index_type=False) - - # DataFrame - grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) - exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, - index=pd.Index([], dtype=np.float64)) - tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) - tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], - check_names=False) - - def test_agg_grouping_is_list_tuple(self): - from pandas.core.groupby import Grouping - - df = tm.makeTimeDataFrame() - - grouped = df.groupby(lambda x: x.year) - grouper = grouped.grouper.groupings[0].grouper - grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - def test_agg_python_multiindex(self): - grouped = self.mframe.groupby(['A', 'B']) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize('groupbyfunc', [ - lambda x: x.weekday(), - [lambda x: x.month, lambda x: x.weekday()], - ]) - def test_aggregate_str_func(self, groupbyfunc): - grouped = self.tsframe.groupby(groupbyfunc) - - # single series - result = grouped['A'].agg('std') - expected = grouped['A'].std() - tm.assert_series_equal(result, expected) - - # group frame by function name - result = grouped.aggregate('var') - expected = grouped.var() - tm.assert_frame_equal(result, expected) - - # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], - ['B', 'std'], - ['C', 'mean'], - ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var()], - ['B', grouped['B'].std()], - ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) - tm.assert_frame_equal(result, expected) - - def test_aggregate_item_by_item(self): - df = self.df.copy() - df['E'] = ['a'] * len(self.df) - grouped = self.df.groupby('A') - - aggfun = lambda ser: ser.size - result = grouped.agg(aggfun) - foo = (self.df.A == 'foo').sum() - bar = (self.df.A == 'bar').sum() - K = len(result.columns) - - # GH5782 - # odd comparisons can result here, so cast to make easy - exp = pd.Series(np.array([foo] * K), index=list('BCD'), - dtype=np.float64, name='foo') - tm.assert_series_equal(result.xs('foo'), exp) - - exp = pd.Series(np.array([bar] * K), index=list('BCD'), - dtype=np.float64, name='bar') - tm.assert_almost_equal(result.xs('bar'), exp) - - def aggfun(ser): - return ser.size - - result = DataFrame().groupby(self.df.A).agg(aggfun) - assert isinstance(result, DataFrame) - assert len(result) == 0 - - def test_wrap_agg_out(self): - grouped = self.three_group.groupby(['A', 'B']) - - def func(ser): - if ser.dtype == np.object: - raise TypeError - else: - return ser.sum() - - result = grouped.aggregate(func) - exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] - expected = exp_grouped.groupby(['A', 'B']).aggregate(func) - tm.assert_frame_equal(result, expected) - - def test_agg_multiple_functions_maintain_order(self): - # GH #610 - funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] - result = self.df.groupby('A')['C'].agg(funcs) - exp_cols = Index(['mean', 'max', 'min']) - - tm.assert_index_equal(result.columns, exp_cols) - - def test_multiple_functions_tuples_and_non_tuples(self): - # #1359 - funcs = [('foo', 'mean'), 'std'] - ex_funcs = [('foo', 'mean'), ('std', 'std')] - - result = self.df.groupby('A')['C'].agg(funcs) - expected = self.df.groupby('A')['C'].agg(ex_funcs) - tm.assert_frame_equal(result, expected) - - result = self.df.groupby('A').agg(funcs) - expected = self.df.groupby('A').agg(ex_funcs) - tm.assert_frame_equal(result, expected) - - def test_agg_multiple_functions_too_many_lambdas(self): - grouped = self.df.groupby('A') - funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] - - msg = 'Function names must be unique, found multiple named ' - with tm.assert_raises_regex(SpecificationError, msg): - grouped.agg(funcs) - - def test_more_flexible_frame_multi_function(self): - grouped = self.df.groupby('A') - - exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) - exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) - - expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) - expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) - - d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) - result = grouped.aggregate(d) +@pytest.fixture +def ts(): + return tm.makeTimeSeries() + + +@pytest.fixture +def tsframe(): + return DataFrame(tm.getTimeSeriesData()) + + +@pytest.fixture +def df(): + return DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + +@pytest.fixture +def mframe(): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + return DataFrame(np.random.randn(10, 3), + index=index, + columns=['A', 'B', 'C']) + + +@pytest.fixture +def three_group(): + return DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', + 'bar', 'bar', 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', + 'one', 'two', 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', + 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + +def test_agg_regression1(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +def test_agg_must_agg(df): + grouped = df.groupby('A')['C'] + + msg = "Must produce aggregated value" + with tm.assert_raises_regex(Exception, msg): + grouped.agg(lambda x: x.describe()) + with tm.assert_raises_regex(Exception, msg): + grouped.agg(lambda x: x.index[:2]) + + +def test_agg_ser_multi_key(df): + # TODO(wesm): unused + ser = df.C # noqa + + f = lambda x: x.sum() + results = df.C.groupby([df.A, df.B]).aggregate(f) + expected = df.groupby(['A', 'B']).sum()['C'] + tm.assert_series_equal(results, expected) + + +def test_agg_apply_corner(ts, tsframe): + # nothing to group, all NA + grouped = ts.groupby(ts * np.nan) + assert ts.dtype == np.float64 + + # groupby float64 values results in Float64Index + exp = Series([], dtype=np.float64, + index=pd.Index([], dtype=np.float64)) + tm.assert_series_equal(grouped.sum(), exp) + tm.assert_series_equal(grouped.agg(np.sum), exp) + tm.assert_series_equal(grouped.apply(np.sum), exp, + check_index_type=False) + + # DataFrame + grouped = tsframe.groupby(tsframe['A'] * np.nan) + exp_df = DataFrame(columns=tsframe.columns, dtype=float, + index=pd.Index([], dtype=np.float64)) + tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) + tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], + check_names=False) + + +def test_agg_grouping_is_list_tuple(ts): + df = tm.makeTimeDataFrame() + + grouped = df.groupby(lambda x: x.year) + grouper = grouped.grouper.groupings[0].grouper + grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +def test_agg_python_multiindex(mframe): + grouped = mframe.groupby(['A', 'B']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('groupbyfunc', [ + lambda x: x.weekday(), + [lambda x: x.month, lambda x: x.weekday()], +]) +def test_aggregate_str_func(tsframe, groupbyfunc): + grouped = tsframe.groupby(groupbyfunc) + + # single series + result = grouped['A'].agg('std') + expected = grouped['A'].std() + tm.assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate('var') + expected = grouped.var() + tm.assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg(OrderedDict([['A', 'var'], + ['B', 'std'], + ['C', 'mean'], + ['D', 'sem']])) + expected = DataFrame(OrderedDict([['A', grouped['A'].var()], + ['B', grouped['B'].std()], + ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) + tm.assert_frame_equal(result, expected) + + +def test_aggregate_item_by_item(df): + grouped = df.groupby('A') + + aggfun = lambda ser: ser.size + result = grouped.agg(aggfun) + foo = (df.A == 'foo').sum() + bar = (df.A == 'bar').sum() + K = len(result.columns) + + # GH5782 + # odd comparisons can result here, so cast to make easy + exp = pd.Series(np.array([foo] * K), index=list('BCD'), + dtype=np.float64, name='foo') + tm.assert_series_equal(result.xs('foo'), exp) - tm.assert_frame_equal(result, expected) + exp = pd.Series(np.array([bar] * K), index=list('BCD'), + dtype=np.float64, name='bar') + tm.assert_almost_equal(result.xs('bar'), exp) - # be careful - result = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - expected = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - tm.assert_frame_equal(result, expected) + def aggfun(ser): + return ser.size - def foo(x): - return np.mean(x) + result = DataFrame().groupby(df.A).agg(aggfun) + assert isinstance(result, DataFrame) + assert len(result) == 0 - def bar(x): - return np.std(x, ddof=1) - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - d = OrderedDict([['C', np.mean], - ['D', OrderedDict([['foo', np.mean], - ['bar', np.std]])]]) - result = grouped.aggregate(d) +def test_wrap_agg_out(three_group): + grouped = three_group.groupby(['A', 'B']) - d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) + def func(ser): + if ser.dtype == np.object: + raise TypeError + else: + return ser.sum() + + result = grouped.aggregate(func) + exp_grouped = three_group.loc[:, three_group.columns != 'C'] + expected = exp_grouped.groupby(['A', 'B']).aggregate(func) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_functions_maintain_order(df): + # GH #610 + funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] + result = df.groupby('A')['C'].agg(funcs) + exp_cols = Index(['mean', 'max', 'min']) + + tm.assert_index_equal(result.columns, exp_cols) + + +def test_multiple_functions_tuples_and_non_tuples(df): + # #1359 + funcs = [('foo', 'mean'), 'std'] + ex_funcs = [('foo', 'mean'), ('std', 'std')] + + result = df.groupby('A')['C'].agg(funcs) + expected = df.groupby('A')['C'].agg(ex_funcs) + tm.assert_frame_equal(result, expected) + + result = df.groupby('A').agg(funcs) + expected = df.groupby('A').agg(ex_funcs) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_functions_too_many_lambdas(df): + grouped = df.groupby('A') + funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] + + msg = 'Function names must be unique, found multiple named ' + with tm.assert_raises_regex(SpecificationError, msg): + grouped.agg(funcs) + + +def test_more_flexible_frame_multi_function(df): + grouped = df.groupby('A') + + exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) + exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) + + expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) + expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) + + d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) + result = grouped.aggregate(d) + + tm.assert_frame_equal(result, expected) + + # be careful + result = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + expected = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + tm.assert_frame_equal(result, expected) + + def foo(x): + return np.mean(x) + + def bar(x): + return np.std(x, ddof=1) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + d = OrderedDict([['C', np.mean], + ['D', OrderedDict([['foo', np.mean], + ['bar', np.std]])]]) + result = grouped.aggregate(d) + + d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) + expected = grouped.aggregate(d) + + tm.assert_frame_equal(result, expected) + + +def test_multi_function_flexible_mix(df): + # GH #1268 + grouped = df.groupby('A') + + # Expected + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], + ['D', {'sum': 'sum'}]]) + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) - - def test_multi_function_flexible_mix(self): - # GH #1268 - grouped = self.df.groupby('A') - - # Expected - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', {'sum': 'sum'}]]) - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = grouped.aggregate(d) - - # Test 1 - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', 'sum']]) - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) - - # Test 2 - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', ['sum']]]) - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + # Test 1 + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], + ['D', 'sum']]) + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = grouped.aggregate(d) + tm.assert_frame_equal(result, expected) + + # Test 2 + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], + ['D', ['sum']]]) + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = grouped.aggregate(d) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index f8e44b1548819..575eae1916f4c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,13 +8,15 @@ import pytest -from datetime import datetime, timedelta +import datetime as dt from functools import partial import numpy as np import pandas as pd -from pandas import date_range, DataFrame, Index, MultiIndex, Series +from pandas import ( + date_range, DataFrame, Index, MultiIndex, PeriodIndex, period_range, Series +) from pandas.core.groupby import SpecificationError from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm @@ -50,7 +52,8 @@ def test_agg_datetimes_mixed(): 'value': [x[2] for x in data]}) data = [[row[0], - datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] else None, + (dt.datetime.strptime(row[1], '%Y-%m-%d').date() + if row[1] else None), row[2]] for row in data] @@ -68,7 +71,6 @@ def test_agg_datetimes_mixed(): def test_agg_period_index(): - from pandas import period_range, PeriodIndex prng = period_range('2012-1-1', freq='M', periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() @@ -125,7 +127,7 @@ def test_agg_dict_parameter_cast_result_dtypes(): def test_agg_cast_results_dtypes(): # similar to GH12821 # xref #11444 - u = [datetime(2015, x + 1, 1) for x in range(12)] + u = [dt.datetime(2015, x + 1, 1) for x in range(12)] v = list('aaabbbbbbccd') df = pd.DataFrame({'X': v, 'Y': u}) @@ -292,9 +294,7 @@ def test_agg_nested_dicts(): def test_agg_item_by_item_raise_typeerror(): - from numpy.random import randint - - df = DataFrame(randint(10, size=(20, 10))) + df = DataFrame(np.random.randint(10, size=(20, 10))) def raiseException(df): pprint_thing('----------------------------------------') @@ -344,7 +344,6 @@ def P1(a): except Exception: return np.nan - import datetime as dt df = DataFrame({'col1': [1, 2, 3, 4], 'col2': [10, 25, 26, 31], 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), @@ -403,7 +402,8 @@ def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') df = pd.DataFrame({'a': 1, - 'b': [ts + timedelta(minutes=nn) for nn in range(10)]}) + 'b': [ts + dt.timedelta(minutes=nn) + for nn in range(10)]}) result1 = df.groupby('a')['b'].agg(np.min).iloc[0] result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0]