|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +""" |
| 4 | +test .agg behavior / note that .apply is tested generally in test_groupby.py |
| 5 | +""" |
| 6 | + |
| 7 | +import pytest |
| 8 | + |
| 9 | +import numpy as np |
| 10 | +import pandas as pd |
| 11 | + |
| 12 | +from pandas import concat, DataFrame, Index, MultiIndex, Series |
| 13 | +from pandas.core.groupby import SpecificationError |
| 14 | +from pandas.compat import OrderedDict |
| 15 | +import pandas.util.testing as tm |
| 16 | + |
| 17 | + |
| 18 | +class TestGroupByAggregate(object): |
| 19 | + |
| 20 | + def setup_method(self, method): |
| 21 | + self.ts = tm.makeTimeSeries() |
| 22 | + |
| 23 | + self.seriesd = tm.getSeriesData() |
| 24 | + self.tsd = tm.getTimeSeriesData() |
| 25 | + self.frame = DataFrame(self.seriesd) |
| 26 | + self.tsframe = DataFrame(self.tsd) |
| 27 | + |
| 28 | + self.df = DataFrame( |
| 29 | + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], |
| 30 | + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], |
| 31 | + 'C': np.random.randn(8), |
| 32 | + 'D': np.random.randn(8)}) |
| 33 | + |
| 34 | + self.df_mixed_floats = DataFrame( |
| 35 | + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], |
| 36 | + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], |
| 37 | + 'C': np.random.randn(8), |
| 38 | + 'D': np.array(np.random.randn(8), dtype='float32')}) |
| 39 | + |
| 40 | + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], |
| 41 | + ['one', 'two', 'three']], |
| 42 | + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], |
| 43 | + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], |
| 44 | + names=['first', 'second']) |
| 45 | + self.mframe = DataFrame(np.random.randn(10, 3), index=index, |
| 46 | + columns=['A', 'B', 'C']) |
| 47 | + |
| 48 | + self.three_group = DataFrame( |
| 49 | + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', |
| 50 | + 'foo', 'foo', 'foo'], |
| 51 | + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', |
| 52 | + 'two', 'two', 'one'], |
| 53 | + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', |
| 54 | + 'dull', 'shiny', 'shiny', 'shiny'], |
| 55 | + 'D': np.random.randn(11), |
| 56 | + 'E': np.random.randn(11), |
| 57 | + 'F': np.random.randn(11)}) |
| 58 | + |
| 59 | + def test_agg_regression1(self): |
| 60 | + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) |
| 61 | + result = grouped.agg(np.mean) |
| 62 | + expected = grouped.mean() |
| 63 | + tm.assert_frame_equal(result, expected) |
| 64 | + |
| 65 | + def test_agg_must_agg(self): |
| 66 | + grouped = self.df.groupby('A')['C'] |
| 67 | + |
| 68 | + msg = "Must produce aggregated value" |
| 69 | + with tm.assert_raises_regex(Exception, msg): |
| 70 | + grouped.agg(lambda x: x.describe()) |
| 71 | + with tm.assert_raises_regex(Exception, msg): |
| 72 | + grouped.agg(lambda x: x.index[:2]) |
| 73 | + |
| 74 | + def test_agg_ser_multi_key(self): |
| 75 | + # TODO(wesm): unused |
| 76 | + ser = self.df.C # noqa |
| 77 | + |
| 78 | + f = lambda x: x.sum() |
| 79 | + results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) |
| 80 | + expected = self.df.groupby(['A', 'B']).sum()['C'] |
| 81 | + tm.assert_series_equal(results, expected) |
| 82 | + |
| 83 | + def test_agg_apply_corner(self): |
| 84 | + # nothing to group, all NA |
| 85 | + grouped = self.ts.groupby(self.ts * np.nan) |
| 86 | + assert self.ts.dtype == np.float64 |
| 87 | + |
| 88 | + # groupby float64 values results in Float64Index |
| 89 | + exp = Series([], dtype=np.float64, |
| 90 | + index=pd.Index([], dtype=np.float64)) |
| 91 | + tm.assert_series_equal(grouped.sum(), exp) |
| 92 | + tm.assert_series_equal(grouped.agg(np.sum), exp) |
| 93 | + tm.assert_series_equal(grouped.apply(np.sum), exp, |
| 94 | + check_index_type=False) |
| 95 | + |
| 96 | + # DataFrame |
| 97 | + grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) |
| 98 | + exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, |
| 99 | + index=pd.Index([], dtype=np.float64)) |
| 100 | + tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) |
| 101 | + tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) |
| 102 | + tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], |
| 103 | + check_names=False) |
| 104 | + |
| 105 | + def test_agg_grouping_is_list_tuple(self): |
| 106 | + from pandas.core.groupby import Grouping |
| 107 | + |
| 108 | + df = tm.makeTimeDataFrame() |
| 109 | + |
| 110 | + grouped = df.groupby(lambda x: x.year) |
| 111 | + grouper = grouped.grouper.groupings[0].grouper |
| 112 | + grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) |
| 113 | + |
| 114 | + result = grouped.agg(np.mean) |
| 115 | + expected = grouped.mean() |
| 116 | + tm.assert_frame_equal(result, expected) |
| 117 | + |
| 118 | + grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) |
| 119 | + |
| 120 | + result = grouped.agg(np.mean) |
| 121 | + expected = grouped.mean() |
| 122 | + tm.assert_frame_equal(result, expected) |
| 123 | + |
| 124 | + def test_agg_python_multiindex(self): |
| 125 | + grouped = self.mframe.groupby(['A', 'B']) |
| 126 | + |
| 127 | + result = grouped.agg(np.mean) |
| 128 | + expected = grouped.mean() |
| 129 | + tm.assert_frame_equal(result, expected) |
| 130 | + |
| 131 | + @pytest.mark.parametrize('groupbyfunc', [ |
| 132 | + lambda x: x.weekday(), |
| 133 | + [lambda x: x.month, lambda x: x.weekday()], |
| 134 | + ]) |
| 135 | + def test_aggregate_str_func(self, groupbyfunc): |
| 136 | + grouped = self.tsframe.groupby(groupbyfunc) |
| 137 | + |
| 138 | + # single series |
| 139 | + result = grouped['A'].agg('std') |
| 140 | + expected = grouped['A'].std() |
| 141 | + tm.assert_series_equal(result, expected) |
| 142 | + |
| 143 | + # group frame by function name |
| 144 | + result = grouped.aggregate('var') |
| 145 | + expected = grouped.var() |
| 146 | + tm.assert_frame_equal(result, expected) |
| 147 | + |
| 148 | + # group frame by function dict |
| 149 | + result = grouped.agg(OrderedDict([['A', 'var'], |
| 150 | + ['B', 'std'], |
| 151 | + ['C', 'mean'], |
| 152 | + ['D', 'sem']])) |
| 153 | + expected = DataFrame(OrderedDict([['A', grouped['A'].var()], |
| 154 | + ['B', grouped['B'].std()], |
| 155 | + ['C', grouped['C'].mean()], |
| 156 | + ['D', grouped['D'].sem()]])) |
| 157 | + tm.assert_frame_equal(result, expected) |
| 158 | + |
| 159 | + def test_aggregate_item_by_item(self): |
| 160 | + df = self.df.copy() |
| 161 | + df['E'] = ['a'] * len(self.df) |
| 162 | + grouped = self.df.groupby('A') |
| 163 | + |
| 164 | + aggfun = lambda ser: ser.size |
| 165 | + result = grouped.agg(aggfun) |
| 166 | + foo = (self.df.A == 'foo').sum() |
| 167 | + bar = (self.df.A == 'bar').sum() |
| 168 | + K = len(result.columns) |
| 169 | + |
| 170 | + # GH5782 |
| 171 | + # odd comparisons can result here, so cast to make easy |
| 172 | + exp = pd.Series(np.array([foo] * K), index=list('BCD'), |
| 173 | + dtype=np.float64, name='foo') |
| 174 | + tm.assert_series_equal(result.xs('foo'), exp) |
| 175 | + |
| 176 | + exp = pd.Series(np.array([bar] * K), index=list('BCD'), |
| 177 | + dtype=np.float64, name='bar') |
| 178 | + tm.assert_almost_equal(result.xs('bar'), exp) |
| 179 | + |
| 180 | + def aggfun(ser): |
| 181 | + return ser.size |
| 182 | + |
| 183 | + result = DataFrame().groupby(self.df.A).agg(aggfun) |
| 184 | + assert isinstance(result, DataFrame) |
| 185 | + assert len(result) == 0 |
| 186 | + |
| 187 | + def test_wrap_agg_out(self): |
| 188 | + grouped = self.three_group.groupby(['A', 'B']) |
| 189 | + |
| 190 | + def func(ser): |
| 191 | + if ser.dtype == np.object: |
| 192 | + raise TypeError |
| 193 | + else: |
| 194 | + return ser.sum() |
| 195 | + |
| 196 | + result = grouped.aggregate(func) |
| 197 | + exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] |
| 198 | + expected = exp_grouped.groupby(['A', 'B']).aggregate(func) |
| 199 | + tm.assert_frame_equal(result, expected) |
| 200 | + |
| 201 | + def test_agg_multiple_functions_maintain_order(self): |
| 202 | + # GH #610 |
| 203 | + funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] |
| 204 | + result = self.df.groupby('A')['C'].agg(funcs) |
| 205 | + exp_cols = Index(['mean', 'max', 'min']) |
| 206 | + |
| 207 | + tm.assert_index_equal(result.columns, exp_cols) |
| 208 | + |
| 209 | + def test_multiple_functions_tuples_and_non_tuples(self): |
| 210 | + # #1359 |
| 211 | + funcs = [('foo', 'mean'), 'std'] |
| 212 | + ex_funcs = [('foo', 'mean'), ('std', 'std')] |
| 213 | + |
| 214 | + result = self.df.groupby('A')['C'].agg(funcs) |
| 215 | + expected = self.df.groupby('A')['C'].agg(ex_funcs) |
| 216 | + tm.assert_frame_equal(result, expected) |
| 217 | + |
| 218 | + result = self.df.groupby('A').agg(funcs) |
| 219 | + expected = self.df.groupby('A').agg(ex_funcs) |
| 220 | + tm.assert_frame_equal(result, expected) |
| 221 | + |
| 222 | + def test_agg_multiple_functions_too_many_lambdas(self): |
| 223 | + grouped = self.df.groupby('A') |
| 224 | + funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] |
| 225 | + |
| 226 | + msg = 'Function names must be unique, found multiple named <lambda>' |
| 227 | + with tm.assert_raises_regex(SpecificationError, msg): |
| 228 | + grouped.agg(funcs) |
| 229 | + |
| 230 | + def test_more_flexible_frame_multi_function(self): |
| 231 | + grouped = self.df.groupby('A') |
| 232 | + |
| 233 | + exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) |
| 234 | + exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) |
| 235 | + |
| 236 | + expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) |
| 237 | + expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) |
| 238 | + |
| 239 | + d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) |
| 240 | + result = grouped.aggregate(d) |
| 241 | + |
| 242 | + tm.assert_frame_equal(result, expected) |
| 243 | + |
| 244 | + # be careful |
| 245 | + result = grouped.aggregate(OrderedDict([['C', np.mean], |
| 246 | + ['D', [np.mean, np.std]]])) |
| 247 | + expected = grouped.aggregate(OrderedDict([['C', np.mean], |
| 248 | + ['D', [np.mean, np.std]]])) |
| 249 | + tm.assert_frame_equal(result, expected) |
| 250 | + |
| 251 | + def foo(x): |
| 252 | + return np.mean(x) |
| 253 | + |
| 254 | + def bar(x): |
| 255 | + return np.std(x, ddof=1) |
| 256 | + |
| 257 | + # this uses column selection & renaming |
| 258 | + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): |
| 259 | + d = OrderedDict([['C', np.mean], |
| 260 | + ['D', OrderedDict([['foo', np.mean], |
| 261 | + ['bar', np.std]])]]) |
| 262 | + result = grouped.aggregate(d) |
| 263 | + |
| 264 | + d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) |
| 265 | + expected = grouped.aggregate(d) |
| 266 | + |
| 267 | + tm.assert_frame_equal(result, expected) |
| 268 | + |
| 269 | + def test_multi_function_flexible_mix(self): |
| 270 | + # GH #1268 |
| 271 | + grouped = self.df.groupby('A') |
| 272 | + |
| 273 | + # Expected |
| 274 | + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], |
| 275 | + ['D', {'sum': 'sum'}]]) |
| 276 | + # this uses column selection & renaming |
| 277 | + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): |
| 278 | + expected = grouped.aggregate(d) |
| 279 | + |
| 280 | + # Test 1 |
| 281 | + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], |
| 282 | + ['D', 'sum']]) |
| 283 | + # this uses column selection & renaming |
| 284 | + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): |
| 285 | + result = grouped.aggregate(d) |
| 286 | + tm.assert_frame_equal(result, expected) |
| 287 | + |
| 288 | + # Test 2 |
| 289 | + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], |
| 290 | + ['D', ['sum']]]) |
| 291 | + # this uses column selection & renaming |
| 292 | + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): |
| 293 | + result = grouped.aggregate(d) |
| 294 | + tm.assert_frame_equal(result, expected) |
0 commit comments