Skip to content

Commit faeac49

Browse files
alysivjijreback
authored andcommitted
TST: organize and cleanup pandas/tests/groupby/test_aggregate.py (pandas-dev#18931)
1 parent 8433562 commit faeac49

File tree

6 files changed

+985
-961
lines changed

6 files changed

+985
-961
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
.ipynb_checkpoints
2222
.tags
2323
.cache/
24+
.vscode/
2425

2526
# Compiled source #
2627
###################

pandas/tests/groupby/aggregate/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
test .agg behavior / note that .apply is tested generally in test_groupby.py
5+
"""
6+
7+
import pytest
8+
9+
import numpy as np
10+
import pandas as pd
11+
12+
from pandas import concat, DataFrame, Index, MultiIndex, Series
13+
from pandas.core.groupby import SpecificationError
14+
from pandas.compat import OrderedDict
15+
import pandas.util.testing as tm
16+
17+
18+
class TestGroupByAggregate(object):
19+
20+
def setup_method(self, method):
21+
self.ts = tm.makeTimeSeries()
22+
23+
self.seriesd = tm.getSeriesData()
24+
self.tsd = tm.getTimeSeriesData()
25+
self.frame = DataFrame(self.seriesd)
26+
self.tsframe = DataFrame(self.tsd)
27+
28+
self.df = DataFrame(
29+
{'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
30+
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
31+
'C': np.random.randn(8),
32+
'D': np.random.randn(8)})
33+
34+
self.df_mixed_floats = DataFrame(
35+
{'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
36+
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
37+
'C': np.random.randn(8),
38+
'D': np.array(np.random.randn(8), dtype='float32')})
39+
40+
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
41+
['one', 'two', 'three']],
42+
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
43+
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
44+
names=['first', 'second'])
45+
self.mframe = DataFrame(np.random.randn(10, 3), index=index,
46+
columns=['A', 'B', 'C'])
47+
48+
self.three_group = DataFrame(
49+
{'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
50+
'foo', 'foo', 'foo'],
51+
'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
52+
'two', 'two', 'one'],
53+
'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
54+
'dull', 'shiny', 'shiny', 'shiny'],
55+
'D': np.random.randn(11),
56+
'E': np.random.randn(11),
57+
'F': np.random.randn(11)})
58+
59+
def test_agg_regression1(self):
60+
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
61+
result = grouped.agg(np.mean)
62+
expected = grouped.mean()
63+
tm.assert_frame_equal(result, expected)
64+
65+
def test_agg_must_agg(self):
66+
grouped = self.df.groupby('A')['C']
67+
68+
msg = "Must produce aggregated value"
69+
with tm.assert_raises_regex(Exception, msg):
70+
grouped.agg(lambda x: x.describe())
71+
with tm.assert_raises_regex(Exception, msg):
72+
grouped.agg(lambda x: x.index[:2])
73+
74+
def test_agg_ser_multi_key(self):
75+
# TODO(wesm): unused
76+
ser = self.df.C # noqa
77+
78+
f = lambda x: x.sum()
79+
results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f)
80+
expected = self.df.groupby(['A', 'B']).sum()['C']
81+
tm.assert_series_equal(results, expected)
82+
83+
def test_agg_apply_corner(self):
84+
# nothing to group, all NA
85+
grouped = self.ts.groupby(self.ts * np.nan)
86+
assert self.ts.dtype == np.float64
87+
88+
# groupby float64 values results in Float64Index
89+
exp = Series([], dtype=np.float64,
90+
index=pd.Index([], dtype=np.float64))
91+
tm.assert_series_equal(grouped.sum(), exp)
92+
tm.assert_series_equal(grouped.agg(np.sum), exp)
93+
tm.assert_series_equal(grouped.apply(np.sum), exp,
94+
check_index_type=False)
95+
96+
# DataFrame
97+
grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
98+
exp_df = DataFrame(columns=self.tsframe.columns, dtype=float,
99+
index=pd.Index([], dtype=np.float64))
100+
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
101+
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
102+
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
103+
check_names=False)
104+
105+
def test_agg_grouping_is_list_tuple(self):
106+
from pandas.core.groupby import Grouping
107+
108+
df = tm.makeTimeDataFrame()
109+
110+
grouped = df.groupby(lambda x: x.year)
111+
grouper = grouped.grouper.groupings[0].grouper
112+
grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper))
113+
114+
result = grouped.agg(np.mean)
115+
expected = grouped.mean()
116+
tm.assert_frame_equal(result, expected)
117+
118+
grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper))
119+
120+
result = grouped.agg(np.mean)
121+
expected = grouped.mean()
122+
tm.assert_frame_equal(result, expected)
123+
124+
def test_agg_python_multiindex(self):
125+
grouped = self.mframe.groupby(['A', 'B'])
126+
127+
result = grouped.agg(np.mean)
128+
expected = grouped.mean()
129+
tm.assert_frame_equal(result, expected)
130+
131+
@pytest.mark.parametrize('groupbyfunc', [
132+
lambda x: x.weekday(),
133+
[lambda x: x.month, lambda x: x.weekday()],
134+
])
135+
def test_aggregate_str_func(self, groupbyfunc):
136+
grouped = self.tsframe.groupby(groupbyfunc)
137+
138+
# single series
139+
result = grouped['A'].agg('std')
140+
expected = grouped['A'].std()
141+
tm.assert_series_equal(result, expected)
142+
143+
# group frame by function name
144+
result = grouped.aggregate('var')
145+
expected = grouped.var()
146+
tm.assert_frame_equal(result, expected)
147+
148+
# group frame by function dict
149+
result = grouped.agg(OrderedDict([['A', 'var'],
150+
['B', 'std'],
151+
['C', 'mean'],
152+
['D', 'sem']]))
153+
expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
154+
['B', grouped['B'].std()],
155+
['C', grouped['C'].mean()],
156+
['D', grouped['D'].sem()]]))
157+
tm.assert_frame_equal(result, expected)
158+
159+
def test_aggregate_item_by_item(self):
160+
df = self.df.copy()
161+
df['E'] = ['a'] * len(self.df)
162+
grouped = self.df.groupby('A')
163+
164+
aggfun = lambda ser: ser.size
165+
result = grouped.agg(aggfun)
166+
foo = (self.df.A == 'foo').sum()
167+
bar = (self.df.A == 'bar').sum()
168+
K = len(result.columns)
169+
170+
# GH5782
171+
# odd comparisons can result here, so cast to make easy
172+
exp = pd.Series(np.array([foo] * K), index=list('BCD'),
173+
dtype=np.float64, name='foo')
174+
tm.assert_series_equal(result.xs('foo'), exp)
175+
176+
exp = pd.Series(np.array([bar] * K), index=list('BCD'),
177+
dtype=np.float64, name='bar')
178+
tm.assert_almost_equal(result.xs('bar'), exp)
179+
180+
def aggfun(ser):
181+
return ser.size
182+
183+
result = DataFrame().groupby(self.df.A).agg(aggfun)
184+
assert isinstance(result, DataFrame)
185+
assert len(result) == 0
186+
187+
def test_wrap_agg_out(self):
188+
grouped = self.three_group.groupby(['A', 'B'])
189+
190+
def func(ser):
191+
if ser.dtype == np.object:
192+
raise TypeError
193+
else:
194+
return ser.sum()
195+
196+
result = grouped.aggregate(func)
197+
exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C']
198+
expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
199+
tm.assert_frame_equal(result, expected)
200+
201+
def test_agg_multiple_functions_maintain_order(self):
202+
# GH #610
203+
funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
204+
result = self.df.groupby('A')['C'].agg(funcs)
205+
exp_cols = Index(['mean', 'max', 'min'])
206+
207+
tm.assert_index_equal(result.columns, exp_cols)
208+
209+
def test_multiple_functions_tuples_and_non_tuples(self):
210+
# #1359
211+
funcs = [('foo', 'mean'), 'std']
212+
ex_funcs = [('foo', 'mean'), ('std', 'std')]
213+
214+
result = self.df.groupby('A')['C'].agg(funcs)
215+
expected = self.df.groupby('A')['C'].agg(ex_funcs)
216+
tm.assert_frame_equal(result, expected)
217+
218+
result = self.df.groupby('A').agg(funcs)
219+
expected = self.df.groupby('A').agg(ex_funcs)
220+
tm.assert_frame_equal(result, expected)
221+
222+
def test_agg_multiple_functions_too_many_lambdas(self):
223+
grouped = self.df.groupby('A')
224+
funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
225+
226+
msg = 'Function names must be unique, found multiple named <lambda>'
227+
with tm.assert_raises_regex(SpecificationError, msg):
228+
grouped.agg(funcs)
229+
230+
def test_more_flexible_frame_multi_function(self):
231+
grouped = self.df.groupby('A')
232+
233+
exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
234+
exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))
235+
236+
expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
237+
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
238+
239+
d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
240+
result = grouped.aggregate(d)
241+
242+
tm.assert_frame_equal(result, expected)
243+
244+
# be careful
245+
result = grouped.aggregate(OrderedDict([['C', np.mean],
246+
['D', [np.mean, np.std]]]))
247+
expected = grouped.aggregate(OrderedDict([['C', np.mean],
248+
['D', [np.mean, np.std]]]))
249+
tm.assert_frame_equal(result, expected)
250+
251+
def foo(x):
252+
return np.mean(x)
253+
254+
def bar(x):
255+
return np.std(x, ddof=1)
256+
257+
# this uses column selection & renaming
258+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
259+
d = OrderedDict([['C', np.mean],
260+
['D', OrderedDict([['foo', np.mean],
261+
['bar', np.std]])]])
262+
result = grouped.aggregate(d)
263+
264+
d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
265+
expected = grouped.aggregate(d)
266+
267+
tm.assert_frame_equal(result, expected)
268+
269+
def test_multi_function_flexible_mix(self):
270+
# GH #1268
271+
grouped = self.df.groupby('A')
272+
273+
# Expected
274+
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
275+
['D', {'sum': 'sum'}]])
276+
# this uses column selection & renaming
277+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
278+
expected = grouped.aggregate(d)
279+
280+
# Test 1
281+
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
282+
['D', 'sum']])
283+
# this uses column selection & renaming
284+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
285+
result = grouped.aggregate(d)
286+
tm.assert_frame_equal(result, expected)
287+
288+
# Test 2
289+
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
290+
['D', ['sum']]])
291+
# this uses column selection & renaming
292+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
293+
result = grouped.aggregate(d)
294+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)