Skip to content

Commit 8376ec9

Browse files
jrebackGuessWhoSamFoo
authored andcommitted
TST: separate out grouping-type tests
1 parent 0f5f2af commit 8376ec9

File tree

6 files changed

+1304
-1199
lines changed

6 files changed

+1304
-1199
lines changed
+371
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
# -*- coding: utf-8 -*-
2+
3+
""" test function application """
4+
5+
import pytest
6+
7+
from string import ascii_lowercase
8+
from pandas import (date_range, Timestamp,
9+
Index, MultiIndex, DataFrame, Series)
10+
from pandas.util.testing import assert_frame_equal, assert_series_equal
11+
from pandas.compat import product as cart_product
12+
13+
import numpy as np
14+
15+
import pandas.util.testing as tm
16+
import pandas as pd
17+
from .common import MixIn
18+
19+
20+
# describe
21+
# --------------------------------
22+
23+
class TestDescribe(MixIn):
24+
25+
def test_apply_describe_bug(self):
26+
grouped = self.mframe.groupby(level='first')
27+
grouped.describe() # it works!
28+
29+
def test_series_describe_multikey(self):
30+
ts = tm.makeTimeSeries()
31+
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
32+
result = grouped.describe()
33+
assert_series_equal(result['mean'], grouped.mean(), check_names=False)
34+
assert_series_equal(result['std'], grouped.std(), check_names=False)
35+
assert_series_equal(result['min'], grouped.min(), check_names=False)
36+
37+
def test_series_describe_single(self):
38+
ts = tm.makeTimeSeries()
39+
grouped = ts.groupby(lambda x: x.month)
40+
result = grouped.apply(lambda x: x.describe())
41+
expected = grouped.describe().stack()
42+
assert_series_equal(result, expected)
43+
44+
def test_series_index_name(self):
45+
grouped = self.df.loc[:, ['C']].groupby(self.df['A'])
46+
result = grouped.agg(lambda x: x.mean())
47+
assert result.index.name == 'A'
48+
49+
def test_frame_describe_multikey(self):
50+
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
51+
result = grouped.describe()
52+
desc_groups = []
53+
for col in self.tsframe:
54+
group = grouped[col].describe()
55+
group_col = pd.MultiIndex([[col] * len(group.columns),
56+
group.columns],
57+
[[0] * len(group.columns),
58+
range(len(group.columns))])
59+
group = pd.DataFrame(group.values,
60+
columns=group_col,
61+
index=group.index)
62+
desc_groups.append(group)
63+
expected = pd.concat(desc_groups, axis=1)
64+
tm.assert_frame_equal(result, expected)
65+
66+
groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
67+
'C': 1, 'D': 1}, axis=1)
68+
result = groupedT.describe()
69+
expected = self.tsframe.describe().T
70+
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
71+
[range(4), range(len(expected.index))])
72+
tm.assert_frame_equal(result, expected)
73+
74+
def test_frame_describe_tupleindex(self):
75+
76+
# GH 14848 - regression from 0.19.0 to 0.19.1
77+
df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
78+
'y': [10, 20, 30, 40, 50] * 3,
79+
'z': [100, 200, 300, 400, 500] * 3})
80+
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
81+
df2 = df1.rename(columns={'k': 'key'})
82+
pytest.raises(ValueError, lambda: df1.groupby('k').describe())
83+
pytest.raises(ValueError, lambda: df2.groupby('key').describe())
84+
85+
def test_frame_describe_unstacked_format(self):
86+
# GH 4792
87+
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
88+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
89+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
90+
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
91+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
92+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
93+
df = pd.DataFrame({'PRICE': prices,
94+
'VOLUME': volumes})
95+
result = df.groupby('PRICE').VOLUME.describe()
96+
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
97+
df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
98+
expected = pd.DataFrame(data,
99+
index=pd.Index([24990, 25499], name='PRICE'),
100+
columns=['count', 'mean', 'std', 'min',
101+
'25%', '50%', '75%', 'max'])
102+
tm.assert_frame_equal(result, expected)
103+
104+
105+
# nunique
106+
# --------------------------------
107+
108+
class TestNUnique(MixIn):
109+
110+
def test_series_groupby_nunique(self):
111+
112+
def check_nunique(df, keys, as_index=True):
113+
for sort, dropna in cart_product((False, True), repeat=2):
114+
gr = df.groupby(keys, as_index=as_index, sort=sort)
115+
left = gr['julie'].nunique(dropna=dropna)
116+
117+
gr = df.groupby(keys, as_index=as_index, sort=sort)
118+
right = gr['julie'].apply(Series.nunique, dropna=dropna)
119+
if not as_index:
120+
right = right.reset_index(drop=True)
121+
122+
assert_series_equal(left, right, check_names=False)
123+
124+
days = date_range('2015-08-23', periods=10)
125+
126+
for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)):
127+
frame = DataFrame({
128+
'jim': np.random.choice(
129+
list(ascii_lowercase), n),
130+
'joe': np.random.choice(days, n),
131+
'julie': np.random.randint(0, m, n)
132+
})
133+
134+
check_nunique(frame, ['jim'])
135+
check_nunique(frame, ['jim', 'joe'])
136+
137+
frame.loc[1::17, 'jim'] = None
138+
frame.loc[3::37, 'joe'] = None
139+
frame.loc[7::19, 'julie'] = None
140+
frame.loc[8::19, 'julie'] = None
141+
frame.loc[9::19, 'julie'] = None
142+
143+
check_nunique(frame, ['jim'])
144+
check_nunique(frame, ['jim', 'joe'])
145+
check_nunique(frame, ['jim'], as_index=False)
146+
check_nunique(frame, ['jim', 'joe'], as_index=False)
147+
148+
def test_nunique(self):
149+
df = DataFrame({
150+
'A': list('abbacc'),
151+
'B': list('abxacc'),
152+
'C': list('abbacx'),
153+
})
154+
155+
expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
156+
result = df.groupby('A', as_index=False).nunique()
157+
tm.assert_frame_equal(result, expected)
158+
159+
# as_index
160+
expected.index = list('abc')
161+
expected.index.name = 'A'
162+
result = df.groupby('A').nunique()
163+
tm.assert_frame_equal(result, expected)
164+
165+
# with na
166+
result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
167+
tm.assert_frame_equal(result, expected)
168+
169+
# dropna
170+
expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
171+
index=list('abc'))
172+
expected.index.name = 'A'
173+
result = df.replace({'x': None}).groupby('A').nunique()
174+
tm.assert_frame_equal(result, expected)
175+
176+
def test_nunique_with_object(self):
177+
# GH 11077
178+
data = pd.DataFrame(
179+
[[100, 1, 'Alice'],
180+
[200, 2, 'Bob'],
181+
[300, 3, 'Charlie'],
182+
[-400, 4, 'Dan'],
183+
[500, 5, 'Edith']],
184+
columns=['amount', 'id', 'name']
185+
)
186+
187+
result = data.groupby(['id', 'amount'])['name'].nunique()
188+
index = MultiIndex.from_arrays([data.id, data.amount])
189+
expected = pd.Series([1] * 5, name='name', index=index)
190+
tm.assert_series_equal(result, expected)
191+
192+
def test_nunique_with_empty_series(self):
193+
# GH 12553
194+
data = pd.Series(name='name')
195+
result = data.groupby(level=0).nunique()
196+
expected = pd.Series(name='name', dtype='int64')
197+
tm.assert_series_equal(result, expected)
198+
199+
def test_nunique_with_timegrouper(self):
200+
# GH 13453
201+
test = pd.DataFrame({
202+
'time': [Timestamp('2016-06-28 09:35:35'),
203+
Timestamp('2016-06-28 16:09:30'),
204+
Timestamp('2016-06-28 16:46:28')],
205+
'data': ['1', '2', '3']}).set_index('time')
206+
result = test.groupby(pd.Grouper(freq='h'))['data'].nunique()
207+
expected = test.groupby(
208+
pd.Grouper(freq='h')
209+
)['data'].apply(pd.Series.nunique)
210+
tm.assert_series_equal(result, expected)
211+
212+
213+
# count
214+
# --------------------------------
215+
216+
class TestCount(MixIn):
217+
218+
def test_groupby_timedelta_cython_count(self):
219+
df = DataFrame({'g': list('ab' * 2),
220+
'delt': np.arange(4).astype('timedelta64[ns]')})
221+
expected = Series([
222+
2, 2
223+
], index=pd.Index(['a', 'b'], name='g'), name='delt')
224+
result = df.groupby('g').delt.count()
225+
tm.assert_series_equal(expected, result)
226+
227+
def test_count(self):
228+
n = 1 << 15
229+
dr = date_range('2015-08-30', periods=n // 10, freq='T')
230+
231+
df = DataFrame({
232+
'1st': np.random.choice(
233+
list(ascii_lowercase), n),
234+
'2nd': np.random.randint(0, 5, n),
235+
'3rd': np.random.randn(n).round(3),
236+
'4th': np.random.randint(-10, 10, n),
237+
'5th': np.random.choice(dr, n),
238+
'6th': np.random.randn(n).round(3),
239+
'7th': np.random.randn(n).round(3),
240+
'8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
241+
'9th': np.random.choice(
242+
list(ascii_lowercase), n)
243+
})
244+
245+
for col in df.columns.drop(['1st', '2nd', '4th']):
246+
df.loc[np.random.choice(n, n // 10), col] = np.nan
247+
248+
df['9th'] = df['9th'].astype('category')
249+
250+
for key in '1st', '2nd', ['1st', '2nd']:
251+
left = df.groupby(key).count()
252+
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
253+
assert_frame_equal(left, right)
254+
255+
# GH5610
256+
# count counts non-nulls
257+
df = pd.DataFrame([[1, 2, 'foo'],
258+
[1, np.nan, 'bar'],
259+
[3, np.nan, np.nan]],
260+
columns=['A', 'B', 'C'])
261+
262+
count_as = df.groupby('A').count()
263+
count_not_as = df.groupby('A', as_index=False).count()
264+
265+
expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
266+
index=[1, 3])
267+
expected.index.name = 'A'
268+
assert_frame_equal(count_not_as, expected.reset_index())
269+
assert_frame_equal(count_as, expected)
270+
271+
count_B = df.groupby('A')['B'].count()
272+
assert_series_equal(count_B, expected['B'])
273+
274+
def test_count_object(self):
275+
df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
276+
result = df.groupby('c').a.count()
277+
expected = pd.Series([
278+
3, 3
279+
], index=pd.Index([2, 3], name='c'), name='a')
280+
tm.assert_series_equal(result, expected)
281+
282+
df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
283+
'c': [2] * 3 + [3] * 3})
284+
result = df.groupby('c').a.count()
285+
expected = pd.Series([
286+
1, 3
287+
], index=pd.Index([2, 3], name='c'), name='a')
288+
tm.assert_series_equal(result, expected)
289+
290+
def test_count_cross_type(self): # GH8169
291+
vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
292+
0, 2, (100, 2))))
293+
294+
df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
295+
df[df == 2] = np.nan
296+
expected = df.groupby(['c', 'd']).count()
297+
298+
for t in ['float32', 'object']:
299+
df['a'] = df['a'].astype(t)
300+
df['b'] = df['b'].astype(t)
301+
result = df.groupby(['c', 'd']).count()
302+
tm.assert_frame_equal(result, expected)
303+
304+
def test_lower_int_prec_count(self):
305+
df = DataFrame({'a': np.array(
306+
[0, 1, 2, 100], np.int8),
307+
'b': np.array(
308+
[1, 2, 3, 6], np.uint32),
309+
'c': np.array(
310+
[4, 5, 6, 8], np.int16),
311+
'grp': list('ab' * 2)})
312+
result = df.groupby('grp').count()
313+
expected = DataFrame({'a': [2, 2],
314+
'b': [2, 2],
315+
'c': [2, 2]}, index=pd.Index(list('ab'),
316+
name='grp'))
317+
tm.assert_frame_equal(result, expected)
318+
319+
def test_count_uses_size_on_exception(self):
320+
class RaisingObjectException(Exception):
321+
pass
322+
323+
class RaisingObject(object):
324+
325+
def __init__(self, msg='I will raise inside Cython'):
326+
super(RaisingObject, self).__init__()
327+
self.msg = msg
328+
329+
def __eq__(self, other):
330+
# gets called in Cython to check that raising calls the method
331+
raise RaisingObjectException(self.msg)
332+
333+
df = DataFrame({'a': [RaisingObject() for _ in range(4)],
334+
'grp': list('ab' * 2)})
335+
result = df.groupby('grp').count()
336+
expected = DataFrame({'a': [2, 2]}, index=pd.Index(
337+
list('ab'), name='grp'))
338+
tm.assert_frame_equal(result, expected)
339+
340+
341+
# size
342+
# --------------------------------
343+
344+
class TestSize(MixIn):
345+
346+
def test_size(self):
347+
grouped = self.df.groupby(['A', 'B'])
348+
result = grouped.size()
349+
for key, group in grouped:
350+
assert result[key] == len(group)
351+
352+
grouped = self.df.groupby('A')
353+
result = grouped.size()
354+
for key, group in grouped:
355+
assert result[key] == len(group)
356+
357+
grouped = self.df.groupby('B')
358+
result = grouped.size()
359+
for key, group in grouped:
360+
assert result[key] == len(group)
361+
362+
df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
363+
for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
364+
left = df.groupby(key, sort=sort).size()
365+
right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
366+
assert_series_equal(left, right, check_names=False)
367+
368+
# GH11699
369+
df = DataFrame([], columns=['A', 'B'])
370+
out = Series([], dtype='int64', index=Index([], name='A'))
371+
assert_series_equal(df.groupby('A').size(), out)

0 commit comments

Comments
 (0)