Skip to content

Commit 3d6fcdc

Browse files
mroeschkejreback
authored andcommitted
API: Reformat output of groupby.describe (#4792)
closes #4792 Author: Matt Roeschke <[email protected]> Author: Matthew Roeschke <[email protected]> Closes #15260 from mroeschke/fix_4792 and squashes the following commits: 618bc46 [Matthew Roeschke] Merge branch 'master' into fix_4792 184378d [Matt Roeschke] TST: groupby.describe levels don't appear as column (#4792)
1 parent e303e26 commit 3d6fcdc

File tree

6 files changed

+150
-59
lines changed

6 files changed

+150
-59
lines changed

doc/source/whatsnew/v0.20.0.txt

+53
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,59 @@ New Behavior:
356356
In [11]: index.memory_usage(deep=True)
357357
Out[11]: 260
358358

359+
.. _whatsnew_0200.api_breaking.groupby_describe:
360+
361+
Groupby Describe Formatting
362+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
363+
364+
The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
365+
This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)
366+
367+
Previous Behavior:
368+
369+
.. code-block:: ipython
370+
371+
In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
372+
373+
In [2]: df.groupby('A').describe()
374+
Out[2]:
375+
B
376+
A
377+
1 count 2.000000
378+
mean 1.500000
379+
std 0.707107
380+
min 1.000000
381+
25% 1.250000
382+
50% 1.500000
383+
75% 1.750000
384+
max 2.000000
385+
2 count 2.000000
386+
mean 3.500000
387+
std 0.707107
388+
min 3.000000
389+
25% 3.250000
390+
50% 3.500000
391+
75% 3.750000
392+
max 4.000000
393+
394+
In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max])
395+
Out[3]:
396+
B
397+
mean std amin amax
398+
A
399+
1 1.5 0.707107 1 2
400+
2 3.5 0.707107 3 4
401+
402+
New Behavior:
403+
404+
.. ipython:: python
405+
406+
df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
407+
408+
df.groupby('A').describe()
409+
410+
df.groupby('A').agg([np.mean, np.std, np.min, np.max])
411+
359412
.. _whatsnew_0200.api:
360413

361414
Other API Changes

pandas/core/groupby.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@
8080
'mean', 'sum', 'min', 'max',
8181
'cumcount',
8282
'resample',
83-
'describe',
8483
'rank', 'quantile',
8584
'fillna',
8685
'mad',
@@ -1138,6 +1137,16 @@ def ohlc(self):
11381137
return self._apply_to_column_groupbys(
11391138
lambda x: x._cython_agg_general('ohlc'))
11401139

1140+
@Appender(DataFrame.describe.__doc__)
1141+
@Substitution(name='groupby')
1142+
@Appender(_doc_template)
1143+
def describe(self, **kwargs):
1144+
self._set_group_selection()
1145+
result = self.apply(lambda x: x.describe(**kwargs))
1146+
if self.axis == 1:
1147+
return result.T
1148+
return result.unstack()
1149+
11411150
@Substitution(name='groupby')
11421151
@Appender(_doc_template)
11431152
def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3048,14 @@ def nlargest(self, n=5, keep='first'):
30393048
def nsmallest(self, n=5, keep='first'):
30403049
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
30413050

3051+
@Appender(Series.describe.__doc__)
3052+
def describe(self, **kwargs):
3053+
self._set_group_selection()
3054+
result = self.apply(lambda x: x.describe(**kwargs))
3055+
if self.axis == 1:
3056+
return result.T
3057+
return result.unstack()
3058+
30423059
def value_counts(self, normalize=False, sort=True, ascending=False,
30433060
bins=None, dropna=True):
30443061

pandas/tests/formats/test_format.py

+9-24
Original file line numberDiff line numberDiff line change
@@ -3545,30 +3545,15 @@ def test_to_latex_multiindex(self):
35453545
self.assertEqual(result, expected)
35463546

35473547
result = df.groupby('a').describe().to_latex()
3548-
expected = r"""\begin{tabular}{llr}
3549-
\toprule
3550-
& & c \\
3551-
a & {} & \\
3552-
\midrule
3553-
0 & count & 2.000000 \\
3554-
& mean & 1.500000 \\
3555-
& std & 0.707107 \\
3556-
& min & 1.000000 \\
3557-
& 25\% & 1.250000 \\
3558-
& 50\% & 1.500000 \\
3559-
& 75\% & 1.750000 \\
3560-
& max & 2.000000 \\
3561-
1 & count & 2.000000 \\
3562-
& mean & 3.500000 \\
3563-
& std & 0.707107 \\
3564-
& min & 3.000000 \\
3565-
& 25\% & 3.250000 \\
3566-
& 50\% & 3.500000 \\
3567-
& 75\% & 3.750000 \\
3568-
& max & 4.000000 \\
3569-
\bottomrule
3570-
\end{tabular}
3571-
"""
3548+
expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & '
3549+
' & & & & & & '
3550+
'\\\\\n{} & count & mean & std & min & 25\\% & '
3551+
'50\\% & 75\\% & max \\\\\na & & & '
3552+
' & & & & & \\\\\n\\midrule\n0 '
3553+
'& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 '
3554+
'& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 '
3555+
'& 3.5 & 3.75 & 4.0 '
3556+
'\\\\\n\\bottomrule\n\\end{tabular}\n')
35723557

35733558
self.assertEqual(result, expected)
35743559

pandas/tests/groupby/test_categorical.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -107,17 +107,20 @@ def test_groupby_categorical(self):
107107
exp_cats = Categorical(ord_labels, ordered=True,
108108
categories=['foo', 'bar', 'baz', 'qux'])
109109
expected = ord_data.groupby(exp_cats, sort=False).describe()
110-
expected.index.names = [None, None]
111110
assert_frame_equal(desc_result, expected)
112111

113112
# GH 10460
114113
expc = Categorical.from_codes(np.arange(4).repeat(8),
115114
levels, ordered=True)
116115
exp = CategoricalIndex(expc)
117-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
116+
self.assert_index_equal((desc_result.stack()
117+
.index
118+
.get_level_values(0)), exp)
118119
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
119120
'75%', 'max'] * 4)
120-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
121+
self.assert_index_equal((desc_result.stack()
122+
.index
123+
.get_level_values(1)), exp)
121124

122125
def test_groupby_datetime_categorical(self):
123126
# GH9049: ensure backward compatibility
@@ -144,7 +147,6 @@ def test_groupby_datetime_categorical(self):
144147
ord_labels = cats.take_nd(idx)
145148
ord_data = data.take(idx)
146149
expected = ord_data.groupby(ord_labels).describe()
147-
expected.index.names = [None, None]
148150
assert_frame_equal(desc_result, expected)
149151
tm.assert_index_equal(desc_result.index, expected.index)
150152
tm.assert_index_equal(
@@ -155,10 +157,14 @@ def test_groupby_datetime_categorical(self):
155157
expc = Categorical.from_codes(
156158
np.arange(4).repeat(8), levels, ordered=True)
157159
exp = CategoricalIndex(expc)
158-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
160+
self.assert_index_equal((desc_result.stack()
161+
.index
162+
.get_level_values(0)), exp)
159163
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
160164
'75%', 'max'] * 4)
161-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
165+
self.assert_index_equal((desc_result.stack()
166+
.index
167+
.get_level_values(1)), exp)
162168

163169
def test_groupby_categorical_index(self):
164170

@@ -195,8 +201,8 @@ def test_groupby_describe_categorical_columns(self):
195201
df = DataFrame(np.random.randn(20, 4), columns=cats)
196202
result = df.groupby([1, 2, 3, 4] * 5).describe()
197203

198-
tm.assert_index_equal(result.columns, cats)
199-
tm.assert_categorical_equal(result.columns.values, cats.values)
204+
tm.assert_index_equal(result.stack().columns, cats)
205+
tm.assert_categorical_equal(result.stack().columns.values, cats.values)
200206

201207
def test_groupby_unstack_categorical(self):
202208
# GH11558 (example is taken from the original issue)

pandas/tests/groupby/test_groupby.py

+52-22
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,7 @@ def test_attr_wrapper(self):
10851085
for name, gp in grouped:
10861086
expected[name] = gp.describe()
10871087
expected = DataFrame(expected).T
1088-
assert_frame_equal(result.unstack(), expected)
1088+
assert_frame_equal(result, expected)
10891089

10901090
# get attribute
10911091
result = grouped.dtype
@@ -1097,7 +1097,7 @@ def test_attr_wrapper(self):
10971097
def test_series_describe_multikey(self):
10981098
ts = tm.makeTimeSeries()
10991099
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
1100-
result = grouped.describe().unstack()
1100+
result = grouped.describe()
11011101
assert_series_equal(result['mean'], grouped.mean(), check_names=False)
11021102
assert_series_equal(result['std'], grouped.std(), check_names=False)
11031103
assert_series_equal(result['min'], grouped.min(), check_names=False)
@@ -1106,7 +1106,7 @@ def test_series_describe_single(self):
11061106
ts = tm.makeTimeSeries()
11071107
grouped = ts.groupby(lambda x: x.month)
11081108
result = grouped.apply(lambda x: x.describe())
1109-
expected = grouped.describe()
1109+
expected = grouped.describe().stack()
11101110
assert_series_equal(result, expected)
11111111

11121112
def test_series_index_name(self):
@@ -1117,17 +1117,27 @@ def test_series_index_name(self):
11171117
def test_frame_describe_multikey(self):
11181118
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
11191119
result = grouped.describe()
1120-
1120+
desc_groups = []
11211121
for col in self.tsframe:
1122-
expected = grouped[col].describe()
1123-
assert_series_equal(result[col], expected, check_names=False)
1122+
group = grouped[col].describe()
1123+
group_col = pd.MultiIndex([[col] * len(group.columns),
1124+
group.columns],
1125+
[[0] * len(group.columns),
1126+
range(len(group.columns))])
1127+
group = pd.DataFrame(group.values,
1128+
columns=group_col,
1129+
index=group.index)
1130+
desc_groups.append(group)
1131+
expected = pd.concat(desc_groups, axis=1)
1132+
tm.assert_frame_equal(result, expected)
11241133

11251134
groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
11261135
'C': 1, 'D': 1}, axis=1)
11271136
result = groupedT.describe()
1128-
1129-
for name, group in groupedT:
1130-
assert_frame_equal(result[name], group.describe())
1137+
expected = self.tsframe.describe().T
1138+
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
1139+
[range(4), range(len(expected.index))])
1140+
tm.assert_frame_equal(result, expected)
11311141

11321142
def test_frame_describe_tupleindex(self):
11331143

@@ -1137,10 +1147,27 @@ def test_frame_describe_tupleindex(self):
11371147
'z': [100, 200, 300, 400, 500] * 3})
11381148
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
11391149
df2 = df1.rename(columns={'k': 'key'})
1140-
result = df1.groupby('k').describe()
1141-
expected = df2.groupby('key').describe()
1142-
expected.index.set_names(result.index.names, inplace=True)
1143-
assert_frame_equal(result, expected)
1150+
tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
1151+
tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())
1152+
1153+
def test_frame_describe_unstacked_format(self):
1154+
# GH 4792
1155+
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
1156+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
1157+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
1158+
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
1159+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
1160+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
1161+
df = pd.DataFrame({'PRICE': prices,
1162+
'VOLUME': volumes})
1163+
result = df.groupby('PRICE').VOLUME.describe()
1164+
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
1165+
df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
1166+
expected = pd.DataFrame(data,
1167+
index=pd.Index([24990, 25499], name='PRICE'),
1168+
columns=['count', 'mean', 'std', 'min',
1169+
'25%', '50%', '75%', 'max'])
1170+
tm.assert_frame_equal(result, expected)
11441171

11451172
def test_frame_groupby(self):
11461173
grouped = self.tsframe.groupby(lambda x: x.weekday())
@@ -2545,16 +2572,21 @@ def test_non_cython_api(self):
25452572
assert_frame_equal(result, expected)
25462573

25472574
# describe
2548-
expected = DataFrame(dict(B=concat(
2549-
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
2550-
keys=[1, 3])))
2551-
expected.index.names = ['A', None]
2575+
expected_index = pd.Index([1, 3], name='A')
2576+
expected_col = pd.MultiIndex(levels=[['B'],
2577+
['count', 'mean', 'std', 'min',
2578+
'25%', '50%', '75%', 'max']],
2579+
labels=[[0] * 8, list(range(8))])
2580+
expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0],
2581+
[0.0, nan, nan, nan, nan, nan, nan, nan]],
2582+
index=expected_index,
2583+
columns=expected_col)
25522584
result = g.describe()
25532585
assert_frame_equal(result, expected)
25542586

2555-
expected = concat(
2556-
[df.loc[[0, 1], ['A', 'B']].describe(),
2557-
df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
2587+
expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
2588+
df[df.A == 3].describe().unstack().to_frame().T])
2589+
expected.index = pd.Index([0, 1])
25582590
result = gni.describe()
25592591
assert_frame_equal(result, expected)
25602592

@@ -3872,7 +3904,6 @@ def test_groupby_whitelist(self):
38723904
'tail',
38733905
'cumcount',
38743906
'resample',
3875-
'describe',
38763907
'rank',
38773908
'quantile',
38783909
'fillna',
@@ -3909,7 +3940,6 @@ def test_groupby_whitelist(self):
39093940
'tail',
39103941
'cumcount',
39113942
'resample',
3912-
'describe',
39133943
'rank',
39143944
'quantile',
39153945
'fillna',

pandas/tests/test_generic.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1267,10 +1267,10 @@ def test_describe_typefiltering_groupby(self):
12671267
'numD': np.arange(24.) + .5,
12681268
'ts': tm.makeTimeSeries()[:24].index})
12691269
G = df.groupby('catA')
1270-
self.assertTrue(G.describe(include=['number']).shape == (16, 2))
1271-
self.assertTrue(G.describe(include=['number', 'object']).shape == (22,
1272-
3))
1273-
self.assertTrue(G.describe(include='all').shape == (26, 4))
1270+
self.assertTrue(G.describe(include=['number']).shape == (2, 16))
1271+
self.assertTrue(G.describe(include=['number', 'object']).shape == (2,
1272+
33))
1273+
self.assertTrue(G.describe(include='all').shape == (2, 52))
12741274

12751275
def test_describe_multi_index_df_column_names(self):
12761276
""" Test that column names persist after the describe operation."""

0 commit comments

Comments
 (0)