Skip to content

Commit 9643f01

Browse files
committed
TST: groupby.describe levels don't appear as column (#4792)
Restructure describe def Fix another test Refactoring tests linting & patch groupby tests add whatsnew fix docstring fix more tests Added api example and documentation to describe fix potential pep8 complaint adjust doc description renamed original test and add agg example in doc simplify example
1 parent f93714b commit 9643f01

File tree

6 files changed

+154
-59
lines changed

6 files changed

+154
-59
lines changed

doc/source/whatsnew/v0.20.0.txt

+53
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,58 @@ New Behavior:
352352
In [11]: index.memory_usage(deep=True)
353353
Out[11]: 260
354354

355+
.. _whatsnew_0200.api_breaking.groupby_describe:
356+
357+
Groupby Describe Formatting
358+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
359+
360+
The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
361+
This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)
362+
363+
Previous Behavior:
364+
365+
.. code-block:: ipython
366+
367+
In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
368+
369+
In [2]: df.groupby('A').B.describe()
370+
Out[2]:
371+
A
372+
1 count 2.000000
373+
mean 1.500000
374+
std 0.707107
375+
min 1.000000
376+
25% 1.250000
377+
50% 1.500000
378+
75% 1.750000
379+
max 2.000000
380+
2 count 2.000000
381+
mean 3.500000
382+
std 0.707107
383+
min 3.000000
384+
25% 3.250000
385+
50% 3.500000
386+
75% 3.750000
387+
max 4.000000
388+
Name: B, dtype: float64
389+
390+
In [3]: df.groupby('A').B.agg([np.mean, np.std, np.min, np.max])
391+
Out[3]:
392+
mean std amin amax
393+
A
394+
1 1.5 0.707107 1 2
395+
2 3.5 0.707107 3 4
396+
397+
New Behavior:
398+
399+
.. ipython:: python
400+
401+
df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
402+
403+
df.groupby('A').B.describe()
404+
405+
df.groupby('A').B.agg([np.mean, np.std, np.min, np.max])
406+
355407
.. _whatsnew_0200.api:
356408

357409
Other API Changes
@@ -366,6 +418,7 @@ Other API Changes
366418
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
367419
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
368420
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
421+
369422
.. _whatsnew_0200.deprecations:
370423

371424
Deprecations

pandas/core/groupby.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@
8080
'mean', 'sum', 'min', 'max',
8181
'cumcount',
8282
'resample',
83-
'describe',
8483
'rank', 'quantile',
8584
'fillna',
8685
'mad',
@@ -1138,6 +1137,15 @@ def ohlc(self):
11381137
return self._apply_to_column_groupbys(
11391138
lambda x: x._cython_agg_general('ohlc'))
11401139

1140+
@Appender(DataFrame.describe.__doc__)
1141+
@Substitution(name='groupby')
1142+
@Appender(_doc_template)
1143+
def describe(self, **kwargs):
1144+
result = self.apply(lambda x: x.describe(**kwargs))
1145+
if self.axis == 1:
1146+
return result.T
1147+
return result.unstack()
1148+
11411149
@Substitution(name='groupby')
11421150
@Appender(_doc_template)
11431151
def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3047,13 @@ def nlargest(self, n=5, keep='first'):
30393047
def nsmallest(self, n=5, keep='first'):
30403048
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
30413049

3050+
@Appender(Series.describe.__doc__)
3051+
def describe(self, **kwargs):
3052+
result = self.apply(lambda x: x.describe(**kwargs))
3053+
if self.axis == 1:
3054+
return result.T
3055+
return result.unstack()
3056+
30423057
def value_counts(self, normalize=False, sort=True, ascending=False,
30433058
bins=None, dropna=True):
30443059

pandas/tests/formats/test_format.py

+15-24
Original file line numberDiff line numberDiff line change
@@ -3544,30 +3544,21 @@ def test_to_latex_multiindex(self):
35443544
self.assertEqual(result, expected)
35453545

35463546
result = df.groupby('a').describe().to_latex()
3547-
expected = r"""\begin{tabular}{llr}
3548-
\toprule
3549-
& & c \\
3550-
a & {} & \\
3551-
\midrule
3552-
0 & count & 2.000000 \\
3553-
& mean & 1.500000 \\
3554-
& std & 0.707107 \\
3555-
& min & 1.000000 \\
3556-
& 25\% & 1.250000 \\
3557-
& 50\% & 1.500000 \\
3558-
& 75\% & 1.750000 \\
3559-
& max & 2.000000 \\
3560-
1 & count & 2.000000 \\
3561-
& mean & 3.500000 \\
3562-
& std & 0.707107 \\
3563-
& min & 3.000000 \\
3564-
& 25\% & 3.250000 \\
3565-
& 50\% & 3.500000 \\
3566-
& 75\% & 3.750000 \\
3567-
& max & 4.000000 \\
3568-
\bottomrule
3569-
\end{tabular}
3570-
"""
3547+
expected = ('\\begin{tabular}{lrrrrrrrrrrrrrrrr}\n\\toprule\n{} & '
3548+
'a & & & & & & & & '
3549+
'c & & & & & & & '
3550+
' \\\\\n{} & count & mean & std & min & 25\\% & 50\\% '
3551+
'& 75\\% & max & count & mean & std & min & '
3552+
'25\\% & 50\\% & 75\\% & max \\\\\na & & & '
3553+
' & & & & & & & & '
3554+
' & & & & & '
3555+
'\\\\\n\\midrule\n0 & 2.0 & 0.0 & 0.0 & 0.0 & 0.0 & '
3556+
' 0.0 & 0.0 & 0.0 & 2.0 & 1.5 & 0.707107 & 1.0 & '
3557+
'1.25 & 1.5 & 1.75 & 2.0 \\\\\n1 & 2.0 & 1.0 & 0.0 '
3558+
'& 1.0 & 1.0 & 1.0 & 1.0 & 1.0 & 2.0 & 3.5 & '
3559+
'0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 '
3560+
'\\\\\n\\bottomrule\n\\end{tabular}\n')
3561+
35713562

35723563
self.assertEqual(result, expected)
35733564

pandas/tests/groupby/test_categorical.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -159,17 +159,20 @@ def test_groupby_categorical(self):
159159
exp_cats = Categorical(ord_labels, ordered=True,
160160
categories=['foo', 'bar', 'baz', 'qux'])
161161
expected = ord_data.groupby(exp_cats, sort=False).describe()
162-
expected.index.names = [None, None]
163162
assert_frame_equal(desc_result, expected)
164163

165164
# GH 10460
166165
expc = Categorical.from_codes(np.arange(4).repeat(8),
167166
levels, ordered=True)
168167
exp = CategoricalIndex(expc)
169-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
168+
self.assert_index_equal((desc_result.stack()
169+
.index
170+
.get_level_values(0)), exp)
170171
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
171172
'75%', 'max'] * 4)
172-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
173+
self.assert_index_equal((desc_result.stack()
174+
.index
175+
.get_level_values(1)), exp)
173176

174177
def test_groupby_datetime_categorical(self):
175178
# GH9049: ensure backward compatibility
@@ -196,7 +199,6 @@ def test_groupby_datetime_categorical(self):
196199
ord_labels = cats.take_nd(idx)
197200
ord_data = data.take(idx)
198201
expected = ord_data.groupby(ord_labels).describe()
199-
expected.index.names = [None, None]
200202
assert_frame_equal(desc_result, expected)
201203
tm.assert_index_equal(desc_result.index, expected.index)
202204
tm.assert_index_equal(
@@ -207,10 +209,14 @@ def test_groupby_datetime_categorical(self):
207209
expc = Categorical.from_codes(
208210
np.arange(4).repeat(8), levels, ordered=True)
209211
exp = CategoricalIndex(expc)
210-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
212+
self.assert_index_equal((desc_result.stack()
213+
.index
214+
.get_level_values(0)), exp)
211215
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
212216
'75%', 'max'] * 4)
213-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
217+
self.assert_index_equal((desc_result.stack()
218+
.index
219+
.get_level_values(1)), exp)
214220

215221
def test_groupby_categorical_index(self):
216222

@@ -246,8 +252,8 @@ def test_groupby_describe_categorical_columns(self):
246252
df = DataFrame(np.random.randn(20, 4), columns=cats)
247253
result = df.groupby([1, 2, 3, 4] * 5).describe()
248254

249-
tm.assert_index_equal(result.columns, cats)
250-
tm.assert_categorical_equal(result.columns.values, cats.values)
255+
tm.assert_index_equal(result.stack().columns, cats)
256+
tm.assert_categorical_equal(result.stack().columns.values, cats.values)
251257

252258
def test_groupby_unstack_categorical(self):
253259
# GH11558 (example is taken from the original issue)

pandas/tests/groupby/test_groupby.py

+52-22
Original file line numberDiff line numberDiff line change
@@ -1447,7 +1447,7 @@ def test_attr_wrapper(self):
14471447
for name, gp in grouped:
14481448
expected[name] = gp.describe()
14491449
expected = DataFrame(expected).T
1450-
assert_frame_equal(result.unstack(), expected)
1450+
assert_frame_equal(result, expected)
14511451

14521452
# get attribute
14531453
result = grouped.dtype
@@ -1459,7 +1459,7 @@ def test_attr_wrapper(self):
14591459
def test_series_describe_multikey(self):
14601460
ts = tm.makeTimeSeries()
14611461
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
1462-
result = grouped.describe().unstack()
1462+
result = grouped.describe()
14631463
assert_series_equal(result['mean'], grouped.mean(), check_names=False)
14641464
assert_series_equal(result['std'], grouped.std(), check_names=False)
14651465
assert_series_equal(result['min'], grouped.min(), check_names=False)
@@ -1468,7 +1468,7 @@ def test_series_describe_single(self):
14681468
ts = tm.makeTimeSeries()
14691469
grouped = ts.groupby(lambda x: x.month)
14701470
result = grouped.apply(lambda x: x.describe())
1471-
expected = grouped.describe()
1471+
expected = grouped.describe().stack()
14721472
assert_series_equal(result, expected)
14731473

14741474
def test_series_index_name(self):
@@ -1479,17 +1479,27 @@ def test_series_index_name(self):
14791479
def test_frame_describe_multikey(self):
14801480
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
14811481
result = grouped.describe()
1482-
1482+
desc_groups = []
14831483
for col in self.tsframe:
1484-
expected = grouped[col].describe()
1485-
assert_series_equal(result[col], expected, check_names=False)
1484+
group = grouped[col].describe()
1485+
group_col = pd.MultiIndex([[col] * len(group.columns),
1486+
group.columns],
1487+
[[0] * len(group.columns),
1488+
range(len(group.columns))])
1489+
group = pd.DataFrame(group.values,
1490+
columns=group_col,
1491+
index=group.index)
1492+
desc_groups.append(group)
1493+
expected = pd.concat(desc_groups, axis=1)
1494+
tm.assert_frame_equal(result, expected)
14861495

14871496
groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
14881497
'C': 1, 'D': 1}, axis=1)
14891498
result = groupedT.describe()
1490-
1491-
for name, group in groupedT:
1492-
assert_frame_equal(result[name], group.describe())
1499+
expected = self.tsframe.describe().T
1500+
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
1501+
[range(4), range(len(expected.index))])
1502+
tm.assert_frame_equal(result, expected)
14931503

14941504
def test_frame_describe_tupleindex(self):
14951505

@@ -1499,10 +1509,27 @@ def test_frame_describe_tupleindex(self):
14991509
'z': [100, 200, 300, 400, 500] * 3})
15001510
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
15011511
df2 = df1.rename(columns={'k': 'key'})
1502-
result = df1.groupby('k').describe()
1503-
expected = df2.groupby('key').describe()
1504-
expected.index.set_names(result.index.names, inplace=True)
1505-
assert_frame_equal(result, expected)
1512+
tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
1513+
tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())
1514+
1515+
def test_frame_describe_unstacked_format(self):
1516+
# GH 4792
1517+
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
1518+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
1519+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
1520+
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
1521+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
1522+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
1523+
df = pd.DataFrame({'PRICE': prices,
1524+
'VOLUME': volumes})
1525+
result = df.groupby('PRICE').VOLUME.describe()
1526+
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
1527+
df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
1528+
expected = pd.DataFrame(data,
1529+
index=pd.Index([24990, 25499], name='PRICE'),
1530+
columns=['count', 'mean', 'std', 'min',
1531+
'25%', '50%', '75%', 'max'])
1532+
tm.assert_frame_equal(result, expected)
15061533

15071534
def test_frame_groupby(self):
15081535
grouped = self.tsframe.groupby(lambda x: x.weekday())
@@ -2994,16 +3021,21 @@ def test_non_cython_api(self):
29943021
assert_frame_equal(result, expected)
29953022

29963023
# describe
2997-
expected = DataFrame(dict(B=concat(
2998-
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
2999-
keys=[1, 3])))
3000-
expected.index.names = ['A', None]
3024+
expected_index = pd.Index([1, 3], name='A')
3025+
expected_col = pd.MultiIndex(levels=[['B'],
3026+
['count', 'mean', 'std', 'min',
3027+
'25%', '50%', '75%', 'max']],
3028+
labels=[[0] * 8, list(range(8))])
3029+
expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0],
3030+
[0.0, nan, nan, nan, nan, nan, nan, nan]],
3031+
index=expected_index,
3032+
columns=expected_col)
30013033
result = g.describe()
30023034
assert_frame_equal(result, expected)
30033035

3004-
expected = concat(
3005-
[df.loc[[0, 1], ['A', 'B']].describe(),
3006-
df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
3036+
expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
3037+
df[df.A == 3].describe().unstack().to_frame().T])
3038+
expected.index = pd.Index([0, 1])
30073039
result = gni.describe()
30083040
assert_frame_equal(result, expected)
30093041

@@ -5157,7 +5189,6 @@ def test_groupby_whitelist(self):
51575189
'tail',
51585190
'cumcount',
51595191
'resample',
5160-
'describe',
51615192
'rank',
51625193
'quantile',
51635194
'fillna',
@@ -5194,7 +5225,6 @@ def test_groupby_whitelist(self):
51945225
'tail',
51955226
'cumcount',
51965227
'resample',
5197-
'describe',
51985228
'rank',
51995229
'quantile',
52005230
'fillna',

pandas/tests/test_generic.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1269,10 +1269,10 @@ def test_describe_typefiltering_groupby(self):
12691269
'numD': np.arange(24.) + .5,
12701270
'ts': tm.makeTimeSeries()[:24].index})
12711271
G = df.groupby('catA')
1272-
self.assertTrue(G.describe(include=['number']).shape == (16, 2))
1273-
self.assertTrue(G.describe(include=['number', 'object']).shape == (22,
1274-
3))
1275-
self.assertTrue(G.describe(include='all').shape == (26, 4))
1272+
self.assertTrue(G.describe(include=['number']).shape == (2, 16))
1273+
self.assertTrue(G.describe(include=['number', 'object']).shape == (2,
1274+
44))
1275+
self.assertTrue(G.describe(include='all').shape == (2, 65))
12761276

12771277
def test_describe_multi_index_df_column_names(self):
12781278
""" Test that column names persist after the describe operation."""

0 commit comments

Comments
 (0)