Skip to content

Commit 184378d

Browse files
committed
TST: groupby.describe levels don't appear as column (pandas-dev#4792)
Restructure describe def Fix another test Refactoring tests linting & patch groupby tests add whatsnew fix docstring fix more tests Added api example and documentation to describe fix potential pep8 complaint adjust doc description renamed original test and add agg example in doc simplify example Eliminate grouper from result simplify example in the whatsnew
1 parent f93714b commit 184378d

File tree

6 files changed

+151
-59
lines changed

6 files changed

+151
-59
lines changed

doc/source/whatsnew/v0.20.0.txt

+54
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,59 @@ New Behavior:
352352
In [11]: index.memory_usage(deep=True)
353353
Out[11]: 260
354354

355+
.. _whatsnew_0200.api_breaking.groupby_describe:
356+
357+
Groupby Describe Formatting
358+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
359+
360+
The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
361+
This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)
362+
363+
Previous Behavior:
364+
365+
.. code-block:: ipython
366+
367+
In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
368+
369+
In [2]: df.groupby('A').describe()
370+
Out[2]:
371+
B
372+
A
373+
1 count 2.000000
374+
mean 1.500000
375+
std 0.707107
376+
min 1.000000
377+
25% 1.250000
378+
50% 1.500000
379+
75% 1.750000
380+
max 2.000000
381+
2 count 2.000000
382+
mean 3.500000
383+
std 0.707107
384+
min 3.000000
385+
25% 3.250000
386+
50% 3.500000
387+
75% 3.750000
388+
max 4.000000
389+
390+
In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max])
391+
Out[3]:
392+
B
393+
mean std amin amax
394+
A
395+
1 1.5 0.707107 1 2
396+
2 3.5 0.707107 3 4
397+
398+
New Behavior:
399+
400+
.. ipython:: python
401+
402+
df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
403+
404+
df.groupby('A').describe()
405+
406+
df.groupby('A').agg([np.mean, np.std, np.min, np.max])
407+
355408
.. _whatsnew_0200.api:
356409

357410
Other API Changes
@@ -366,6 +419,7 @@ Other API Changes
366419
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
367420
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
368421
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
422+
369423
.. _whatsnew_0200.deprecations:
370424

371425
Deprecations

pandas/core/groupby.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@
8080
'mean', 'sum', 'min', 'max',
8181
'cumcount',
8282
'resample',
83-
'describe',
8483
'rank', 'quantile',
8584
'fillna',
8685
'mad',
@@ -1138,6 +1137,16 @@ def ohlc(self):
11381137
return self._apply_to_column_groupbys(
11391138
lambda x: x._cython_agg_general('ohlc'))
11401139

1140+
@Appender(DataFrame.describe.__doc__)
1141+
@Substitution(name='groupby')
1142+
@Appender(_doc_template)
1143+
def describe(self, **kwargs):
1144+
self._set_group_selection()
1145+
result = self.apply(lambda x: x.describe(**kwargs))
1146+
if self.axis == 1:
1147+
return result.T
1148+
return result.unstack()
1149+
11411150
@Substitution(name='groupby')
11421151
@Appender(_doc_template)
11431152
def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3048,14 @@ def nlargest(self, n=5, keep='first'):
30393048
def nsmallest(self, n=5, keep='first'):
30403049
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
30413050

3051+
@Appender(Series.describe.__doc__)
3052+
def describe(self, **kwargs):
3053+
self._set_group_selection()
3054+
result = self.apply(lambda x: x.describe(**kwargs))
3055+
if self.axis == 1:
3056+
return result.T
3057+
return result.unstack()
3058+
30423059
def value_counts(self, normalize=False, sort=True, ascending=False,
30433060
bins=None, dropna=True):
30443061

pandas/tests/formats/test_format.py

+9-24
Original file line numberDiff line numberDiff line change
@@ -3544,30 +3544,15 @@ def test_to_latex_multiindex(self):
35443544
self.assertEqual(result, expected)
35453545

35463546
result = df.groupby('a').describe().to_latex()
3547-
expected = r"""\begin{tabular}{llr}
3548-
\toprule
3549-
& & c \\
3550-
a & {} & \\
3551-
\midrule
3552-
0 & count & 2.000000 \\
3553-
& mean & 1.500000 \\
3554-
& std & 0.707107 \\
3555-
& min & 1.000000 \\
3556-
& 25\% & 1.250000 \\
3557-
& 50\% & 1.500000 \\
3558-
& 75\% & 1.750000 \\
3559-
& max & 2.000000 \\
3560-
1 & count & 2.000000 \\
3561-
& mean & 3.500000 \\
3562-
& std & 0.707107 \\
3563-
& min & 3.000000 \\
3564-
& 25\% & 3.250000 \\
3565-
& 50\% & 3.500000 \\
3566-
& 75\% & 3.750000 \\
3567-
& max & 4.000000 \\
3568-
\bottomrule
3569-
\end{tabular}
3570-
"""
3547+
expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & '
3548+
' & & & & & & '
3549+
'\\\\\n{} & count & mean & std & min & 25\\% & '
3550+
'50\\% & 75\\% & max \\\\\na & & & '
3551+
' & & & & & \\\\\n\\midrule\n0 '
3552+
'& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 '
3553+
'& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 '
3554+
'& 3.5 & 3.75 & 4.0 '
3555+
'\\\\\n\\bottomrule\n\\end{tabular}\n')
35713556

35723557
self.assertEqual(result, expected)
35733558

pandas/tests/groupby/test_categorical.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -159,17 +159,20 @@ def test_groupby_categorical(self):
159159
exp_cats = Categorical(ord_labels, ordered=True,
160160
categories=['foo', 'bar', 'baz', 'qux'])
161161
expected = ord_data.groupby(exp_cats, sort=False).describe()
162-
expected.index.names = [None, None]
163162
assert_frame_equal(desc_result, expected)
164163

165164
# GH 10460
166165
expc = Categorical.from_codes(np.arange(4).repeat(8),
167166
levels, ordered=True)
168167
exp = CategoricalIndex(expc)
169-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
168+
self.assert_index_equal((desc_result.stack()
169+
.index
170+
.get_level_values(0)), exp)
170171
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
171172
'75%', 'max'] * 4)
172-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
173+
self.assert_index_equal((desc_result.stack()
174+
.index
175+
.get_level_values(1)), exp)
173176

174177
def test_groupby_datetime_categorical(self):
175178
# GH9049: ensure backward compatibility
@@ -196,7 +199,6 @@ def test_groupby_datetime_categorical(self):
196199
ord_labels = cats.take_nd(idx)
197200
ord_data = data.take(idx)
198201
expected = ord_data.groupby(ord_labels).describe()
199-
expected.index.names = [None, None]
200202
assert_frame_equal(desc_result, expected)
201203
tm.assert_index_equal(desc_result.index, expected.index)
202204
tm.assert_index_equal(
@@ -207,10 +209,14 @@ def test_groupby_datetime_categorical(self):
207209
expc = Categorical.from_codes(
208210
np.arange(4).repeat(8), levels, ordered=True)
209211
exp = CategoricalIndex(expc)
210-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
212+
self.assert_index_equal((desc_result.stack()
213+
.index
214+
.get_level_values(0)), exp)
211215
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
212216
'75%', 'max'] * 4)
213-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
217+
self.assert_index_equal((desc_result.stack()
218+
.index
219+
.get_level_values(1)), exp)
214220

215221
def test_groupby_categorical_index(self):
216222

@@ -246,8 +252,8 @@ def test_groupby_describe_categorical_columns(self):
246252
df = DataFrame(np.random.randn(20, 4), columns=cats)
247253
result = df.groupby([1, 2, 3, 4] * 5).describe()
248254

249-
tm.assert_index_equal(result.columns, cats)
250-
tm.assert_categorical_equal(result.columns.values, cats.values)
255+
tm.assert_index_equal(result.stack().columns, cats)
256+
tm.assert_categorical_equal(result.stack().columns.values, cats.values)
251257

252258
def test_groupby_unstack_categorical(self):
253259
# GH11558 (example is taken from the original issue)

pandas/tests/groupby/test_groupby.py

+52-22
Original file line numberDiff line numberDiff line change
@@ -1447,7 +1447,7 @@ def test_attr_wrapper(self):
14471447
for name, gp in grouped:
14481448
expected[name] = gp.describe()
14491449
expected = DataFrame(expected).T
1450-
assert_frame_equal(result.unstack(), expected)
1450+
assert_frame_equal(result, expected)
14511451

14521452
# get attribute
14531453
result = grouped.dtype
@@ -1459,7 +1459,7 @@ def test_attr_wrapper(self):
14591459
def test_series_describe_multikey(self):
14601460
ts = tm.makeTimeSeries()
14611461
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
1462-
result = grouped.describe().unstack()
1462+
result = grouped.describe()
14631463
assert_series_equal(result['mean'], grouped.mean(), check_names=False)
14641464
assert_series_equal(result['std'], grouped.std(), check_names=False)
14651465
assert_series_equal(result['min'], grouped.min(), check_names=False)
@@ -1468,7 +1468,7 @@ def test_series_describe_single(self):
14681468
ts = tm.makeTimeSeries()
14691469
grouped = ts.groupby(lambda x: x.month)
14701470
result = grouped.apply(lambda x: x.describe())
1471-
expected = grouped.describe()
1471+
expected = grouped.describe().stack()
14721472
assert_series_equal(result, expected)
14731473

14741474
def test_series_index_name(self):
@@ -1479,17 +1479,27 @@ def test_series_index_name(self):
14791479
def test_frame_describe_multikey(self):
14801480
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
14811481
result = grouped.describe()
1482-
1482+
desc_groups = []
14831483
for col in self.tsframe:
1484-
expected = grouped[col].describe()
1485-
assert_series_equal(result[col], expected, check_names=False)
1484+
group = grouped[col].describe()
1485+
group_col = pd.MultiIndex([[col] * len(group.columns),
1486+
group.columns],
1487+
[[0] * len(group.columns),
1488+
range(len(group.columns))])
1489+
group = pd.DataFrame(group.values,
1490+
columns=group_col,
1491+
index=group.index)
1492+
desc_groups.append(group)
1493+
expected = pd.concat(desc_groups, axis=1)
1494+
tm.assert_frame_equal(result, expected)
14861495

14871496
groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
14881497
'C': 1, 'D': 1}, axis=1)
14891498
result = groupedT.describe()
1490-
1491-
for name, group in groupedT:
1492-
assert_frame_equal(result[name], group.describe())
1499+
expected = self.tsframe.describe().T
1500+
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
1501+
[range(4), range(len(expected.index))])
1502+
tm.assert_frame_equal(result, expected)
14931503

14941504
def test_frame_describe_tupleindex(self):
14951505

@@ -1499,10 +1509,27 @@ def test_frame_describe_tupleindex(self):
14991509
'z': [100, 200, 300, 400, 500] * 3})
15001510
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
15011511
df2 = df1.rename(columns={'k': 'key'})
1502-
result = df1.groupby('k').describe()
1503-
expected = df2.groupby('key').describe()
1504-
expected.index.set_names(result.index.names, inplace=True)
1505-
assert_frame_equal(result, expected)
1512+
tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
1513+
tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())
1514+
1515+
def test_frame_describe_unstacked_format(self):
1516+
# GH 4792
1517+
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
1518+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
1519+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
1520+
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
1521+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
1522+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
1523+
df = pd.DataFrame({'PRICE': prices,
1524+
'VOLUME': volumes})
1525+
result = df.groupby('PRICE').VOLUME.describe()
1526+
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
1527+
df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
1528+
expected = pd.DataFrame(data,
1529+
index=pd.Index([24990, 25499], name='PRICE'),
1530+
columns=['count', 'mean', 'std', 'min',
1531+
'25%', '50%', '75%', 'max'])
1532+
tm.assert_frame_equal(result, expected)
15061533

15071534
def test_frame_groupby(self):
15081535
grouped = self.tsframe.groupby(lambda x: x.weekday())
@@ -2994,16 +3021,21 @@ def test_non_cython_api(self):
29943021
assert_frame_equal(result, expected)
29953022

29963023
# describe
2997-
expected = DataFrame(dict(B=concat(
2998-
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
2999-
keys=[1, 3])))
3000-
expected.index.names = ['A', None]
3024+
expected_index = pd.Index([1, 3], name='A')
3025+
expected_col = pd.MultiIndex(levels=[['B'],
3026+
['count', 'mean', 'std', 'min',
3027+
'25%', '50%', '75%', 'max']],
3028+
labels=[[0] * 8, list(range(8))])
3029+
expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0],
3030+
[0.0, nan, nan, nan, nan, nan, nan, nan]],
3031+
index=expected_index,
3032+
columns=expected_col)
30013033
result = g.describe()
30023034
assert_frame_equal(result, expected)
30033035

3004-
expected = concat(
3005-
[df.loc[[0, 1], ['A', 'B']].describe(),
3006-
df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
3036+
expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
3037+
df[df.A == 3].describe().unstack().to_frame().T])
3038+
expected.index = pd.Index([0, 1])
30073039
result = gni.describe()
30083040
assert_frame_equal(result, expected)
30093041

@@ -5157,7 +5189,6 @@ def test_groupby_whitelist(self):
51575189
'tail',
51585190
'cumcount',
51595191
'resample',
5160-
'describe',
51615192
'rank',
51625193
'quantile',
51635194
'fillna',
@@ -5194,7 +5225,6 @@ def test_groupby_whitelist(self):
51945225
'tail',
51955226
'cumcount',
51965227
'resample',
5197-
'describe',
51985228
'rank',
51995229
'quantile',
52005230
'fillna',

pandas/tests/test_generic.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1269,10 +1269,10 @@ def test_describe_typefiltering_groupby(self):
12691269
'numD': np.arange(24.) + .5,
12701270
'ts': tm.makeTimeSeries()[:24].index})
12711271
G = df.groupby('catA')
1272-
self.assertTrue(G.describe(include=['number']).shape == (16, 2))
1273-
self.assertTrue(G.describe(include=['number', 'object']).shape == (22,
1274-
3))
1275-
self.assertTrue(G.describe(include='all').shape == (26, 4))
1272+
self.assertTrue(G.describe(include=['number']).shape == (2, 16))
1273+
self.assertTrue(G.describe(include=['number', 'object']).shape == (2,
1274+
33))
1275+
self.assertTrue(G.describe(include='all').shape == (2, 52))
12761276

12771277
def test_describe_multi_index_df_column_names(self):
12781278
""" Test that column names persist after the describe operation."""

0 commit comments

Comments
 (0)