Skip to content

Commit 4b5d367

Browse files
committed
TST: groupby.describe levels don't appear as column (#4792)
Restructure describe def Fix another test Refactoring tests linting & patch groupby tests add whatsnew fix docstring fix more tests Added api example and documentation to describe
1 parent df9fc4f commit 4b5d367

File tree

6 files changed

+159
-54
lines changed

6 files changed

+159
-54
lines changed

doc/source/whatsnew/v0.20.0.txt

+63
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,68 @@ New Behavior:
352352
In [11]: index.memory_usage(deep=True)
353353
Out[11]: 260
354354

355+
.. _whatsnew_0200.api_breaking.groupby_describe:
356+
357+
Groupby Describe Formatting
358+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
359+
360+
The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
361+
This format is consistent with ``groupby.ohlc()`` (:issue:`4792`)
362+
363+
Previous Behavior:
364+
365+
.. code-block:: ipython
366+
367+
In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
368+
369+
In [2]: df.groupby('A').B.describe()
370+
Out[2]:
371+
A
372+
1 count 2.000000
373+
mean 1.500000
374+
std 0.707107
375+
min 1.000000
376+
25% 1.250000
377+
50% 1.500000
378+
75% 1.750000
379+
max 2.000000
380+
2 count 2.000000
381+
mean 3.500000
382+
std 0.707107
383+
min 3.000000
384+
25% 3.250000
385+
50% 3.500000
386+
75% 3.750000
387+
max 4.000000
388+
Name: B, dtype: float64
389+
390+
In [3]: df.groupby('A').B.ohlc()
391+
Out[3]:
392+
open high low close
393+
A
394+
1 1 2 1 2
395+
2 3 4 3 4
396+
397+
New Behavior:
398+
399+
.. code-block:: ipython
400+
401+
In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
402+
403+
In [2]: df.groupby('A').B.describe()
404+
Out[2]:
405+
count mean std min 25% 50% 75% max
406+
A
407+
1 2.0 1.5 0.707107 1.0 1.25 1.5 1.75 2.0
408+
2 2.0 3.5 0.707107 3.0 3.25 3.5 3.75 4.0
409+
410+
In [3]: df.groupby('A').B.ohlc()
411+
Out[3]:
412+
open high low close
413+
A
414+
1 1 2 1 2
415+
2 3 4 3 4
416+
355417
.. _whatsnew_0200.api:
356418

357419
Other API Changes
@@ -366,6 +428,7 @@ Other API Changes
366428
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
367429
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
368430
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
431+
369432
.. _whatsnew_0200.deprecations:
370433

371434
Deprecations

pandas/core/groupby.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@
8080
'mean', 'sum', 'min', 'max',
8181
'cumcount',
8282
'resample',
83-
'describe',
8483
'rank', 'quantile',
8584
'fillna',
8685
'mad',
@@ -1138,6 +1137,15 @@ def ohlc(self):
11381137
return self._apply_to_column_groupbys(
11391138
lambda x: x._cython_agg_general('ohlc'))
11401139

1140+
@Appender(DataFrame.describe.__doc__)
1141+
@Substitution(name='groupby')
1142+
@Appender(_doc_template)
1143+
def describe(self, **kwargs):
1144+
result = self.apply(lambda x: x.describe(**kwargs))
1145+
if self.axis == 1:
1146+
return result.T
1147+
return result.unstack()
1148+
11411149
@Substitution(name='groupby')
11421150
@Appender(_doc_template)
11431151
def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3047,13 @@ def nlargest(self, n=5, keep='first'):
30393047
def nsmallest(self, n=5, keep='first'):
30403048
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
30413049

3050+
@Appender(Series.describe.__doc__)
3051+
def describe(self, **kwargs):
3052+
result = self.apply(lambda x: x.describe(**kwargs))
3053+
if self.axis == 1:
3054+
return result.T
3055+
return result.unstack()
3056+
30423057
def value_counts(self, normalize=False, sort=True, ascending=False,
30433058
bins=None, dropna=True):
30443059

pandas/tests/formats/test_format.py

+6-19
Original file line numberDiff line numberDiff line change
@@ -3544,27 +3544,14 @@ def test_to_latex_multiindex(self):
35443544
self.assertEqual(result, expected)
35453545

35463546
result = df.groupby('a').describe().to_latex()
3547-
expected = r"""\begin{tabular}{llr}
3547+
expected = r"""\begin{tabular}{lrrrrrrrrrrrrrrrr}
35483548
\toprule
3549-
& & c \\
3550-
a & {} & \\
3549+
{} & a & & & & & & & & c & & & & & & & \\
3550+
{} & count & mean & std & min & 25\% & 50\% & 75\% & max & count & mean & std & min & 25\% & 50\% & 75\% & max \\
3551+
a & & & & & & & & & & & & & & & & \\
35513552
\midrule
3552-
0 & count & 2.000000 \\
3553-
& mean & 1.500000 \\
3554-
& std & 0.707107 \\
3555-
& min & 1.000000 \\
3556-
& 25\% & 1.250000 \\
3557-
& 50\% & 1.500000 \\
3558-
& 75\% & 1.750000 \\
3559-
& max & 2.000000 \\
3560-
1 & count & 2.000000 \\
3561-
& mean & 3.500000 \\
3562-
& std & 0.707107 \\
3563-
& min & 3.000000 \\
3564-
& 25\% & 3.250000 \\
3565-
& 50\% & 3.500000 \\
3566-
& 75\% & 3.750000 \\
3567-
& max & 4.000000 \\
3553+
0 & 2.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\
3554+
1 & 2.0 & 1.0 & 0.0 & 1.0 & 1.0 & 1.0 & 1.0 & 1.0 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\
35683555
\bottomrule
35693556
\end{tabular}
35703557
"""

pandas/tests/groupby/test_categorical.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -159,17 +159,20 @@ def test_groupby_categorical(self):
159159
exp_cats = Categorical(ord_labels, ordered=True,
160160
categories=['foo', 'bar', 'baz', 'qux'])
161161
expected = ord_data.groupby(exp_cats, sort=False).describe()
162-
expected.index.names = [None, None]
163162
assert_frame_equal(desc_result, expected)
164163

165164
# GH 10460
166165
expc = Categorical.from_codes(np.arange(4).repeat(8),
167166
levels, ordered=True)
168167
exp = CategoricalIndex(expc)
169-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
168+
self.assert_index_equal((desc_result.stack()
169+
.index
170+
.get_level_values(0)), exp)
170171
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
171172
'75%', 'max'] * 4)
172-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
173+
self.assert_index_equal((desc_result.stack()
174+
.index
175+
.get_level_values(1)), exp)
173176

174177
def test_groupby_datetime_categorical(self):
175178
# GH9049: ensure backward compatibility
@@ -196,7 +199,6 @@ def test_groupby_datetime_categorical(self):
196199
ord_labels = cats.take_nd(idx)
197200
ord_data = data.take(idx)
198201
expected = ord_data.groupby(ord_labels).describe()
199-
expected.index.names = [None, None]
200202
assert_frame_equal(desc_result, expected)
201203
tm.assert_index_equal(desc_result.index, expected.index)
202204
tm.assert_index_equal(
@@ -207,10 +209,14 @@ def test_groupby_datetime_categorical(self):
207209
expc = Categorical.from_codes(
208210
np.arange(4).repeat(8), levels, ordered=True)
209211
exp = CategoricalIndex(expc)
210-
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
212+
self.assert_index_equal((desc_result.stack()
213+
.index
214+
.get_level_values(0)), exp)
211215
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
212216
'75%', 'max'] * 4)
213-
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
217+
self.assert_index_equal((desc_result.stack()
218+
.index
219+
.get_level_values(1)), exp)
214220

215221
def test_groupby_categorical_index(self):
216222

@@ -246,8 +252,8 @@ def test_groupby_describe_categorical_columns(self):
246252
df = DataFrame(np.random.randn(20, 4), columns=cats)
247253
result = df.groupby([1, 2, 3, 4] * 5).describe()
248254

249-
tm.assert_index_equal(result.columns, cats)
250-
tm.assert_categorical_equal(result.columns.values, cats.values)
255+
tm.assert_index_equal(result.stack().columns, cats)
256+
tm.assert_categorical_equal(result.stack().columns.values, cats.values)
251257

252258
def test_groupby_unstack_categorical(self):
253259
# GH11558 (example is taken from the original issue)

pandas/tests/groupby/test_groupby.py

+56-22
Original file line numberDiff line numberDiff line change
@@ -1447,7 +1447,7 @@ def test_attr_wrapper(self):
14471447
for name, gp in grouped:
14481448
expected[name] = gp.describe()
14491449
expected = DataFrame(expected).T
1450-
assert_frame_equal(result.unstack(), expected)
1450+
assert_frame_equal(result, expected)
14511451

14521452
# get attribute
14531453
result = grouped.dtype
@@ -1459,7 +1459,7 @@ def test_attr_wrapper(self):
14591459
def test_series_describe_multikey(self):
14601460
ts = tm.makeTimeSeries()
14611461
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
1462-
result = grouped.describe().unstack()
1462+
result = grouped.describe()
14631463
assert_series_equal(result['mean'], grouped.mean(), check_names=False)
14641464
assert_series_equal(result['std'], grouped.std(), check_names=False)
14651465
assert_series_equal(result['min'], grouped.min(), check_names=False)
@@ -1468,7 +1468,7 @@ def test_series_describe_single(self):
14681468
ts = tm.makeTimeSeries()
14691469
grouped = ts.groupby(lambda x: x.month)
14701470
result = grouped.apply(lambda x: x.describe())
1471-
expected = grouped.describe()
1471+
expected = grouped.describe().stack()
14721472
assert_series_equal(result, expected)
14731473

14741474
def test_series_index_name(self):
@@ -1479,17 +1479,27 @@ def test_series_index_name(self):
14791479
def test_frame_describe_multikey(self):
14801480
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
14811481
result = grouped.describe()
1482-
1482+
desc_groups = []
14831483
for col in self.tsframe:
1484-
expected = grouped[col].describe()
1485-
assert_series_equal(result[col], expected, check_names=False)
1484+
group = grouped[col].describe()
1485+
group_col = pd.MultiIndex([[col] * len(group.columns),
1486+
group.columns],
1487+
[[0] * len(group.columns),
1488+
range(len(group.columns))])
1489+
group = pd.DataFrame(group.values,
1490+
columns=group_col,
1491+
index=group.index)
1492+
desc_groups.append(group)
1493+
expected = pd.concat(desc_groups, axis=1)
1494+
tm.assert_frame_equal(result, expected)
14861495

14871496
groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
14881497
'C': 1, 'D': 1}, axis=1)
14891498
result = groupedT.describe()
1490-
1491-
for name, group in groupedT:
1492-
assert_frame_equal(result[name], group.describe())
1499+
expected = self.tsframe.describe().T
1500+
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
1501+
[range(4), range(len(expected.index))])
1502+
tm.assert_frame_equal(result, expected)
14931503

14941504
def test_frame_describe_tupleindex(self):
14951505

@@ -1499,10 +1509,27 @@ def test_frame_describe_tupleindex(self):
14991509
'z': [100, 200, 300, 400, 500] * 3})
15001510
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
15011511
df2 = df1.rename(columns={'k': 'key'})
1502-
result = df1.groupby('k').describe()
1503-
expected = df2.groupby('key').describe()
1504-
expected.index.set_names(result.index.names, inplace=True)
1505-
assert_frame_equal(result, expected)
1512+
tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
1513+
tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())
1514+
1515+
def test_frame_describe_multiindex_level_not_as_column(self):
1516+
# GH 4792
1517+
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
1518+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
1519+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
1520+
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
1521+
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
1522+
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
1523+
df = pd.DataFrame({'PRICE': prices,
1524+
'VOLUME': volumes})
1525+
result = df.groupby('PRICE').VOLUME.describe()
1526+
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
1527+
df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
1528+
expected = pd.DataFrame(data,
1529+
index=pd.Index([24990, 25499], name='PRICE'),
1530+
columns=['count', 'mean', 'std', 'min',
1531+
'25%', '50%', '75%', 'max'])
1532+
tm.assert_frame_equal(result, expected)
15061533

15071534
def test_frame_groupby(self):
15081535
grouped = self.tsframe.groupby(lambda x: x.weekday())
@@ -2994,16 +3021,25 @@ def test_non_cython_api(self):
29943021
assert_frame_equal(result, expected)
29953022

29963023
# describe
2997-
expected = DataFrame(dict(B=concat(
2998-
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
2999-
keys=[1, 3])))
3000-
expected.index.names = ['A', None]
3024+
expected = pd.concat([(df[df.A == 1].B
3025+
.describe()
3026+
.to_frame()
3027+
.unstack()
3028+
.to_frame()
3029+
.T),
3030+
(df[df.A == 3].B
3031+
.describe()
3032+
.to_frame()
3033+
.unstack()
3034+
.to_frame()
3035+
.T)])
3036+
expected.index = pd.Index([1, 3], name='A')
30013037
result = g.describe()
30023038
assert_frame_equal(result, expected)
30033039

3004-
expected = concat(
3005-
[df.loc[[0, 1], ['A', 'B']].describe(),
3006-
df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
3040+
expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
3041+
df[df.A == 3].describe().unstack().to_frame().T])
3042+
expected.index = pd.Index([0, 1])
30073043
result = gni.describe()
30083044
assert_frame_equal(result, expected)
30093045

@@ -5149,7 +5185,6 @@ def test_groupby_whitelist(self):
51495185
'tail',
51505186
'cumcount',
51515187
'resample',
5152-
'describe',
51535188
'rank',
51545189
'quantile',
51555190
'fillna',
@@ -5186,7 +5221,6 @@ def test_groupby_whitelist(self):
51865221
'tail',
51875222
'cumcount',
51885223
'resample',
5189-
'describe',
51905224
'rank',
51915225
'quantile',
51925226
'fillna',

pandas/tests/test_generic.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1269,10 +1269,10 @@ def test_describe_typefiltering_groupby(self):
12691269
'numD': np.arange(24.) + .5,
12701270
'ts': tm.makeTimeSeries()[:24].index})
12711271
G = df.groupby('catA')
1272-
self.assertTrue(G.describe(include=['number']).shape == (16, 2))
1273-
self.assertTrue(G.describe(include=['number', 'object']).shape == (22,
1274-
3))
1275-
self.assertTrue(G.describe(include='all').shape == (26, 4))
1272+
self.assertTrue(G.describe(include=['number']).shape == (2, 16))
1273+
self.assertTrue(G.describe(include=['number', 'object']).shape == (2,
1274+
44))
1275+
self.assertTrue(G.describe(include='all').shape == (2, 65))
12761276

12771277
def test_describe_multi_index_df_column_names(self):
12781278
""" Test that column names persist after the describe operation."""

0 commit comments

Comments
 (0)