Skip to content

API: Reformat output of groupby.describe (#4792) #15260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,59 @@ New Behavior:
In [11]: index.memory_usage(deep=True)
Out[11]: 260

.. _whatsnew_0200.api_breaking.groupby_describe:

Groupby Describe Formatting
^^^^^^^^^^^^^^^^^^^^^^^^^^^

The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)

Previous Behavior:

.. code-block:: ipython

In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})

In [2]: df.groupby('A').describe()
Out[2]:
B
A
1 count 2.000000
mean 1.500000
std 0.707107
min 1.000000
25% 1.250000
50% 1.500000
75% 1.750000
max 2.000000
2 count 2.000000
mean 3.500000
std 0.707107
min 3.000000
25% 3.250000
50% 3.500000
75% 3.750000
max 4.000000

In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max])
Out[3]:
B
mean std amin amax
A
1 1.5 0.707107 1 2
2 3.5 0.707107 3 4

New Behavior:

.. ipython:: python

df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})

df.groupby('A').describe()

df.groupby('A').agg([np.mean, np.std, np.min, np.max])

.. _whatsnew_0200.api:

Other API Changes
Expand Down
19 changes: 18 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@
'mean', 'sum', 'min', 'max',
'cumcount',
'resample',
'describe',
'rank', 'quantile',
'fillna',
'mad',
Expand Down Expand Up @@ -1138,6 +1137,16 @@ def ohlc(self):
return self._apply_to_column_groupbys(
lambda x: x._cython_agg_general('ohlc'))

@Appender(DataFrame.describe.__doc__)
@Substitution(name='groupby')
@Appender(_doc_template)
def describe(self, **kwargs):
self._set_group_selection()
result = self.apply(lambda x: x.describe(**kwargs))
if self.axis == 1:
return result.T
return result.unstack()

@Substitution(name='groupby')
@Appender(_doc_template)
def resample(self, rule, *args, **kwargs):
Expand Down Expand Up @@ -3039,6 +3048,14 @@ def nlargest(self, n=5, keep='first'):
def nsmallest(self, n=5, keep='first'):
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))

@Appender(Series.describe.__doc__)
def describe(self, **kwargs):
self._set_group_selection()
result = self.apply(lambda x: x.describe(**kwargs))
if self.axis == 1:
return result.T
return result.unstack()

def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None, dropna=True):

Expand Down
33 changes: 9 additions & 24 deletions pandas/tests/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3545,30 +3545,15 @@ def test_to_latex_multiindex(self):
self.assertEqual(result, expected)

result = df.groupby('a').describe().to_latex()
expected = r"""\begin{tabular}{llr}
\toprule
& & c \\
a & {} & \\
\midrule
0 & count & 2.000000 \\
& mean & 1.500000 \\
& std & 0.707107 \\
& min & 1.000000 \\
& 25\% & 1.250000 \\
& 50\% & 1.500000 \\
& 75\% & 1.750000 \\
& max & 2.000000 \\
1 & count & 2.000000 \\
& mean & 3.500000 \\
& std & 0.707107 \\
& min & 3.000000 \\
& 25\% & 3.250000 \\
& 50\% & 3.500000 \\
& 75\% & 3.750000 \\
& max & 4.000000 \\
\bottomrule
\end{tabular}
"""
expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & '
' & & & & & & '
'\\\\\n{} & count & mean & std & min & 25\\% & '
'50\\% & 75\\% & max \\\\\na & & & '
' & & & & & \\\\\n\\midrule\n0 '
'& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 '
'& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 '
'& 3.5 & 3.75 & 4.0 '
'\\\\\n\\bottomrule\n\\end{tabular}\n')

self.assertEqual(result, expected)

Expand Down
22 changes: 14 additions & 8 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,20 @@ def test_groupby_categorical(self):
exp_cats = Categorical(ord_labels, ordered=True,
categories=['foo', 'bar', 'baz', 'qux'])
expected = ord_data.groupby(exp_cats, sort=False).describe()
expected.index.names = [None, None]
assert_frame_equal(desc_result, expected)

# GH 10460
expc = Categorical.from_codes(np.arange(4).repeat(8),
levels, ordered=True)
exp = CategoricalIndex(expc)
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
self.assert_index_equal((desc_result.stack()
.index
.get_level_values(0)), exp)
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
'75%', 'max'] * 4)
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
self.assert_index_equal((desc_result.stack()
.index
.get_level_values(1)), exp)

def test_groupby_datetime_categorical(self):
# GH9049: ensure backward compatibility
Expand All @@ -144,7 +147,6 @@ def test_groupby_datetime_categorical(self):
ord_labels = cats.take_nd(idx)
ord_data = data.take(idx)
expected = ord_data.groupby(ord_labels).describe()
expected.index.names = [None, None]
assert_frame_equal(desc_result, expected)
tm.assert_index_equal(desc_result.index, expected.index)
tm.assert_index_equal(
Expand All @@ -155,10 +157,14 @@ def test_groupby_datetime_categorical(self):
expc = Categorical.from_codes(
np.arange(4).repeat(8), levels, ordered=True)
exp = CategoricalIndex(expc)
self.assert_index_equal(desc_result.index.get_level_values(0), exp)
self.assert_index_equal((desc_result.stack()
.index
.get_level_values(0)), exp)
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
'75%', 'max'] * 4)
self.assert_index_equal(desc_result.index.get_level_values(1), exp)
self.assert_index_equal((desc_result.stack()
.index
.get_level_values(1)), exp)

def test_groupby_categorical_index(self):

Expand Down Expand Up @@ -195,8 +201,8 @@ def test_groupby_describe_categorical_columns(self):
df = DataFrame(np.random.randn(20, 4), columns=cats)
result = df.groupby([1, 2, 3, 4] * 5).describe()

tm.assert_index_equal(result.columns, cats)
tm.assert_categorical_equal(result.columns.values, cats.values)
tm.assert_index_equal(result.stack().columns, cats)
tm.assert_categorical_equal(result.stack().columns.values, cats.values)

def test_groupby_unstack_categorical(self):
# GH11558 (example is taken from the original issue)
Expand Down
74 changes: 52 additions & 22 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,7 @@ def test_attr_wrapper(self):
for name, gp in grouped:
expected[name] = gp.describe()
expected = DataFrame(expected).T
assert_frame_equal(result.unstack(), expected)
assert_frame_equal(result, expected)

# get attribute
result = grouped.dtype
Expand All @@ -1098,7 +1098,7 @@ def test_attr_wrapper(self):
def test_series_describe_multikey(self):
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe().unstack()
result = grouped.describe()
assert_series_equal(result['mean'], grouped.mean(), check_names=False)
assert_series_equal(result['std'], grouped.std(), check_names=False)
assert_series_equal(result['min'], grouped.min(), check_names=False)
Expand All @@ -1107,7 +1107,7 @@ def test_series_describe_single(self):
ts = tm.makeTimeSeries()
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(lambda x: x.describe())
expected = grouped.describe()
expected = grouped.describe().stack()
assert_series_equal(result, expected)

def test_series_index_name(self):
Expand All @@ -1118,17 +1118,27 @@ def test_series_index_name(self):
def test_frame_describe_multikey(self):
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()

desc_groups = []
for col in self.tsframe:
expected = grouped[col].describe()
assert_series_equal(result[col], expected, check_names=False)
group = grouped[col].describe()
group_col = pd.MultiIndex([[col] * len(group.columns),
group.columns],
[[0] * len(group.columns),
range(len(group.columns))])
group = pd.DataFrame(group.values,
columns=group_col,
index=group.index)
desc_groups.append(group)
expected = pd.concat(desc_groups, axis=1)
tm.assert_frame_equal(result, expected)

groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
'C': 1, 'D': 1}, axis=1)
result = groupedT.describe()

for name, group in groupedT:
assert_frame_equal(result[name], group.describe())
expected = self.tsframe.describe().T
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
[range(4), range(len(expected.index))])
tm.assert_frame_equal(result, expected)

def test_frame_describe_tupleindex(self):

Expand All @@ -1138,10 +1148,27 @@ def test_frame_describe_tupleindex(self):
'z': [100, 200, 300, 400, 500] * 3})
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
df2 = df1.rename(columns={'k': 'key'})
result = df1.groupby('k').describe()
expected = df2.groupby('key').describe()
expected.index.set_names(result.index.names, inplace=True)
assert_frame_equal(result, expected)
tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())

def test_frame_describe_unstacked_format(self):
# GH 4792
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
df = pd.DataFrame({'PRICE': prices,
'VOLUME': volumes})
result = df.groupby('PRICE').VOLUME.describe()
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
expected = pd.DataFrame(data,
index=pd.Index([24990, 25499], name='PRICE'),
columns=['count', 'mean', 'std', 'min',
'25%', '50%', '75%', 'max'])
tm.assert_frame_equal(result, expected)

def test_frame_groupby(self):
grouped = self.tsframe.groupby(lambda x: x.weekday())
Expand Down Expand Up @@ -2546,16 +2573,21 @@ def test_non_cython_api(self):
assert_frame_equal(result, expected)

# describe
expected = DataFrame(dict(B=concat(
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
keys=[1, 3])))
expected.index.names = ['A', None]
expected_index = pd.Index([1, 3], name='A')
expected_col = pd.MultiIndex(levels=[['B'],
['count', 'mean', 'std', 'min',
'25%', '50%', '75%', 'max']],
labels=[[0] * 8, list(range(8))])
expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0],
[0.0, nan, nan, nan, nan, nan, nan, nan]],
index=expected_index,
columns=expected_col)
result = g.describe()
assert_frame_equal(result, expected)

expected = concat(
[df.loc[[0, 1], ['A', 'B']].describe(),
df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
df[df.A == 3].describe().unstack().to_frame().T])
expected.index = pd.Index([0, 1])
result = gni.describe()
assert_frame_equal(result, expected)

Expand Down Expand Up @@ -3873,7 +3905,6 @@ def test_groupby_whitelist(self):
'tail',
'cumcount',
'resample',
'describe',
'rank',
'quantile',
'fillna',
Expand Down Expand Up @@ -3910,7 +3941,6 @@ def test_groupby_whitelist(self):
'tail',
'cumcount',
'resample',
'describe',
'rank',
'quantile',
'fillna',
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,10 +1267,10 @@ def test_describe_typefiltering_groupby(self):
'numD': np.arange(24.) + .5,
'ts': tm.makeTimeSeries()[:24].index})
G = df.groupby('catA')
self.assertTrue(G.describe(include=['number']).shape == (16, 2))
self.assertTrue(G.describe(include=['number', 'object']).shape == (22,
3))
self.assertTrue(G.describe(include='all').shape == (26, 4))
self.assertTrue(G.describe(include=['number']).shape == (2, 16))
self.assertTrue(G.describe(include=['number', 'object']).shape == (2,
33))
self.assertTrue(G.describe(include='all').shape == (2, 52))

def test_describe_multi_index_df_column_names(self):
""" Test that column names persist after the describe operation."""
Expand Down