diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9eae2b7a33923..2300382a09940 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -356,6 +356,59 @@ New Behavior: In [11]: index.memory_usage(deep=True) Out[11]: 260 +.. _whatsnew_0200.api_breaking.groupby_describe: + +Groupby Describe Formatting +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index. +This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`) + +Previous Behavior: + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]}) + + In [2]: df.groupby('A').describe() + Out[2]: + B + A + 1 count 2.000000 + mean 1.500000 + std 0.707107 + min 1.000000 + 25% 1.250000 + 50% 1.500000 + 75% 1.750000 + max 2.000000 + 2 count 2.000000 + mean 3.500000 + std 0.707107 + min 3.000000 + 25% 3.250000 + 50% 3.500000 + 75% 3.750000 + max 4.000000 + + In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max]) + Out[3]: + B + mean std amin amax + A + 1 1.5 0.707107 1 2 + 2 3.5 0.707107 3 4 + +New Behavior: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]}) + + df.groupby('A').describe() + + df.groupby('A').agg([np.mean, np.std, np.min, np.max]) + .. _whatsnew_0200.api: Other API Changes diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 99220232114ce..5ba3791de607b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -80,7 +80,6 @@ 'mean', 'sum', 'min', 'max', 'cumcount', 'resample', - 'describe', 'rank', 'quantile', 'fillna', 'mad', @@ -1138,6 +1137,16 @@ def ohlc(self): return self._apply_to_column_groupbys( lambda x: x._cython_agg_general('ohlc')) + @Appender(DataFrame.describe.__doc__) + @Substitution(name='groupby') + @Appender(_doc_template) + def describe(self, **kwargs): + self._set_group_selection() + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() + @Substitution(name='groupby') @Appender(_doc_template) def resample(self, rule, *args, **kwargs): @@ -3039,6 +3048,14 @@ def nlargest(self, n=5, keep='first'): def nsmallest(self, n=5, keep='first'): return self.apply(lambda x: x.nsmallest(n=n, keep=keep)) + @Appender(Series.describe.__doc__) + def describe(self, **kwargs): + self._set_group_selection() + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() + def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index a9553d9ea10cb..99cc70ae36f6b 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3545,30 +3545,15 @@ def test_to_latex_multiindex(self): self.assertEqual(result, expected) result = df.groupby('a').describe().to_latex() - expected = r"""\begin{tabular}{llr} -\toprule - & & c \\ -a & {} & \\ -\midrule -0 & count & 2.000000 \\ - & mean & 1.500000 \\ - & std & 0.707107 \\ - & min & 1.000000 \\ - & 25\% & 1.250000 \\ - & 50\% & 1.500000 \\ - & 75\% & 1.750000 \\ - & max & 2.000000 \\ -1 & count & 2.000000 \\ - & mean & 3.500000 \\ - & std & 0.707107 \\ - & min & 3.000000 \\ - & 25\% & 3.250000 \\ - & 50\% & 3.500000 \\ - & 75\% & 3.750000 \\ - & max & 4.000000 \\ -\bottomrule -\end{tabular} -""" + expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} & c & ' + ' & & & & & & ' + '\\\\\n{} & count & mean & std & min & 25\\% & ' + '50\\% & 75\\% & max \\\\\na & & & ' + ' & & & & & \\\\\n\\midrule\n0 ' + '& 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 ' + '& 2.0 \\\\\n1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 ' + '& 3.5 & 3.75 & 4.0 ' + '\\\\\n\\bottomrule\n\\end{tabular}\n') self.assertEqual(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8952b520f4f78..eebd0e0f490c1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -107,17 +107,20 @@ def test_groupby_categorical(self): exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False).describe() - expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(1)), exp) def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility @@ -144,7 +147,6 @@ def test_groupby_datetime_categorical(self): ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() - expected.index.names = [None, None] assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal( @@ -155,10 +157,14 @@ def test_groupby_datetime_categorical(self): expc = Categorical.from_codes( np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) + self.assert_index_equal((desc_result.stack() + .index + .get_level_values(1)), exp) def test_groupby_categorical_index(self): @@ -195,8 +201,8 @@ def test_groupby_describe_categorical_columns(self): df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() - tm.assert_index_equal(result.columns, cats) - tm.assert_categorical_equal(result.columns.values, cats.values) + tm.assert_index_equal(result.stack().columns, cats) + tm.assert_categorical_equal(result.stack().columns.values, cats.values) def test_groupby_unstack_categorical(self): # GH11558 (example is taken from the original issue) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 458e869130190..63355d52fbd29 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1086,7 +1086,7 @@ def test_attr_wrapper(self): for name, gp in grouped: expected[name] = gp.describe() expected = DataFrame(expected).T - assert_frame_equal(result.unstack(), expected) + assert_frame_equal(result, expected) # get attribute result = grouped.dtype @@ -1098,7 +1098,7 @@ def test_attr_wrapper(self): def test_series_describe_multikey(self): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe().unstack() + result = grouped.describe() assert_series_equal(result['mean'], grouped.mean(), check_names=False) assert_series_equal(result['std'], grouped.std(), check_names=False) assert_series_equal(result['min'], grouped.min(), check_names=False) @@ -1107,7 +1107,7 @@ def test_series_describe_single(self): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe() + expected = grouped.describe().stack() assert_series_equal(result, expected) def test_series_index_name(self): @@ -1118,17 +1118,27 @@ def test_series_index_name(self): def test_frame_describe_multikey(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() - + desc_groups = [] for col in self.tsframe: - expected = grouped[col].describe() - assert_series_equal(result[col], expected, check_names=False) + group = grouped[col].describe() + group_col = pd.MultiIndex([[col] * len(group.columns), + group.columns], + [[0] * len(group.columns), + range(len(group.columns))]) + group = pd.DataFrame(group.values, + columns=group_col, + index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) groupedT = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis=1) result = groupedT.describe() - - for name, group in groupedT: - assert_frame_equal(result[name], group.describe()) + expected = self.tsframe.describe().T + expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], + [range(4), range(len(expected.index))]) + tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(self): @@ -1138,10 +1148,27 @@ def test_frame_describe_tupleindex(self): 'z': [100, 200, 300, 400, 500] * 3}) df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 df2 = df1.rename(columns={'k': 'key'}) - result = df1.groupby('k').describe() - expected = df2.groupby('key').describe() - expected.index.set_names(result.index.names, inplace=True) - assert_frame_equal(result, expected) + tm.assertRaises(ValueError, lambda: df1.groupby('k').describe()) + tm.assertRaises(ValueError, lambda: df2.groupby('key').describe()) + + def test_frame_describe_unstacked_format(self): + # GH 4792 + prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} + volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} + df = pd.DataFrame({'PRICE': prices, + 'VOLUME': volumes}) + result = df.groupby('PRICE').VOLUME.describe() + data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist()] + expected = pd.DataFrame(data, + index=pd.Index([24990, 25499], name='PRICE'), + columns=['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) def test_frame_groupby(self): grouped = self.tsframe.groupby(lambda x: x.weekday()) @@ -2546,16 +2573,21 @@ def test_non_cython_api(self): assert_frame_equal(result, expected) # describe - expected = DataFrame(dict(B=concat( - [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()], - keys=[1, 3]))) - expected.index.names = ['A', None] + expected_index = pd.Index([1, 3], name='A') + expected_col = pd.MultiIndex(levels=[['B'], + ['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']], + labels=[[0] * 8, list(range(8))]) + expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, nan, nan, nan, nan, nan, nan, nan]], + index=expected_index, + columns=expected_col) result = g.describe() assert_frame_equal(result, expected) - expected = concat( - [df.loc[[0, 1], ['A', 'B']].describe(), - df.loc[[2], ['A', 'B']].describe()], keys=[0, 1]) + expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T]) + expected.index = pd.Index([0, 1]) result = gni.describe() assert_frame_equal(result, expected) @@ -3873,7 +3905,6 @@ def test_groupby_whitelist(self): 'tail', 'cumcount', 'resample', - 'describe', 'rank', 'quantile', 'fillna', @@ -3910,7 +3941,6 @@ def test_groupby_whitelist(self): 'tail', 'cumcount', 'resample', - 'describe', 'rank', 'quantile', 'fillna', diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index bb341c26d454e..e84e2d6809e7b 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1267,10 +1267,10 @@ def test_describe_typefiltering_groupby(self): 'numD': np.arange(24.) + .5, 'ts': tm.makeTimeSeries()[:24].index}) G = df.groupby('catA') - self.assertTrue(G.describe(include=['number']).shape == (16, 2)) - self.assertTrue(G.describe(include=['number', 'object']).shape == (22, - 3)) - self.assertTrue(G.describe(include='all').shape == (26, 4)) + self.assertTrue(G.describe(include=['number']).shape == (2, 16)) + self.assertTrue(G.describe(include=['number', 'object']).shape == (2, + 33)) + self.assertTrue(G.describe(include='all').shape == (2, 52)) def test_describe_multi_index_df_column_names(self): """ Test that column names persist after the describe operation."""