diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 9eae2b7a33923..2300382a09940 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -356,6 +356,59 @@ New Behavior:
    In [11]: index.memory_usage(deep=True)
    Out[11]: 260
 
+.. _whatsnew_0200.api_breaking.groupby_describe:
+
+Groupby Describe Formatting
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
+This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
+
+   In [2]: df.groupby('A').describe()
+   Out[2]: 
+                   B
+   A                
+   1 count  2.000000
+     mean   1.500000
+     std    0.707107
+     min    1.000000
+     25%    1.250000
+     50%    1.500000
+     75%    1.750000
+     max    2.000000
+   2 count  2.000000
+     mean   3.500000
+     std    0.707107
+     min    3.000000
+     25%    3.250000
+     50%    3.500000
+     75%    3.750000
+     max    4.000000
+
+   In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max])
+   Out[3]: 
+        B                    
+     mean       std amin amax
+   A                         
+   1  1.5  0.707107    1    2
+   2  3.5  0.707107    3    4
+
+New Behavior:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
+
+   df.groupby('A').describe()
+
+   df.groupby('A').agg([np.mean, np.std, np.min, np.max])
+
 .. _whatsnew_0200.api:
 
 Other API Changes
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 99220232114ce..5ba3791de607b 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -80,7 +80,6 @@
     'mean', 'sum', 'min', 'max',
     'cumcount',
     'resample',
-    'describe',
     'rank', 'quantile',
     'fillna',
     'mad',
@@ -1138,6 +1137,16 @@ def ohlc(self):
         return self._apply_to_column_groupbys(
             lambda x: x._cython_agg_general('ohlc'))
 
+    @Appender(DataFrame.describe.__doc__)
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def describe(self, **kwargs):
+        self._set_group_selection()
+        result = self.apply(lambda x: x.describe(**kwargs))
+        if self.axis == 1:
+            return result.T
+        return result.unstack()
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3048,14 @@ def nlargest(self, n=5, keep='first'):
     def nsmallest(self, n=5, keep='first'):
         return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
 
+    @Appender(Series.describe.__doc__)
+    def describe(self, **kwargs):
+        self._set_group_selection()
+        result = self.apply(lambda x: x.describe(**kwargs))
+        if self.axis == 1:
+            return result.T
+        return result.unstack()
+
     def value_counts(self, normalize=False, sort=True, ascending=False,
                      bins=None, dropna=True):
 
diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py
index a9553d9ea10cb..99cc70ae36f6b 100644
--- a/pandas/tests/formats/test_format.py
+++ b/pandas/tests/formats/test_format.py
@@ -3545,30 +3545,15 @@ def test_to_latex_multiindex(self):
         self.assertEqual(result, expected)
 
         result = df.groupby('a').describe().to_latex()
-        expected = r"""\begin{tabular}{llr}
-\toprule
-  &       &         c \\
-a & {} &           \\
-\midrule
-0 & count &  2.000000 \\
-  & mean &  1.500000 \\
-  & std &  0.707107 \\
-  & min &  1.000000 \\
-  & 25\% &  1.250000 \\
-  & 50\% &  1.500000 \\
-  & 75\% &  1.750000 \\
-  & max &  2.000000 \\
-1 & count &  2.000000 \\
-  & mean &  3.500000 \\
-  & std &  0.707107 \\
-  & min &  3.000000 \\
-  & 25\% &  3.250000 \\
-  & 50\% &  3.500000 \\
-  & 75\% &  3.750000 \\
-  & max &  4.000000 \\
-\bottomrule
-\end{tabular}
-"""
+        expected = ('\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} &     c &     '
+                    ' &           &      &       &      &       &      '
+                    '\\\\\n{} & count & mean &       std &  min &   25\\% &  '
+                    '50\\% &   75\\% &  max \\\\\na &       &      &          '
+                    ' &      &       &      &       &      \\\\\n\\midrule\n0 '
+                    '&   2.0 &  1.5 &  0.707107 &  1.0 &  1.25 &  1.5 &  1.75 '
+                    '&  2.0 \\\\\n1 &   2.0 &  3.5 &  0.707107 &  3.0 &  3.25 '
+                    '&  3.5 &  3.75 &  4.0 '
+                    '\\\\\n\\bottomrule\n\\end{tabular}\n')
 
         self.assertEqual(result, expected)
 
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 8952b520f4f78..eebd0e0f490c1 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -107,17 +107,20 @@ def test_groupby_categorical(self):
         exp_cats = Categorical(ord_labels, ordered=True,
                                categories=['foo', 'bar', 'baz', 'qux'])
         expected = ord_data.groupby(exp_cats, sort=False).describe()
-        expected.index.names = [None, None]
         assert_frame_equal(desc_result, expected)
 
         # GH 10460
         expc = Categorical.from_codes(np.arange(4).repeat(8),
                                       levels, ordered=True)
         exp = CategoricalIndex(expc)
-        self.assert_index_equal(desc_result.index.get_level_values(0), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(0)), exp)
         exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                      '75%', 'max'] * 4)
-        self.assert_index_equal(desc_result.index.get_level_values(1), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(1)), exp)
 
     def test_groupby_datetime_categorical(self):
         # GH9049: ensure backward compatibility
@@ -144,7 +147,6 @@ def test_groupby_datetime_categorical(self):
         ord_labels = cats.take_nd(idx)
         ord_data = data.take(idx)
         expected = ord_data.groupby(ord_labels).describe()
-        expected.index.names = [None, None]
         assert_frame_equal(desc_result, expected)
         tm.assert_index_equal(desc_result.index, expected.index)
         tm.assert_index_equal(
@@ -155,10 +157,14 @@ def test_groupby_datetime_categorical(self):
         expc = Categorical.from_codes(
             np.arange(4).repeat(8), levels, ordered=True)
         exp = CategoricalIndex(expc)
-        self.assert_index_equal(desc_result.index.get_level_values(0), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(0)), exp)
         exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                      '75%', 'max'] * 4)
-        self.assert_index_equal(desc_result.index.get_level_values(1), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(1)), exp)
 
     def test_groupby_categorical_index(self):
 
@@ -195,8 +201,8 @@ def test_groupby_describe_categorical_columns(self):
         df = DataFrame(np.random.randn(20, 4), columns=cats)
         result = df.groupby([1, 2, 3, 4] * 5).describe()
 
-        tm.assert_index_equal(result.columns, cats)
-        tm.assert_categorical_equal(result.columns.values, cats.values)
+        tm.assert_index_equal(result.stack().columns, cats)
+        tm.assert_categorical_equal(result.stack().columns.values, cats.values)
 
     def test_groupby_unstack_categorical(self):
         # GH11558 (example is taken from the original issue)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 458e869130190..63355d52fbd29 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -1086,7 +1086,7 @@ def test_attr_wrapper(self):
         for name, gp in grouped:
             expected[name] = gp.describe()
         expected = DataFrame(expected).T
-        assert_frame_equal(result.unstack(), expected)
+        assert_frame_equal(result, expected)
 
         # get attribute
         result = grouped.dtype
@@ -1098,7 +1098,7 @@ def test_attr_wrapper(self):
     def test_series_describe_multikey(self):
         ts = tm.makeTimeSeries()
         grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
-        result = grouped.describe().unstack()
+        result = grouped.describe()
         assert_series_equal(result['mean'], grouped.mean(), check_names=False)
         assert_series_equal(result['std'], grouped.std(), check_names=False)
         assert_series_equal(result['min'], grouped.min(), check_names=False)
@@ -1107,7 +1107,7 @@ def test_series_describe_single(self):
         ts = tm.makeTimeSeries()
         grouped = ts.groupby(lambda x: x.month)
         result = grouped.apply(lambda x: x.describe())
-        expected = grouped.describe()
+        expected = grouped.describe().stack()
         assert_series_equal(result, expected)
 
     def test_series_index_name(self):
@@ -1118,17 +1118,27 @@ def test_series_index_name(self):
     def test_frame_describe_multikey(self):
         grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
         result = grouped.describe()
-
+        desc_groups = []
         for col in self.tsframe:
-            expected = grouped[col].describe()
-            assert_series_equal(result[col], expected, check_names=False)
+            group = grouped[col].describe()
+            group_col = pd.MultiIndex([[col] * len(group.columns),
+                                       group.columns],
+                                      [[0] * len(group.columns),
+                                       range(len(group.columns))])
+            group = pd.DataFrame(group.values,
+                                 columns=group_col,
+                                 index=group.index)
+            desc_groups.append(group)
+        expected = pd.concat(desc_groups, axis=1)
+        tm.assert_frame_equal(result, expected)
 
         groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
                                          'C': 1, 'D': 1}, axis=1)
         result = groupedT.describe()
-
-        for name, group in groupedT:
-            assert_frame_equal(result[name], group.describe())
+        expected = self.tsframe.describe().T
+        expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
+                                       [range(4), range(len(expected.index))])
+        tm.assert_frame_equal(result, expected)
 
     def test_frame_describe_tupleindex(self):
 
@@ -1138,10 +1148,27 @@ def test_frame_describe_tupleindex(self):
                          'z': [100, 200, 300, 400, 500] * 3})
         df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
         df2 = df1.rename(columns={'k': 'key'})
-        result = df1.groupby('k').describe()
-        expected = df2.groupby('key').describe()
-        expected.index.set_names(result.index.names, inplace=True)
-        assert_frame_equal(result, expected)
+        tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
+        tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())
+
+    def test_frame_describe_unstacked_format(self):
+        # GH 4792
+        prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
+                  pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
+                  pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
+        volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
+                   pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
+                   pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
+        df = pd.DataFrame({'PRICE': prices,
+                           'VOLUME': volumes})
+        result = df.groupby('PRICE').VOLUME.describe()
+        data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
+                df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
+        expected = pd.DataFrame(data,
+                                index=pd.Index([24990, 25499], name='PRICE'),
+                                columns=['count', 'mean', 'std', 'min',
+                                         '25%', '50%', '75%', 'max'])
+        tm.assert_frame_equal(result, expected)
 
     def test_frame_groupby(self):
         grouped = self.tsframe.groupby(lambda x: x.weekday())
@@ -2546,16 +2573,21 @@ def test_non_cython_api(self):
         assert_frame_equal(result, expected)
 
         # describe
-        expected = DataFrame(dict(B=concat(
-            [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
-            keys=[1, 3])))
-        expected.index.names = ['A', None]
+        expected_index = pd.Index([1, 3], name='A')
+        expected_col = pd.MultiIndex(levels=[['B'],
+                                             ['count', 'mean', 'std', 'min',
+                                              '25%', '50%', '75%', 'max']],
+                                     labels=[[0] * 8, list(range(8))])
+        expected = pd.DataFrame([[1.0, 2.0, nan, 2.0, 2.0, 2.0, 2.0, 2.0],
+                                 [0.0, nan, nan, nan, nan, nan, nan, nan]],
+                                index=expected_index,
+                                columns=expected_col)
         result = g.describe()
         assert_frame_equal(result, expected)
 
-        expected = concat(
-            [df.loc[[0, 1], ['A', 'B']].describe(),
-             df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
+        expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
+                              df[df.A == 3].describe().unstack().to_frame().T])
+        expected.index = pd.Index([0, 1])
         result = gni.describe()
         assert_frame_equal(result, expected)
 
@@ -3873,7 +3905,6 @@ def test_groupby_whitelist(self):
             'tail',
             'cumcount',
             'resample',
-            'describe',
             'rank',
             'quantile',
             'fillna',
@@ -3910,7 +3941,6 @@ def test_groupby_whitelist(self):
             'tail',
             'cumcount',
             'resample',
-            'describe',
             'rank',
             'quantile',
             'fillna',
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
index bb341c26d454e..e84e2d6809e7b 100644
--- a/pandas/tests/test_generic.py
+++ b/pandas/tests/test_generic.py
@@ -1267,10 +1267,10 @@ def test_describe_typefiltering_groupby(self):
                         'numD': np.arange(24.) + .5,
                         'ts': tm.makeTimeSeries()[:24].index})
         G = df.groupby('catA')
-        self.assertTrue(G.describe(include=['number']).shape == (16, 2))
-        self.assertTrue(G.describe(include=['number', 'object']).shape == (22,
-                                                                           3))
-        self.assertTrue(G.describe(include='all').shape == (26, 4))
+        self.assertTrue(G.describe(include=['number']).shape == (2, 16))
+        self.assertTrue(G.describe(include=['number', 'object']).shape == (2,
+                                                                           33))
+        self.assertTrue(G.describe(include='all').shape == (2, 52))
 
     def test_describe_multi_index_df_column_names(self):
         """ Test that column names persist after the describe operation."""