TST: groupby.describe levels don't appear as column (#4792)

mroeschke · mroeschke · commit 4b5d367f128e · 2017-02-02T22:52:37.000-08:00
Restructure describe def

Fix another test

Refactoring tests

linting &amp; patch groupby tests

add whatsnew

fix docstring

fix more tests

Added api example and documentation to describe
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -352,6 +352,68 @@ New Behavior:
    In [11]: index.memory_usage(deep=True)
    Out[11]: 260
 
+.. _whatsnew_0200.api_breaking.groupby_describe:
+
+Groupby Describe Formatting
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index.
+This format is consistent with ``groupby.ohlc()`` (:issue:`4792`)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
+
+   In [2]: df.groupby('A').B.describe()
+   Out[2]: 
+   A       
+   1  count    2.000000
+      mean     1.500000
+      std      0.707107
+      min      1.000000
+      25%      1.250000
+      50%      1.500000
+      75%      1.750000
+      max      2.000000
+   2  count    2.000000
+      mean     3.500000
+      std      0.707107
+      min      3.000000
+      25%      3.250000
+      50%      3.500000
+      75%      3.750000
+      max      4.000000
+   Name: B, dtype: float64
+
+   In [3]: df.groupby('A').B.ohlc()
+   Out[3]: 
+   open  high  low  close
+   A                        
+   1     1     2    1      2
+   2     3     4    3      4
+
+New Behavior:
+
+.. code-block:: ipython
+
+   In [1]: df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]})
+
+   In [2]: df.groupby('A').B.describe()
+   Out[2]: 
+      count  mean       std  min   25%  50%   75%  max
+   A                                                  
+   1    2.0   1.5  0.707107  1.0  1.25  1.5  1.75  2.0
+   2    2.0   3.5  0.707107  3.0  3.25  3.5  3.75  4.0
+
+   In [3]: df.groupby('A').B.ohlc()
+   Out[3]: 
+   open  high  low  close
+   A                        
+   1     1     2    1      2
+   2     3     4    3      4
+
 .. _whatsnew_0200.api:
 
 Other API Changes
@@ -366,6 +428,7 @@ Other API Changes
 - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
 - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
  - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
+
 .. _whatsnew_0200.deprecations:
 
 Deprecations
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -80,7 +80,6 @@
     'mean', 'sum', 'min', 'max',
     'cumcount',
     'resample',
-    'describe',
     'rank', 'quantile',
     'fillna',
     'mad',
@@ -1138,6 +1137,15 @@ def ohlc(self):
         return self._apply_to_column_groupbys(
             lambda x: x._cython_agg_general('ohlc'))
 
+    @Appender(DataFrame.describe.__doc__)
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def describe(self, **kwargs):
+        result = self.apply(lambda x: x.describe(**kwargs))
+        if self.axis == 1:
+            return result.T
+        return result.unstack()
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def resample(self, rule, *args, **kwargs):
@@ -3039,6 +3047,13 @@ def nlargest(self, n=5, keep='first'):
     def nsmallest(self, n=5, keep='first'):
         return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
 
+    @Appender(Series.describe.__doc__)
+    def describe(self, **kwargs):
+        result = self.apply(lambda x: x.describe(**kwargs))
+        if self.axis == 1:
+            return result.T
+        return result.unstack()
+
     def value_counts(self, normalize=False, sort=True, ascending=False,
                      bins=None, dropna=True):
 
diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py
@@ -3544,27 +3544,14 @@ def test_to_latex_multiindex(self):
         self.assertEqual(result, expected)
 
         result = df.groupby('a').describe().to_latex()
-        expected = r"""\begin{tabular}{llr}
+        expected = r"""\begin{tabular}{lrrrrrrrrrrrrrrrr}
 \toprule
-  &       &         c \\
-a & {} &           \\
+{} &     a &      &      &      &      &      &      &      &     c &      &           &      &       &      &       &      \\
+{} & count & mean &  std &  min &  25\% &  50\% &  75\% &  max & count & mean &       std &  min &   25\% &  50\% &   75\% &  max \\
+a &       &      &      &      &      &      &      &      &       &      &           &      &       &      &       &      \\
 \midrule
-0 & count &  2.000000 \\
-  & mean &  1.500000 \\
-  & std &  0.707107 \\
-  & min &  1.000000 \\
-  & 25\% &  1.250000 \\
-  & 50\% &  1.500000 \\
-  & 75\% &  1.750000 \\
-  & max &  2.000000 \\
-1 & count &  2.000000 \\
-  & mean &  3.500000 \\
-  & std &  0.707107 \\
-  & min &  3.000000 \\
-  & 25\% &  3.250000 \\
-  & 50\% &  3.500000 \\
-  & 75\% &  3.750000 \\
-  & max &  4.000000 \\
+0 &   2.0 &  0.0 &  0.0 &  0.0 &  0.0 &  0.0 &  0.0 &  0.0 &   2.0 &  1.5 &  0.707107 &  1.0 &  1.25 &  1.5 &  1.75 &  2.0 \\
+1 &   2.0 &  1.0 &  0.0 &  1.0 &  1.0 &  1.0 &  1.0 &  1.0 &   2.0 &  3.5 &  0.707107 &  3.0 &  3.25 &  3.5 &  3.75 &  4.0 \\
 \bottomrule
 \end{tabular}
 """
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -159,17 +159,20 @@ def test_groupby_categorical(self):
         exp_cats = Categorical(ord_labels, ordered=True,
                                categories=['foo', 'bar', 'baz', 'qux'])
         expected = ord_data.groupby(exp_cats, sort=False).describe()
-        expected.index.names = [None, None]
         assert_frame_equal(desc_result, expected)
 
         # GH 10460
         expc = Categorical.from_codes(np.arange(4).repeat(8),
                                       levels, ordered=True)
         exp = CategoricalIndex(expc)
-        self.assert_index_equal(desc_result.index.get_level_values(0), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(0)), exp)
         exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                      '75%', 'max'] * 4)
-        self.assert_index_equal(desc_result.index.get_level_values(1), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(1)), exp)
 
     def test_groupby_datetime_categorical(self):
         # GH9049: ensure backward compatibility
@@ -196,7 +199,6 @@ def test_groupby_datetime_categorical(self):
         ord_labels = cats.take_nd(idx)
         ord_data = data.take(idx)
         expected = ord_data.groupby(ord_labels).describe()
-        expected.index.names = [None, None]
         assert_frame_equal(desc_result, expected)
         tm.assert_index_equal(desc_result.index, expected.index)
         tm.assert_index_equal(
@@ -207,10 +209,14 @@ def test_groupby_datetime_categorical(self):
         expc = Categorical.from_codes(
             np.arange(4).repeat(8), levels, ordered=True)
         exp = CategoricalIndex(expc)
-        self.assert_index_equal(desc_result.index.get_level_values(0), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(0)), exp)
         exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                      '75%', 'max'] * 4)
-        self.assert_index_equal(desc_result.index.get_level_values(1), exp)
+        self.assert_index_equal((desc_result.stack()
+                                            .index
+                                            .get_level_values(1)), exp)
 
     def test_groupby_categorical_index(self):
 
@@ -246,8 +252,8 @@ def test_groupby_describe_categorical_columns(self):
         df = DataFrame(np.random.randn(20, 4), columns=cats)
         result = df.groupby([1, 2, 3, 4] * 5).describe()
 
-        tm.assert_index_equal(result.columns, cats)
-        tm.assert_categorical_equal(result.columns.values, cats.values)
+        tm.assert_index_equal(result.stack().columns, cats)
+        tm.assert_categorical_equal(result.stack().columns.values, cats.values)
 
     def test_groupby_unstack_categorical(self):
         # GH11558 (example is taken from the original issue)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1447,7 +1447,7 @@ def test_attr_wrapper(self):
         for name, gp in grouped:
             expected[name] = gp.describe()
         expected = DataFrame(expected).T
-        assert_frame_equal(result.unstack(), expected)
+        assert_frame_equal(result, expected)
 
         # get attribute
         result = grouped.dtype
@@ -1459,7 +1459,7 @@ def test_attr_wrapper(self):
     def test_series_describe_multikey(self):
         ts = tm.makeTimeSeries()
         grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
-        result = grouped.describe().unstack()
+        result = grouped.describe()
         assert_series_equal(result['mean'], grouped.mean(), check_names=False)
         assert_series_equal(result['std'], grouped.std(), check_names=False)
         assert_series_equal(result['min'], grouped.min(), check_names=False)
@@ -1468,7 +1468,7 @@ def test_series_describe_single(self):
         ts = tm.makeTimeSeries()
         grouped = ts.groupby(lambda x: x.month)
         result = grouped.apply(lambda x: x.describe())
-        expected = grouped.describe()
+        expected = grouped.describe().stack()
         assert_series_equal(result, expected)
 
     def test_series_index_name(self):
@@ -1479,17 +1479,27 @@ def test_series_index_name(self):
     def test_frame_describe_multikey(self):
         grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
         result = grouped.describe()
-
+        desc_groups = []
         for col in self.tsframe:
-            expected = grouped[col].describe()
-            assert_series_equal(result[col], expected, check_names=False)
+            group = grouped[col].describe()
+            group_col = pd.MultiIndex([[col] * len(group.columns),
+                                       group.columns],
+                                      [[0] * len(group.columns),
+                                       range(len(group.columns))])
+            group = pd.DataFrame(group.values,
+                                 columns=group_col,
+                                 index=group.index)
+            desc_groups.append(group)
+        expected = pd.concat(desc_groups, axis=1)
+        tm.assert_frame_equal(result, expected)
 
         groupedT = self.tsframe.groupby({'A': 0, 'B': 0,
                                          'C': 1, 'D': 1}, axis=1)
         result = groupedT.describe()
-
-        for name, group in groupedT:
-            assert_frame_equal(result[name], group.describe())
+        expected = self.tsframe.describe().T
+        expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
+                                       [range(4), range(len(expected.index))])
+        tm.assert_frame_equal(result, expected)
 
     def test_frame_describe_tupleindex(self):
 
@@ -1499,10 +1509,27 @@ def test_frame_describe_tupleindex(self):
                          'z': [100, 200, 300, 400, 500] * 3})
         df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
         df2 = df1.rename(columns={'k': 'key'})
-        result = df1.groupby('k').describe()
-        expected = df2.groupby('key').describe()
-        expected.index.set_names(result.index.names, inplace=True)
-        assert_frame_equal(result, expected)
+        tm.assertRaises(ValueError, lambda: df1.groupby('k').describe())
+        tm.assertRaises(ValueError, lambda: df2.groupby('key').describe())
+
+    def test_frame_describe_multiindex_level_not_as_column(self):
+        # GH 4792
+        prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
+                  pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
+                  pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
+        volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
+                   pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
+                   pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
+        df = pd.DataFrame({'PRICE': prices,
+                           'VOLUME': volumes})
+        result = df.groupby('PRICE').VOLUME.describe()
+        data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
+                df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
+        expected = pd.DataFrame(data,
+                                index=pd.Index([24990, 25499], name='PRICE'),
+                                columns=['count', 'mean', 'std', 'min',
+                                         '25%', '50%', '75%', 'max'])
+        tm.assert_frame_equal(result, expected)
 
     def test_frame_groupby(self):
         grouped = self.tsframe.groupby(lambda x: x.weekday())
@@ -2994,16 +3021,25 @@ def test_non_cython_api(self):
         assert_frame_equal(result, expected)
 
         # describe
-        expected = DataFrame(dict(B=concat(
-            [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
-            keys=[1, 3])))
-        expected.index.names = ['A', None]
+        expected = pd.concat([(df[df.A == 1].B
+                                            .describe()
+                                            .to_frame()
+                                            .unstack()
+                                            .to_frame()
+                                            .T),
+                              (df[df.A == 3].B
+                                            .describe()
+                                            .to_frame()
+                                            .unstack()
+                                            .to_frame()
+                                            .T)])
+        expected.index = pd.Index([1, 3], name='A')
         result = g.describe()
         assert_frame_equal(result, expected)
 
-        expected = concat(
-            [df.loc[[0, 1], ['A', 'B']].describe(),
-             df.loc[[2], ['A', 'B']].describe()], keys=[0, 1])
+        expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
+                              df[df.A == 3].describe().unstack().to_frame().T])
+        expected.index = pd.Index([0, 1])
         result = gni.describe()
         assert_frame_equal(result, expected)
 
@@ -5149,7 +5185,6 @@ def test_groupby_whitelist(self):
             'tail',
             'cumcount',
             'resample',
-            'describe',
             'rank',
             'quantile',
             'fillna',
@@ -5186,7 +5221,6 @@ def test_groupby_whitelist(self):
             'tail',
             'cumcount',
             'resample',
-            'describe',
             'rank',
             'quantile',
             'fillna',
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -1269,10 +1269,10 @@ def test_describe_typefiltering_groupby(self):
                         'numD': np.arange(24.) + .5,
                         'ts': tm.makeTimeSeries()[:24].index})
         G = df.groupby('catA')
-        self.assertTrue(G.describe(include=['number']).shape == (16, 2))
-        self.assertTrue(G.describe(include=['number', 'object']).shape == (22,
-                                                                           3))
-        self.assertTrue(G.describe(include='all').shape == (26, 4))
+        self.assertTrue(G.describe(include=['number']).shape == (2, 16))
+        self.assertTrue(G.describe(include=['number', 'object']).shape == (2,
+                                                                           44))
+        self.assertTrue(G.describe(include='all').shape == (2, 65))
 
     def test_describe_multi_index_df_column_names(self):
         """ Test that column names persist after the describe operation."""