BUG: .describe lost CategoricalIndex info

sinhrks · sinhrks · commit 729554d7ce2a · 2016-03-30T02:30:59.000+09:00
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -160,12 +160,11 @@ Bug Fixes
 
 
 
-
-
 - Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`)
 
 
 
 
 - Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
 - Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`)
+- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4899,7 +4899,9 @@ def describe_1d(data, percentiles):
             for name in idxnames:
                 if name not in names:
                     names.append(name)
+
         d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
+        d.columns = self.columns._shallow_copy(values=d.columns.values)
         d.columns.names = data.columns.names
         return d
 
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -327,8 +327,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
         result._reset_identity()
         return result
 
-    def _shallow_copy(self, values=None, **kwargs):
-        """
+    _index_shared_docs['_shallow_copy'] = """
         create a new Index with the same class as the caller, don't copy the
         data, use the same object attributes with passed in attributes taking
         precedence
@@ -340,6 +339,8 @@ def _shallow_copy(self, values=None, **kwargs):
         values : the values to create the new Index, optional
         kwargs : updates the default attributes for this Index
         """
+    @Appender(_index_shared_docs['_shallow_copy'])
+    def _shallow_copy(self, values=None, **kwargs):
         if values is None:
             values = self.values
         attributes = self._get_attributes_dict()
diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py
@@ -7,7 +7,7 @@
                                     deprecate_kwarg)
 from pandas.core.missing import _clean_reindex_fill_method
 from pandas.core.config import get_option
-from pandas.indexes.base import Index
+from pandas.indexes.base import Index, _index_shared_docs
 import pandas.core.base as base
 import pandas.core.common as com
 import pandas.indexes.base as ibase
@@ -136,6 +136,19 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None,
         result._reset_identity()
         return result
 
+    @Appender(_index_shared_docs['_shallow_copy'])
+    def _shallow_copy(self, values=None, categories=None, ordered=None,
+                      **kwargs):
+        # categories and ordered can't be part of attributes,
+        # as these are properties
+        if categories is None:
+            categories = self.categories
+        if ordered is None:
+            ordered = self.ordered
+        return super(CategoricalIndex,
+                     self)._shallow_copy(values=values, categories=categories,
+                                         ordered=ordered, **kwargs)
+
     def _is_dtype_compat(self, other):
         """
         *this is an internal non-public method*
diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
@@ -28,7 +28,8 @@
 from pandas.core.config import get_option
 
 from pandas.indexes.base import (Index, _ensure_index, _ensure_frozen,
-                                 _get_na_value, InvalidIndexError)
+                                 _get_na_value, InvalidIndexError,
+                                 _index_shared_docs)
 import pandas.indexes.base as ibase
 
 
@@ -381,6 +382,7 @@ def view(self, cls=None):
     def _shallow_copy_with_infer(self, values=None, **kwargs):
         return self._shallow_copy(values, **kwargs)
 
+    @Appender(_index_shared_docs['_shallow_copy'])
     def _shallow_copy(self, values=None, **kwargs):
         if values is not None:
             if 'name' in kwargs:
diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py
@@ -6,7 +6,7 @@
 
 from pandas import compat
 from pandas.compat import lrange, range
-from pandas.indexes.base import Index
+from pandas.indexes.base import Index, _index_shared_docs
 from pandas.util.decorators import Appender, cache_readonly
 import pandas.core.common as com
 import pandas.indexes.base as ibase
@@ -225,9 +225,8 @@ def has_duplicates(self):
     def tolist(self):
         return lrange(self._start, self._stop, self._step)
 
+    @Appender(_index_shared_docs['_shallow_copy'])
     def _shallow_copy(self, values=None, **kwargs):
-        """ create a new Index, don't copy the data, use the same object attributes
-            with passed in attributes taking precedence """
         if values is None:
             return RangeIndex(name=self.name, fastpath=True,
                               **dict(self._get_data_as_items()))
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -257,6 +257,52 @@ def test_bool_describe_in_mixed_frame(self):
                              index=['count', 'unique', 'top', 'freq'])
         assert_frame_equal(result, expected)
 
+    def test_describe_categorical_columns(self):
+        # GH 11558
+        columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
+                                      ordered=True, name='XXX')
+        df = DataFrame({'int1': [10, 20, 30, 40, 50],
+                        'int2': [10, 20, 30, 40, 50],
+                        'obj': ['A', 0, None, 'X', 1]},
+                       columns=columns)
+        result = df.describe()
+
+        exp_columns = pd.CategoricalIndex(['int1', 'int2'],
+                                          categories=['int1', 'int2', 'obj'],
+                                          ordered=True, name='XXX')
+        expected = DataFrame({'int1': [5, 30, df.int1.std(),
+                                       10, 20, 30, 40, 50],
+                              'int2': [5, 30, df.int2.std(),
+                                       10, 20, 30, 40, 50]},
+                             index=['count', 'mean', 'std', 'min', '25%',
+                                    '50%', '75%', 'max'],
+                             columns=exp_columns)
+        tm.assert_frame_equal(result, expected)
+        tm.assert_categorical_equal(result.columns.values,
+                                    expected.columns.values)
+
+    def test_describe_datetime_columns(self):
+        columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
+                                   freq='MS', tz='US/Eastern', name='XXX')
+        df = DataFrame({0: [10, 20, 30, 40, 50],
+                        1: [10, 20, 30, 40, 50],
+                        2: ['A', 0, None, 'X', 1]})
+        df.columns = columns
+        result = df.describe()
+
+        exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
+                                       freq='MS', tz='US/Eastern', name='XXX')
+        expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
+                                  10, 20, 30, 40, 50],
+                              1: [5, 30, df.iloc[:, 1].std(),
+                                  10, 20, 30, 40, 50]},
+                             index=['count', 'mean', 'std', 'min', '25%',
+                                    '50%', '75%', 'max'])
+        expected.columns = exp_columns
+        tm.assert_frame_equal(result, expected)
+        self.assertEqual(result.columns.freq, 'MS')
+        self.assertEqual(result.columns.tz, expected.columns.tz)
+
     def test_reduce_mixed_frame(self):
         # GH 6806
         df = DataFrame({
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -2768,7 +2768,7 @@ def test_value_counts_with_nan(self):
             pd.Series([2, 1, 3],
                       index=pd.CategoricalIndex(["a", "b", np.nan])))
 
-        with tm.assert_produces_warning(FutureWarning):
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
             s = pd.Series(pd.Categorical(
                 ["a", "b", "a"], categories=["a", "b", np.nan]))
             tm.assert_series_equal(
@@ -2779,7 +2779,7 @@ def test_value_counts_with_nan(self):
                 pd.Series([2, 1, 0],
                           index=pd.CategoricalIndex(["a", "b", np.nan])))
 
-        with tm.assert_produces_warning(FutureWarning):
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
             s = pd.Series(pd.Categorical(
                 ["a", "b", None, "a", None, None], categories=["a", "b", np.nan
                                                                ]))
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2821,8 +2821,8 @@ def test_non_cython_api(self):
 
         # describe
         expected = DataFrame(dict(B=concat(
-            [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()
-             ], keys=[1, 3])))
+            [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
+            keys=[1, 3])))
         expected.index.names = ['A', None]
         result = g.describe()
         assert_frame_equal(result, expected)
@@ -4008,6 +4008,36 @@ def test_groupby_categorical_index(self):
                 [0, 1, 2, 3], levels, ordered=True), name='cats')
         assert_frame_equal(result, expected)
 
+    def test_groupby_describe_categorical_columns(self):
+        # GH 11558
+        cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
+                                   categories=['foo', 'bar', 'baz', 'qux'],
+                                   ordered=True)
+        df = DataFrame(np.random.randn(20, 4), columns=cats)
+        result = df.groupby([1, 2, 3, 4] * 5).describe()
+
+        tm.assert_index_equal(result.columns, cats)
+        tm.assert_categorical_equal(result.columns.values, cats.values)
+
+    def test_groupby_unstack_categorical(self):
+        # GH11558 (example is taken from the original issue)
+        df = pd.DataFrame({'a': range(10),
+                           'medium': ['A', 'B'] * 5,
+                           'artist': list('XYXXY') * 2})
+        df['medium'] = df['medium'].astype('category')
+
+        gcat = df.groupby(['artist', 'medium'])['a'].count().unstack()
+        result = gcat.describe()
+
+        exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
+                                          name='medium')
+        tm.assert_index_equal(result.columns, exp_columns)
+        tm.assert_categorical_equal(result.columns.values, exp_columns.values)
+
+        result = gcat['A'] + gcat['B']
+        expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
+        tm.assert_series_equal(result, expected)
+
     def test_groupby_groups_datetimeindex(self):
         # #1430
         from pandas.tseries.api import DatetimeIndex