diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index e6ea9217347ea..55750fe5700c6 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -160,8 +160,6 @@ Bug Fixes - - - Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`) @@ -169,3 +167,4 @@ Bug Fixes - Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`) - Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`) +- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7233e063afe8e..4f9fa260182f7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4899,7 +4899,9 @@ def describe_1d(data, percentiles): for name in idxnames: if name not in names: names.append(name) + d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) + d.columns = self.columns._shallow_copy(values=d.columns.values) d.columns.names = data.columns.names return d diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 58828d52b60dd..588521e6810b3 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -327,8 +327,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): result._reset_identity() return result - def _shallow_copy(self, values=None, **kwargs): - """ + _index_shared_docs['_shallow_copy'] = """ create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking precedence @@ -340,6 +339,8 @@ def _shallow_copy(self, values=None, **kwargs): values : the values to create the new Index, optional kwargs : updates the default attributes for this Index """ + @Appender(_index_shared_docs['_shallow_copy']) + def _shallow_copy(self, values=None, **kwargs): if values is None: values = self.values attributes = self._get_attributes_dict() diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index cc25b5fc6fb8c..a9390c76f26a6 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -7,7 +7,7 @@ deprecate_kwarg) from pandas.core.missing import _clean_reindex_fill_method from pandas.core.config import get_option -from pandas.indexes.base import Index +from pandas.indexes.base import Index, _index_shared_docs import pandas.core.base as base import pandas.core.common as com import pandas.indexes.base as ibase @@ -136,6 +136,19 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, result._reset_identity() return result + @Appender(_index_shared_docs['_shallow_copy']) + def _shallow_copy(self, values=None, categories=None, ordered=None, + **kwargs): + # categories and ordered can't be part of attributes, + # as these are properties + if categories is None: + categories = self.categories + if ordered is None: + ordered = self.ordered + return super(CategoricalIndex, + self)._shallow_copy(values=values, categories=categories, + ordered=ordered, **kwargs) + def _is_dtype_compat(self, other): """ *this is an internal non-public method* diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index d14568ceca258..4a77282b3877a 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -28,7 +28,8 @@ from pandas.core.config import get_option from pandas.indexes.base import (Index, _ensure_index, _ensure_frozen, - _get_na_value, InvalidIndexError) + _get_na_value, InvalidIndexError, + _index_shared_docs) import pandas.indexes.base as ibase @@ -381,6 +382,7 @@ def view(self, cls=None): def _shallow_copy_with_infer(self, values=None, **kwargs): return self._shallow_copy(values, **kwargs) + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is not None: if 'name' in kwargs: diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 4b06af9240436..dbee753af855c 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -6,7 +6,7 @@ from pandas import compat from pandas.compat import lrange, range -from pandas.indexes.base import Index +from pandas.indexes.base import Index, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly import pandas.core.common as com import pandas.indexes.base as ibase @@ -225,9 +225,8 @@ def has_duplicates(self): def tolist(self): return lrange(self._start, self._stop, self._step) + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): - """ create a new Index, don't copy the data, use the same object attributes - with passed in attributes taking precedence """ if values is None: return RangeIndex(name=self.name, fastpath=True, **dict(self._get_data_as_items())) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index d9cad6a542fb3..74682c506c769 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -257,6 +257,52 @@ def test_bool_describe_in_mixed_frame(self): index=['count', 'unique', 'top', 'freq']) assert_frame_equal(result, expected) + def test_describe_categorical_columns(self): + # GH 11558 + columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], + ordered=True, name='XXX') + df = DataFrame({'int1': [10, 20, 30, 40, 50], + 'int2': [10, 20, 30, 40, 50], + 'obj': ['A', 0, None, 'X', 1]}, + columns=columns) + result = df.describe() + + exp_columns = pd.CategoricalIndex(['int1', 'int2'], + categories=['int1', 'int2', 'obj'], + ordered=True, name='XXX') + expected = DataFrame({'int1': [5, 30, df.int1.std(), + 10, 20, 30, 40, 50], + 'int2': [5, 30, df.int2.std(), + 10, 20, 30, 40, 50]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max'], + columns=exp_columns) + tm.assert_frame_equal(result, expected) + tm.assert_categorical_equal(result.columns.values, + expected.columns.values) + + def test_describe_datetime_columns(self): + columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + freq='MS', tz='US/Eastern', name='XXX') + df = DataFrame({0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ['A', 0, None, 'X', 1]}) + df.columns = columns + result = df.describe() + + exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'], + freq='MS', tz='US/Eastern', name='XXX') + expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(), + 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), + 10, 20, 30, 40, 50]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + expected.columns = exp_columns + tm.assert_frame_equal(result, expected) + self.assertEqual(result.columns.freq, 'MS') + self.assertEqual(result.columns.tz, expected.columns.tz) + def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame({ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 9aeff41ee70f5..b60cbcba45dd8 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2768,7 +2768,7 @@ def test_value_counts_with_nan(self): pd.Series([2, 1, 3], index=pd.CategoricalIndex(["a", "b", np.nan]))) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = pd.Series(pd.Categorical( ["a", "b", "a"], categories=["a", "b", np.nan])) tm.assert_series_equal( @@ -2779,7 +2779,7 @@ def test_value_counts_with_nan(self): pd.Series([2, 1, 0], index=pd.CategoricalIndex(["a", "b", np.nan]))) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = pd.Series(pd.Categorical( ["a", "b", None, "a", None, None], categories=["a", "b", np.nan ])) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index ac98e377f1719..ff9fd7dfb5980 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2821,8 +2821,8 @@ def test_non_cython_api(self): # describe expected = DataFrame(dict(B=concat( - [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe() - ], keys=[1, 3]))) + [df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()], + keys=[1, 3]))) expected.index.names = ['A', None] result = g.describe() assert_frame_equal(result, expected) @@ -4008,6 +4008,36 @@ def test_groupby_categorical_index(self): [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) + def test_groupby_describe_categorical_columns(self): + # GH 11558 + cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) + df = DataFrame(np.random.randn(20, 4), columns=cats) + result = df.groupby([1, 2, 3, 4] * 5).describe() + + tm.assert_index_equal(result.columns, cats) + tm.assert_categorical_equal(result.columns.values, cats.values) + + def test_groupby_unstack_categorical(self): + # GH11558 (example is taken from the original issue) + df = pd.DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) + df['medium'] = df['medium'].astype('category') + + gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() + result = gcat.describe() + + exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, + name='medium') + tm.assert_index_equal(result.columns, exp_columns) + tm.assert_categorical_equal(result.columns.values, exp_columns.values) + + result = gcat['A'] + gcat['B'] + expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + tm.assert_series_equal(result, expected) + def test_groupby_groups_datetimeindex(self): # #1430 from pandas.tseries.api import DatetimeIndex