Skip to content

BUG: .describe lost CategoricalIndex info #12675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,11 @@ Bug Fixes





- Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`)




- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
- Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`)
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
2 changes: 2 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4899,7 +4899,9 @@ def describe_1d(data, percentiles):
for name in idxnames:
if name not in names:
names.append(name)

d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
d.columns = self.columns._shallow_copy(values=d.columns.values)
d.columns.names = data.columns.names
return d

Expand Down
5 changes: 3 additions & 2 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
result._reset_identity()
return result

def _shallow_copy(self, values=None, **kwargs):
"""
_index_shared_docs['_shallow_copy'] = """
create a new Index with the same class as the caller, don't copy the
data, use the same object attributes with passed in attributes taking
precedence
Expand All @@ -340,6 +339,8 @@ def _shallow_copy(self, values=None, **kwargs):
values : the values to create the new Index, optional
kwargs : updates the default attributes for this Index
"""
@Appender(_index_shared_docs['_shallow_copy'])
def _shallow_copy(self, values=None, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if we have _shallow_copy defined for any other classes if so need to add docs like u did here

if values is None:
values = self.values
attributes = self._get_attributes_dict()
Expand Down
15 changes: 14 additions & 1 deletion pandas/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
deprecate_kwarg)
from pandas.core.missing import _clean_reindex_fill_method
from pandas.core.config import get_option
from pandas.indexes.base import Index
from pandas.indexes.base import Index, _index_shared_docs
import pandas.core.base as base
import pandas.core.common as com
import pandas.indexes.base as ibase
Expand Down Expand Up @@ -136,6 +136,19 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None,
result._reset_identity()
return result

@Appender(_index_shared_docs['_shallow_copy'])
def _shallow_copy(self, values=None, categories=None, ordered=None,
**kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why this is necessary, the _simple_new already handles this, if categories or ordered is not None they will be set, otherwise they won't and will simply be a refernce to the data, which is a CategoricalIndex at this point.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback Not now, because _simple_new is a class method.

# on current master
pd.CategoricalIndex([1, 2, 3], ordered=True)._simple_new([1, 2])
# CategoricalIndex([1, 2], categories=[1, 2], ordered=False, dtype='category')

# categories and ordered can't be part of attributes,
# as these are properties
if categories is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its possible that _simple_new needs a change instead

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the current rules, shouldn't _shallow_copy be used if metadata of caller is needed?

  • _simple_new: It returns new Index with the same type as the caller. All metadata (such as name) must be provided by caller's responsibility. Using _shallow_copy is recommended because it fills these metadata otherwise specified.
  • _shallow_copy: It returns new Index with the same type (using _simple_new), but fills caller's metadata otherwise specified. Passed kwargs will overwrite corresponding metadata.

categories = self.categories
if ordered is None:
ordered = self.ordered
return super(CategoricalIndex,
self)._shallow_copy(values=values, categories=categories,
ordered=ordered, **kwargs)

def _is_dtype_compat(self, other):
"""
*this is an internal non-public method*
Expand Down
4 changes: 3 additions & 1 deletion pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from pandas.core.config import get_option

from pandas.indexes.base import (Index, _ensure_index, _ensure_frozen,
_get_na_value, InvalidIndexError)
_get_na_value, InvalidIndexError,
_index_shared_docs)
import pandas.indexes.base as ibase


Expand Down Expand Up @@ -381,6 +382,7 @@ def view(self, cls=None):
def _shallow_copy_with_infer(self, values=None, **kwargs):
return self._shallow_copy(values, **kwargs)

@Appender(_index_shared_docs['_shallow_copy'])
def _shallow_copy(self, values=None, **kwargs):
if values is not None:
if 'name' in kwargs:
Expand Down
5 changes: 2 additions & 3 deletions pandas/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from pandas import compat
from pandas.compat import lrange, range
from pandas.indexes.base import Index
from pandas.indexes.base import Index, _index_shared_docs
from pandas.util.decorators import Appender, cache_readonly
import pandas.core.common as com
import pandas.indexes.base as ibase
Expand Down Expand Up @@ -225,9 +225,8 @@ def has_duplicates(self):
def tolist(self):
return lrange(self._start, self._stop, self._step)

@Appender(_index_shared_docs['_shallow_copy'])
def _shallow_copy(self, values=None, **kwargs):
""" create a new Index, don't copy the data, use the same object attributes
with passed in attributes taking precedence """
if values is None:
return RangeIndex(name=self.name, fastpath=True,
**dict(self._get_data_as_items()))
Expand Down
46 changes: 46 additions & 0 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,52 @@ def test_bool_describe_in_mixed_frame(self):
index=['count', 'unique', 'top', 'freq'])
assert_frame_equal(result, expected)

def test_describe_categorical_columns(self):
# GH 11558
columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
ordered=True, name='XXX')
df = DataFrame({'int1': [10, 20, 30, 40, 50],
'int2': [10, 20, 30, 40, 50],
'obj': ['A', 0, None, 'X', 1]},
columns=columns)
result = df.describe()

exp_columns = pd.CategoricalIndex(['int1', 'int2'],
categories=['int1', 'int2', 'obj'],
ordered=True, name='XXX')
expected = DataFrame({'int1': [5, 30, df.int1.std(),
10, 20, 30, 40, 50],
'int2': [5, 30, df.int2.std(),
10, 20, 30, 40, 50]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'],
columns=exp_columns)
tm.assert_frame_equal(result, expected)
tm.assert_categorical_equal(result.columns.values,
expected.columns.values)

def test_describe_datetime_columns(self):
columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
freq='MS', tz='US/Eastern', name='XXX')
df = DataFrame({0: [10, 20, 30, 40, 50],
1: [10, 20, 30, 40, 50],
2: ['A', 0, None, 'X', 1]})
df.columns = columns
result = df.describe()

exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
freq='MS', tz='US/Eastern', name='XXX')
expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
10, 20, 30, 40, 50],
1: [5, 30, df.iloc[:, 1].std(),
10, 20, 30, 40, 50]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
expected.columns = exp_columns
tm.assert_frame_equal(result, expected)
self.assertEqual(result.columns.freq, 'MS')
self.assertEqual(result.columns.tz, expected.columns.tz)

def test_reduce_mixed_frame(self):
# GH 6806
df = DataFrame({
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2768,7 +2768,7 @@ def test_value_counts_with_nan(self):
pd.Series([2, 1, 3],
index=pd.CategoricalIndex(["a", "b", np.nan])))

with tm.assert_produces_warning(FutureWarning):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
s = pd.Series(pd.Categorical(
["a", "b", "a"], categories=["a", "b", np.nan]))
tm.assert_series_equal(
Expand All @@ -2779,7 +2779,7 @@ def test_value_counts_with_nan(self):
pd.Series([2, 1, 0],
index=pd.CategoricalIndex(["a", "b", np.nan])))

with tm.assert_produces_warning(FutureWarning):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
s = pd.Series(pd.Categorical(
["a", "b", None, "a", None, None], categories=["a", "b", np.nan
]))
Expand Down
34 changes: 32 additions & 2 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2821,8 +2821,8 @@ def test_non_cython_api(self):

# describe
expected = DataFrame(dict(B=concat(
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()
], keys=[1, 3])))
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
keys=[1, 3])))
expected.index.names = ['A', None]
result = g.describe()
assert_frame_equal(result, expected)
Expand Down Expand Up @@ -4008,6 +4008,36 @@ def test_groupby_categorical_index(self):
[0, 1, 2, 3], levels, ordered=True), name='cats')
assert_frame_equal(result, expected)

def test_groupby_describe_categorical_columns(self):
# GH 11558
cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
categories=['foo', 'bar', 'baz', 'qux'],
ordered=True)
df = DataFrame(np.random.randn(20, 4), columns=cats)
result = df.groupby([1, 2, 3, 4] * 5).describe()

tm.assert_index_equal(result.columns, cats)
tm.assert_categorical_equal(result.columns.values, cats.values)

def test_groupby_unstack_categorical(self):
# GH11558 (example is taken from the original issue)
df = pd.DataFrame({'a': range(10),
'medium': ['A', 'B'] * 5,
'artist': list('XYXXY') * 2})
df['medium'] = df['medium'].astype('category')

gcat = df.groupby(['artist', 'medium'])['a'].count().unstack()
result = gcat.describe()

exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
name='medium')
tm.assert_index_equal(result.columns, exp_columns)
tm.assert_categorical_equal(result.columns.values, exp_columns.values)

result = gcat['A'] + gcat['B']
expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
tm.assert_series_equal(result, expected)

def test_groupby_groups_datetimeindex(self):
# #1430
from pandas.tseries.api import DatetimeIndex
Expand Down