Skip to content

Commit 729554d

Browse files
committed
BUG: .describe lost CategoricalIndex info
1 parent 5870731 commit 729554d

File tree

9 files changed

+105
-13
lines changed

9 files changed

+105
-13
lines changed

doc/source/whatsnew/v0.18.1.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -160,12 +160,11 @@ Bug Fixes
160160

161161

162162

163-
164-
165163
- Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`)
166164

167165

168166

169167

170168
- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
171169
- Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`)
170+
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)

pandas/core/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -4899,7 +4899,9 @@ def describe_1d(data, percentiles):
48994899
for name in idxnames:
49004900
if name not in names:
49014901
names.append(name)
4902+
49024903
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
4904+
d.columns = self.columns._shallow_copy(values=d.columns.values)
49034905
d.columns.names = data.columns.names
49044906
return d
49054907

pandas/indexes/base.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
327327
result._reset_identity()
328328
return result
329329

330-
def _shallow_copy(self, values=None, **kwargs):
331-
"""
330+
_index_shared_docs['_shallow_copy'] = """
332331
create a new Index with the same class as the caller, don't copy the
333332
data, use the same object attributes with passed in attributes taking
334333
precedence
@@ -340,6 +339,8 @@ def _shallow_copy(self, values=None, **kwargs):
340339
values : the values to create the new Index, optional
341340
kwargs : updates the default attributes for this Index
342341
"""
342+
@Appender(_index_shared_docs['_shallow_copy'])
343+
def _shallow_copy(self, values=None, **kwargs):
343344
if values is None:
344345
values = self.values
345346
attributes = self._get_attributes_dict()

pandas/indexes/category.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
deprecate_kwarg)
88
from pandas.core.missing import _clean_reindex_fill_method
99
from pandas.core.config import get_option
10-
from pandas.indexes.base import Index
10+
from pandas.indexes.base import Index, _index_shared_docs
1111
import pandas.core.base as base
1212
import pandas.core.common as com
1313
import pandas.indexes.base as ibase
@@ -136,6 +136,19 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None,
136136
result._reset_identity()
137137
return result
138138

139+
@Appender(_index_shared_docs['_shallow_copy'])
140+
def _shallow_copy(self, values=None, categories=None, ordered=None,
141+
**kwargs):
142+
# categories and ordered can't be part of attributes,
143+
# as these are properties
144+
if categories is None:
145+
categories = self.categories
146+
if ordered is None:
147+
ordered = self.ordered
148+
return super(CategoricalIndex,
149+
self)._shallow_copy(values=values, categories=categories,
150+
ordered=ordered, **kwargs)
151+
139152
def _is_dtype_compat(self, other):
140153
"""
141154
*this is an internal non-public method*

pandas/indexes/multi.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
from pandas.core.config import get_option
2929

3030
from pandas.indexes.base import (Index, _ensure_index, _ensure_frozen,
31-
_get_na_value, InvalidIndexError)
31+
_get_na_value, InvalidIndexError,
32+
_index_shared_docs)
3233
import pandas.indexes.base as ibase
3334

3435

@@ -381,6 +382,7 @@ def view(self, cls=None):
381382
def _shallow_copy_with_infer(self, values=None, **kwargs):
382383
return self._shallow_copy(values, **kwargs)
383384

385+
@Appender(_index_shared_docs['_shallow_copy'])
384386
def _shallow_copy(self, values=None, **kwargs):
385387
if values is not None:
386388
if 'name' in kwargs:

pandas/indexes/range.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pandas import compat
88
from pandas.compat import lrange, range
9-
from pandas.indexes.base import Index
9+
from pandas.indexes.base import Index, _index_shared_docs
1010
from pandas.util.decorators import Appender, cache_readonly
1111
import pandas.core.common as com
1212
import pandas.indexes.base as ibase
@@ -225,9 +225,8 @@ def has_duplicates(self):
225225
def tolist(self):
226226
return lrange(self._start, self._stop, self._step)
227227

228+
@Appender(_index_shared_docs['_shallow_copy'])
228229
def _shallow_copy(self, values=None, **kwargs):
229-
""" create a new Index, don't copy the data, use the same object attributes
230-
with passed in attributes taking precedence """
231230
if values is None:
232231
return RangeIndex(name=self.name, fastpath=True,
233232
**dict(self._get_data_as_items()))

pandas/tests/frame/test_analytics.py

+46
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,52 @@ def test_bool_describe_in_mixed_frame(self):
257257
index=['count', 'unique', 'top', 'freq'])
258258
assert_frame_equal(result, expected)
259259

260+
def test_describe_categorical_columns(self):
261+
# GH 11558
262+
columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
263+
ordered=True, name='XXX')
264+
df = DataFrame({'int1': [10, 20, 30, 40, 50],
265+
'int2': [10, 20, 30, 40, 50],
266+
'obj': ['A', 0, None, 'X', 1]},
267+
columns=columns)
268+
result = df.describe()
269+
270+
exp_columns = pd.CategoricalIndex(['int1', 'int2'],
271+
categories=['int1', 'int2', 'obj'],
272+
ordered=True, name='XXX')
273+
expected = DataFrame({'int1': [5, 30, df.int1.std(),
274+
10, 20, 30, 40, 50],
275+
'int2': [5, 30, df.int2.std(),
276+
10, 20, 30, 40, 50]},
277+
index=['count', 'mean', 'std', 'min', '25%',
278+
'50%', '75%', 'max'],
279+
columns=exp_columns)
280+
tm.assert_frame_equal(result, expected)
281+
tm.assert_categorical_equal(result.columns.values,
282+
expected.columns.values)
283+
284+
def test_describe_datetime_columns(self):
285+
columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
286+
freq='MS', tz='US/Eastern', name='XXX')
287+
df = DataFrame({0: [10, 20, 30, 40, 50],
288+
1: [10, 20, 30, 40, 50],
289+
2: ['A', 0, None, 'X', 1]})
290+
df.columns = columns
291+
result = df.describe()
292+
293+
exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
294+
freq='MS', tz='US/Eastern', name='XXX')
295+
expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
296+
10, 20, 30, 40, 50],
297+
1: [5, 30, df.iloc[:, 1].std(),
298+
10, 20, 30, 40, 50]},
299+
index=['count', 'mean', 'std', 'min', '25%',
300+
'50%', '75%', 'max'])
301+
expected.columns = exp_columns
302+
tm.assert_frame_equal(result, expected)
303+
self.assertEqual(result.columns.freq, 'MS')
304+
self.assertEqual(result.columns.tz, expected.columns.tz)
305+
260306
def test_reduce_mixed_frame(self):
261307
# GH 6806
262308
df = DataFrame({

pandas/tests/test_categorical.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2768,7 +2768,7 @@ def test_value_counts_with_nan(self):
27682768
pd.Series([2, 1, 3],
27692769
index=pd.CategoricalIndex(["a", "b", np.nan])))
27702770

2771-
with tm.assert_produces_warning(FutureWarning):
2771+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
27722772
s = pd.Series(pd.Categorical(
27732773
["a", "b", "a"], categories=["a", "b", np.nan]))
27742774
tm.assert_series_equal(
@@ -2779,7 +2779,7 @@ def test_value_counts_with_nan(self):
27792779
pd.Series([2, 1, 0],
27802780
index=pd.CategoricalIndex(["a", "b", np.nan])))
27812781

2782-
with tm.assert_produces_warning(FutureWarning):
2782+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
27832783
s = pd.Series(pd.Categorical(
27842784
["a", "b", None, "a", None, None], categories=["a", "b", np.nan
27852785
]))

pandas/tests/test_groupby.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -2821,8 +2821,8 @@ def test_non_cython_api(self):
28212821

28222822
# describe
28232823
expected = DataFrame(dict(B=concat(
2824-
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()
2825-
], keys=[1, 3])))
2824+
[df.loc[[0, 1], 'B'].describe(), df.loc[[2], 'B'].describe()],
2825+
keys=[1, 3])))
28262826
expected.index.names = ['A', None]
28272827
result = g.describe()
28282828
assert_frame_equal(result, expected)
@@ -4008,6 +4008,36 @@ def test_groupby_categorical_index(self):
40084008
[0, 1, 2, 3], levels, ordered=True), name='cats')
40094009
assert_frame_equal(result, expected)
40104010

4011+
def test_groupby_describe_categorical_columns(self):
4012+
# GH 11558
4013+
cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
4014+
categories=['foo', 'bar', 'baz', 'qux'],
4015+
ordered=True)
4016+
df = DataFrame(np.random.randn(20, 4), columns=cats)
4017+
result = df.groupby([1, 2, 3, 4] * 5).describe()
4018+
4019+
tm.assert_index_equal(result.columns, cats)
4020+
tm.assert_categorical_equal(result.columns.values, cats.values)
4021+
4022+
def test_groupby_unstack_categorical(self):
4023+
# GH11558 (example is taken from the original issue)
4024+
df = pd.DataFrame({'a': range(10),
4025+
'medium': ['A', 'B'] * 5,
4026+
'artist': list('XYXXY') * 2})
4027+
df['medium'] = df['medium'].astype('category')
4028+
4029+
gcat = df.groupby(['artist', 'medium'])['a'].count().unstack()
4030+
result = gcat.describe()
4031+
4032+
exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
4033+
name='medium')
4034+
tm.assert_index_equal(result.columns, exp_columns)
4035+
tm.assert_categorical_equal(result.columns.values, exp_columns.values)
4036+
4037+
result = gcat['A'] + gcat['B']
4038+
expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
4039+
tm.assert_series_equal(result, expected)
4040+
40114041
def test_groupby_groups_datetimeindex(self):
40124042
# #1430
40134043
from pandas.tseries.api import DatetimeIndex

0 commit comments

Comments
 (0)