Skip to content

Commit 711ad3e

Browse files
committed
BUG: coerce Categorical in record array creation (GH8626)
BUG: Categorical not created properly with to_frame() from Series (GH8626) BUG: handle astype with passed pd.Categorical (GH8626)
1 parent a30d6ee commit 711ad3e

File tree

6 files changed

+169
-20
lines changed

6 files changed

+169
-20
lines changed

doc/source/whatsnew/v0.15.1.txt

+34
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,41 @@ Experimental
4848
Bug Fixes
4949
~~~~~~~~~
5050

51+
52+
- Bug in coercing ``Categorical` to a records array, e.g. ``df.to_records()`` (:issue:`8626)
53+
- Bug in ``Categorical`` not created properly with ``Series.to_frame()`` (:issue:`8626`)
54+
- Bug in coercing in astype of a ``Categorical`` of a passed ``pd.Categorical`` (this now raises ``TypeError`` correctly), (:issue:`8626`)
5155
- Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`)
56+
57+
58+
59+
60+
61+
62+
63+
64+
5265
- Bug in numeric index operations of add/sub with Float/Index Index with numpy arrays (:issue:`8608`)
66+
67+
68+
69+
70+
71+
72+
5373
- Bug in ix/loc block splitting on setitem (manifests with integer-like dtypes, e.g. datetime64) (:issue:`8607`)
74+
75+
76+
77+
78+
79+
80+
81+
82+
83+
84+
85+
86+
87+
5488
- Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`)

pandas/core/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ class Categorical(PandasObject):
187187

188188
# For comparisons, so that numpy uses our implementation if the compare ops, which raise
189189
__array_priority__ = 1000
190+
_typ = 'categorical'
190191
ordered = False
191192
name = None
192193

@@ -1464,4 +1465,3 @@ def _convert_to_list_like(list_like):
14641465
else:
14651466
# is this reached?
14661467
return [list_like]
1467-

pandas/core/common.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@ class AmbiguousIndexError(PandasError, KeyError):
5656
def create_pandas_abc_type(name, attr, comp):
5757
@classmethod
5858
def _check(cls, inst):
59-
return getattr(inst, attr, None) in comp
59+
result = getattr(inst, attr, None)
60+
if result is None:
61+
return False
62+
return result in comp
6063
dct = dict(__instancecheck__=_check,
6164
__subclasscheck__=_check)
6265
meta = type("ABCBase", (type,), dct)
@@ -78,6 +81,7 @@ def _check(cls, inst):
7881
'sparse_time_series'))
7982
ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp",
8083
('sparse_array', 'sparse_series'))
84+
ABCCategorical = create_pandas_abc_type("ABCCategorical","_typ",("categorical"))
8185

8286

8387
class _ABCGeneric(type):

pandas/core/frame.py

+40-15
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
2727
_default_index, _maybe_upcast, _is_sequence,
2828
_infer_dtype_from_scalar, _values_from_object,
29-
is_list_like, _get_dtype, _maybe_box_datetimelike)
29+
is_list_like, _get_dtype, _maybe_box_datetimelike,
30+
is_categorical_dtype)
3031
from pandas.core.generic import NDFrame, _shared_docs
3132
from pandas.core.index import Index, MultiIndex, _ensure_index
3233
from pandas.core.indexing import (_maybe_droplevels,
@@ -332,6 +333,8 @@ def _init_dict(self, data, index, columns, dtype=None):
332333

333334
def _init_ndarray(self, values, index, columns, dtype=None,
334335
copy=False):
336+
# input must be a ndarray, list, Series, index
337+
335338
if isinstance(values, Series):
336339
if columns is None:
337340
if values.name is not None:
@@ -345,9 +348,41 @@ def _init_ndarray(self, values, index, columns, dtype=None,
345348
if not len(values) and columns is not None and len(columns):
346349
values = np.empty((0, 1), dtype=object)
347350

351+
# helper to create the axes as indexes
352+
def _get_axes(N, K, index=index, columns=columns):
353+
# return axes or defaults
354+
355+
if index is None:
356+
index = _default_index(N)
357+
else:
358+
index = _ensure_index(index)
359+
360+
if columns is None:
361+
columns = _default_index(K)
362+
else:
363+
columns = _ensure_index(columns)
364+
return index, columns
365+
366+
# we could have a categorical type passed or coerced to 'category'
367+
# recast this to an _arrays_to_mgr
368+
if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype):
369+
370+
if not hasattr(values,'dtype'):
371+
values = _prep_ndarray(values, copy=copy)
372+
values = values.ravel()
373+
elif copy:
374+
values = values.copy()
375+
376+
index, columns = _get_axes(len(values),1)
377+
return _arrays_to_mgr([ values ], columns, index, columns,
378+
dtype=dtype)
379+
380+
# by definition an array here
381+
# the dtypes will be coerced to a single dtype
348382
values = _prep_ndarray(values, copy=copy)
349383

350384
if dtype is not None:
385+
351386
if values.dtype != dtype:
352387
try:
353388
values = values.astype(dtype)
@@ -356,18 +391,7 @@ def _init_ndarray(self, values, index, columns, dtype=None,
356391
% (dtype, orig))
357392
raise_with_traceback(e)
358393

359-
N, K = values.shape
360-
361-
if index is None:
362-
index = _default_index(N)
363-
else:
364-
index = _ensure_index(index)
365-
366-
if columns is None:
367-
columns = _default_index(K)
368-
else:
369-
columns = _ensure_index(columns)
370-
394+
index, columns = _get_axes(*values.shape)
371395
return create_block_manager_from_blocks([values.T], [columns, index])
372396

373397
@property
@@ -877,7 +901,7 @@ def to_records(self, index=True, convert_datetime64=True):
877901
else:
878902
ix_vals = [self.index.values]
879903

880-
arrays = ix_vals + [self[c].values for c in self.columns]
904+
arrays = ix_vals + [self[c].get_values() for c in self.columns]
881905

882906
count = 0
883907
index_names = list(self.index.names)
@@ -890,7 +914,7 @@ def to_records(self, index=True, convert_datetime64=True):
890914
index_names = ['index']
891915
names = index_names + lmap(str, self.columns)
892916
else:
893-
arrays = [self[c].values for c in self.columns]
917+
arrays = [self[c].get_values() for c in self.columns]
894918
names = lmap(str, self.columns)
895919

896920
dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)])
@@ -4729,6 +4753,7 @@ def convert(v):
47294753
values = convert(values)
47304754

47314755
else:
4756+
47324757
# drop subclass info, do not copy data
47334758
values = np.asarray(values)
47344759
if copy:

pandas/core/internals.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,21 @@ def is_datelike(self):
9292
""" return True if I am a non-datelike """
9393
return self.is_datetime or self.is_timedelta
9494

95+
def is_categorical_astype(self, dtype):
96+
"""
97+
validate that we have a astypeable to categorical,
98+
returns a boolean if we are a categorical
99+
"""
100+
if com.is_categorical_dtype(dtype):
101+
if dtype == com.CategoricalDtype():
102+
return True
103+
104+
# this is a pd.Categorical, but is not
105+
# a valid type for astypeing
106+
raise TypeError("invalid type {0} for astype".format(dtype))
107+
108+
return False
109+
95110
def to_dense(self):
96111
return self.values.view()
97112

@@ -345,7 +360,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
345360

346361
# may need to convert to categorical
347362
# this is only called for non-categoricals
348-
if com.is_categorical_dtype(dtype):
363+
if self.is_categorical_astype(dtype):
349364
return make_block(Categorical(self.values),
350365
ndim=self.ndim,
351366
placement=self.mgr_locs)
@@ -1682,7 +1697,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
16821697
raise on an except if raise == True
16831698
"""
16841699

1685-
if dtype == com.CategoricalDtype():
1700+
if self.is_categorical_astype(dtype):
16861701
values = self.values
16871702
else:
16881703
values = np.array(self.values).astype(dtype)

pandas/tests/test_categorical.py

+72-1
Original file line numberDiff line numberDiff line change
@@ -1072,6 +1072,41 @@ def test_construction_series(self):
10721072
df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
10731073
tm.assert_frame_equal(df, expected)
10741074

1075+
def test_construction_frame(self):
1076+
1077+
# GH8626
1078+
1079+
# dict creation
1080+
df = DataFrame({ 'A' : list('abc') },dtype='category')
1081+
expected = Series(list('abc'),dtype='category')
1082+
tm.assert_series_equal(df['A'],expected)
1083+
1084+
# to_frame
1085+
s = Series(list('abc'),dtype='category')
1086+
result = s.to_frame()
1087+
expected = Series(list('abc'),dtype='category')
1088+
tm.assert_series_equal(result[0],expected)
1089+
result = s.to_frame(name='foo')
1090+
expected = Series(list('abc'),dtype='category')
1091+
tm.assert_series_equal(result['foo'],expected)
1092+
1093+
# list-like creation
1094+
df = DataFrame(list('abc'),dtype='category')
1095+
expected = Series(list('abc'),dtype='category')
1096+
tm.assert_series_equal(df[0],expected)
1097+
1098+
# these coerces back to object as its spread across columns
1099+
1100+
# ndim != 1
1101+
df = DataFrame([pd.Categorical(list('abc'))])
1102+
expected = DataFrame([list('abc')])
1103+
tm.assert_frame_equal(df,expected)
1104+
1105+
# mixed
1106+
df = DataFrame([pd.Categorical(list('abc')),list('def')])
1107+
expected = DataFrame([list('abc'),list('def')])
1108+
tm.assert_frame_equal(df,expected)
1109+
10751110
def test_reindex(self):
10761111

10771112
index = pd.date_range('20000101', periods=3)
@@ -2223,6 +2258,42 @@ def cmp(a,b):
22232258
# array conversion
22242259
tm.assert_almost_equal(np.array(s),np.array(s.values))
22252260

2261+
# valid conversion
2262+
for valid in [lambda x: x.astype('category'),
2263+
lambda x: x.astype(com.CategoricalDtype()),
2264+
lambda x: x.astype('object').astype('category'),
2265+
lambda x: x.astype('object').astype(com.CategoricalDtype())]:
2266+
2267+
result = valid(s)
2268+
tm.assert_series_equal(result,s)
2269+
2270+
# invalid conversion (these are NOT a dtype)
2271+
for invalid in [lambda x: x.astype(pd.Categorical),
2272+
lambda x: x.astype('object').astype(pd.Categorical)]:
2273+
self.assertRaises(TypeError, lambda : invalid(s))
2274+
2275+
2276+
def test_to_records(self):
2277+
2278+
# GH8626
2279+
2280+
# dict creation
2281+
df = DataFrame({ 'A' : list('abc') },dtype='category')
2282+
expected = Series(list('abc'),dtype='category')
2283+
tm.assert_series_equal(df['A'],expected)
2284+
2285+
# list-like creation
2286+
df = DataFrame(list('abc'),dtype='category')
2287+
expected = Series(list('abc'),dtype='category')
2288+
tm.assert_series_equal(df[0],expected)
2289+
2290+
# to record array
2291+
# this coerces
2292+
result = df.to_records()
2293+
expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
2294+
dtype=[('index', '<i8'), ('0', 'O')])
2295+
tm.assert_almost_equal(result,expected)
2296+
22262297
def test_numeric_like_ops(self):
22272298

22282299
# numeric ops should not succeed
@@ -2262,7 +2333,7 @@ def get_dir(s):
22622333

22632334
def test_pickle_v0_14_1(self):
22642335
cat = pd.Categorical(values=['a', 'b', 'c'],
2265-
levels=['a', 'b', 'c', 'd'],
2336+
categories=['a', 'b', 'c', 'd'],
22662337
name='foobar', ordered=False)
22672338
pickle_path = os.path.join(tm.get_data_path(),
22682339
'categorical_0_14_1.pickle')

0 commit comments

Comments
 (0)