BUG: coerce Categorical in record array creation (GH8626)

jreback · jreback · commit 711ad3e42f1c · 2014-10-27T20:02:43.000-04:00
BUG: Categorical not created properly with to_frame() from Series (GH8626)
BUG: handle astype with passed pd.Categorical (GH8626)
diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt
@@ -48,7 +48,41 @@ Experimental
 Bug Fixes
 ~~~~~~~~~
 
+
+- Bug in coercing ``Categorical` to a records array, e.g. ``df.to_records()`` (:issue:`8626)
+- Bug in ``Categorical`` not created properly with ``Series.to_frame()`` (:issue:`8626`)
+- Bug in coercing in astype of a ``Categorical`` of a passed ``pd.Categorical`` (this now raises ``TypeError`` correctly), (:issue:`8626`)
 - Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`)
+
+
+
+
+
+
+
+
+
 - Bug in numeric index operations of add/sub with Float/Index Index with numpy arrays (:issue:`8608`)
+
+
+
+
+
+
+
 - Bug in ix/loc block splitting on setitem (manifests with integer-like dtypes, e.g. datetime64) (:issue:`8607`)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 - Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -187,6 +187,7 @@ class Categorical(PandasObject):
 
     # For comparisons, so that numpy uses our implementation if the compare ops, which raise
     __array_priority__ = 1000
+    _typ = 'categorical'
     ordered = False
     name = None
 
@@ -1464,4 +1465,3 @@ def _convert_to_list_like(list_like):
     else:
         # is this reached?
         return [list_like]
-
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -56,7 +56,10 @@ class AmbiguousIndexError(PandasError, KeyError):
 def create_pandas_abc_type(name, attr, comp):
     @classmethod
     def _check(cls, inst):
-        return getattr(inst, attr, None) in comp
+        result = getattr(inst, attr, None)
+        if result is None:
+            return False
+        return result in comp
     dct = dict(__instancecheck__=_check,
                __subclasscheck__=_check)
     meta = type("ABCBase", (type,), dct)
@@ -78,6 +81,7 @@ def _check(cls, inst):
                                           'sparse_time_series'))
 ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp",
                                         ('sparse_array', 'sparse_series'))
+ABCCategorical = create_pandas_abc_type("ABCCategorical","_typ",("categorical"))
 
 
 class _ABCGeneric(type):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -26,7 +26,8 @@
 from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
                                 _default_index, _maybe_upcast, _is_sequence,
                                 _infer_dtype_from_scalar, _values_from_object,
-                                is_list_like, _get_dtype, _maybe_box_datetimelike)
+                                is_list_like, _get_dtype, _maybe_box_datetimelike,
+                                is_categorical_dtype)
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (_maybe_droplevels,
@@ -332,6 +333,8 @@ def _init_dict(self, data, index, columns, dtype=None):
 
     def _init_ndarray(self, values, index, columns, dtype=None,
                       copy=False):
+        # input must be a ndarray, list, Series, index
+
         if isinstance(values, Series):
             if columns is None:
                 if values.name is not None:
@@ -345,9 +348,41 @@ def _init_ndarray(self, values, index, columns, dtype=None,
             if not len(values) and columns is not None and len(columns):
                 values = np.empty((0, 1), dtype=object)
 
+        # helper to create the axes as indexes
+        def _get_axes(N, K, index=index, columns=columns):
+            # return axes or defaults
+
+            if index is None:
+                index = _default_index(N)
+            else:
+                index = _ensure_index(index)
+
+            if columns is None:
+                columns = _default_index(K)
+            else:
+                columns = _ensure_index(columns)
+            return index, columns
+
+        # we could have a categorical type passed or coerced to 'category'
+        # recast this to an _arrays_to_mgr
+        if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype):
+
+            if not hasattr(values,'dtype'):
+                values = _prep_ndarray(values, copy=copy)
+                values = values.ravel()
+            elif copy:
+                values = values.copy()
+
+            index, columns = _get_axes(len(values),1)
+            return _arrays_to_mgr([ values ], columns, index, columns,
+                                  dtype=dtype)
+
+        # by definition an array here
+        # the dtypes will be coerced to a single dtype
         values = _prep_ndarray(values, copy=copy)
 
         if dtype is not None:
+
             if values.dtype != dtype:
                 try:
                     values = values.astype(dtype)
@@ -356,18 +391,7 @@ def _init_ndarray(self, values, index, columns, dtype=None,
                                    % (dtype, orig))
                     raise_with_traceback(e)
 
-        N, K = values.shape
-
-        if index is None:
-            index = _default_index(N)
-        else:
-            index = _ensure_index(index)
-
-        if columns is None:
-            columns = _default_index(K)
-        else:
-            columns = _ensure_index(columns)
-
+        index, columns = _get_axes(*values.shape)
         return create_block_manager_from_blocks([values.T], [columns, index])
 
     @property
@@ -877,7 +901,7 @@ def to_records(self, index=True, convert_datetime64=True):
                 else:
                     ix_vals = [self.index.values]
 
-            arrays = ix_vals + [self[c].values for c in self.columns]
+            arrays = ix_vals + [self[c].get_values() for c in self.columns]
 
             count = 0
             index_names = list(self.index.names)
@@ -890,7 +914,7 @@ def to_records(self, index=True, convert_datetime64=True):
                 index_names = ['index']
             names = index_names + lmap(str, self.columns)
         else:
-            arrays = [self[c].values for c in self.columns]
+            arrays = [self[c].get_values() for c in self.columns]
             names = lmap(str, self.columns)
 
         dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)])
@@ -4729,6 +4753,7 @@ def convert(v):
             values = convert(values)
 
     else:
+
         # drop subclass info, do not copy data
         values = np.asarray(values)
         if copy:
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -92,6 +92,21 @@ def is_datelike(self):
         """ return True if I am a non-datelike """
         return self.is_datetime or self.is_timedelta
 
+    def is_categorical_astype(self, dtype):
+        """
+        validate that we have a astypeable to categorical,
+        returns a boolean if we are a categorical
+        """
+        if com.is_categorical_dtype(dtype):
+            if dtype == com.CategoricalDtype():
+                return True
+
+            # this is a pd.Categorical, but is not
+            # a valid type for astypeing
+            raise TypeError("invalid type {0} for astype".format(dtype))
+
+        return False
+
     def to_dense(self):
         return self.values.view()
 
@@ -345,7 +360,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
 
         # may need to convert to categorical
         # this is only called for non-categoricals
-        if com.is_categorical_dtype(dtype):
+        if self.is_categorical_astype(dtype):
             return make_block(Categorical(self.values),
                               ndim=self.ndim,
                               placement=self.mgr_locs)
@@ -1682,7 +1697,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
         raise on an except if raise == True
         """
 
-        if dtype == com.CategoricalDtype():
+        if self.is_categorical_astype(dtype):
             values = self.values
         else:
             values = np.array(self.values).astype(dtype)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1072,6 +1072,41 @@ def test_construction_series(self):
         df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
         tm.assert_frame_equal(df, expected)
 
+    def test_construction_frame(self):
+
+        # GH8626
+
+        # dict creation
+        df = DataFrame({ 'A' : list('abc') },dtype='category')
+        expected = Series(list('abc'),dtype='category')
+        tm.assert_series_equal(df['A'],expected)
+
+        # to_frame
+        s = Series(list('abc'),dtype='category')
+        result = s.to_frame()
+        expected = Series(list('abc'),dtype='category')
+        tm.assert_series_equal(result[0],expected)
+        result = s.to_frame(name='foo')
+        expected = Series(list('abc'),dtype='category')
+        tm.assert_series_equal(result['foo'],expected)
+
+        # list-like creation
+        df = DataFrame(list('abc'),dtype='category')
+        expected = Series(list('abc'),dtype='category')
+        tm.assert_series_equal(df[0],expected)
+
+        # these coerces back to object as its spread across columns
+
+        # ndim != 1
+        df = DataFrame([pd.Categorical(list('abc'))])
+        expected = DataFrame([list('abc')])
+        tm.assert_frame_equal(df,expected)
+
+        # mixed
+        df = DataFrame([pd.Categorical(list('abc')),list('def')])
+        expected = DataFrame([list('abc'),list('def')])
+        tm.assert_frame_equal(df,expected)
+
     def test_reindex(self):
 
         index = pd.date_range('20000101', periods=3)
@@ -2223,6 +2258,42 @@ def cmp(a,b):
         # array conversion
         tm.assert_almost_equal(np.array(s),np.array(s.values))
 
+        # valid conversion
+        for valid in [lambda x: x.astype('category'),
+                      lambda x: x.astype(com.CategoricalDtype()),
+                      lambda x: x.astype('object').astype('category'),
+                      lambda x: x.astype('object').astype(com.CategoricalDtype())]:
+
+            result = valid(s)
+            tm.assert_series_equal(result,s)
+
+        # invalid conversion (these are NOT a dtype)
+        for invalid in [lambda x: x.astype(pd.Categorical),
+                        lambda x: x.astype('object').astype(pd.Categorical)]:
+            self.assertRaises(TypeError, lambda : invalid(s))
+
+
+    def test_to_records(self):
+
+        # GH8626
+
+        # dict creation
+        df = DataFrame({ 'A' : list('abc') },dtype='category')
+        expected = Series(list('abc'),dtype='category')
+        tm.assert_series_equal(df['A'],expected)
+
+        # list-like creation
+        df = DataFrame(list('abc'),dtype='category')
+        expected = Series(list('abc'),dtype='category')
+        tm.assert_series_equal(df[0],expected)
+
+        # to record array
+        # this coerces
+        result = df.to_records()
+        expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
+                                dtype=[('index', '<i8'), ('0', 'O')])
+        tm.assert_almost_equal(result,expected)
+
     def test_numeric_like_ops(self):
 
         # numeric ops should not succeed
@@ -2262,7 +2333,7 @@ def get_dir(s):
 
     def test_pickle_v0_14_1(self):
         cat = pd.Categorical(values=['a', 'b', 'c'],
-                             levels=['a', 'b', 'c', 'd'],
+                             categories=['a', 'b', 'c', 'd'],
                              name='foobar', ordered=False)
         pickle_path = os.path.join(tm.get_data_path(),
                                    'categorical_0_14_1.pickle')