Merge pull request #8705 from jreback/to_panel

jreback · jreback · commit 9d8b3a130205 · 2014-11-02T16:51:43.000-05:00
ENH/BUG: support Categorical in to_panel reshaping (GH8704)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -13,6 +13,7 @@
 from pandas.core.indexing import _is_null_slice
 from pandas.tseries.period import PeriodIndex
 import pandas.core.common as com
+from pandas.util.decorators import cache_readonly
 
 from pandas.core.common import isnull
 from pandas.util.terminal import get_terminal_size
@@ -174,9 +175,6 @@ class Categorical(PandasObject):
     >>> a.min()
     'c'
     """
-    ndim = 1
-    """Number of dimensions (always 1!)"""
-
     dtype = com.CategoricalDtype()
     """The dtype (always "category")"""
 
@@ -256,6 +254,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
                 dtype = 'object' if isnull(values).any() else None
                 values = _sanitize_array(values, None, dtype=dtype)
 
+
         if categories is None:
             try:
                 codes, categories = factorize(values, sort=True)
@@ -270,6 +269,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
                     # give us one by specifying categories
                     raise TypeError("'values' is not ordered, please explicitly specify the "
                                     "categories order by passing in a categories argument.")
+            except ValueError:
+
+                ### FIXME ####
+                raise NotImplementedError("> 1 ndim Categorical are not supported at this time")
+
         else:
             # there were two ways if categories are present
             # - the old one, where each value is a int pointer to the levels array -> not anymore
@@ -305,8 +309,13 @@ def copy(self):
         return Categorical(values=self._codes.copy(),categories=self.categories,
                            name=self.name, ordered=self.ordered, fastpath=True)
 
+    @cache_readonly
+    def ndim(self):
+        """Number of dimensions of the Categorical """
+        return self._codes.ndim
+
     @classmethod
-    def from_array(cls, data):
+    def from_array(cls, data, **kwargs):
         """
         Make a Categorical type from a single array-like object.
 
@@ -318,7 +327,7 @@ def from_array(cls, data):
             Can be an Index or array-like. The categories are assumed to be
             the unique values of `data`.
         """
-        return Categorical(data)
+        return Categorical(data, **kwargs)
 
     @classmethod
     def from_codes(cls, codes, categories, ordered=False, name=None):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -241,15 +241,19 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
             if isinstance(data, types.GeneratorType):
                 data = list(data)
             if len(data) > 0:
-                if index is None and isinstance(data[0], Series):
-                    index = _get_names_from_index(data)
-
                 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
                     arrays, columns = _to_arrays(data, columns, dtype=dtype)
                     columns = _ensure_index(columns)
 
+                    # set the index
                     if index is None:
-                        index = _default_index(len(data))
+                        if isinstance(data[0], Series):
+                            index = _get_names_from_index(data)
+                        elif isinstance(data[0], Categorical):
+                            index = _default_index(len(data[0]))
+                        else:
+                            index = _default_index(len(data))
+
                     mgr = _arrays_to_mgr(arrays, columns, index, columns,
                                          dtype=dtype)
                 else:
@@ -1053,7 +1057,6 @@ def to_panel(self):
         panel : Panel
         """
         from pandas.core.panel import Panel
-        from pandas.core.reshape import block2d_to_blocknd
 
         # only support this kind for now
         if (not isinstance(self.index, MultiIndex) or  # pragma: no cover
@@ -1073,29 +1076,24 @@ def to_panel(self):
             selfsorted = self
 
         major_axis, minor_axis = selfsorted.index.levels
-
         major_labels, minor_labels = selfsorted.index.labels
-
         shape = len(major_axis), len(minor_axis)
 
-        new_blocks = []
-        for block in selfsorted._data.blocks:
-            newb = block2d_to_blocknd(
-                values=block.values.T,
-                placement=block.mgr_locs, shape=shape,
-                labels=[major_labels, minor_labels],
-                ref_items=selfsorted.columns)
-            new_blocks.append(newb)
-
         # preserve names, if any
         major_axis = major_axis.copy()
         major_axis.name = self.index.names[0]
 
         minor_axis = minor_axis.copy()
         minor_axis.name = self.index.names[1]
 
+        # create new axes
         new_axes = [selfsorted.columns, major_axis, minor_axis]
-        new_mgr = create_block_manager_from_blocks(new_blocks, new_axes)
+
+        # create new manager
+        new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
+                                              labels=[major_labels, minor_labels],
+                                              shape=shape,
+                                              ref_items=selfsorted.columns)
 
         return Panel(new_mgr)
 
@@ -4808,6 +4806,10 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None):
         return _list_of_series_to_arrays(data, columns,
                                          coerce_float=coerce_float,
                                          dtype=dtype)
+    elif isinstance(data[0], Categorical):
+        if columns is None:
+            columns = _default_index(len(data))
+        return data, columns
     elif (isinstance(data, (np.ndarray, Series, Index))
           and data.dtype.names is not None):
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -11,7 +11,7 @@
 from pandas.core.common import (_possibly_downcast_to_dtype, isnull,
                                 _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
                                 ABCSparseSeries, _infer_dtype_from_scalar,
-                                _is_null_datelike_scalar,
+                                _is_null_datelike_scalar, _maybe_promote,
                                 is_timedelta64_dtype, is_datetime64_dtype,
                                 _possibly_infer_to_datetimelike, array_equivalent)
 from pandas.core.index import Index, MultiIndex, _ensure_index
@@ -177,6 +177,24 @@ def _slice(self, slicer):
         """ return a slice of my values """
         return self.values[slicer]
 
+    def reshape_nd(self, labels, shape, ref_items):
+        """
+        Parameters
+        ----------
+        labels : list of new axis labels
+        shape : new shape
+        ref_items : new ref_items
+
+        return a new block that is transformed to a nd block
+        """
+
+        return _block2d_to_blocknd(
+            values=self.get_values().T,
+            placement=self.mgr_locs,
+            shape=shape,
+            labels=labels,
+            ref_items=ref_items)
+
     def getitem_block(self, slicer, new_mgr_locs=None):
         """
         Perform __getitem__-like, return result as block.
@@ -2573,6 +2591,10 @@ def comp(s):
         bm._consolidate_inplace()
         return bm
 
+    def reshape_nd(self, axes, **kwargs):
+        """ a 2d-nd reshape operation on a BlockManager """
+        return self.apply('reshape_nd', axes=axes, **kwargs)
+
     def is_consolidated(self):
         """
         Return True if more than one block with the same dtype
@@ -3895,6 +3917,43 @@ def _concat_indexes(indexes):
     return indexes[0].append(indexes[1:])
 
 
+def _block2d_to_blocknd(values, placement, shape, labels, ref_items):
+    """ pivot to the labels shape """
+    from pandas.core.internals import make_block
+
+    panel_shape = (len(placement),) + shape
+
+    # TODO: lexsort depth needs to be 2!!
+
+    # Create observation selection vector using major and minor
+    # labels, for converting to panel format.
+    selector = _factor_indexer(shape[1:], labels)
+    mask = np.zeros(np.prod(shape), dtype=bool)
+    mask.put(selector, True)
+
+    if mask.all():
+        pvalues = np.empty(panel_shape, dtype=values.dtype)
+    else:
+        dtype, fill_value = _maybe_promote(values.dtype)
+        pvalues = np.empty(panel_shape, dtype=dtype)
+        pvalues.fill(fill_value)
+
+    values = values
+    for i in range(len(placement)):
+        pvalues[i].flat[mask] = values[:, i]
+
+    return make_block(pvalues, placement=placement)
+
+
+def _factor_indexer(shape, labels):
+    """
+    given a tuple of shape and a list of Categorical labels, return the
+    expanded label indexer
+    """
+    mult = np.array(shape)[::-1].cumprod()[::-1]
+    return com._ensure_platform_int(
+        np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
+
 def _get_blkno_placements(blknos, blk_count, group=True):
     """
 
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -59,7 +59,12 @@ class _Unstacker(object):
     """
 
     def __init__(self, values, index, level=-1, value_columns=None):
+
+        self.is_categorical = None
         if values.ndim == 1:
+            if isinstance(values, Categorical):
+                self.is_categorical = values
+                values = np.array(values)
             values = values[:, np.newaxis]
         self.values = values
         self.value_columns = value_columns
@@ -175,6 +180,12 @@ def get_result(self):
             else:
                 index = index.take(self.unique_groups)
 
+        # may need to coerce categoricals here
+        if self.is_categorical is not None:
+            values = [ Categorical.from_array(values[:,i],
+                                              categories=self.is_categorical.categories)
+                       for i in range(values.shape[-1]) ]
+
         return DataFrame(values, index=index, columns=columns)
 
     def get_new_values(self):
@@ -1188,40 +1199,3 @@ def make_axis_dummies(frame, axis='minor', transform=None):
     values = values.take(labels, axis=0)
 
     return DataFrame(values, columns=items, index=frame.index)
-
-
-def block2d_to_blocknd(values, placement, shape, labels, ref_items):
-    """ pivot to the labels shape """
-    from pandas.core.internals import make_block
-
-    panel_shape = (len(placement),) + shape
-
-    # TODO: lexsort depth needs to be 2!!
-
-    # Create observation selection vector using major and minor
-    # labels, for converting to panel format.
-    selector = factor_indexer(shape[1:], labels)
-    mask = np.zeros(np.prod(shape), dtype=bool)
-    mask.put(selector, True)
-
-    if mask.all():
-        pvalues = np.empty(panel_shape, dtype=values.dtype)
-    else:
-        dtype, fill_value = _maybe_promote(values.dtype)
-        pvalues = np.empty(panel_shape, dtype=dtype)
-        pvalues.fill(fill_value)
-
-    values = values
-    for i in range(len(placement)):
-        pvalues[i].flat[mask] = values[:, i]
-
-    return make_block(pvalues, placement=placement)
-
-
-def factor_indexer(shape, labels):
-    """ given a tuple of shape and a list of Categorical labels, return the
-    expanded label indexer
-    """
-    mult = np.array(shape)[::-1].cumprod()[::-1]
-    return com._ensure_platform_int(
-        np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -23,8 +23,7 @@
 from pandas.core.algorithms import match, unique
 from pandas.core.categorical import Categorical
 from pandas.core.common import _asarray_tuplesafe
-from pandas.core.internals import BlockManager, make_block
-from pandas.core.reshape import block2d_to_blocknd, factor_indexer
+from pandas.core.internals import BlockManager, make_block, _block2d_to_blocknd, _factor_indexer
 from pandas.core.index import _ensure_index
 from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
 import pandas.core.common as com
@@ -332,7 +331,7 @@ def read_hdf(path_or_buf, key, **kwargs):
         key, auto_close=auto_close, **kwargs)
 
     if isinstance(path_or_buf, string_types):
-        
+
         try:
             exists = os.path.exists(path_or_buf)
 
@@ -3537,7 +3536,7 @@ def read(self, where=None, columns=None, **kwargs):
         labels = [f.codes for f in factors]
 
         # compute the key
-        key = factor_indexer(N[1:], labels)
+        key = _factor_indexer(N[1:], labels)
 
         objs = []
         if len(unique(key)) == len(key):
@@ -3556,7 +3555,7 @@ def read(self, where=None, columns=None, **kwargs):
 
                 take_labels = [l.take(sorter) for l in labels]
                 items = Index(c.values)
-                block = block2d_to_blocknd(
+                block = _block2d_to_blocknd(
                     values=sorted_values, placement=np.arange(len(items)),
                     shape=tuple(N), labels=take_labels, ref_items=items)
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1121,18 +1121,45 @@ def test_construction_frame(self):
         expected = Series(list('abc'),dtype='category')
         tm.assert_series_equal(df[0],expected)
 
-        # these coerces back to object as its spread across columns
-
         # ndim != 1
         df = DataFrame([pd.Categorical(list('abc'))])
-        expected = DataFrame([list('abc')])
+        expected = DataFrame({ 0 : Series(list('abc'),dtype='category')})
+        tm.assert_frame_equal(df,expected)
+
+        df = DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abd'))])
+        expected = DataFrame({ 0 : Series(list('abc'),dtype='category'),
+                               1 : Series(list('abd'),dtype='category')},columns=[0,1])
         tm.assert_frame_equal(df,expected)
 
         # mixed
         df = DataFrame([pd.Categorical(list('abc')),list('def')])
-        expected = DataFrame([list('abc'),list('def')])
+        expected = DataFrame({ 0 : Series(list('abc'),dtype='category'),
+                               1 : list('def')},columns=[0,1])
         tm.assert_frame_equal(df,expected)
 
+        # invalid (shape)
+        self.assertRaises(ValueError, lambda : DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abdefg'))]))
+
+        # ndim > 1
+        self.assertRaises(NotImplementedError, lambda : pd.Categorical(np.array([list('abcd')])))
+
+    def test_reshaping(self):
+
+        p = tm.makePanel()
+        p['str'] = 'foo'
+        df = p.to_frame()
+        df['category'] = df['str'].astype('category')
+        result = df['category'].unstack()
+
+        c = Categorical(['foo']*len(p.major_axis))
+        expected = DataFrame({'A' : c.copy(),
+                              'B' : c.copy(),
+                              'C' : c.copy(),
+                              'D' : c.copy()},
+                             columns=Index(list('ABCD'),name='minor'),
+                             index=p.major_axis.set_names('major'))
+        tm.assert_frame_equal(result, expected)
+
     def test_reindex(self):
 
         index = pd.date_range('20000101', periods=3)
diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py