From 6d1945d8a94957a4a6ff05da463f045cb6c9cb0c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 1 Nov 2014 20:36:02 -0400 Subject: [PATCH] ENH/BUG: support Categorical in to_panel reshaping (GH8704) CLN: move block2d_to_blocknd support code to core/internal.py TST/BUG: support Categorical reshaping via .unstack --- pandas/core/categorical.py | 19 +++++++--- pandas/core/frame.py | 36 ++++++++++--------- pandas/core/internals.py | 61 +++++++++++++++++++++++++++++++- pandas/core/reshape.py | 48 ++++++------------------- pandas/io/pytables.py | 9 +++-- pandas/tests/test_categorical.py | 35 +++++++++++++++--- pandas/tests/test_panel.py | 12 +++++++ 7 files changed, 151 insertions(+), 69 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 150da65580223..dd23897a3f7e9 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -13,6 +13,7 @@ from pandas.core.indexing import _is_null_slice from pandas.tseries.period import PeriodIndex import pandas.core.common as com +from pandas.util.decorators import cache_readonly from pandas.core.common import isnull from pandas.util.terminal import get_terminal_size @@ -174,9 +175,6 @@ class Categorical(PandasObject): >>> a.min() 'c' """ - ndim = 1 - """Number of dimensions (always 1!)""" - dtype = com.CategoricalDtype() """The dtype (always "category")""" @@ -256,6 +254,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) + if categories is None: try: codes, categories = factorize(values, sort=True) @@ -270,6 +269,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa # give us one by specifying categories raise TypeError("'values' is not ordered, please explicitly specify the " "categories order by passing in a categories argument.") + except ValueError: + + ### FIXME #### + raise NotImplementedError("> 1 ndim Categorical are not supported at this time") + else: # there were two ways if categories are present # - the old one, where each value is a int pointer to the levels array -> not anymore @@ -305,8 +309,13 @@ def copy(self): return Categorical(values=self._codes.copy(),categories=self.categories, name=self.name, ordered=self.ordered, fastpath=True) + @cache_readonly + def ndim(self): + """Number of dimensions of the Categorical """ + return self._codes.ndim + @classmethod - def from_array(cls, data): + def from_array(cls, data, **kwargs): """ Make a Categorical type from a single array-like object. @@ -318,7 +327,7 @@ def from_array(cls, data): Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ - return Categorical(data) + return Categorical(data, **kwargs) @classmethod def from_codes(cls, codes, categories, ordered=False, name=None): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4350d5aba3846..a734baf28464b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -241,15 +241,19 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if isinstance(data, types.GeneratorType): data = list(data) if len(data) > 0: - if index is None and isinstance(data[0], Series): - index = _get_names_from_index(data) - if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: arrays, columns = _to_arrays(data, columns, dtype=dtype) columns = _ensure_index(columns) + # set the index if index is None: - index = _default_index(len(data)) + if isinstance(data[0], Series): + index = _get_names_from_index(data) + elif isinstance(data[0], Categorical): + index = _default_index(len(data[0])) + else: + index = _default_index(len(data)) + mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) else: @@ -1053,7 +1057,6 @@ def to_panel(self): panel : Panel """ from pandas.core.panel import Panel - from pandas.core.reshape import block2d_to_blocknd # only support this kind for now if (not isinstance(self.index, MultiIndex) or # pragma: no cover @@ -1073,20 +1076,9 @@ def to_panel(self): selfsorted = self major_axis, minor_axis = selfsorted.index.levels - major_labels, minor_labels = selfsorted.index.labels - shape = len(major_axis), len(minor_axis) - new_blocks = [] - for block in selfsorted._data.blocks: - newb = block2d_to_blocknd( - values=block.values.T, - placement=block.mgr_locs, shape=shape, - labels=[major_labels, minor_labels], - ref_items=selfsorted.columns) - new_blocks.append(newb) - # preserve names, if any major_axis = major_axis.copy() major_axis.name = self.index.names[0] @@ -1094,8 +1086,14 @@ def to_panel(self): minor_axis = minor_axis.copy() minor_axis.name = self.index.names[1] + # create new axes new_axes = [selfsorted.columns, major_axis, minor_axis] - new_mgr = create_block_manager_from_blocks(new_blocks, new_axes) + + # create new manager + new_mgr = selfsorted._data.reshape_nd(axes=new_axes, + labels=[major_labels, minor_labels], + shape=shape, + ref_items=selfsorted.columns) return Panel(new_mgr) @@ -4808,6 +4806,10 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): return _list_of_series_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + elif isinstance(data[0], Categorical): + if columns is None: + columns = _default_index(len(data)) + return data, columns elif (isinstance(data, (np.ndarray, Series, Index)) and data.dtype.names is not None): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f3f88583b2445..bb81258efe4c5 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -11,7 +11,7 @@ from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like, ABCSparseSeries, _infer_dtype_from_scalar, - _is_null_datelike_scalar, + _is_null_datelike_scalar, _maybe_promote, is_timedelta64_dtype, is_datetime64_dtype, _possibly_infer_to_datetimelike, array_equivalent) from pandas.core.index import Index, MultiIndex, _ensure_index @@ -177,6 +177,24 @@ def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] + def reshape_nd(self, labels, shape, ref_items): + """ + Parameters + ---------- + labels : list of new axis labels + shape : new shape + ref_items : new ref_items + + return a new block that is transformed to a nd block + """ + + return _block2d_to_blocknd( + values=self.get_values().T, + placement=self.mgr_locs, + shape=shape, + labels=labels, + ref_items=ref_items) + def getitem_block(self, slicer, new_mgr_locs=None): """ Perform __getitem__-like, return result as block. @@ -2573,6 +2591,10 @@ def comp(s): bm._consolidate_inplace() return bm + def reshape_nd(self, axes, **kwargs): + """ a 2d-nd reshape operation on a BlockManager """ + return self.apply('reshape_nd', axes=axes, **kwargs) + def is_consolidated(self): """ Return True if more than one block with the same dtype @@ -3895,6 +3917,43 @@ def _concat_indexes(indexes): return indexes[0].append(indexes[1:]) +def _block2d_to_blocknd(values, placement, shape, labels, ref_items): + """ pivot to the labels shape """ + from pandas.core.internals import make_block + + panel_shape = (len(placement),) + shape + + # TODO: lexsort depth needs to be 2!! + + # Create observation selection vector using major and minor + # labels, for converting to panel format. + selector = _factor_indexer(shape[1:], labels) + mask = np.zeros(np.prod(shape), dtype=bool) + mask.put(selector, True) + + if mask.all(): + pvalues = np.empty(panel_shape, dtype=values.dtype) + else: + dtype, fill_value = _maybe_promote(values.dtype) + pvalues = np.empty(panel_shape, dtype=dtype) + pvalues.fill(fill_value) + + values = values + for i in range(len(placement)): + pvalues[i].flat[mask] = values[:, i] + + return make_block(pvalues, placement=placement) + + +def _factor_indexer(shape, labels): + """ + given a tuple of shape and a list of Categorical labels, return the + expanded label indexer + """ + mult = np.array(shape)[::-1].cumprod()[::-1] + return com._ensure_platform_int( + np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) + def _get_blkno_placements(blknos, blk_count, group=True): """ diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index bb6f6f4d00cd8..5cbf392f246ed 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -59,7 +59,12 @@ class _Unstacker(object): """ def __init__(self, values, index, level=-1, value_columns=None): + + self.is_categorical = None if values.ndim == 1: + if isinstance(values, Categorical): + self.is_categorical = values + values = np.array(values) values = values[:, np.newaxis] self.values = values self.value_columns = value_columns @@ -175,6 +180,12 @@ def get_result(self): else: index = index.take(self.unique_groups) + # may need to coerce categoricals here + if self.is_categorical is not None: + values = [ Categorical.from_array(values[:,i], + categories=self.is_categorical.categories) + for i in range(values.shape[-1]) ] + return DataFrame(values, index=index, columns=columns) def get_new_values(self): @@ -1188,40 +1199,3 @@ def make_axis_dummies(frame, axis='minor', transform=None): values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index) - - -def block2d_to_blocknd(values, placement, shape, labels, ref_items): - """ pivot to the labels shape """ - from pandas.core.internals import make_block - - panel_shape = (len(placement),) + shape - - # TODO: lexsort depth needs to be 2!! - - # Create observation selection vector using major and minor - # labels, for converting to panel format. - selector = factor_indexer(shape[1:], labels) - mask = np.zeros(np.prod(shape), dtype=bool) - mask.put(selector, True) - - if mask.all(): - pvalues = np.empty(panel_shape, dtype=values.dtype) - else: - dtype, fill_value = _maybe_promote(values.dtype) - pvalues = np.empty(panel_shape, dtype=dtype) - pvalues.fill(fill_value) - - values = values - for i in range(len(placement)): - pvalues[i].flat[mask] = values[:, i] - - return make_block(pvalues, placement=placement) - - -def factor_indexer(shape, labels): - """ given a tuple of shape and a list of Categorical labels, return the - expanded label indexer - """ - mult = np.array(shape)[::-1].cumprod()[::-1] - return com._ensure_platform_int( - np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f1745fe8579bb..6f8a774356293 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -23,8 +23,7 @@ from pandas.core.algorithms import match, unique from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe -from pandas.core.internals import BlockManager, make_block -from pandas.core.reshape import block2d_to_blocknd, factor_indexer +from pandas.core.internals import BlockManager, make_block, _block2d_to_blocknd, _factor_indexer from pandas.core.index import _ensure_index from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type import pandas.core.common as com @@ -332,7 +331,7 @@ def read_hdf(path_or_buf, key, **kwargs): key, auto_close=auto_close, **kwargs) if isinstance(path_or_buf, string_types): - + try: exists = os.path.exists(path_or_buf) @@ -3537,7 +3536,7 @@ def read(self, where=None, columns=None, **kwargs): labels = [f.codes for f in factors] # compute the key - key = factor_indexer(N[1:], labels) + key = _factor_indexer(N[1:], labels) objs = [] if len(unique(key)) == len(key): @@ -3556,7 +3555,7 @@ def read(self, where=None, columns=None, **kwargs): take_labels = [l.take(sorter) for l in labels] items = Index(c.values) - block = block2d_to_blocknd( + block = _block2d_to_blocknd( values=sorted_values, placement=np.arange(len(items)), shape=tuple(N), labels=take_labels, ref_items=items) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 4bc7084c93b6b..624c6cf9688d6 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1121,18 +1121,45 @@ def test_construction_frame(self): expected = Series(list('abc'),dtype='category') tm.assert_series_equal(df[0],expected) - # these coerces back to object as its spread across columns - # ndim != 1 df = DataFrame([pd.Categorical(list('abc'))]) - expected = DataFrame([list('abc')]) + expected = DataFrame({ 0 : Series(list('abc'),dtype='category')}) + tm.assert_frame_equal(df,expected) + + df = DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abd'))]) + expected = DataFrame({ 0 : Series(list('abc'),dtype='category'), + 1 : Series(list('abd'),dtype='category')},columns=[0,1]) tm.assert_frame_equal(df,expected) # mixed df = DataFrame([pd.Categorical(list('abc')),list('def')]) - expected = DataFrame([list('abc'),list('def')]) + expected = DataFrame({ 0 : Series(list('abc'),dtype='category'), + 1 : list('def')},columns=[0,1]) tm.assert_frame_equal(df,expected) + # invalid (shape) + self.assertRaises(ValueError, lambda : DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abdefg'))])) + + # ndim > 1 + self.assertRaises(NotImplementedError, lambda : pd.Categorical(np.array([list('abcd')]))) + + def test_reshaping(self): + + p = tm.makePanel() + p['str'] = 'foo' + df = p.to_frame() + df['category'] = df['str'].astype('category') + result = df['category'].unstack() + + c = Categorical(['foo']*len(p.major_axis)) + expected = DataFrame({'A' : c.copy(), + 'B' : c.copy(), + 'C' : c.copy(), + 'D' : c.copy()}, + columns=Index(list('ABCD'),name='minor'), + index=p.major_axis.set_names('major')) + tm.assert_frame_equal(result, expected) + def test_reindex(self): index = pd.date_range('20000101', periods=3) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 14e4e32acae9f..01d086f57718c 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1501,6 +1501,18 @@ def test_to_frame_mixed(self): # Previously, this was mutating the underlying index and changing its name assert_frame_equal(wp['bool'], panel['bool'], check_names=False) + # GH 8704 + # with categorical + df = panel.to_frame() + df['category'] = df['str'].astype('category') + + # to_panel + # TODO: this converts back to object + p = df.to_panel() + expected = panel.copy() + expected['category'] = 'foo' + assert_panel_equal(p,expected) + def test_to_frame_multi_major(self): idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')])