Skip to content

ENH/BUG: support Categorical in to_panel reshaping (GH8704) #8705

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 2, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas.core.indexing import _is_null_slice
from pandas.tseries.period import PeriodIndex
import pandas.core.common as com
from pandas.util.decorators import cache_readonly

from pandas.core.common import isnull
from pandas.util.terminal import get_terminal_size
Expand Down Expand Up @@ -174,9 +175,6 @@ class Categorical(PandasObject):
>>> a.min()
'c'
"""
ndim = 1
"""Number of dimensions (always 1!)"""

dtype = com.CategoricalDtype()
"""The dtype (always "category")"""

Expand Down Expand Up @@ -256,6 +254,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
dtype = 'object' if isnull(values).any() else None
values = _sanitize_array(values, None, dtype=dtype)


if categories is None:
try:
codes, categories = factorize(values, sort=True)
Expand All @@ -270,6 +269,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
# give us one by specifying categories
raise TypeError("'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument.")
except ValueError:

### FIXME ####
raise NotImplementedError("> 1 ndim Categorical are not supported at this time")

else:
# there were two ways if categories are present
# - the old one, where each value is a int pointer to the levels array -> not anymore
Expand Down Expand Up @@ -305,8 +309,13 @@ def copy(self):
return Categorical(values=self._codes.copy(),categories=self.categories,
name=self.name, ordered=self.ordered, fastpath=True)

@cache_readonly
def ndim(self):
"""Number of dimensions of the Categorical """
return self._codes.ndim

@classmethod
def from_array(cls, data):
def from_array(cls, data, **kwargs):
"""
Make a Categorical type from a single array-like object.

Expand All @@ -318,7 +327,7 @@ def from_array(cls, data):
Can be an Index or array-like. The categories are assumed to be
the unique values of `data`.
"""
return Categorical(data)
return Categorical(data, **kwargs)

@classmethod
def from_codes(cls, codes, categories, ordered=False, name=None):
Expand Down
36 changes: 19 additions & 17 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,15 +241,19 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
if isinstance(data, types.GeneratorType):
data = list(data)
if len(data) > 0:
if index is None and isinstance(data[0], Series):
index = _get_names_from_index(data)

if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
arrays, columns = _to_arrays(data, columns, dtype=dtype)
columns = _ensure_index(columns)

# set the index
if index is None:
index = _default_index(len(data))
if isinstance(data[0], Series):
index = _get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = _default_index(len(data[0]))
else:
index = _default_index(len(data))

mgr = _arrays_to_mgr(arrays, columns, index, columns,
dtype=dtype)
else:
Expand Down Expand Up @@ -1053,7 +1057,6 @@ def to_panel(self):
panel : Panel
"""
from pandas.core.panel import Panel
from pandas.core.reshape import block2d_to_blocknd

# only support this kind for now
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
Expand All @@ -1073,29 +1076,24 @@ def to_panel(self):
selfsorted = self

major_axis, minor_axis = selfsorted.index.levels

major_labels, minor_labels = selfsorted.index.labels

shape = len(major_axis), len(minor_axis)

new_blocks = []
for block in selfsorted._data.blocks:
newb = block2d_to_blocknd(
values=block.values.T,
placement=block.mgr_locs, shape=shape,
labels=[major_labels, minor_labels],
ref_items=selfsorted.columns)
new_blocks.append(newb)

# preserve names, if any
major_axis = major_axis.copy()
major_axis.name = self.index.names[0]

minor_axis = minor_axis.copy()
minor_axis.name = self.index.names[1]

# create new axes
new_axes = [selfsorted.columns, major_axis, minor_axis]
new_mgr = create_block_manager_from_blocks(new_blocks, new_axes)

# create new manager
new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
labels=[major_labels, minor_labels],
shape=shape,
ref_items=selfsorted.columns)

return Panel(new_mgr)

Expand Down Expand Up @@ -4808,6 +4806,10 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None):
return _list_of_series_to_arrays(data, columns,
coerce_float=coerce_float,
dtype=dtype)
elif isinstance(data[0], Categorical):
if columns is None:
columns = _default_index(len(data))
return data, columns
elif (isinstance(data, (np.ndarray, Series, Index))
and data.dtype.names is not None):

Expand Down
61 changes: 60 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas.core.common import (_possibly_downcast_to_dtype, isnull,
_NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
ABCSparseSeries, _infer_dtype_from_scalar,
_is_null_datelike_scalar,
_is_null_datelike_scalar, _maybe_promote,
is_timedelta64_dtype, is_datetime64_dtype,
_possibly_infer_to_datetimelike, array_equivalent)
from pandas.core.index import Index, MultiIndex, _ensure_index
Expand Down Expand Up @@ -177,6 +177,24 @@ def _slice(self, slicer):
""" return a slice of my values """
return self.values[slicer]

def reshape_nd(self, labels, shape, ref_items):
"""
Parameters
----------
labels : list of new axis labels
shape : new shape
ref_items : new ref_items

return a new block that is transformed to a nd block
"""

return _block2d_to_blocknd(
values=self.get_values().T,
placement=self.mgr_locs,
shape=shape,
labels=labels,
ref_items=ref_items)

def getitem_block(self, slicer, new_mgr_locs=None):
"""
Perform __getitem__-like, return result as block.
Expand Down Expand Up @@ -2573,6 +2591,10 @@ def comp(s):
bm._consolidate_inplace()
return bm

def reshape_nd(self, axes, **kwargs):
""" a 2d-nd reshape operation on a BlockManager """
return self.apply('reshape_nd', axes=axes, **kwargs)

def is_consolidated(self):
"""
Return True if more than one block with the same dtype
Expand Down Expand Up @@ -3895,6 +3917,43 @@ def _concat_indexes(indexes):
return indexes[0].append(indexes[1:])


def _block2d_to_blocknd(values, placement, shape, labels, ref_items):
""" pivot to the labels shape """
from pandas.core.internals import make_block

panel_shape = (len(placement),) + shape

# TODO: lexsort depth needs to be 2!!

# Create observation selection vector using major and minor
# labels, for converting to panel format.
selector = _factor_indexer(shape[1:], labels)
mask = np.zeros(np.prod(shape), dtype=bool)
mask.put(selector, True)

if mask.all():
pvalues = np.empty(panel_shape, dtype=values.dtype)
else:
dtype, fill_value = _maybe_promote(values.dtype)
pvalues = np.empty(panel_shape, dtype=dtype)
pvalues.fill(fill_value)

values = values
for i in range(len(placement)):
pvalues[i].flat[mask] = values[:, i]

return make_block(pvalues, placement=placement)


def _factor_indexer(shape, labels):
"""
given a tuple of shape and a list of Categorical labels, return the
expanded label indexer
"""
mult = np.array(shape)[::-1].cumprod()[::-1]
return com._ensure_platform_int(
np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)

def _get_blkno_placements(blknos, blk_count, group=True):
"""

Expand Down
48 changes: 11 additions & 37 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@ class _Unstacker(object):
"""

def __init__(self, values, index, level=-1, value_columns=None):

self.is_categorical = None
if values.ndim == 1:
if isinstance(values, Categorical):
self.is_categorical = values
values = np.array(values)
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
Expand Down Expand Up @@ -175,6 +180,12 @@ def get_result(self):
else:
index = index.take(self.unique_groups)

# may need to coerce categoricals here
if self.is_categorical is not None:
values = [ Categorical.from_array(values[:,i],
categories=self.is_categorical.categories)
for i in range(values.shape[-1]) ]

return DataFrame(values, index=index, columns=columns)

def get_new_values(self):
Expand Down Expand Up @@ -1188,40 +1199,3 @@ def make_axis_dummies(frame, axis='minor', transform=None):
values = values.take(labels, axis=0)

return DataFrame(values, columns=items, index=frame.index)


def block2d_to_blocknd(values, placement, shape, labels, ref_items):
""" pivot to the labels shape """
from pandas.core.internals import make_block

panel_shape = (len(placement),) + shape

# TODO: lexsort depth needs to be 2!!

# Create observation selection vector using major and minor
# labels, for converting to panel format.
selector = factor_indexer(shape[1:], labels)
mask = np.zeros(np.prod(shape), dtype=bool)
mask.put(selector, True)

if mask.all():
pvalues = np.empty(panel_shape, dtype=values.dtype)
else:
dtype, fill_value = _maybe_promote(values.dtype)
pvalues = np.empty(panel_shape, dtype=dtype)
pvalues.fill(fill_value)

values = values
for i in range(len(placement)):
pvalues[i].flat[mask] = values[:, i]

return make_block(pvalues, placement=placement)


def factor_indexer(shape, labels):
""" given a tuple of shape and a list of Categorical labels, return the
expanded label indexer
"""
mult = np.array(shape)[::-1].cumprod()[::-1]
return com._ensure_platform_int(
np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
9 changes: 4 additions & 5 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
from pandas.core.algorithms import match, unique
from pandas.core.categorical import Categorical
from pandas.core.common import _asarray_tuplesafe
from pandas.core.internals import BlockManager, make_block
from pandas.core.reshape import block2d_to_blocknd, factor_indexer
from pandas.core.internals import BlockManager, make_block, _block2d_to_blocknd, _factor_indexer
from pandas.core.index import _ensure_index
from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
import pandas.core.common as com
Expand Down Expand Up @@ -332,7 +331,7 @@ def read_hdf(path_or_buf, key, **kwargs):
key, auto_close=auto_close, **kwargs)

if isinstance(path_or_buf, string_types):

try:
exists = os.path.exists(path_or_buf)

Expand Down Expand Up @@ -3537,7 +3536,7 @@ def read(self, where=None, columns=None, **kwargs):
labels = [f.codes for f in factors]

# compute the key
key = factor_indexer(N[1:], labels)
key = _factor_indexer(N[1:], labels)

objs = []
if len(unique(key)) == len(key):
Expand All @@ -3556,7 +3555,7 @@ def read(self, where=None, columns=None, **kwargs):

take_labels = [l.take(sorter) for l in labels]
items = Index(c.values)
block = block2d_to_blocknd(
block = _block2d_to_blocknd(
values=sorted_values, placement=np.arange(len(items)),
shape=tuple(N), labels=take_labels, ref_items=items)

Expand Down
35 changes: 31 additions & 4 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1121,18 +1121,45 @@ def test_construction_frame(self):
expected = Series(list('abc'),dtype='category')
tm.assert_series_equal(df[0],expected)

# these coerces back to object as its spread across columns

# ndim != 1
df = DataFrame([pd.Categorical(list('abc'))])
expected = DataFrame([list('abc')])
expected = DataFrame({ 0 : Series(list('abc'),dtype='category')})
tm.assert_frame_equal(df,expected)

df = DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abd'))])
expected = DataFrame({ 0 : Series(list('abc'),dtype='category'),
1 : Series(list('abd'),dtype='category')},columns=[0,1])
tm.assert_frame_equal(df,expected)

# mixed
df = DataFrame([pd.Categorical(list('abc')),list('def')])
expected = DataFrame([list('abc'),list('def')])
expected = DataFrame({ 0 : Series(list('abc'),dtype='category'),
1 : list('def')},columns=[0,1])
tm.assert_frame_equal(df,expected)

# invalid (shape)
self.assertRaises(ValueError, lambda : DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abdefg'))]))

# ndim > 1
self.assertRaises(NotImplementedError, lambda : pd.Categorical(np.array([list('abcd')])))

def test_reshaping(self):

p = tm.makePanel()
p['str'] = 'foo'
df = p.to_frame()
df['category'] = df['str'].astype('category')
result = df['category'].unstack()

c = Categorical(['foo']*len(p.major_axis))
expected = DataFrame({'A' : c.copy(),
'B' : c.copy(),
'C' : c.copy(),
'D' : c.copy()},
columns=Index(list('ABCD'),name='minor'),
index=p.major_axis.set_names('major'))
tm.assert_frame_equal(result, expected)

def test_reindex(self):

index = pd.date_range('20000101', periods=3)
Expand Down
Loading