Skip to content

Commit 9d8b3a1

Browse files
committed
Merge pull request #8705 from jreback/to_panel
ENH/BUG: support Categorical in to_panel reshaping (GH8704)
2 parents daa7265 + 6d1945d commit 9d8b3a1

File tree

7 files changed

+151
-69
lines changed

7 files changed

+151
-69
lines changed

pandas/core/categorical.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas.core.indexing import _is_null_slice
1414
from pandas.tseries.period import PeriodIndex
1515
import pandas.core.common as com
16+
from pandas.util.decorators import cache_readonly
1617

1718
from pandas.core.common import isnull
1819
from pandas.util.terminal import get_terminal_size
@@ -174,9 +175,6 @@ class Categorical(PandasObject):
174175
>>> a.min()
175176
'c'
176177
"""
177-
ndim = 1
178-
"""Number of dimensions (always 1!)"""
179-
180178
dtype = com.CategoricalDtype()
181179
"""The dtype (always "category")"""
182180

@@ -256,6 +254,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
256254
dtype = 'object' if isnull(values).any() else None
257255
values = _sanitize_array(values, None, dtype=dtype)
258256

257+
259258
if categories is None:
260259
try:
261260
codes, categories = factorize(values, sort=True)
@@ -270,6 +269,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
270269
# give us one by specifying categories
271270
raise TypeError("'values' is not ordered, please explicitly specify the "
272271
"categories order by passing in a categories argument.")
272+
except ValueError:
273+
274+
### FIXME ####
275+
raise NotImplementedError("> 1 ndim Categorical are not supported at this time")
276+
273277
else:
274278
# there were two ways if categories are present
275279
# - the old one, where each value is a int pointer to the levels array -> not anymore
@@ -305,8 +309,13 @@ def copy(self):
305309
return Categorical(values=self._codes.copy(),categories=self.categories,
306310
name=self.name, ordered=self.ordered, fastpath=True)
307311

312+
@cache_readonly
313+
def ndim(self):
314+
"""Number of dimensions of the Categorical """
315+
return self._codes.ndim
316+
308317
@classmethod
309-
def from_array(cls, data):
318+
def from_array(cls, data, **kwargs):
310319
"""
311320
Make a Categorical type from a single array-like object.
312321
@@ -318,7 +327,7 @@ def from_array(cls, data):
318327
Can be an Index or array-like. The categories are assumed to be
319328
the unique values of `data`.
320329
"""
321-
return Categorical(data)
330+
return Categorical(data, **kwargs)
322331

323332
@classmethod
324333
def from_codes(cls, codes, categories, ordered=False, name=None):

pandas/core/frame.py

+19-17
Original file line numberDiff line numberDiff line change
@@ -241,15 +241,19 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
241241
if isinstance(data, types.GeneratorType):
242242
data = list(data)
243243
if len(data) > 0:
244-
if index is None and isinstance(data[0], Series):
245-
index = _get_names_from_index(data)
246-
247244
if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
248245
arrays, columns = _to_arrays(data, columns, dtype=dtype)
249246
columns = _ensure_index(columns)
250247

248+
# set the index
251249
if index is None:
252-
index = _default_index(len(data))
250+
if isinstance(data[0], Series):
251+
index = _get_names_from_index(data)
252+
elif isinstance(data[0], Categorical):
253+
index = _default_index(len(data[0]))
254+
else:
255+
index = _default_index(len(data))
256+
253257
mgr = _arrays_to_mgr(arrays, columns, index, columns,
254258
dtype=dtype)
255259
else:
@@ -1053,7 +1057,6 @@ def to_panel(self):
10531057
panel : Panel
10541058
"""
10551059
from pandas.core.panel import Panel
1056-
from pandas.core.reshape import block2d_to_blocknd
10571060

10581061
# only support this kind for now
10591062
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
@@ -1073,29 +1076,24 @@ def to_panel(self):
10731076
selfsorted = self
10741077

10751078
major_axis, minor_axis = selfsorted.index.levels
1076-
10771079
major_labels, minor_labels = selfsorted.index.labels
1078-
10791080
shape = len(major_axis), len(minor_axis)
10801081

1081-
new_blocks = []
1082-
for block in selfsorted._data.blocks:
1083-
newb = block2d_to_blocknd(
1084-
values=block.values.T,
1085-
placement=block.mgr_locs, shape=shape,
1086-
labels=[major_labels, minor_labels],
1087-
ref_items=selfsorted.columns)
1088-
new_blocks.append(newb)
1089-
10901082
# preserve names, if any
10911083
major_axis = major_axis.copy()
10921084
major_axis.name = self.index.names[0]
10931085

10941086
minor_axis = minor_axis.copy()
10951087
minor_axis.name = self.index.names[1]
10961088

1089+
# create new axes
10971090
new_axes = [selfsorted.columns, major_axis, minor_axis]
1098-
new_mgr = create_block_manager_from_blocks(new_blocks, new_axes)
1091+
1092+
# create new manager
1093+
new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
1094+
labels=[major_labels, minor_labels],
1095+
shape=shape,
1096+
ref_items=selfsorted.columns)
10991097

11001098
return Panel(new_mgr)
11011099

@@ -4808,6 +4806,10 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None):
48084806
return _list_of_series_to_arrays(data, columns,
48094807
coerce_float=coerce_float,
48104808
dtype=dtype)
4809+
elif isinstance(data[0], Categorical):
4810+
if columns is None:
4811+
columns = _default_index(len(data))
4812+
return data, columns
48114813
elif (isinstance(data, (np.ndarray, Series, Index))
48124814
and data.dtype.names is not None):
48134815

pandas/core/internals.py

+60-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pandas.core.common import (_possibly_downcast_to_dtype, isnull,
1212
_NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
1313
ABCSparseSeries, _infer_dtype_from_scalar,
14-
_is_null_datelike_scalar,
14+
_is_null_datelike_scalar, _maybe_promote,
1515
is_timedelta64_dtype, is_datetime64_dtype,
1616
_possibly_infer_to_datetimelike, array_equivalent)
1717
from pandas.core.index import Index, MultiIndex, _ensure_index
@@ -177,6 +177,24 @@ def _slice(self, slicer):
177177
""" return a slice of my values """
178178
return self.values[slicer]
179179

180+
def reshape_nd(self, labels, shape, ref_items):
181+
"""
182+
Parameters
183+
----------
184+
labels : list of new axis labels
185+
shape : new shape
186+
ref_items : new ref_items
187+
188+
return a new block that is transformed to a nd block
189+
"""
190+
191+
return _block2d_to_blocknd(
192+
values=self.get_values().T,
193+
placement=self.mgr_locs,
194+
shape=shape,
195+
labels=labels,
196+
ref_items=ref_items)
197+
180198
def getitem_block(self, slicer, new_mgr_locs=None):
181199
"""
182200
Perform __getitem__-like, return result as block.
@@ -2573,6 +2591,10 @@ def comp(s):
25732591
bm._consolidate_inplace()
25742592
return bm
25752593

2594+
def reshape_nd(self, axes, **kwargs):
2595+
""" a 2d-nd reshape operation on a BlockManager """
2596+
return self.apply('reshape_nd', axes=axes, **kwargs)
2597+
25762598
def is_consolidated(self):
25772599
"""
25782600
Return True if more than one block with the same dtype
@@ -3895,6 +3917,43 @@ def _concat_indexes(indexes):
38953917
return indexes[0].append(indexes[1:])
38963918

38973919

3920+
def _block2d_to_blocknd(values, placement, shape, labels, ref_items):
3921+
""" pivot to the labels shape """
3922+
from pandas.core.internals import make_block
3923+
3924+
panel_shape = (len(placement),) + shape
3925+
3926+
# TODO: lexsort depth needs to be 2!!
3927+
3928+
# Create observation selection vector using major and minor
3929+
# labels, for converting to panel format.
3930+
selector = _factor_indexer(shape[1:], labels)
3931+
mask = np.zeros(np.prod(shape), dtype=bool)
3932+
mask.put(selector, True)
3933+
3934+
if mask.all():
3935+
pvalues = np.empty(panel_shape, dtype=values.dtype)
3936+
else:
3937+
dtype, fill_value = _maybe_promote(values.dtype)
3938+
pvalues = np.empty(panel_shape, dtype=dtype)
3939+
pvalues.fill(fill_value)
3940+
3941+
values = values
3942+
for i in range(len(placement)):
3943+
pvalues[i].flat[mask] = values[:, i]
3944+
3945+
return make_block(pvalues, placement=placement)
3946+
3947+
3948+
def _factor_indexer(shape, labels):
3949+
"""
3950+
given a tuple of shape and a list of Categorical labels, return the
3951+
expanded label indexer
3952+
"""
3953+
mult = np.array(shape)[::-1].cumprod()[::-1]
3954+
return com._ensure_platform_int(
3955+
np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
3956+
38983957
def _get_blkno_placements(blknos, blk_count, group=True):
38993958
"""
39003959

pandas/core/reshape.py

+11-37
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,12 @@ class _Unstacker(object):
5959
"""
6060

6161
def __init__(self, values, index, level=-1, value_columns=None):
62+
63+
self.is_categorical = None
6264
if values.ndim == 1:
65+
if isinstance(values, Categorical):
66+
self.is_categorical = values
67+
values = np.array(values)
6368
values = values[:, np.newaxis]
6469
self.values = values
6570
self.value_columns = value_columns
@@ -175,6 +180,12 @@ def get_result(self):
175180
else:
176181
index = index.take(self.unique_groups)
177182

183+
# may need to coerce categoricals here
184+
if self.is_categorical is not None:
185+
values = [ Categorical.from_array(values[:,i],
186+
categories=self.is_categorical.categories)
187+
for i in range(values.shape[-1]) ]
188+
178189
return DataFrame(values, index=index, columns=columns)
179190

180191
def get_new_values(self):
@@ -1188,40 +1199,3 @@ def make_axis_dummies(frame, axis='minor', transform=None):
11881199
values = values.take(labels, axis=0)
11891200

11901201
return DataFrame(values, columns=items, index=frame.index)
1191-
1192-
1193-
def block2d_to_blocknd(values, placement, shape, labels, ref_items):
1194-
""" pivot to the labels shape """
1195-
from pandas.core.internals import make_block
1196-
1197-
panel_shape = (len(placement),) + shape
1198-
1199-
# TODO: lexsort depth needs to be 2!!
1200-
1201-
# Create observation selection vector using major and minor
1202-
# labels, for converting to panel format.
1203-
selector = factor_indexer(shape[1:], labels)
1204-
mask = np.zeros(np.prod(shape), dtype=bool)
1205-
mask.put(selector, True)
1206-
1207-
if mask.all():
1208-
pvalues = np.empty(panel_shape, dtype=values.dtype)
1209-
else:
1210-
dtype, fill_value = _maybe_promote(values.dtype)
1211-
pvalues = np.empty(panel_shape, dtype=dtype)
1212-
pvalues.fill(fill_value)
1213-
1214-
values = values
1215-
for i in range(len(placement)):
1216-
pvalues[i].flat[mask] = values[:, i]
1217-
1218-
return make_block(pvalues, placement=placement)
1219-
1220-
1221-
def factor_indexer(shape, labels):
1222-
""" given a tuple of shape and a list of Categorical labels, return the
1223-
expanded label indexer
1224-
"""
1225-
mult = np.array(shape)[::-1].cumprod()[::-1]
1226-
return com._ensure_platform_int(
1227-
np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)

pandas/io/pytables.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@
2323
from pandas.core.algorithms import match, unique
2424
from pandas.core.categorical import Categorical
2525
from pandas.core.common import _asarray_tuplesafe
26-
from pandas.core.internals import BlockManager, make_block
27-
from pandas.core.reshape import block2d_to_blocknd, factor_indexer
26+
from pandas.core.internals import BlockManager, make_block, _block2d_to_blocknd, _factor_indexer
2827
from pandas.core.index import _ensure_index
2928
from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
3029
import pandas.core.common as com
@@ -332,7 +331,7 @@ def read_hdf(path_or_buf, key, **kwargs):
332331
key, auto_close=auto_close, **kwargs)
333332

334333
if isinstance(path_or_buf, string_types):
335-
334+
336335
try:
337336
exists = os.path.exists(path_or_buf)
338337

@@ -3537,7 +3536,7 @@ def read(self, where=None, columns=None, **kwargs):
35373536
labels = [f.codes for f in factors]
35383537

35393538
# compute the key
3540-
key = factor_indexer(N[1:], labels)
3539+
key = _factor_indexer(N[1:], labels)
35413540

35423541
objs = []
35433542
if len(unique(key)) == len(key):
@@ -3556,7 +3555,7 @@ def read(self, where=None, columns=None, **kwargs):
35563555

35573556
take_labels = [l.take(sorter) for l in labels]
35583557
items = Index(c.values)
3559-
block = block2d_to_blocknd(
3558+
block = _block2d_to_blocknd(
35603559
values=sorted_values, placement=np.arange(len(items)),
35613560
shape=tuple(N), labels=take_labels, ref_items=items)
35623561

pandas/tests/test_categorical.py

+31-4
Original file line numberDiff line numberDiff line change
@@ -1121,18 +1121,45 @@ def test_construction_frame(self):
11211121
expected = Series(list('abc'),dtype='category')
11221122
tm.assert_series_equal(df[0],expected)
11231123

1124-
# these coerces back to object as its spread across columns
1125-
11261124
# ndim != 1
11271125
df = DataFrame([pd.Categorical(list('abc'))])
1128-
expected = DataFrame([list('abc')])
1126+
expected = DataFrame({ 0 : Series(list('abc'),dtype='category')})
1127+
tm.assert_frame_equal(df,expected)
1128+
1129+
df = DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abd'))])
1130+
expected = DataFrame({ 0 : Series(list('abc'),dtype='category'),
1131+
1 : Series(list('abd'),dtype='category')},columns=[0,1])
11291132
tm.assert_frame_equal(df,expected)
11301133

11311134
# mixed
11321135
df = DataFrame([pd.Categorical(list('abc')),list('def')])
1133-
expected = DataFrame([list('abc'),list('def')])
1136+
expected = DataFrame({ 0 : Series(list('abc'),dtype='category'),
1137+
1 : list('def')},columns=[0,1])
11341138
tm.assert_frame_equal(df,expected)
11351139

1140+
# invalid (shape)
1141+
self.assertRaises(ValueError, lambda : DataFrame([pd.Categorical(list('abc')),pd.Categorical(list('abdefg'))]))
1142+
1143+
# ndim > 1
1144+
self.assertRaises(NotImplementedError, lambda : pd.Categorical(np.array([list('abcd')])))
1145+
1146+
def test_reshaping(self):
1147+
1148+
p = tm.makePanel()
1149+
p['str'] = 'foo'
1150+
df = p.to_frame()
1151+
df['category'] = df['str'].astype('category')
1152+
result = df['category'].unstack()
1153+
1154+
c = Categorical(['foo']*len(p.major_axis))
1155+
expected = DataFrame({'A' : c.copy(),
1156+
'B' : c.copy(),
1157+
'C' : c.copy(),
1158+
'D' : c.copy()},
1159+
columns=Index(list('ABCD'),name='minor'),
1160+
index=p.major_axis.set_names('major'))
1161+
tm.assert_frame_equal(result, expected)
1162+
11361163
def test_reindex(self):
11371164

11381165
index = pd.date_range('20000101', periods=3)

0 commit comments

Comments
 (0)