From adf2afc1913383462fa31057939031ba328dd3aa Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 31 Jan 2016 19:44:46 -0600 Subject: [PATCH] ENH: allow categoricals in msgpack --- pandas/io/packers.py | 29 ++++++++++++++++++++++++++--- pandas/io/tests/test_packers.py | 7 +++++-- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index a16f3600736b8..fc81acdbefd08 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -47,11 +47,12 @@ from pandas.compat import u from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, - Panel, RangeIndex, PeriodIndex, DatetimeIndex) + Panel, RangeIndex, PeriodIndex, DatetimeIndex, + Categorical) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame -from pandas.core.common import needs_i8_conversion +from pandas.core.common import needs_i8_conversion, is_categorical_dtype from pandas.io.common import get_filepath_or_buffer from pandas.core.internals import BlockManager, make_block import pandas.core.internals as internals @@ -170,6 +171,7 @@ def read(fh): # this is platform int, which we need to remap to np.int64 # for compat on windows platforms 7: np.dtype('int64'), + 'category': 'category' } @@ -209,6 +211,14 @@ def convert(values): if dtype == np.object_: return v.tolist() + if is_categorical_dtype(dtype): + return { + 'codes': {'dtype': values.codes.dtype.name, + 'data': convert(values.codes)}, + 'categories': {'dtype': values.categories.dtype.name, + 'data': convert(values.categories.values)} + } + if compressor == 'zlib': # return string arrays like they are @@ -242,6 +252,15 @@ def unconvert(values, dtype, compress=None): if as_is_ext: values = values.data + if is_categorical_dtype(dtype): + return Categorical.from_codes( + unconvert(values['codes']['data'], + dtype_for(values['codes']['dtype']), + compress=compress), + unconvert(values['categories']['data'], + dtype_for(values['categories']['dtype']), + compress=compress)) + if dtype == np.object_: return np.array(values, dtype=object) @@ -495,11 +514,15 @@ def decode(obj): elif typ == 'series': dtype = dtype_for(obj['dtype']) + ctor_dtype = dtype + if is_categorical_dtype(dtype): + # Series ctor doesn't take dtype with categorical + ctor_dtype = None index = obj['index'] return globals()[obj['klass']](unconvert(obj['data'], dtype, obj['compress']), index=index, - dtype=dtype, + dtype=ctor_dtype, name=obj['name']) elif typ == 'block_manager': axes = obj['axes'] diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 6905225600ae6..9d10732aebe17 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -9,7 +9,7 @@ from pandas import compat from pandas.compat import u from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, - date_range, period_range, Index) + date_range, period_range, Index, Categorical) from pandas.io.packers import to_msgpack, read_msgpack import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, assert_index_equal, @@ -330,11 +330,13 @@ def setUp(self): 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + 'F': Categorical(['a', 'b', 'c', 'd', 'e']) } self.d['float'] = Series(data['A']) self.d['int'] = Series(data['B']) self.d['mixed'] = Series(data['E']) + self.d['categorical'] = Series(data['F']) def test_basic(self): @@ -356,13 +358,14 @@ def setUp(self): 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + 'F': Categorical(['a', 'b', 'c', 'd', 'e']) } self.frame = { 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), 'mixed': DataFrame(dict([(k, data[k]) - for k in ['A', 'B', 'C', 'D']]))} + for k in ['A', 'B', 'C', 'D', 'F']]))} self.panel = { 'float': Panel(dict(ItemA=self.frame['float'],