diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index cc2269afa6e61..182562f055164 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -78,6 +78,7 @@ Other Enhancements - ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`) - ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`) - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`). +- ``pd.read_msgpack()`` now supports serializing and de-serializing categoricals with msgpack (:issue:`12573`) - ``interpolate()`` now supports ``method='akima'`` (:issue:`7588`). - ``Index.take`` now handles ``allow_fill`` and ``fill_value`` consistently (:issue:`12631`) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index d1cef04121fbb..f009793172e31 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,13 +49,15 @@ from pandas.compat import u, u_safe from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, - Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT) + Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, + Categorical) from pandas.tslib import NaTType from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame from pandas.core.common import ( PerformanceWarning, + is_categorical_dtype, needs_i8_conversion, pandas_dtype, ) @@ -226,6 +228,7 @@ def read(fh): # this is platform int, which we need to remap to np.int64 # for compat on windows platforms 7: np.dtype('int64'), + 'category': 'category' } @@ -262,6 +265,9 @@ def convert(values): v = values.ravel() # convert object + if is_categorical_dtype(values): + return values + if dtype == np.object_: return v.tolist() @@ -298,6 +304,9 @@ def unconvert(values, dtype, compress=None): if as_is_ext: values = values.data + if is_categorical_dtype(dtype): + return values + if dtype == np.object_: return np.array(values, dtype=object) @@ -393,6 +402,16 @@ def encode(obj): u'dtype': u(obj.dtype.name), u'data': convert(obj.values), u'compress': compressor} + + elif isinstance(obj, Categorical): + return {u'typ': u'category', + u'klass': u(obj.__class__.__name__), + u'name': getattr(obj, 'name', None), + u'codes': obj.codes, + u'categories': obj.categories, + u'ordered': obj.ordered, + u'compress': compressor} + elif isinstance(obj, Series): if isinstance(obj, SparseSeries): raise NotImplementedError( @@ -576,10 +595,18 @@ def decode(obj): result = result.tz_localize('UTC').tz_convert(tz) return result + elif typ == u'category': + from_codes = globals()[obj[u'klass']].from_codes + return from_codes(codes=obj[u'codes'], + categories=obj[u'categories'], + ordered=obj[u'ordered'], + name=obj[u'name']) + elif typ == u'series': dtype = dtype_for(obj[u'dtype']) pd_dtype = pandas_dtype(dtype) np_dtype = pandas_dtype(dtype).base + index = obj[u'index'] result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype, obj[u'compress']), diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 276763989d7cf..4c0e71698ad79 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -9,13 +9,15 @@ from pandas import compat from pandas.compat import u from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, - date_range, period_range, Index) + date_range, period_range, Index, Categorical) from pandas.core.common import PerformanceWarning from pandas.io.packers import to_msgpack, read_msgpack import pandas.util.testing as tm -from pandas.util.testing import (ensure_clean, assert_index_equal, - assert_series_equal, +from pandas.util.testing import (ensure_clean, + assert_categorical_equal, assert_frame_equal, + assert_index_equal, + assert_series_equal, patch) from pandas.tests.test_panel import assert_panel_equal @@ -335,7 +337,7 @@ def setUp(self): 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 + [Timestamp('20130603', tz='CET')] * 3, - 'G': [Timestamp('20130102', tz='US/Eastern')] * 5 + 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, } self.d['float'] = Series(data['A']) @@ -353,6 +355,29 @@ def test_basic(self): assert_series_equal(i, i_rec) +class TestCategorical(TestPackers): + + def setUp(self): + super(TestCategorical, self).setUp() + + self.d = {} + + self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e']) + self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'], + ordered=True) + + self.d['plain_int'] = Categorical([5, 6, 7, 8]) + self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True) + + def test_basic(self): + + # run multiple times here + for n in range(10): + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + assert_categorical_equal(i, i_rec) + + class TestNDFrame(TestPackers): def setUp(self): @@ -365,7 +390,9 @@ def setUp(self): 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], 'F': [Timestamp('20130102', tz='US/Eastern')] * 5, - 'G': [Timestamp('20130603', tz='CET')] * 5 + 'G': [Timestamp('20130603', tz='CET')] * 5, + 'H': Categorical(['a', 'b', 'c', 'd', 'e']), + 'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True), } self.frame = {