ENH: allow categoricals in msgpack

pwaller · pwaller · commit ff87282c4669 · 2016-04-25T16:34:43.000+01:00
DOC: support for categoricals in read_msgpack

Add TestCategorical test cases

Add Catecorical ordered=True ndframe test
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -78,6 +78,7 @@ Other Enhancements
 - ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`)
 - ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`)
 - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
+- ``pd.read_msgpack()`` now supports serializing and de-serializing categoricals with msgpack (:issue:`12573`)
 - ``interpolate()`` now supports ``method='akima'`` (:issue:`7588`).
 - ``Index.take`` now handles ``allow_fill`` and ``fill_value`` consistently (:issue:`12631`)
 
diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -49,13 +49,15 @@
 from pandas.compat import u, u_safe
 from pandas import (Timestamp, Period, Series, DataFrame,  # noqa
                     Index, MultiIndex, Float64Index, Int64Index,
-                    Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT)
+                    Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT,
+                    Categorical)
 from pandas.tslib import NaTType
 from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel
 from pandas.sparse.array import BlockIndex, IntIndex
 from pandas.core.generic import NDFrame
 from pandas.core.common import (
     PerformanceWarning,
+    is_categorical_dtype,
     needs_i8_conversion,
     pandas_dtype,
 )
@@ -226,6 +228,7 @@ def read(fh):
               # this is platform int, which we need to remap to np.int64
               # for compat on windows platforms
               7: np.dtype('int64'),
+              'category': 'category'
               }
 
 
@@ -262,6 +265,9 @@ def convert(values):
     v = values.ravel()
 
     # convert object
+    if is_categorical_dtype(values):
+        return values
+
     if dtype == np.object_:
         return v.tolist()
 
@@ -298,6 +304,9 @@ def unconvert(values, dtype, compress=None):
     if as_is_ext:
         values = values.data
 
+    if is_categorical_dtype(dtype):
+        return values
+
     if dtype == np.object_:
         return np.array(values, dtype=object)
 
@@ -393,6 +402,16 @@ def encode(obj):
                     u'dtype': u(obj.dtype.name),
                     u'data': convert(obj.values),
                     u'compress': compressor}
+
+    elif isinstance(obj, Categorical):
+        return {u'typ': u'category',
+                u'klass': u(obj.__class__.__name__),
+                u'name': getattr(obj, 'name', None),
+                u'codes': obj.codes,
+                u'categories': obj.categories,
+                u'ordered': obj.ordered,
+                u'compress': compressor}
+
     elif isinstance(obj, Series):
         if isinstance(obj, SparseSeries):
             raise NotImplementedError(
@@ -576,10 +595,18 @@ def decode(obj):
             result = result.tz_localize('UTC').tz_convert(tz)
         return result
 
+    elif typ == u'category':
+        from_codes = globals()[obj[u'klass']].from_codes
+        return from_codes(codes=obj[u'codes'],
+                          categories=obj[u'categories'],
+                          ordered=obj[u'ordered'],
+                          name=obj[u'name'])
+
     elif typ == u'series':
         dtype = dtype_for(obj[u'dtype'])
         pd_dtype = pandas_dtype(dtype)
         np_dtype = pandas_dtype(dtype).base
+
         index = obj[u'index']
         result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype,
                                                     obj[u'compress']),
diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py
@@ -9,13 +9,15 @@
 from pandas import compat
 from pandas.compat import u
 from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
-                    date_range, period_range, Index)
+                    date_range, period_range, Index, Categorical)
 from pandas.core.common import PerformanceWarning
 from pandas.io.packers import to_msgpack, read_msgpack
 import pandas.util.testing as tm
-from pandas.util.testing import (ensure_clean, assert_index_equal,
-                                 assert_series_equal,
+from pandas.util.testing import (ensure_clean,
+                                 assert_categorical_equal,
                                  assert_frame_equal,
+                                 assert_index_equal,
+                                 assert_series_equal,
                                  patch)
 from pandas.tests.test_panel import assert_panel_equal
 
@@ -335,7 +337,7 @@ def setUp(self):
             'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
             'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
                  [Timestamp('20130603', tz='CET')] * 3,
-            'G': [Timestamp('20130102', tz='US/Eastern')] * 5
+            'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
         }
 
         self.d['float'] = Series(data['A'])
@@ -353,6 +355,29 @@ def test_basic(self):
                 assert_series_equal(i, i_rec)
 
 
+class TestCategorical(TestPackers):
+
+    def setUp(self):
+        super(TestCategorical, self).setUp()
+
+        self.d = {}
+
+        self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
+        self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
+                                                  ordered=True)
+
+        self.d['plain_int'] = Categorical([5, 6, 7, 8])
+        self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
+
+    def test_basic(self):
+
+        # run multiple times here
+        for n in range(10):
+            for s, i in self.d.items():
+                i_rec = self.encode_decode(i)
+                assert_categorical_equal(i, i_rec)
+
+
 class TestNDFrame(TestPackers):
 
     def setUp(self):
@@ -365,7 +390,9 @@ def setUp(self):
             'D': date_range('1/1/2009', periods=5),
             'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
             'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
-            'G': [Timestamp('20130603', tz='CET')] * 5
+            'G': [Timestamp('20130603', tz='CET')] * 5,
+            'H': Categorical(['a', 'b', 'c', 'd', 'e']),
+            'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
         }
 
         self.frame = {