Merge pull request #8519 from immerrr/fix-categoricalblock-pickling

jreback · jreback · commit 56dbb8cc0cf8 · 2014-10-10T08:50:13.000-04:00
BUG: fix CategoricalBlock pickling
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -657,7 +657,7 @@ Categoricals in Series/DataFrame
 :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
 methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
 :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`,
-:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`).
+:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`, :issue:`8518`).
 
 For full docs, see the :ref:`categorical introduction <categorical>` and the
 :ref:`API documentation <api.categorical>`.
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -187,6 +187,8 @@ class Categorical(PandasObject):
 
     # For comparisons, so that numpy uses our implementation if the compare ops, which raise
     __array_priority__ = 1000
+    ordered = False
+    name = None
 
     def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
                  levels=None):
@@ -718,6 +720,21 @@ def __array__(self, dtype=None):
             return np.asarray(ret, dtype)
         return ret
 
+    def __setstate__(self, state):
+        """Necessary for making this object picklable"""
+        if not isinstance(state, dict):
+            raise Exception('invalid pickle state')
+
+        # Provide compatibility with pre-0.15.0 Categoricals.
+        if '_codes' not in state and 'labels' in state:
+            state['_codes'] = state.pop('labels')
+        if '_categories' not in state and '_levels' in state:
+            state['_categories'] = \
+                self._validate_categories(state.pop('_levels'))
+
+        for k, v in compat.iteritems(state):
+            setattr(self, k, v)
+
     @property
     def T(self):
         return self
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1070,16 +1070,19 @@ class NonConsolidatableMixIn(object):
     def __init__(self, values, placement,
                  ndim=None, fastpath=False,):
 
+        # Placement must be converted to BlockPlacement via property setter
+        # before ndim logic, because placement may be a slice which doesn't
+        # have a length.
+        self.mgr_locs = placement
+
         # kludgetastic
         if ndim is None:
-            if len(placement) != 1:
+            if len(self.mgr_locs) != 1:
                 ndim = 1
             else:
                 ndim = 2
         self.ndim = ndim
 
-        self.mgr_locs = placement
-
         if not isinstance(values, self._holder):
             raise TypeError("values must be {0}".format(self._holder.__name__))
 
@@ -1852,6 +1855,7 @@ def get_values(self, dtype=None):
                       .reshape(self.values.shape)
         return self.values
 
+
 class SparseBlock(NonConsolidatableMixIn, Block):
     """ implement as a list of sparse arrays of the same dtype """
     __slots__ = ()
@@ -1861,27 +1865,6 @@ class SparseBlock(NonConsolidatableMixIn, Block):
     _ftype = 'sparse'
     _holder = SparseArray
 
-    def __init__(self, values, placement,
-                 ndim=None, fastpath=False,):
-
-        # Placement must be converted to BlockPlacement via property setter
-        # before ndim logic, because placement may be a slice which doesn't
-        # have a length.
-        self.mgr_locs = placement
-
-        # kludgetastic
-        if ndim is None:
-            if len(self.mgr_locs) != 1:
-                ndim = 1
-            else:
-                ndim = 2
-        self.ndim = ndim
-
-        if not isinstance(values, SparseArray):
-            raise TypeError("values must be SparseArray")
-
-        self.values = values
-
     @property
     def shape(self):
         return (len(self.mgr_locs), self.sp_index.length)
diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle
diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py
@@ -60,7 +60,7 @@ def create_data():
     from pandas import (Series,TimeSeries,DataFrame,Panel,
                         SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel,
                         Index,MultiIndex,PeriodIndex,
-                        date_range,period_range,bdate_range,Timestamp)
+                        date_range,period_range,bdate_range,Timestamp,Categorical)
     nan = np.nan
 
     data = {
@@ -85,7 +85,8 @@ def create_data():
                   mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
                                                                                                     [3,4,3,4,5]])),
                                                                                            names=['one','two'])),
-                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))
+                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
+                  cat=Series(Categorical(['foo', 'bar', 'baz'])))
 
     frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
                  int = DataFrame(dict(A = series['int']  , B = series['int']   + 1)),
@@ -95,7 +96,11 @@ def create_data():
                                                                        ['one','two','one','two','three']])),
                                                              names=['first','second'])),
                  dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
-                               columns=['A', 'B', 'A']))
+                               columns=['A', 'B', 'A']),
+                 cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
+                 cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
+                                              B=np.arange(3))),
+    )
     panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
                  dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                              items=['A', 'B', 'A']))
diff --git a/pandas/tests/data/categorical_0_14_1.pickle b/pandas/tests/data/categorical_0_14_1.pickle
@@ -0,0 +1,94 @@
+ccopy_reg
+_reconstructor
+p0
+(cpandas.core.categorical
+Categorical
+p1
+c__builtin__
+object
+p2
+Ntp3
+Rp4
+(dp5
+S'_levels'
+p6
+cnumpy.core.multiarray
+_reconstruct
+p7
+(cpandas.core.index
+Index
+p8
+(I0
+tp9
+S'b'
+p10
+tp11
+Rp12
+((I1
+(I4
+tp13
+cnumpy
+dtype
+p14
+(S'O8'
+p15
+I0
+I1
+tp16
+Rp17
+(I3
+S'|'
+p18
+NNNI-1
+I-1
+I63
+tp19
+bI00
+(lp20
+S'a'
+p21
+ag10
+aS'c'
+p22
+aS'd'
+p23
+atp24
+(Ntp25
+tp26
+bsS'labels'
+p27
+g7
+(cnumpy
+ndarray
+p28
+(I0
+tp29
+g10
+tp30
+Rp31
+(I1
+(I3
+tp32
+g14
+(S'i8'
+p33
+I0
+I1
+tp34
+Rp35
+(I3
+S'<'
+p36
+NNNI-1
+I-1
+I0
+tp37
+bI00
+S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00'
+p38
+tp39
+bsS'name'
+p40
+S'foobar'
+p41
+sb.
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -2,6 +2,8 @@
 
 from datetime import datetime
 from pandas.compat import range, lrange, u
+import os
+import pickle
 import re
 from distutils.version import LooseVersion
 
@@ -21,16 +23,6 @@ def setUp(self):
         self.factor = Categorical.from_array(['a', 'b', 'b', 'a',
                                               'a', 'c', 'c', 'c'])
 
-    def assert_categorical_equal(self, res, exp):
-        if not com.array_equivalent(res.categories, exp.categories):
-            raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories,
-                                                                                 exp.categories))
-        if not com.array_equivalent(res.codes, exp.codes):
-            raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes,
-                                                                            exp.codes))
-        self.assertEqual(res.ordered, exp.ordered, "ordered not the same")
-        self.assertEqual(res.name, exp.name, "name not the same")
-
     def test_getitem(self):
         self.assertEqual(self.factor[0], 'a')
         self.assertEqual(self.factor[-1], 'c')
@@ -2268,6 +2260,21 @@ def get_dir(s):
         results = get_dir(s)
         tm.assert_almost_equal(results,list(sorted(set(ok_for_cat))))
 
+    def test_pickle_v0_14_1(self):
+        cat = pd.Categorical(values=['a', 'b', 'c'],
+                             levels=['a', 'b', 'c', 'd'],
+                             name='foobar', ordered=False)
+        pickle_path = os.path.join(tm.get_data_path(),
+                                   'categorical_0_14_1.pickle')
+        # This code was executed once on v0.14.1 to generate the pickle:
+        #
+        # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
+        #                   name='foobar')
+        # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
+        #
+        self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -11,7 +11,7 @@
 import pandas.util.testing as tm
 import pandas as pd
 from pandas.util.testing import (
-    assert_almost_equal, assert_frame_equal, randn)
+    assert_almost_equal, assert_frame_equal, randn, assert_series_equal)
 from pandas.compat import zip, u
 
 
@@ -363,6 +363,15 @@ def test_non_unique_pickle(self):
         mgr2 = self.round_trip_pickle(mgr)
         assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
 
+    def test_categorical_block_pickle(self):
+        mgr = create_mgr('a: category')
+        mgr2 = self.round_trip_pickle(mgr)
+        assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
+        smgr = create_single_mgr('category')
+        smgr2 = self.round_trip_pickle(smgr)
+        assert_series_equal(Series(smgr), Series(smgr2))
+
     def test_get_scalar(self):
         for item in self.mgr.items:
             for i, index in enumerate(self.mgr.axes[1]):
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -121,6 +121,16 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False
             return
         raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal))
 
+    def assert_categorical_equal(self, res, exp):
+        if not array_equivalent(res.categories, exp.categories):
+            raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories,
+                                                                                 exp.categories))
+        if not array_equivalent(res.codes, exp.codes):
+            raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes,
+                                                                            exp.codes))
+        self.assertEqual(res.ordered, exp.ordered, "ordered not the same")
+        self.assertEqual(res.name, exp.name, "name not the same")
+
     def assertIs(self, first, second, msg=''):
         """Checks that 'first' is 'second'"""
         a, b = first, second