Merge pull request #7370 from immerrr/blockmanager-new-pickle-format

jreback · jreback · commit eb7724e2d8fb · 2014-07-01T06:11:23.000-04:00
ENH: change BlockManager pickle format to work with dup items
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -135,6 +135,9 @@ Enhancements
 - All offsets ``apply``, ``rollforward`` and ``rollback`` can now handle ``np.datetime64``, previously results in ``ApplyTypeError`` (:issue:`7452`)
 
 - ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`)
+- Support pickling ``Series``, ``DataFrame`` and ``Panel`` objects with
+  non-unique labels along *item* axis (``index``, ``columns`` and ``items``
+  respectively) (:issue:`7370`).
 
 - ``read_csv`` and ``read_table`` can now read index columns from the first
   line after the header when using the C engine (:issue:`6893`)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1603,16 +1603,19 @@ class SparseBlock(Block):
     def __init__(self, values, placement,
                  ndim=None, fastpath=False,):
 
+        # Placement must be converted to BlockPlacement via property setter
+        # before ndim logic, because placement may be a slice which doesn't
+        # have a length.
+        self.mgr_locs = placement
+
         # kludgetastic
         if ndim is None:
-            if len(placement) != 1:
+            if len(self.mgr_locs) != 1:
                 ndim = 1
             else:
                 ndim = 2
         self.ndim = ndim
 
-        self.mgr_locs = placement
-
         if not isinstance(values, SparseArray):
             raise TypeError("values must be SparseArray")
 
@@ -2050,26 +2053,44 @@ def __getstate__(self):
         block_values = [b.values for b in self.blocks]
         block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
         axes_array = [ax for ax in self.axes]
-        return axes_array, block_values, block_items
 
-    def __setstate__(self, state):
-        # discard anything after 3rd, support beta pickling format for a little
-        # while longer
-        ax_arrays, bvalues, bitems = state[:3]
+        extra_state = {
+            '0.14.1': {
+                'axes': axes_array,
+                'blocks': [dict(values=b.values,
+                                mgr_locs=b.mgr_locs.indexer)
+                           for b in self.blocks]
+            }
+        }
 
-        self.axes = [_ensure_index(ax) for ax in ax_arrays]
-
-        blocks = []
-        for values, items in zip(bvalues, bitems):
+        # First three elements of the state are to maintain forward
+        # compatibility with 0.13.1.
+        return axes_array, block_values, block_items, extra_state
 
+    def __setstate__(self, state):
+        def unpickle_block(values, mgr_locs):
             # numpy < 1.7 pickle compat
             if values.dtype == 'M8[us]':
                 values = values.astype('M8[ns]')
-
-            blk = make_block(values,
-                             placement=self.axes[0].get_indexer(items))
-            blocks.append(blk)
-        self.blocks = tuple(blocks)
+            return make_block(values, placement=mgr_locs)
+
+        if (isinstance(state, tuple) and len(state) >= 4
+            and '0.14.1' in state[3]):
+            state = state[3]['0.14.1']
+            self.axes = [_ensure_index(ax) for ax in state['axes']]
+            self.blocks = tuple(
+                unpickle_block(b['values'], b['mgr_locs'])
+                for b in state['blocks'])
+        else:
+            # discard anything after 3rd, support beta pickling format for a
+            # little while longer
+            ax_arrays, bvalues, bitems = state[:3]
+
+            self.axes = [_ensure_index(ax) for ax in ax_arrays]
+            self.blocks = tuple(
+                unpickle_block(values,
+                               self.axes[0].get_indexer(items))
+                for values, items in zip(bvalues, bitems))
 
         self._post_setstate()
 
diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py
@@ -80,15 +80,21 @@ def create_data():
                   ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)),
                   mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
                                                                                                     [3,4,3,4,5]])),
-                                                                                           names=['one','two'])))
+                                                                                           names=['one','two'])),
+                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))
+
     frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
                  int = DataFrame(dict(A = series['int']  , B = series['int']   + 1)),
                  mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])),
                  mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)),
                                 index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
                                                                        ['one','two','one','two','three']])),
-                                                             names=['first','second'])))
-    panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)))
+                                                             names=['first','second'])),
+                 dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
+                                 columns=['A', 'B', 'A']))
+    panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
+                 dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
+                             items=['A', 'B', 'A']))
 
 
 
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -352,6 +352,16 @@ def test_pickle(self):
         self.assertFalse(mgr2._is_consolidated)
         self.assertFalse(mgr2._known_consolidated)
 
+    def test_non_unique_pickle(self):
+        import pickle
+        mgr = create_mgr('a,a,a:f8')
+        mgr2 = pickle.loads(pickle.dumps(mgr))
+        assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
+        mgr = create_mgr('a: f8; a: i8')
+        mgr2 = pickle.loads(pickle.dumps(mgr))
+        assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
+
     def test_get_scalar(self):
         for item in self.mgr.items:
             for i, index in enumerate(self.mgr.axes[1]):