From a9761d1331694620de13afdd7c70315203ef510a Mon Sep 17 00:00:00 2001 From: immerrr Date: Wed, 18 Jun 2014 22:10:25 +0400 Subject: [PATCH] ENH: change BlockManager pickle format to work with dup items --- doc/source/v0.14.1.txt | 3 ++ pandas/core/internals.py | 55 +++++++++++++++------- pandas/io/tests/generate_legacy_pickles.py | 12 +++-- pandas/tests/test_internals.py | 10 ++++ 4 files changed, 60 insertions(+), 20 deletions(-) diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 1aaf77625cf7f..96b611bc9afec 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -135,6 +135,9 @@ Enhancements - All offsets ``apply``, ``rollforward`` and ``rollback`` can now handle ``np.datetime64``, previously results in ``ApplyTypeError`` (:issue:`7452`) - ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`) +- Support pickling ``Series``, ``DataFrame`` and ``Panel`` objects with + non-unique labels along *item* axis (``index``, ``columns`` and ``items`` + respectively) (:issue:`7370`). .. _whatsnew_0141.performance: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index accaf4ea5cd29..4f7f36dd4a14d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1603,16 +1603,19 @@ class SparseBlock(Block): def __init__(self, values, placement, ndim=None, fastpath=False,): + # Placement must be converted to BlockPlacement via property setter + # before ndim logic, because placement may be a slice which doesn't + # have a length. + self.mgr_locs = placement + # kludgetastic if ndim is None: - if len(placement) != 1: + if len(self.mgr_locs) != 1: ndim = 1 else: ndim = 2 self.ndim = ndim - self.mgr_locs = placement - if not isinstance(values, SparseArray): raise TypeError("values must be SparseArray") @@ -2050,26 +2053,44 @@ def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] axes_array = [ax for ax in self.axes] - return axes_array, block_values, block_items - def __setstate__(self, state): - # discard anything after 3rd, support beta pickling format for a little - # while longer - ax_arrays, bvalues, bitems = state[:3] + extra_state = { + '0.14.1': { + 'axes': axes_array, + 'blocks': [dict(values=b.values, + mgr_locs=b.mgr_locs.indexer) + for b in self.blocks] + } + } - self.axes = [_ensure_index(ax) for ax in ax_arrays] - - blocks = [] - for values, items in zip(bvalues, bitems): + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + def __setstate__(self, state): + def unpickle_block(values, mgr_locs): # numpy < 1.7 pickle compat if values.dtype == 'M8[us]': values = values.astype('M8[ns]') - - blk = make_block(values, - placement=self.axes[0].get_indexer(items)) - blocks.append(blk) - self.blocks = tuple(blocks) + return make_block(values, placement=mgr_locs) + + if (isinstance(state, tuple) and len(state) >= 4 + and '0.14.1' in state[3]): + state = state[3]['0.14.1'] + self.axes = [_ensure_index(ax) for ax in state['axes']] + self.blocks = tuple( + unpickle_block(b['values'], b['mgr_locs']) + for b in state['blocks']) + else: + # discard anything after 3rd, support beta pickling format for a + # little while longer + ax_arrays, bvalues, bitems = state[:3] + + self.axes = [_ensure_index(ax) for ax in ax_arrays] + self.blocks = tuple( + unpickle_block(values, + self.axes[0].get_indexer(items)) + for values, items in zip(bvalues, bitems)) self._post_setstate() diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py index 08f63b0179db2..48d0fd57d831b 100644 --- a/pandas/io/tests/generate_legacy_pickles.py +++ b/pandas/io/tests/generate_legacy_pickles.py @@ -80,15 +80,21 @@ def create_data(): ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)), mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], [3,4,3,4,5]])), - names=['one','two']))) + names=['one','two'])), + dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A'])) + frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])), mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'], ['one','two','one','two','three']])), - names=['first','second']))) - panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1))) + names=['first','second'])), + dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), + columns=['A', 'B', 'A'])) + panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), + dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), + items=['A', 'B', 'A'])) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index e8308c09cef90..8a9010084fd99 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -352,6 +352,16 @@ def test_pickle(self): self.assertFalse(mgr2._is_consolidated) self.assertFalse(mgr2._known_consolidated) + def test_non_unique_pickle(self): + import pickle + mgr = create_mgr('a,a,a:f8') + mgr2 = pickle.loads(pickle.dumps(mgr)) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + mgr = create_mgr('a: f8; a: i8') + mgr2 = pickle.loads(pickle.dumps(mgr)) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + def test_get_scalar(self): for item in self.mgr.items: for i, index in enumerate(self.mgr.axes[1]):