Skip to content

Commit eb7724e

Browse files
committed
Merge pull request #7370 from immerrr/blockmanager-new-pickle-format
ENH: change BlockManager pickle format to work with dup items
2 parents 255e82a + a9761d1 commit eb7724e

File tree

4 files changed

+60
-20
lines changed

4 files changed

+60
-20
lines changed

doc/source/v0.14.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ Enhancements
135135
- All offsets ``apply``, ``rollforward`` and ``rollback`` can now handle ``np.datetime64``, previously results in ``ApplyTypeError`` (:issue:`7452`)
136136

137137
- ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`)
138+
- Support pickling ``Series``, ``DataFrame`` and ``Panel`` objects with
139+
non-unique labels along *item* axis (``index``, ``columns`` and ``items``
140+
respectively) (:issue:`7370`).
138141

139142
- ``read_csv`` and ``read_table`` can now read index columns from the first
140143
line after the header when using the C engine (:issue:`6893`)

pandas/core/internals.py

+38-17
Original file line numberDiff line numberDiff line change
@@ -1603,16 +1603,19 @@ class SparseBlock(Block):
16031603
def __init__(self, values, placement,
16041604
ndim=None, fastpath=False,):
16051605

1606+
# Placement must be converted to BlockPlacement via property setter
1607+
# before ndim logic, because placement may be a slice which doesn't
1608+
# have a length.
1609+
self.mgr_locs = placement
1610+
16061611
# kludgetastic
16071612
if ndim is None:
1608-
if len(placement) != 1:
1613+
if len(self.mgr_locs) != 1:
16091614
ndim = 1
16101615
else:
16111616
ndim = 2
16121617
self.ndim = ndim
16131618

1614-
self.mgr_locs = placement
1615-
16161619
if not isinstance(values, SparseArray):
16171620
raise TypeError("values must be SparseArray")
16181621

@@ -2050,26 +2053,44 @@ def __getstate__(self):
20502053
block_values = [b.values for b in self.blocks]
20512054
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
20522055
axes_array = [ax for ax in self.axes]
2053-
return axes_array, block_values, block_items
20542056

2055-
def __setstate__(self, state):
2056-
# discard anything after 3rd, support beta pickling format for a little
2057-
# while longer
2058-
ax_arrays, bvalues, bitems = state[:3]
2057+
extra_state = {
2058+
'0.14.1': {
2059+
'axes': axes_array,
2060+
'blocks': [dict(values=b.values,
2061+
mgr_locs=b.mgr_locs.indexer)
2062+
for b in self.blocks]
2063+
}
2064+
}
20592065

2060-
self.axes = [_ensure_index(ax) for ax in ax_arrays]
2061-
2062-
blocks = []
2063-
for values, items in zip(bvalues, bitems):
2066+
# First three elements of the state are to maintain forward
2067+
# compatibility with 0.13.1.
2068+
return axes_array, block_values, block_items, extra_state
20642069

2070+
def __setstate__(self, state):
2071+
def unpickle_block(values, mgr_locs):
20652072
# numpy < 1.7 pickle compat
20662073
if values.dtype == 'M8[us]':
20672074
values = values.astype('M8[ns]')
2068-
2069-
blk = make_block(values,
2070-
placement=self.axes[0].get_indexer(items))
2071-
blocks.append(blk)
2072-
self.blocks = tuple(blocks)
2075+
return make_block(values, placement=mgr_locs)
2076+
2077+
if (isinstance(state, tuple) and len(state) >= 4
2078+
and '0.14.1' in state[3]):
2079+
state = state[3]['0.14.1']
2080+
self.axes = [_ensure_index(ax) for ax in state['axes']]
2081+
self.blocks = tuple(
2082+
unpickle_block(b['values'], b['mgr_locs'])
2083+
for b in state['blocks'])
2084+
else:
2085+
# discard anything after 3rd, support beta pickling format for a
2086+
# little while longer
2087+
ax_arrays, bvalues, bitems = state[:3]
2088+
2089+
self.axes = [_ensure_index(ax) for ax in ax_arrays]
2090+
self.blocks = tuple(
2091+
unpickle_block(values,
2092+
self.axes[0].get_indexer(items))
2093+
for values, items in zip(bvalues, bitems))
20732094

20742095
self._post_setstate()
20752096

pandas/io/tests/generate_legacy_pickles.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,21 @@ def create_data():
8080
ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)),
8181
mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
8282
[3,4,3,4,5]])),
83-
names=['one','two'])))
83+
names=['one','two'])),
84+
dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))
85+
8486
frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
8587
int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)),
8688
mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])),
8789
mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)),
8890
index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
8991
['one','two','one','two','three']])),
90-
names=['first','second'])))
91-
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)))
92+
names=['first','second'])),
93+
dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
94+
columns=['A', 'B', 'A']))
95+
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
96+
dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
97+
items=['A', 'B', 'A']))
9298

9399

94100

pandas/tests/test_internals.py

+10
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,16 @@ def test_pickle(self):
352352
self.assertFalse(mgr2._is_consolidated)
353353
self.assertFalse(mgr2._known_consolidated)
354354

355+
def test_non_unique_pickle(self):
356+
import pickle
357+
mgr = create_mgr('a,a,a:f8')
358+
mgr2 = pickle.loads(pickle.dumps(mgr))
359+
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
360+
361+
mgr = create_mgr('a: f8; a: i8')
362+
mgr2 = pickle.loads(pickle.dumps(mgr))
363+
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
364+
355365
def test_get_scalar(self):
356366
for item in self.mgr.items:
357367
for i, index in enumerate(self.mgr.axes[1]):

0 commit comments

Comments
 (0)