Skip to content

ENH: change BlockManager pickle format to work with dup items #7370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 1, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ Enhancements
- All offsets ``apply``, ``rollforward`` and ``rollback`` can now handle ``np.datetime64``, previously results in ``ApplyTypeError`` (:issue:`7452`)

- ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`)
- Support pickling ``Series``, ``DataFrame`` and ``Panel`` objects with
non-unique labels along *item* axis (``index``, ``columns`` and ``items``
respectively) (:issue:`7370`).


.. _whatsnew_0141.performance:
Expand Down
55 changes: 38 additions & 17 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1603,16 +1603,19 @@ class SparseBlock(Block):
def __init__(self, values, placement,
ndim=None, fastpath=False,):

# Placement must be converted to BlockPlacement via property setter
# before ndim logic, because placement may be a slice which doesn't
# have a length.
self.mgr_locs = placement

# kludgetastic
if ndim is None:
if len(placement) != 1:
if len(self.mgr_locs) != 1:
ndim = 1
else:
ndim = 2
self.ndim = ndim

self.mgr_locs = placement

if not isinstance(values, SparseArray):
raise TypeError("values must be SparseArray")

Expand Down Expand Up @@ -2050,26 +2053,44 @@ def __getstate__(self):
block_values = [b.values for b in self.blocks]
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
axes_array = [ax for ax in self.axes]
return axes_array, block_values, block_items

def __setstate__(self, state):
# discard anything after 3rd, support beta pickling format for a little
# while longer
ax_arrays, bvalues, bitems = state[:3]
extra_state = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why don't you add a version tag instead, this seems kind of odd to do it this way

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's an "upcoming" version tag, as in the first stable version having this serialization format, serving as a key into this dictionary. Or do you mean something else?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no I meant having the version as the key was odd, why not just as a key-value in the dict, e.g. version : '0.14.1'

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I figured, something like this:

for ver in list(supported_versions):
    if ver in state:
        setstate_ver(state[ver])

would be easier on the eye than:

for ver in list(supported_versions):
     for d in state.values():
        if d['version'] == ver:
            setstate(d)
            break

But it's not a strong opinion, rather a gut feeling. If you insist, I'll make the version a dictionary element again.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, either way is fine. Just trying to make it easy on future versions.

'0.14.1': {
'axes': axes_array,
'blocks': [dict(values=b.values,
mgr_locs=b.mgr_locs.indexer)
for b in self.blocks]
}
}

self.axes = [_ensure_index(ax) for ax in ax_arrays]

blocks = []
for values, items in zip(bvalues, bitems):
# First three elements of the state are to maintain forward
# compatibility with 0.13.1.
return axes_array, block_values, block_items, extra_state

def __setstate__(self, state):
def unpickle_block(values, mgr_locs):
# numpy < 1.7 pickle compat
if values.dtype == 'M8[us]':
values = values.astype('M8[ns]')

blk = make_block(values,
placement=self.axes[0].get_indexer(items))
blocks.append(blk)
self.blocks = tuple(blocks)
return make_block(values, placement=mgr_locs)

if (isinstance(state, tuple) and len(state) >= 4
and '0.14.1' in state[3]):
state = state[3]['0.14.1']
self.axes = [_ensure_index(ax) for ax in state['axes']]
self.blocks = tuple(
unpickle_block(b['values'], b['mgr_locs'])
for b in state['blocks'])
else:
# discard anything after 3rd, support beta pickling format for a
# little while longer
ax_arrays, bvalues, bitems = state[:3]

self.axes = [_ensure_index(ax) for ax in ax_arrays]
self.blocks = tuple(
unpickle_block(values,
self.axes[0].get_indexer(items))
for values, items in zip(bvalues, bitems))

self._post_setstate()

Expand Down
12 changes: 9 additions & 3 deletions pandas/io/tests/generate_legacy_pickles.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,21 @@ def create_data():
ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)),
mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
[3,4,3,4,5]])),
names=['one','two'])))
names=['one','two'])),
dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))

frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)),
mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])),
mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)),
index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
['one','two','one','two','three']])),
names=['first','second'])))
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)))
names=['first','second'])),
dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
columns=['A', 'B', 'A']))
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
items=['A', 'B', 'A']))



Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,16 @@ def test_pickle(self):
self.assertFalse(mgr2._is_consolidated)
self.assertFalse(mgr2._known_consolidated)

def test_non_unique_pickle(self):
import pickle
mgr = create_mgr('a,a,a:f8')
mgr2 = pickle.loads(pickle.dumps(mgr))
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))

mgr = create_mgr('a: f8; a: i8')
mgr2 = pickle.loads(pickle.dumps(mgr))
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))

def test_get_scalar(self):
for item in self.mgr.items:
for i, index in enumerate(self.mgr.axes[1]):
Expand Down