Skip to content

Commit 73598c1

Browse files
committed
Merge pull request #7794 from immerrr/fix-unpickling-series-with-non-unique-index
BUG: fix reading pre-0.14.1 pickles of containers with one block and dup items
2 parents 47ba06e + 4434f48 commit 73598c1

File tree

6 files changed

+32
-5
lines changed

6 files changed

+32
-5
lines changed

doc/source/v0.15.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ Bug Fixes
192192

193193
- Bug in pickles contains ``DateOffset`` may raise ``AttributeError`` when ``normalize`` attribute is reffered internally (:issue:`7748`)
194194

195+
- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity
196+
when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)
195197

196198

197199
- Bug in ``is_superperiod`` and ``is_subperiod`` cannot handle higher frequencies than ``S`` (:issue:`7760`, :issue:`7772`, :issue:`7803`)

pandas/core/internals.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -2271,10 +2271,23 @@ def unpickle_block(values, mgr_locs):
22712271
ax_arrays, bvalues, bitems = state[:3]
22722272

22732273
self.axes = [_ensure_index(ax) for ax in ax_arrays]
2274+
2275+
if len(bitems) == 1 and self.axes[0].equals(bitems[0]):
2276+
# This is a workaround for pre-0.14.1 pickles that didn't
2277+
# support unpickling multi-block frames/panels with non-unique
2278+
# columns/items, because given a manager with items ["a", "b",
2279+
# "a"] there's no way of knowing which block's "a" is where.
2280+
#
2281+
# Single-block case can be supported under the assumption that
2282+
# block items corresponded to manager items 1-to-1.
2283+
all_mgr_locs = [slice(0, len(bitems[0]))]
2284+
else:
2285+
all_mgr_locs = [self.axes[0].get_indexer(blk_items)
2286+
for blk_items in bitems]
2287+
22742288
self.blocks = tuple(
2275-
unpickle_block(values,
2276-
self.axes[0].get_indexer(items))
2277-
for values, items in zip(bvalues, bitems))
2289+
unpickle_block(values, mgr_locs)
2290+
for values, mgr_locs in zip(bvalues, all_mgr_locs))
22782291

22792292
self._post_setstate()
22802293

Binary file not shown.
Binary file not shown.
Binary file not shown.

pandas/io/tests/generate_legacy_pickles.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
""" self-contained to write legacy pickle files """
22
from __future__ import print_function
33

4+
45
def _create_sp_series():
56

67
import numpy as np
@@ -53,6 +54,7 @@ def _create_sp_frame():
5354
def create_data():
5455
""" create the pickle data """
5556

57+
from distutils.version import LooseVersion
5658
import numpy as np
5759
import pandas
5860
from pandas import (Series,TimeSeries,DataFrame,Panel,
@@ -92,13 +94,23 @@ def create_data():
9294
index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
9395
['one','two','one','two','three']])),
9496
names=['first','second'])),
95-
dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
96-
columns=['A', 'B', 'A']))
97+
dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
98+
columns=['A', 'B', 'A']))
9799
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
98100
dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
99101
items=['A', 'B', 'A']))
100102

103+
if LooseVersion(pandas.__version__) >= '0.14.1':
104+
# Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
105+
# panels if their columns/items were non-unique.
106+
mixed_dup_df = DataFrame(data)
107+
mixed_dup_df.columns = list("ABCDA")
108+
109+
mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int']))
110+
mixed_dup_panel.items = ['ItemA', 'ItemA']
101111

112+
frame['mixed_dup'] = mixed_dup_df
113+
panel['mixed_dup'] = mixed_dup_panel
102114

103115
return dict( series = series,
104116
frame = frame,

0 commit comments

Comments
 (0)