Skip to content

Commit 56dbb8c

Browse files
committed
Merge pull request #8519 from immerrr/fix-categoricalblock-pickling
BUG: fix CategoricalBlock pickling
2 parents ce79c80 + b68017e commit 56dbb8c

File tree

9 files changed

+164
-39
lines changed

9 files changed

+164
-39
lines changed

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -657,7 +657,7 @@ Categoricals in Series/DataFrame
657657
:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
658658
methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
659659
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`,
660-
:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`).
660+
:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`, :issue:`8518`).
661661

662662
For full docs, see the :ref:`categorical introduction <categorical>` and the
663663
:ref:`API documentation <api.categorical>`.

pandas/core/categorical.py

+17
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,8 @@ class Categorical(PandasObject):
187187

188188
# For comparisons, so that numpy uses our implementation if the compare ops, which raise
189189
__array_priority__ = 1000
190+
ordered = False
191+
name = None
190192

191193
def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
192194
levels=None):
@@ -718,6 +720,21 @@ def __array__(self, dtype=None):
718720
return np.asarray(ret, dtype)
719721
return ret
720722

723+
def __setstate__(self, state):
724+
"""Necessary for making this object picklable"""
725+
if not isinstance(state, dict):
726+
raise Exception('invalid pickle state')
727+
728+
# Provide compatibility with pre-0.15.0 Categoricals.
729+
if '_codes' not in state and 'labels' in state:
730+
state['_codes'] = state.pop('labels')
731+
if '_categories' not in state and '_levels' in state:
732+
state['_categories'] = \
733+
self._validate_categories(state.pop('_levels'))
734+
735+
for k, v in compat.iteritems(state):
736+
setattr(self, k, v)
737+
721738
@property
722739
def T(self):
723740
return self

pandas/core/internals.py

+7-24
Original file line numberDiff line numberDiff line change
@@ -1070,16 +1070,19 @@ class NonConsolidatableMixIn(object):
10701070
def __init__(self, values, placement,
10711071
ndim=None, fastpath=False,):
10721072

1073+
# Placement must be converted to BlockPlacement via property setter
1074+
# before ndim logic, because placement may be a slice which doesn't
1075+
# have a length.
1076+
self.mgr_locs = placement
1077+
10731078
# kludgetastic
10741079
if ndim is None:
1075-
if len(placement) != 1:
1080+
if len(self.mgr_locs) != 1:
10761081
ndim = 1
10771082
else:
10781083
ndim = 2
10791084
self.ndim = ndim
10801085

1081-
self.mgr_locs = placement
1082-
10831086
if not isinstance(values, self._holder):
10841087
raise TypeError("values must be {0}".format(self._holder.__name__))
10851088

@@ -1852,6 +1855,7 @@ def get_values(self, dtype=None):
18521855
.reshape(self.values.shape)
18531856
return self.values
18541857

1858+
18551859
class SparseBlock(NonConsolidatableMixIn, Block):
18561860
""" implement as a list of sparse arrays of the same dtype """
18571861
__slots__ = ()
@@ -1861,27 +1865,6 @@ class SparseBlock(NonConsolidatableMixIn, Block):
18611865
_ftype = 'sparse'
18621866
_holder = SparseArray
18631867

1864-
def __init__(self, values, placement,
1865-
ndim=None, fastpath=False,):
1866-
1867-
# Placement must be converted to BlockPlacement via property setter
1868-
# before ndim logic, because placement may be a slice which doesn't
1869-
# have a length.
1870-
self.mgr_locs = placement
1871-
1872-
# kludgetastic
1873-
if ndim is None:
1874-
if len(self.mgr_locs) != 1:
1875-
ndim = 1
1876-
else:
1877-
ndim = 2
1878-
self.ndim = ndim
1879-
1880-
if not isinstance(values, SparseArray):
1881-
raise TypeError("values must be SparseArray")
1882-
1883-
self.values = values
1884-
18851868
@property
18861869
def shape(self):
18871870
return (len(self.mgr_locs), self.sp_index.length)
Binary file not shown.

pandas/io/tests/generate_legacy_pickles.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def create_data():
6060
from pandas import (Series,TimeSeries,DataFrame,Panel,
6161
SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel,
6262
Index,MultiIndex,PeriodIndex,
63-
date_range,period_range,bdate_range,Timestamp)
63+
date_range,period_range,bdate_range,Timestamp,Categorical)
6464
nan = np.nan
6565

6666
data = {
@@ -85,7 +85,8 @@ def create_data():
8585
mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
8686
[3,4,3,4,5]])),
8787
names=['one','two'])),
88-
dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))
88+
dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
89+
cat=Series(Categorical(['foo', 'bar', 'baz'])))
8990

9091
frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
9192
int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)),
@@ -95,7 +96,11 @@ def create_data():
9596
['one','two','one','two','three']])),
9697
names=['first','second'])),
9798
dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
98-
columns=['A', 'B', 'A']))
99+
columns=['A', 'B', 'A']),
100+
cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
101+
cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
102+
B=np.arange(3))),
103+
)
99104
panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
100105
dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
101106
items=['A', 'B', 'A']))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
ccopy_reg
2+
_reconstructor
3+
p0
4+
(cpandas.core.categorical
5+
Categorical
6+
p1
7+
c__builtin__
8+
object
9+
p2
10+
Ntp3
11+
Rp4
12+
(dp5
13+
S'_levels'
14+
p6
15+
cnumpy.core.multiarray
16+
_reconstruct
17+
p7
18+
(cpandas.core.index
19+
Index
20+
p8
21+
(I0
22+
tp9
23+
S'b'
24+
p10
25+
tp11
26+
Rp12
27+
((I1
28+
(I4
29+
tp13
30+
cnumpy
31+
dtype
32+
p14
33+
(S'O8'
34+
p15
35+
I0
36+
I1
37+
tp16
38+
Rp17
39+
(I3
40+
S'|'
41+
p18
42+
NNNI-1
43+
I-1
44+
I63
45+
tp19
46+
bI00
47+
(lp20
48+
S'a'
49+
p21
50+
ag10
51+
aS'c'
52+
p22
53+
aS'd'
54+
p23
55+
atp24
56+
(Ntp25
57+
tp26
58+
bsS'labels'
59+
p27
60+
g7
61+
(cnumpy
62+
ndarray
63+
p28
64+
(I0
65+
tp29
66+
g10
67+
tp30
68+
Rp31
69+
(I1
70+
(I3
71+
tp32
72+
g14
73+
(S'i8'
74+
p33
75+
I0
76+
I1
77+
tp34
78+
Rp35
79+
(I3
80+
S'<'
81+
p36
82+
NNNI-1
83+
I-1
84+
I0
85+
tp37
86+
bI00
87+
S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00'
88+
p38
89+
tp39
90+
bsS'name'
91+
p40
92+
S'foobar'
93+
p41
94+
sb.

pandas/tests/test_categorical.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from datetime import datetime
44
from pandas.compat import range, lrange, u
5+
import os
6+
import pickle
57
import re
68
from distutils.version import LooseVersion
79

@@ -21,16 +23,6 @@ def setUp(self):
2123
self.factor = Categorical.from_array(['a', 'b', 'b', 'a',
2224
'a', 'c', 'c', 'c'])
2325

24-
def assert_categorical_equal(self, res, exp):
25-
if not com.array_equivalent(res.categories, exp.categories):
26-
raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories,
27-
exp.categories))
28-
if not com.array_equivalent(res.codes, exp.codes):
29-
raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes,
30-
exp.codes))
31-
self.assertEqual(res.ordered, exp.ordered, "ordered not the same")
32-
self.assertEqual(res.name, exp.name, "name not the same")
33-
3426
def test_getitem(self):
3527
self.assertEqual(self.factor[0], 'a')
3628
self.assertEqual(self.factor[-1], 'c')
@@ -2268,6 +2260,21 @@ def get_dir(s):
22682260
results = get_dir(s)
22692261
tm.assert_almost_equal(results,list(sorted(set(ok_for_cat))))
22702262

2263+
def test_pickle_v0_14_1(self):
2264+
cat = pd.Categorical(values=['a', 'b', 'c'],
2265+
levels=['a', 'b', 'c', 'd'],
2266+
name='foobar', ordered=False)
2267+
pickle_path = os.path.join(tm.get_data_path(),
2268+
'categorical_0_14_1.pickle')
2269+
# This code was executed once on v0.14.1 to generate the pickle:
2270+
#
2271+
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
2272+
# name='foobar')
2273+
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
2274+
#
2275+
self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
2276+
2277+
22712278
if __name__ == '__main__':
22722279
import nose
22732280
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tests/test_internals.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pandas.util.testing as tm
1212
import pandas as pd
1313
from pandas.util.testing import (
14-
assert_almost_equal, assert_frame_equal, randn)
14+
assert_almost_equal, assert_frame_equal, randn, assert_series_equal)
1515
from pandas.compat import zip, u
1616

1717

@@ -363,6 +363,15 @@ def test_non_unique_pickle(self):
363363
mgr2 = self.round_trip_pickle(mgr)
364364
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
365365

366+
def test_categorical_block_pickle(self):
367+
mgr = create_mgr('a: category')
368+
mgr2 = self.round_trip_pickle(mgr)
369+
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
370+
371+
smgr = create_single_mgr('category')
372+
smgr2 = self.round_trip_pickle(smgr)
373+
assert_series_equal(Series(smgr), Series(smgr2))
374+
366375
def test_get_scalar(self):
367376
for item in self.mgr.items:
368377
for i, index in enumerate(self.mgr.axes[1]):

pandas/util/testing.py

+10
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False
121121
return
122122
raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal))
123123

124+
def assert_categorical_equal(self, res, exp):
125+
if not array_equivalent(res.categories, exp.categories):
126+
raise AssertionError('categories not equivalent: {0} vs {1}.'.format(res.categories,
127+
exp.categories))
128+
if not array_equivalent(res.codes, exp.codes):
129+
raise AssertionError('codes not equivalent: {0} vs {1}.'.format(res.codes,
130+
exp.codes))
131+
self.assertEqual(res.ordered, exp.ordered, "ordered not the same")
132+
self.assertEqual(res.name, exp.name, "name not the same")
133+
124134
def assertIs(self, first, second, msg=''):
125135
"""Checks that 'first' is 'second'"""
126136
a, b = first, second

0 commit comments

Comments
 (0)