Skip to content

Commit 2fd0a06

Browse files
pwallerjreback
authored andcommitted
ENH: allow categoricals in msgpack
closes #12573 xref #8632
1 parent 8890cc1 commit 2fd0a06

File tree

3 files changed

+68
-16
lines changed

3 files changed

+68
-16
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Other Enhancements
7878
- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`)
7979
- ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`)
8080
- ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
81+
- ``pd.read_msgpack()`` now supports serializing and de-serializing categoricals with msgpack (:issue:`12573`)
8182
- ``interpolate()`` now supports ``method='akima'`` (:issue:`7588`).
8283
- ``Index.take`` now handles ``allow_fill`` and ``fill_value`` consistently (:issue:`12631`)
8384

pandas/io/packers.py

+35-11
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,15 @@
4949
from pandas.compat import u, u_safe
5050
from pandas import (Timestamp, Period, Series, DataFrame, # noqa
5151
Index, MultiIndex, Float64Index, Int64Index,
52-
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT)
52+
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT,
53+
Categorical)
5354
from pandas.tslib import NaTType
5455
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel
5556
from pandas.sparse.array import BlockIndex, IntIndex
5657
from pandas.core.generic import NDFrame
57-
from pandas.core.common import (
58-
PerformanceWarning,
59-
needs_i8_conversion,
60-
pandas_dtype,
61-
)
58+
from pandas.core.common import (PerformanceWarning,
59+
is_categorical_dtype, is_object_dtype,
60+
needs_i8_conversion, pandas_dtype)
6261
from pandas.io.common import get_filepath_or_buffer
6362
from pandas.core.internals import BlockManager, make_block
6463
import pandas.core.internals as internals
@@ -226,6 +225,7 @@ def read(fh):
226225
# this is platform int, which we need to remap to np.int64
227226
# for compat on windows platforms
228227
7: np.dtype('int64'),
228+
'category': 'category'
229229
}
230230

231231

@@ -257,14 +257,17 @@ def convert(values):
257257
""" convert the numpy values to a list """
258258

259259
dtype = values.dtype
260+
261+
if is_categorical_dtype(values):
262+
return values
263+
264+
elif is_object_dtype(dtype):
265+
return values.ravel().tolist()
266+
260267
if needs_i8_conversion(dtype):
261268
values = values.view('i8')
262269
v = values.ravel()
263270

264-
# convert object
265-
if dtype == np.object_:
266-
return v.tolist()
267-
268271
if compressor == 'zlib':
269272
_check_zlib()
270273

@@ -298,7 +301,10 @@ def unconvert(values, dtype, compress=None):
298301
if as_is_ext:
299302
values = values.data
300303

301-
if dtype == np.object_:
304+
if is_categorical_dtype(dtype):
305+
return values
306+
307+
elif is_object_dtype(dtype):
302308
return np.array(values, dtype=object)
303309

304310
dtype = pandas_dtype(dtype).base
@@ -393,6 +399,16 @@ def encode(obj):
393399
u'dtype': u(obj.dtype.name),
394400
u'data': convert(obj.values),
395401
u'compress': compressor}
402+
403+
elif isinstance(obj, Categorical):
404+
return {u'typ': u'category',
405+
u'klass': u(obj.__class__.__name__),
406+
u'name': getattr(obj, 'name', None),
407+
u'codes': obj.codes,
408+
u'categories': obj.categories,
409+
u'ordered': obj.ordered,
410+
u'compress': compressor}
411+
396412
elif isinstance(obj, Series):
397413
if isinstance(obj, SparseSeries):
398414
raise NotImplementedError(
@@ -576,10 +592,18 @@ def decode(obj):
576592
result = result.tz_localize('UTC').tz_convert(tz)
577593
return result
578594

595+
elif typ == u'category':
596+
from_codes = globals()[obj[u'klass']].from_codes
597+
return from_codes(codes=obj[u'codes'],
598+
categories=obj[u'categories'],
599+
ordered=obj[u'ordered'],
600+
name=obj[u'name'])
601+
579602
elif typ == u'series':
580603
dtype = dtype_for(obj[u'dtype'])
581604
pd_dtype = pandas_dtype(dtype)
582605
np_dtype = pandas_dtype(dtype).base
606+
583607
index = obj[u'index']
584608
result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype,
585609
obj[u'compress']),

pandas/io/tests/test_packers.py

+32-5
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
from pandas import compat
1010
from pandas.compat import u
1111
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
12-
date_range, period_range, Index)
12+
date_range, period_range, Index, Categorical)
1313
from pandas.core.common import PerformanceWarning
1414
from pandas.io.packers import to_msgpack, read_msgpack
1515
import pandas.util.testing as tm
16-
from pandas.util.testing import (ensure_clean, assert_index_equal,
17-
assert_series_equal,
16+
from pandas.util.testing import (ensure_clean,
17+
assert_categorical_equal,
1818
assert_frame_equal,
19+
assert_index_equal,
20+
assert_series_equal,
1921
patch)
2022
from pandas.tests.test_panel import assert_panel_equal
2123

@@ -335,7 +337,7 @@ def setUp(self):
335337
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
336338
'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
337339
[Timestamp('20130603', tz='CET')] * 3,
338-
'G': [Timestamp('20130102', tz='US/Eastern')] * 5
340+
'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
339341
}
340342

341343
self.d['float'] = Series(data['A'])
@@ -353,6 +355,29 @@ def test_basic(self):
353355
assert_series_equal(i, i_rec)
354356

355357

358+
class TestCategorical(TestPackers):
359+
360+
def setUp(self):
361+
super(TestCategorical, self).setUp()
362+
363+
self.d = {}
364+
365+
self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
366+
self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
367+
ordered=True)
368+
369+
self.d['plain_int'] = Categorical([5, 6, 7, 8])
370+
self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
371+
372+
def test_basic(self):
373+
374+
# run multiple times here
375+
for n in range(10):
376+
for s, i in self.d.items():
377+
i_rec = self.encode_decode(i)
378+
assert_categorical_equal(i, i_rec)
379+
380+
356381
class TestNDFrame(TestPackers):
357382

358383
def setUp(self):
@@ -365,7 +390,9 @@ def setUp(self):
365390
'D': date_range('1/1/2009', periods=5),
366391
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
367392
'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
368-
'G': [Timestamp('20130603', tz='CET')] * 5
393+
'G': [Timestamp('20130603', tz='CET')] * 5,
394+
'H': Categorical(['a', 'b', 'c', 'd', 'e']),
395+
'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
369396
}
370397

371398
self.frame = {

0 commit comments

Comments
 (0)