Skip to content

Commit ff87282

Browse files
committed
ENH: allow categoricals in msgpack
DOC: support for categoricals in read_msgpack Add TestCategorical test cases Add Catecorical ordered=True ndframe test
1 parent a544e9e commit ff87282

File tree

3 files changed

+61
-6
lines changed

3 files changed

+61
-6
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Other Enhancements
7878
- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`)
7979
- ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`)
8080
- ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
81+
- ``pd.read_msgpack()`` now supports serializing and de-serializing categoricals with msgpack (:issue:`12573`)
8182
- ``interpolate()`` now supports ``method='akima'`` (:issue:`7588`).
8283
- ``Index.take`` now handles ``allow_fill`` and ``fill_value`` consistently (:issue:`12631`)
8384

pandas/io/packers.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,15 @@
4949
from pandas.compat import u, u_safe
5050
from pandas import (Timestamp, Period, Series, DataFrame, # noqa
5151
Index, MultiIndex, Float64Index, Int64Index,
52-
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT)
52+
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT,
53+
Categorical)
5354
from pandas.tslib import NaTType
5455
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel
5556
from pandas.sparse.array import BlockIndex, IntIndex
5657
from pandas.core.generic import NDFrame
5758
from pandas.core.common import (
5859
PerformanceWarning,
60+
is_categorical_dtype,
5961
needs_i8_conversion,
6062
pandas_dtype,
6163
)
@@ -226,6 +228,7 @@ def read(fh):
226228
# this is platform int, which we need to remap to np.int64
227229
# for compat on windows platforms
228230
7: np.dtype('int64'),
231+
'category': 'category'
229232
}
230233

231234

@@ -262,6 +265,9 @@ def convert(values):
262265
v = values.ravel()
263266

264267
# convert object
268+
if is_categorical_dtype(values):
269+
return values
270+
265271
if dtype == np.object_:
266272
return v.tolist()
267273

@@ -298,6 +304,9 @@ def unconvert(values, dtype, compress=None):
298304
if as_is_ext:
299305
values = values.data
300306

307+
if is_categorical_dtype(dtype):
308+
return values
309+
301310
if dtype == np.object_:
302311
return np.array(values, dtype=object)
303312

@@ -393,6 +402,16 @@ def encode(obj):
393402
u'dtype': u(obj.dtype.name),
394403
u'data': convert(obj.values),
395404
u'compress': compressor}
405+
406+
elif isinstance(obj, Categorical):
407+
return {u'typ': u'category',
408+
u'klass': u(obj.__class__.__name__),
409+
u'name': getattr(obj, 'name', None),
410+
u'codes': obj.codes,
411+
u'categories': obj.categories,
412+
u'ordered': obj.ordered,
413+
u'compress': compressor}
414+
396415
elif isinstance(obj, Series):
397416
if isinstance(obj, SparseSeries):
398417
raise NotImplementedError(
@@ -576,10 +595,18 @@ def decode(obj):
576595
result = result.tz_localize('UTC').tz_convert(tz)
577596
return result
578597

598+
elif typ == u'category':
599+
from_codes = globals()[obj[u'klass']].from_codes
600+
return from_codes(codes=obj[u'codes'],
601+
categories=obj[u'categories'],
602+
ordered=obj[u'ordered'],
603+
name=obj[u'name'])
604+
579605
elif typ == u'series':
580606
dtype = dtype_for(obj[u'dtype'])
581607
pd_dtype = pandas_dtype(dtype)
582608
np_dtype = pandas_dtype(dtype).base
609+
583610
index = obj[u'index']
584611
result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype,
585612
obj[u'compress']),

pandas/io/tests/test_packers.py

+32-5
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
from pandas import compat
1010
from pandas.compat import u
1111
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
12-
date_range, period_range, Index)
12+
date_range, period_range, Index, Categorical)
1313
from pandas.core.common import PerformanceWarning
1414
from pandas.io.packers import to_msgpack, read_msgpack
1515
import pandas.util.testing as tm
16-
from pandas.util.testing import (ensure_clean, assert_index_equal,
17-
assert_series_equal,
16+
from pandas.util.testing import (ensure_clean,
17+
assert_categorical_equal,
1818
assert_frame_equal,
19+
assert_index_equal,
20+
assert_series_equal,
1921
patch)
2022
from pandas.tests.test_panel import assert_panel_equal
2123

@@ -335,7 +337,7 @@ def setUp(self):
335337
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
336338
'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
337339
[Timestamp('20130603', tz='CET')] * 3,
338-
'G': [Timestamp('20130102', tz='US/Eastern')] * 5
340+
'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
339341
}
340342

341343
self.d['float'] = Series(data['A'])
@@ -353,6 +355,29 @@ def test_basic(self):
353355
assert_series_equal(i, i_rec)
354356

355357

358+
class TestCategorical(TestPackers):
359+
360+
def setUp(self):
361+
super(TestCategorical, self).setUp()
362+
363+
self.d = {}
364+
365+
self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
366+
self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
367+
ordered=True)
368+
369+
self.d['plain_int'] = Categorical([5, 6, 7, 8])
370+
self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
371+
372+
def test_basic(self):
373+
374+
# run multiple times here
375+
for n in range(10):
376+
for s, i in self.d.items():
377+
i_rec = self.encode_decode(i)
378+
assert_categorical_equal(i, i_rec)
379+
380+
356381
class TestNDFrame(TestPackers):
357382

358383
def setUp(self):
@@ -365,7 +390,9 @@ def setUp(self):
365390
'D': date_range('1/1/2009', periods=5),
366391
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
367392
'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
368-
'G': [Timestamp('20130603', tz='CET')] * 5
393+
'G': [Timestamp('20130603', tz='CET')] * 5,
394+
'H': Categorical(['a', 'b', 'c', 'd', 'e']),
395+
'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
369396
}
370397

371398
self.frame = {

0 commit comments

Comments
 (0)