Skip to content

ENH: allow categoricals in msgpack #12573

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Other Enhancements
- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`)
- ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`)
- ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
- ``pd.read_msgpack()`` now supports serializing and de-serializing categoricals with msgpack (:issue:`12573`)
- ``interpolate()`` now supports ``method='akima'`` (:issue:`7588`).
- ``Index.take`` now handles ``allow_fill`` and ``fill_value`` consistently (:issue:`12631`)

Expand Down
29 changes: 28 additions & 1 deletion pandas/io/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,15 @@
from pandas.compat import u, u_safe
from pandas import (Timestamp, Period, Series, DataFrame, # noqa
Index, MultiIndex, Float64Index, Int64Index,
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT)
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT,
Categorical)
from pandas.tslib import NaTType
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel
from pandas.sparse.array import BlockIndex, IntIndex
from pandas.core.generic import NDFrame
from pandas.core.common import (
PerformanceWarning,
is_categorical_dtype,
needs_i8_conversion,
pandas_dtype,
)
Expand Down Expand Up @@ -226,6 +228,7 @@ def read(fh):
# this is platform int, which we need to remap to np.int64
# for compat on windows platforms
7: np.dtype('int64'),
'category': 'category'
}


Expand Down Expand Up @@ -262,6 +265,9 @@ def convert(values):
v = values.ravel()

# convert object
if is_categorical_dtype(values):
return values

if dtype == np.object_:
return v.tolist()

Expand Down Expand Up @@ -298,6 +304,9 @@ def unconvert(values, dtype, compress=None):
if as_is_ext:
values = values.data

if is_categorical_dtype(dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, should be in decode

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

return values

if dtype == np.object_:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

return np.array(values, dtype=object)

Expand Down Expand Up @@ -393,6 +402,16 @@ def encode(obj):
u'dtype': u(obj.dtype.name),
u'data': convert(obj.values),
u'compress': compressor}

elif isinstance(obj, Categorical):
return {u'typ': u'category',
u'klass': u(obj.__class__.__name__),
u'name': getattr(obj, 'name', None),
u'codes': obj.codes,
u'categories': obj.categories,
u'ordered': obj.ordered,
u'compress': compressor}

elif isinstance(obj, Series):
if isinstance(obj, SparseSeries):
raise NotImplementedError(
Expand Down Expand Up @@ -576,10 +595,18 @@ def decode(obj):
result = result.tz_localize('UTC').tz_convert(tz)
return result

elif typ == u'category':
from_codes = globals()[obj[u'klass']].from_codes
return from_codes(codes=obj[u'codes'],
categories=obj[u'categories'],
ordered=obj[u'ordered'],
name=obj[u'name'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove name

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


elif typ == u'series':
dtype = dtype_for(obj[u'dtype'])
pd_dtype = pandas_dtype(dtype)
np_dtype = pandas_dtype(dtype).base

index = obj[u'index']
result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype,
obj[u'compress']),
Expand Down
37 changes: 32 additions & 5 deletions pandas/io/tests/test_packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
from pandas import compat
from pandas.compat import u
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
date_range, period_range, Index)
date_range, period_range, Index, Categorical)
from pandas.core.common import PerformanceWarning
from pandas.io.packers import to_msgpack, read_msgpack
import pandas.util.testing as tm
from pandas.util.testing import (ensure_clean, assert_index_equal,
assert_series_equal,
from pandas.util.testing import (ensure_clean,
assert_categorical_equal,
assert_frame_equal,
assert_index_equal,
assert_series_equal,
patch)
from pandas.tests.test_panel import assert_panel_equal

Expand Down Expand Up @@ -335,7 +337,7 @@ def setUp(self):
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
[Timestamp('20130603', tz='CET')] * 3,
'G': [Timestamp('20130102', tz='US/Eastern')] * 5
'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
}

self.d['float'] = Series(data['A'])
Expand All @@ -353,6 +355,29 @@ def test_basic(self):
assert_series_equal(i, i_rec)


class TestCategorical(TestPackers):

def setUp(self):
super(TestCategorical, self).setUp()

self.d = {}

self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
ordered=True)

self.d['plain_int'] = Categorical([5, 6, 7, 8])
self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)

def test_basic(self):

# run multiple times here
for n in range(10):
for s, i in self.d.items():
i_rec = self.encode_decode(i)
assert_categorical_equal(i, i_rec)


class TestNDFrame(TestPackers):

def setUp(self):
Expand All @@ -365,7 +390,9 @@ def setUp(self):
'D': date_range('1/1/2009', periods=5),
'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
'G': [Timestamp('20130603', tz='CET')] * 5
'G': [Timestamp('20130603', tz='CET')] * 5,
'H': Categorical(['a', 'b', 'c', 'd', 'e']),
'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm in a strange position here. I added this test - it passes, but I didn't add the relevant code in convert/unconvert to pass the ordered parameter through. What gives?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that check is not used at all! I was just going to tell you to take it out. a Categorical is fully serialized/deserialized via encode/decode the dtype is NEVER category. except when its a series but that is already handled.

}

self.frame = {
Expand Down