-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: allow categoricals in msgpack #12573
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,13 +49,15 @@ | |
from pandas.compat import u, u_safe | ||
from pandas import (Timestamp, Period, Series, DataFrame, # noqa | ||
Index, MultiIndex, Float64Index, Int64Index, | ||
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT) | ||
Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, | ||
Categorical) | ||
from pandas.tslib import NaTType | ||
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel | ||
from pandas.sparse.array import BlockIndex, IntIndex | ||
from pandas.core.generic import NDFrame | ||
from pandas.core.common import ( | ||
PerformanceWarning, | ||
is_categorical_dtype, | ||
needs_i8_conversion, | ||
pandas_dtype, | ||
) | ||
|
@@ -226,6 +228,7 @@ def read(fh): | |
# this is platform int, which we need to remap to np.int64 | ||
# for compat on windows platforms | ||
7: np.dtype('int64'), | ||
'category': 'category' | ||
} | ||
|
||
|
||
|
@@ -262,6 +265,9 @@ def convert(values): | |
v = values.ravel() | ||
|
||
# convert object | ||
if is_categorical_dtype(values): | ||
return values | ||
|
||
if dtype == np.object_: | ||
return v.tolist() | ||
|
||
|
@@ -298,6 +304,9 @@ def unconvert(values, dtype, compress=None): | |
if as_is_ext: | ||
values = values.data | ||
|
||
if is_categorical_dtype(dtype): | ||
return values | ||
|
||
if dtype == np.object_: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
return np.array(values, dtype=object) | ||
|
||
|
@@ -393,6 +402,16 @@ def encode(obj): | |
u'dtype': u(obj.dtype.name), | ||
u'data': convert(obj.values), | ||
u'compress': compressor} | ||
|
||
elif isinstance(obj, Categorical): | ||
return {u'typ': u'category', | ||
u'klass': u(obj.__class__.__name__), | ||
u'name': getattr(obj, 'name', None), | ||
u'codes': obj.codes, | ||
u'categories': obj.categories, | ||
u'ordered': obj.ordered, | ||
u'compress': compressor} | ||
|
||
elif isinstance(obj, Series): | ||
if isinstance(obj, SparseSeries): | ||
raise NotImplementedError( | ||
|
@@ -576,10 +595,18 @@ def decode(obj): | |
result = result.tz_localize('UTC').tz_convert(tz) | ||
return result | ||
|
||
elif typ == u'category': | ||
from_codes = globals()[obj[u'klass']].from_codes | ||
return from_codes(codes=obj[u'codes'], | ||
categories=obj[u'categories'], | ||
ordered=obj[u'ordered'], | ||
name=obj[u'name']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
elif typ == u'series': | ||
dtype = dtype_for(obj[u'dtype']) | ||
pd_dtype = pandas_dtype(dtype) | ||
np_dtype = pandas_dtype(dtype).base | ||
|
||
index = obj[u'index'] | ||
result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype, | ||
obj[u'compress']), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,13 +9,15 @@ | |
from pandas import compat | ||
from pandas.compat import u | ||
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, | ||
date_range, period_range, Index) | ||
date_range, period_range, Index, Categorical) | ||
from pandas.core.common import PerformanceWarning | ||
from pandas.io.packers import to_msgpack, read_msgpack | ||
import pandas.util.testing as tm | ||
from pandas.util.testing import (ensure_clean, assert_index_equal, | ||
assert_series_equal, | ||
from pandas.util.testing import (ensure_clean, | ||
assert_categorical_equal, | ||
assert_frame_equal, | ||
assert_index_equal, | ||
assert_series_equal, | ||
patch) | ||
from pandas.tests.test_panel import assert_panel_equal | ||
|
||
|
@@ -335,7 +337,7 @@ def setUp(self): | |
'E': [0., 1, Timestamp('20100101'), 'foo', 2.], | ||
'F': [Timestamp('20130102', tz='US/Eastern')] * 2 + | ||
[Timestamp('20130603', tz='CET')] * 3, | ||
'G': [Timestamp('20130102', tz='US/Eastern')] * 5 | ||
'G': [Timestamp('20130102', tz='US/Eastern')] * 5, | ||
} | ||
|
||
self.d['float'] = Series(data['A']) | ||
|
@@ -353,6 +355,29 @@ def test_basic(self): | |
assert_series_equal(i, i_rec) | ||
|
||
|
||
class TestCategorical(TestPackers): | ||
|
||
def setUp(self): | ||
super(TestCategorical, self).setUp() | ||
|
||
self.d = {} | ||
|
||
self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e']) | ||
self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'], | ||
ordered=True) | ||
|
||
self.d['plain_int'] = Categorical([5, 6, 7, 8]) | ||
self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True) | ||
|
||
def test_basic(self): | ||
|
||
# run multiple times here | ||
for n in range(10): | ||
for s, i in self.d.items(): | ||
i_rec = self.encode_decode(i) | ||
assert_categorical_equal(i, i_rec) | ||
|
||
|
||
class TestNDFrame(TestPackers): | ||
|
||
def setUp(self): | ||
|
@@ -365,7 +390,9 @@ def setUp(self): | |
'D': date_range('1/1/2009', periods=5), | ||
'E': [0., 1, Timestamp('20100101'), 'foo', 2.], | ||
'F': [Timestamp('20130102', tz='US/Eastern')] * 5, | ||
'G': [Timestamp('20130603', tz='CET')] * 5 | ||
'G': [Timestamp('20130603', tz='CET')] * 5, | ||
'H': Categorical(['a', 'b', 'c', 'd', 'e']), | ||
'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm in a strange position here. I added this test - it passes, but I didn't add the relevant code in convert/unconvert to pass the ordered parameter through. What gives? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that check is not used at all! I was just going to tell you to take it out. a |
||
} | ||
|
||
self.frame = { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same here, should be in
decode
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.