From 98671ee1a33c44cdee2f7b2dce06a0f7e385ebb6 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 3 May 2013 15:24:01 -0400 Subject: [PATCH 1/5] ENH: support for msgpack serialization/deserialization DOC: install.rst mention DOC: added license from msgpack_numpy PERF: changed Timestamp and DatetimeIndex serialization for speedups add vb_suite benchmarks ENH: added to_msgpack method in generic.py, and default import into pandas TST: all packers to always be imported, fail on usage with no msgpack installed --- LICENSES/MSGPACK_NUMPY_LICENSE | 33 +++ doc/source/install.rst | 1 + pandas/__init__.py | 1 + pandas/core/generic.py | 4 + pandas/core/internals.py | 1 - pandas/io/packers.py | 378 ++++++++++++++++++++++++++++++++ pandas/io/tests/test_packers.py | 313 ++++++++++++++++++++++++++ vb_suite/packers.py | 80 +++++++ vb_suite/suite.py | 1 + 9 files changed, 811 insertions(+), 1 deletion(-) create mode 100644 LICENSES/MSGPACK_NUMPY_LICENSE create mode 100644 pandas/io/packers.py create mode 100644 pandas/io/tests/test_packers.py create mode 100644 vb_suite/packers.py diff --git a/LICENSES/MSGPACK_NUMPY_LICENSE b/LICENSES/MSGPACK_NUMPY_LICENSE new file mode 100644 index 0000000000000..57ea631f0f66d --- /dev/null +++ b/LICENSES/MSGPACK_NUMPY_LICENSE @@ -0,0 +1,33 @@ +.. -*- rst -*- + +License +======= + +Copyright (c) 2013, Lev Givon. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Lev Givon nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/doc/source/install.rst b/doc/source/install.rst index 9d14d1b11c6b1..360ded91c86f0 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -93,6 +93,7 @@ Optional Dependencies version. Version 0.17.1 or higher. * `SciPy `__: miscellaneous statistical functions * `PyTables `__: necessary for HDF5-based storage + * `msgpack `__: necessary for msgpack based serialization * `matplotlib `__: for plotting * `statsmodels `__ * Needed for parts of :mod:`pandas.stats` diff --git a/pandas/__init__.py b/pandas/__init__.py index bf5bcc81bc21e..3aee9b2ab67d8 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -32,6 +32,7 @@ from pandas.io.parsers import (read_csv, read_table, read_clipboard, read_fwf, to_clipboard, ExcelFile, ExcelWriter) +from pandas.io.packers import read_msgpack, to_msgpack from pandas.io.pytables import HDFStore, Term, get_store, read_hdf from pandas.io.html import read_html from pandas.util.testing import debug diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ed90aab715cfd..571ab4fab07ce 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -487,6 +487,10 @@ def to_hdf(self, path_or_buf, key, **kwargs): from pandas.io import pytables return pytables.to_hdf(path_or_buf, key, self, **kwargs) + def to_msgpack(self, path_or_buf, **kwargs): + from pandas.io import packers + return packers.to_msgpack(path_or_buf, self, **kwargs) + # install the indexerse for _name, _indexer in indexing.get_indexers_list(): PandasObject._create_indexer(_name,_indexer) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5c0f9253beb62..4628773491d61 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -15,7 +15,6 @@ from pandas.tslib import Timestamp from pandas.util import py3compat - class Block(object): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas diff --git a/pandas/io/packers.py b/pandas/io/packers.py new file mode 100644 index 0000000000000..81c85965da4b9 --- /dev/null +++ b/pandas/io/packers.py @@ -0,0 +1,378 @@ +""" +Msgpack serializer support for reading and writing pandas data structures +to disk +""" + +# porfions of msgpack_numpy package, by Lev Givon were incorporated +# into this module (and tests_packers.py) + +""" +License +======= + +Copyright (c) 2013, Lev Givon. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Lev Givon nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +from datetime import datetime +import time +import re +import copy +import itertools +import warnings + +import numpy as np +from pandas import ( + Timestamp, Period, Series, TimeSeries, DataFrame, Panel, Panel4D, + Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, NaT +) +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.array import BlockIndex, IntIndex +from pandas.tseries.api import PeriodIndex, DatetimeIndex +from pandas.core.index import Int64Index, _ensure_index +import pandas.core.common as com +from pandas.core.common import needs_i8_conversion +from pandas.core.internals import BlockManager, make_block +import pandas.core.internals as internals + +try: + import msgpack + from msgpack import _packer, _unpacker + _USE_MSGPACK = True +except: + _USE_MSGPACK = False + +def to_msgpack(path, obj, **kwargs): + """ + msgpack (serialize) object to input file path + + Parameters + ---------- + path : string + File path + obj : any object + """ + if not _USE_MSGPACK: + raise Exception("please install msgpack to create msgpack stores!") + f = open(path, 'wb') + try: + f.write(msgpack.packb(obj)) + finally: + f.close() + + +def read_msgpack(path): + """ + Load msgpack pandas object from the specified + file path + + Parameters + ---------- + path : string + File path + + Returns + ------- + obj : type of object stored in file + """ + if not _USE_MSGPACK: + raise Exception("please install msgpack to read msgpack stores!") + with open(path,'rb') as fh: + return msgpack.unpackb(fh.read()) + +dtype_dict = { 'datetime64[ns]' : np.dtype('M8[ns]'), + 'timedelta64[ns]' : np.dtype('m8[ns]') } + +def dtype_for(t): + if t in dtype_dict: + return dtype_dict[t] + return np.typeDict[t] + +c2f_dict = {'complex': np.float64, + 'complex128': np.float64, + 'complex256': np.float128, + 'complex64': np.float32} + +def c2f(r, i, ctype_name): + """ + Convert strings to complex number instance with specified numpy type. + """ + + ftype = c2f_dict[ctype_name] + return np.typeDict[ctype_name](ftype(r)+1j*ftype(i)) + +def convert(values): + """ convert the numpy values to a list """ + + dtype = values.dtype + if needs_i8_conversion(dtype): + values = values.view('i8') + return values.ravel().tolist() + + +def encode(obj): + """ + Data encoder + """ + + if isinstance(obj, Index): + if isinstance(obj, PeriodIndex): + return {'typ' : 'period_index', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'dtype': obj.dtype.name, + 'data': obj.tolist() } + elif isinstance(obj, DatetimeIndex): + return {'typ' : 'datetime_index', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'dtype': obj.dtype.name, + 'data': obj.values.view('i8').tolist(), + 'freq' : obj.freqstr, + 'tz' : obj.tz} + elif isinstance(obj, MultiIndex): + return {'typ' : 'multi_index', + 'klass' : obj.__class__.__name__, + 'names' : getattr(obj,'names',None), + 'dtype': obj.dtype.name, + 'data': obj.tolist() } + else: + return {'typ' : 'index', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'dtype': obj.dtype.name, + 'data': obj.tolist() } + elif isinstance(obj, Series): + if isinstance(obj, SparseSeries): + import pdb; pdb.set_trace() + else: + return {'typ' : 'series', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'index' : obj.index, + 'dtype': obj.dtype.name, + 'data': convert(obj.values) } + elif isinstance(obj, DataFrame): + if isinstance(obj, SparseDataFrame): + import pdb; pdb.set_trace() + else: + + data = obj._data + if not data.is_consolidated(): + data = data.consolidate() + + # the block manager + return {'typ' : 'dataframe', + 'klass' : obj.__class__.__name__, + 'axes' : data.axes, + 'blocks' : [ { 'items' : b.items, + 'values' : convert(b.values), + 'shape' : b.values.shape, + 'dtype' : b.dtype.name, + 'klass' : b.__class__.__name__ + } for b in data.blocks ] } + + elif isinstance(obj, datetime): + if isinstance(obj, Timestamp): + tz = obj.tzinfo + if tz is not None: + tz = tz.zone + offset = obj.offset + if offset is not None: + offset = offset.freqstr + return {'typ' : 'timestamp', + 'value': obj.value, + 'offset' : offset, + 'tz' : tz} + return { 'typ' : 'datetime', + 'data' : obj.isoformat() } + elif isinstance(obj, Period): + return {'typ' : 'period', + 'ordinal' : obj.ordinal, + 'freq' : obj.freq } + elif isinstance(obj, np.ndarray): + return {'typ' : 'ndarray', + 'shape': obj.shape, + 'ndim': obj.ndim, + 'dtype': obj.dtype.name, + 'data': convert(obj)} + elif isinstance(obj, np.number): + if np.iscomplexobj(obj): + return {'typ' : 'np_scalar', + 'sub_typ' : 'np_complex', + 'dtype': obj.dtype.name, + 'real': obj.real.__repr__(), + 'imag': obj.imag.__repr__()} + else: + return {'typ' : 'np_scalar', + 'dtype': obj.dtype.name, + 'data': obj.__repr__()} + elif isinstance(obj, complex): + return {'typ' : 'np_complex', + 'real': obj.real.__repr__(), + 'imag': obj.imag.__repr__()} + else: + import pdb; pdb.set_trace() + return obj + +def decode(obj): + """ + Decoder for deserializing numpy data types. + """ + + typ = obj.get('typ') + if typ is None: + return obj + elif typ == 'timestamp': + return Timestamp(obj['value'],tz=obj['tz'],offset=obj['offset']) + elif typ == 'period': + return Period(ordinal=obj['ordinal'],freq=obj['freq']) + elif typ == 'index': + dtype = dtype_for(obj['dtype']) + data = obj['data'] + return globals()[obj['klass']](data,dtype=dtype,name=obj['name']) + elif typ == 'multi_index': + return globals()[obj['klass']].from_tuples(obj['data'],names=obj['names']) + elif typ == 'period_index': + return globals()[obj['klass']](obj['data'],name=obj['name']) + elif typ == 'datetime_index': + return globals()[obj['klass']](obj['data'],freq=obj['freq'],tz=obj['tz'],name=obj['name']) + elif typ == 'series': + dtype = dtype_for(obj['dtype']) + index = obj['index'] + return globals()[obj['klass']](obj['data'],index=index,dtype=dtype,name=obj['name']) + elif typ == 'dataframe': + axes = obj['axes'] + + def create_block(b): + dtype = dtype_for(b['dtype']) + return make_block(np.array(b['values'],dtype=dtype).reshape(b['shape']),b['items'],axes[0],klass=getattr(internals,b['klass'])) + + blocks = [ create_block(b) for b in obj['blocks'] ] + return globals()[obj['klass']](BlockManager(blocks, axes)) + elif typ == 'datetime': + import pdb; pdb.set_trace() + return datetime.fromtimestamp(obj['data']) + elif typ == 'ndarray': + return np.array(obj['data'], + dtype=np.typeDict[obj['dtype']], + ndmin=obj['ndim']).reshape(obj['shape']) + elif typ == 'np_scalar': + if obj.get('sub_typ') == 'np_complex': + return c2f(obj['real'], obj['imag'], obj['dtype']) + else: + return np.typeDict[obj['dtype']](obj['data']) + elif typ == 'np_complex': + return complex(obj['real']+'+'+obj['imag']+'j') + elif isinstance(obj, (dict,list,set)): + return obj + else: + import pdb; pdb.set_trace() + return obj + +def pack(o, stream, default=encode, + encoding='utf-8', unicode_errors='strict'): + """ + Pack an object and write it to a stream. + """ + + _packer.pack(o, stream, default=default, + encoding=encoding, + unicode_errors=unicode_errors) +def packb(o, default=encode, + encoding='utf-8', unicode_errors='strict', use_single_float=False): + """ + Pack an object and return the packed bytes. + """ + + return _packer.packb(o, default=default, encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float) + +def unpack(stream, object_hook=decode, list_hook=None, use_list=None, + encoding='utf-8', unicode_errors='strict', object_pairs_hook=None): + """ + Unpack a packed object from a stream. + """ + + return _unpacker.unpack(stream, object_hook=object_hook, + list_hook=list_hook, use_list=use_list, + encoding=encoding, + unicode_errors=unicode_errors, + object_pairs_hook=object_pairs_hook) +def unpackb(packed, object_hook=decode, + list_hook=None, use_list=None, encoding='utf-8', + unicode_errors='strict', object_pairs_hook=None): + """ + Unpack a packed object. + """ + + return _unpacker.unpackb(packed, object_hook=object_hook, + list_hook=list_hook, + use_list=use_list, encoding=encoding, + unicode_errors=unicode_errors, + object_pairs_hook=object_pairs_hook) + +if _USE_MSGPACK: + + class Packer(_packer.Packer): + def __init__(self, default=encode, + encoding='utf-8', + unicode_errors='strict', + use_single_float=False): + super(Packer, self).__init__(default=default, + encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float) + + class Unpacker(_unpacker.Unpacker): + def __init__(self, file_like=None, read_size=0, use_list=None, + object_hook=decode, + object_pairs_hook=None, list_hook=None, encoding='utf-8', + unicode_errors='strict', max_buffer_size=0): + super(Unpacker, self).__init__(file_like=file_like, + read_size=read_size, + use_list=use_list, + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, + list_hook=list_hook, + encoding=encoding, + unicode_errors=unicode_errors, + max_buffer_size=max_buffer_size) + + setattr(msgpack, 'Packer', Packer) + setattr(msgpack, 'Unpacker', Unpacker) + setattr(msgpack, 'load', unpack) + setattr(msgpack, 'loads', unpackb) + setattr(msgpack, 'dump', pack) + setattr(msgpack, 'dumps', packb) + setattr(msgpack, 'pack', pack) + setattr(msgpack, 'packb', packb) + setattr(msgpack, 'unpack', unpack) + setattr(msgpack, 'unpackb', unpackb) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py new file mode 100644 index 0000000000000..08986cb082131 --- /dev/null +++ b/pandas/io/tests/test_packers.py @@ -0,0 +1,313 @@ +import nose +import unittest +import os +import sys +import warnings + +import datetime +import numpy as np + +from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, + date_range, period_range, Index, SparseSeries, SparseDataFrame, + SparsePanel) +import pandas.util.testing as tm +from pandas.util.testing import ensure_clean +from pandas.tests.test_series import assert_series_equal +from pandas.tests.test_frame import assert_frame_equal +from pandas import concat, Timestamp, tslib + +from numpy.testing.decorators import slow +nan = np.nan + +from pandas.io.packers import to_msgpack, read_msgpack, _USE_MSGPACK + +if not _USE_MSGPACK: + raise nose.SkipTest('no msgpack') + +_multiprocess_can_split_ = False + +class Test(unittest.TestCase): + + def setUp(self): + self.path = '__%s__.msg' % tm.rands(10) + + def tearDown(self): + pass + + def encode_decode(self, x): + with ensure_clean(self.path) as p: + to_msgpack(p,x) + return read_msgpack(p) + +class TestNumpy(Test): + + def test_numpy_scalar_float(self): + x = np.float32(np.random.rand()) + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_numpy_scalar_complex(self): + x = np.complex64(np.random.rand()+1j*np.random.rand()) + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_scalar_float(self): + x = np.random.rand() + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_scalar_complex(self): + x = np.random.rand()+1j*np.random.rand() + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_list_numpy_float(self): + x = [np.float32(np.random.rand()) for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_list_numpy_float_complex(self): + x = [np.float32(np.random.rand()) for i in xrange(5)] + \ + [np.complex128(np.random.rand()+1j*np.random.rand()) for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_list_float(self): + x = [np.random.rand() for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_list_float_complex(self): + x = [np.random.rand() for i in xrange(5)] + \ + [(np.random.rand()+1j*np.random.rand()) for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_dict_float(self): + x = {'foo': 1.0, 'bar': 2.0} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_dict_complex(self): + x = {'foo': 1.0+1.0j, 'bar': 2.0+2.0j} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_dict_numpy_float(self): + x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_dict_numpy_complex(self): + x = {'foo': np.complex128(1.0+1.0j), 'bar': np.complex128(2.0+2.0j)} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_numpy_array_float(self): + x = np.random.rand(5).astype(np.float32) + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + x.dtype == x_rec.dtype + def test_numpy_array_complex(self): + x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + x.dtype == x_rec.dtype + + def test_list_mixed(self): + x = [1.0, np.float32(3.5), np.complex128(4.25), u'foo'] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) +class TestBasic(Test): + + def test_timestamp(self): + + for i in [ Timestamp('20130101'), Timestamp('20130101',tz='US/Eastern'), + Timestamp('201301010501') ]: + i_rec = self.encode_decode(i) + self.assert_(i == i_rec) + +class TestIndex(Test): + + def setUp(self): + super(TestIndex, self).setUp() + + self.d = { + 'string' : tm.makeStringIndex(100), + 'date' : tm.makeDateIndex(100), + 'int' : tm.makeIntIndex(100), + 'float' : tm.makeFloatIndex(100), + 'empty' : Index([]), + 'tuple' : Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), + 'period' : Index(period_range('2012-1-1', freq='M', periods=3)), + 'date2' : Index(date_range('2013-01-1', periods=10)), + 'bdate' : Index(bdate_range('2013-01-02',periods=10)), + } + + self.mi = { + 'reg' : MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), + ('qux', 'one'), ('qux', 'two')], names=['first','second']), + } + + def test_basic_index(self): + + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + self.assert_(i.equals(i_rec)) + + def test_multi_index(self): + + for s, i in self.mi.items(): + i_rec = self.encode_decode(i) + self.assert_(i.equals(i_rec)) + + def test_unicode(self): + i = tm.makeUnicodeIndex(100) + i_rec = self.encode_decode(i) + self.assert_(i.equals(i_rec)) + +class TestSeries(Test): + + def setUp(self): + super(TestSeries, self).setUp() + + self.d = {} + + + s = tm.makeStringSeries() + s.name = 'string' + self.d['string'] = s + + s = tm.makeObjectSeries() + s.name = 'object' + self.d['object'] = s + + s = Series(tslib.iNaT, dtype='M8[ns]', index=range(5)) + self.d['date'] = s + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E' : [0., 1, Timestamp('20100101'),'foo',2.], + } + + self.d['float'] = Series(data['A']) + self.d['int'] = Series(data['B']) + self.d['mixed'] = Series(data['E']) + + def test_basic(self): + + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + assert_series_equal(i,i_rec) + +class TestFrame(Test): + + def setUp(self): + super(TestFrame, self).setUp() + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E' : [0., 1, Timestamp('20100101'),'foo',2.], + } + + self.d = { 'float' : DataFrame(dict(A = data['A'], B = Series(data['A']) + 1)), + 'int' : DataFrame(dict(A = data['B'], B = Series(data['B']) + 1)), + 'mixed' : DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])) } + + def test_basic(self): + + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + assert_frame_equal(i,i_rec) + + def test_multi(self): + + i_rec = self.encode_decode(self.d) + for k in self.d.keys(): + assert_frame_equal(self.d[k],i_rec[k]) + + l = [ self.d['float'], self.d['float'].A, self.d['float'].B, None ] + l_rec = self.encode_decode(l) + self.assert_(len(l) == len(l_rec)) + assert_frame_equal(l[0],l_rec[0]) + assert_series_equal(l[1],l_rec[1]) + assert_series_equal(l[2],l_rec[2]) + self.assert_(l[3] == l_rec[3]) + +def _create_sp_series(): + + # nan-based + arr = np.arange(15, dtype=float) + index = np.arange(15) + arr[7:12] = nan + arr[-1:] = nan + + date_index = bdate_range('1/1/2011', periods=len(index)) + bseries = SparseSeries(arr, index=index, kind='block') + bseries.name = 'bseries' + return bseries + +def _create_sp_frame(): + + data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C': np.arange(10), + 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + + dates = bdate_range('1/1/2011', periods=10) + return SparseDataFrame(data, index=dates) + +def create_data(): + """ create the pickle data """ + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E' : [0., 1, Timestamp('20100101'),'foo',2.], + } + + index = dict(int = Index(np.arange(10)), + date = date_range('20130101',periods=10)) + mi = dict(reg = MultiIndex.from_tuples(zip([['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]), + names=['first', 'second'])) + series = dict(float = Series(data['A']), + int = Series(data['B']), + mixed = Series(data['E'])) + frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), + int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), + mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']]))) + panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1))) + + + + return dict( series = series, + frame = frame, + panel = panel, + index = index, + mi = mi, + sp_series = dict(float = _create_sp_series()), + sp_frame = dict(float = _create_sp_frame()) + ) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/vb_suite/packers.py b/vb_suite/packers.py new file mode 100644 index 0000000000000..6733b5fa6dfb8 --- /dev/null +++ b/vb_suite/packers.py @@ -0,0 +1,80 @@ +from vbench.api import Benchmark +from datetime import datetime + +start_date = datetime(2013, 5, 1) + +common_setup = """from pandas_vb_common import * +import os +from pandas.io import packers +from pandas.core import common as com + +f = '__test__.msg' +def remove(f): + try: + os.remove(f) + except: + pass + +""" + +#---------------------------------------------------------------------- +# read a pack + +setup1 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +packers.save(f,df) +""" + +read_pack = Benchmark("packers.load(f)", setup1, + start_date=start_date) + + +#---------------------------------------------------------------------- +# write to a pack + +setup2 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +""" + +write_pack = Benchmark( + "packers.save(f,df)", setup2, cleanup="remove(f)", + start_date=start_date) + +#---------------------------------------------------------------------- +# read a pickle + +setup1 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +df.save(f) +""" + +read_pickle = Benchmark("com.load(f)", setup1, + start_date=start_date) + + +#---------------------------------------------------------------------- +# write to a pickle + +setup2 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +""" + +write_pickle = Benchmark( + "df.save(f)", setup2, cleanup="remove(f)", + start_date=start_date) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index 905c4371837cc..4ac967dc1664a 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -16,6 +16,7 @@ 'join_merge', 'miscellaneous', 'panel_ctor', + 'packers', 'parser', 'reindex', 'replace', From 4870ad95b23c7ac77880e3d388deb0ca48c66a4a Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 3 May 2013 19:37:05 -0400 Subject: [PATCH 2/5] DOC: added mentions in release notes, v0.11.1, basics ENH: provide automatic list if multiple args passed to to_msgpack DOC: changed docs to 0.12 ENH: iterator support for stream unpacking --- RELEASE.rst | 3 + doc/source/basics.rst | 41 ------------ doc/source/io.rst | 73 +++++++++++++++++++++ doc/source/v0.11.1.txt | 9 ++- doc/source/v0.12.0.txt | 35 ++++++++++ doc/source/whatsnew.rst | 2 + pandas/io/packers.py | 113 +++++++++++++++++--------------- pandas/io/tests/test_packers.py | 34 ++++++++-- 8 files changed, 207 insertions(+), 103 deletions(-) create mode 100644 doc/source/v0.12.0.txt diff --git a/RELEASE.rst b/RELEASE.rst index 77e8e85db6a76..cefb18c9f0ddf 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -32,6 +32,8 @@ pandas 0.11.1 - pd.read_html() can now parse HTML string, files or urls and return dataframes courtesy of @cpcloud. (GH3477_) + - ``pd.read_msgpack()`` and ``pd.to_msgpack()`` are now a supported method of serialization + of arbitrary pandas (and python objects) in a lightweight portable binary format (GH686_) **Improvements to existing features** @@ -75,6 +77,7 @@ pandas 0.11.1 .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 +.. _GH686: https://github.com/pydata/pandas/issues/686 .. _GH2194: https://github.com/pydata/pandas/issues/2194 .. _GH3230: https://github.com/pydata/pandas/issues/3230 .. _GH3251: https://github.com/pydata/pandas/issues/3251 diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 5739fe0922d6d..c6f036d9541a6 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1192,47 +1192,6 @@ While float dtypes are unchanged. casted casted.dtypes -.. _basics.serialize: - -Pickling and serialization --------------------------- - -All pandas objects are equipped with ``save`` methods which use Python's -``cPickle`` module to save data structures to disk using the pickle format. - -.. ipython:: python - - df - df.save('foo.pickle') - -The ``load`` function in the ``pandas`` namespace can be used to load any -pickled pandas object (or any other pickled object) from file: - - -.. ipython:: python - - load('foo.pickle') - -There is also a ``save`` function which takes any object as its first argument: - -.. ipython:: python - - save(df, 'foo.pickle') - load('foo.pickle') - -.. ipython:: python - :suppress: - - import os - os.remove('foo.pickle') - -.. warning:: - - Loading pickled data received from untrusted sources can be unsafe. - - See: http://docs.python.org/2.7/library/pickle.html - - Working with package options ---------------------------- diff --git a/doc/source/io.rst b/doc/source/io.rst index 9001ae393d552..1d90cf98067ea 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -981,6 +981,79 @@ one can use the ExcelWriter class, as in the following example: .. _io.hdf5: +.. _basics.serialize: + +Serialization +------------- + +msgpack +~~~~~~~ + +Starting in 0.12.0, pandas is supporting the ``msgpack`` format for +object serialization. This is a lightweight portable binary format, similar +to binary JSON, that is highly space efficient, and provides good performance +both on the writing (serialization), and reading (deserialization). + +.. ipython:: python + + df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df.to_msgpack('foo.msg') + pd.read_msgpack('foo.msg') + s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + +You can pass a list of objects and you will receive them back on deserialization. + +.. ipython:: python + + pd.to_msgpack('foo.msg', df, 'foo', np.array([1,2,3]), s) + pd.read_msgpack('foo.msg') + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('foo.msg') + + +pickling +~~~~~~~~ + +All pandas objects are equipped with ``save`` methods which use Python's +``cPickle`` module to save data structures to disk using the pickle format. + +.. ipython:: python + + df + df.save('foo.pickle') + +The ``load`` function in the ``pandas`` namespace can be used to load any +pickled pandas object (or any other pickled object) from file: + + +.. ipython:: python + + load('foo.pickle') + +There is also a ``save`` function which takes any object as its first argument: + +.. ipython:: python + + save(df, 'foo.pickle') + load('foo.pickle') + +.. ipython:: python + :suppress: + + import os + os.remove('foo.pickle') + +.. warning:: + + Loading pickled data received from untrusted sources can be unsafe. + + See: http://docs.python.org/2.7/library/pickle.html + + HDF5 (PyTables) --------------- diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 5cfb24d71e19b..d87e9eea8a35e 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -1,10 +1,9 @@ -.. _whatsnew_0120: +.. _whatsnew_0111: -v0.12.0 (??) +v0.11.1 (??) ------------------------ -This is a major release from 0.11.0 and includes many new features and -enhancements along with a large number of bug fixes. +This is a minor release from 0.11.0 and include a small number of enhances and bug fixes. API changes ~~~~~~~~~~~ @@ -12,7 +11,7 @@ API changes Enhancements ~~~~~~~~~~~~ - - pd.read_html() can now parse HTML string, files or urls and return dataframes + - ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes courtesy of @cpcloud. (GH3477_) See the `full release notes diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt new file mode 100644 index 0000000000000..c680af0221a38 --- /dev/null +++ b/doc/source/v0.12.0.txt @@ -0,0 +1,35 @@ +.. _whatsnew_0120: + +v0.12.0 (??????) +---------------- + +This is a major release from 0.11.1 and includes many new features and +enhancements along with a large number of bug fixes. There are also a +number of important API changes that long-time pandas users should +pay close attention to. + +Enhancements +~~~~~~~~~~~~ + +- ``pd.read_msgpack()`` and ``pd.to_msgpack()`` are now a supported method of serialization + of arbitrary pandas (and python objects) in a lightweight portable binary format + + .. ipython:: python + + df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df.to_msgpack('foo.msg') + pd.read_msgpack('foo.msg') + + s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + pd.to_msgpack('foo.msg', df, s) + pd.read_msgpack('foo.msg') + + .. ipython:: python + :suppress: + :okexcept: + + os.remove('foo.msg') + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 81bd39dd0e70f..a02e41176ced1 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ These are new features and improvements of note in each release. .. include:: v0.12.0.txt +.. include:: v0.11.1.txt + .. include:: v0.11.0.txt .. include:: v0.10.1.txt diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 81c85965da4b9..ecd0ee1d02f5f 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -68,7 +68,7 @@ except: _USE_MSGPACK = False -def to_msgpack(path, obj, **kwargs): +def to_msgpack(path, *args, **kwargs): """ msgpack (serialize) object to input file path @@ -76,18 +76,30 @@ def to_msgpack(path, obj, **kwargs): ---------- path : string File path - obj : any object + args : an object or objects to serialize + + append : boolean whether to append to an existing msgpack + (default is False) """ if not _USE_MSGPACK: raise Exception("please install msgpack to create msgpack stores!") - f = open(path, 'wb') + + append = kwargs.get('append') + if append: + f = open(path, 'a+b') + else: + f = open(path, 'wb') try: - f.write(msgpack.packb(obj)) + if len(args) == 1: + f.write(pack(args[0])) + else: + for a in args: + f.write(pack(a)) finally: f.close() -def read_msgpack(path): +def read_msgpack(path, iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path @@ -96,15 +108,24 @@ def read_msgpack(path): ---------- path : string File path + iterator : boolean, if True, return an iterator to the unpacker + (default is False) Returns ------- obj : type of object stored in file + """ if not _USE_MSGPACK: raise Exception("please install msgpack to read msgpack stores!") + if iterator: + return Iterator(path) + with open(path,'rb') as fh: - return msgpack.unpackb(fh.read()) + l = list(unpack(fh)) + if len(l) == 1: + return l[0] + return l dtype_dict = { 'datetime64[ns]' : np.dtype('M8[ns]'), 'timedelta64[ns]' : np.dtype('m8[ns]') } @@ -296,48 +317,29 @@ def create_block(b): import pdb; pdb.set_trace() return obj -def pack(o, stream, default=encode, - encoding='utf-8', unicode_errors='strict'): - """ - Pack an object and write it to a stream. - """ - - _packer.pack(o, stream, default=default, - encoding=encoding, - unicode_errors=unicode_errors) -def packb(o, default=encode, - encoding='utf-8', unicode_errors='strict', use_single_float=False): +def pack(o, default=encode, + encoding='utf-8', unicode_errors='strict', use_single_float=False): """ Pack an object and return the packed bytes. """ - return _packer.packb(o, default=default, encoding=encoding, - unicode_errors=unicode_errors, - use_single_float=use_single_float) - -def unpack(stream, object_hook=decode, list_hook=None, use_list=None, - encoding='utf-8', unicode_errors='strict', object_pairs_hook=None): - """ - Unpack a packed object from a stream. - """ + return Packer(default=default, encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float).pack(o) - return _unpacker.unpack(stream, object_hook=object_hook, - list_hook=list_hook, use_list=use_list, - encoding=encoding, - unicode_errors=unicode_errors, - object_pairs_hook=object_pairs_hook) -def unpackb(packed, object_hook=decode, - list_hook=None, use_list=None, encoding='utf-8', - unicode_errors='strict', object_pairs_hook=None): +def unpack(packed, object_hook=decode, + list_hook=None, use_list=False, encoding='utf-8', + unicode_errors='strict', object_pairs_hook=None): """ - Unpack a packed object. + Unpack a packed object, return an iterator + Note: packed lists will be returned as tuples """ - return _unpacker.unpackb(packed, object_hook=object_hook, - list_hook=list_hook, - use_list=use_list, encoding=encoding, - unicode_errors=unicode_errors, - object_pairs_hook=object_pairs_hook) + return Unpacker(packed, object_hook=object_hook, + list_hook=list_hook, + use_list=use_list, encoding=encoding, + unicode_errors=unicode_errors, + object_pairs_hook=object_pairs_hook) if _USE_MSGPACK: @@ -352,7 +354,7 @@ def __init__(self, default=encode, use_single_float=use_single_float) class Unpacker(_unpacker.Unpacker): - def __init__(self, file_like=None, read_size=0, use_list=None, + def __init__(self, file_like=None, read_size=0, use_list=False, object_hook=decode, object_pairs_hook=None, list_hook=None, encoding='utf-8', unicode_errors='strict', max_buffer_size=0): @@ -365,14 +367,21 @@ def __init__(self, file_like=None, read_size=0, use_list=None, encoding=encoding, unicode_errors=unicode_errors, max_buffer_size=max_buffer_size) - - setattr(msgpack, 'Packer', Packer) - setattr(msgpack, 'Unpacker', Unpacker) - setattr(msgpack, 'load', unpack) - setattr(msgpack, 'loads', unpackb) - setattr(msgpack, 'dump', pack) - setattr(msgpack, 'dumps', packb) - setattr(msgpack, 'pack', pack) - setattr(msgpack, 'packb', packb) - setattr(msgpack, 'unpack', unpack) - setattr(msgpack, 'unpackb', unpackb) + +class Iterator(object): + """ manage the unpacking iteration, + close the file on completion """ + + def __init__(self, path, **kwargs): + self.path = path + self.kwargs = kwargs + + def __iter__(self): + + try: + fh = open(self.path,'rb') + unpacker = unpack(fh) + for o in unpacker: + yield o + finally: + fh.close() diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 08986cb082131..294b99d20a85b 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -26,6 +26,19 @@ _multiprocess_can_split_ = False +def check_arbitrary(a, b): + + if isinstance(a,(list,tuple)) and isinstance(b,(list,tuple)): + assert(len(a) == len(b)) + for a_, b_ in zip(a,b): + check_arbitrary(a_,b_) + elif isinstance(a,DataFrame): + assert_frame_equal(a,b) + elif isinstance(a,Series): + assert_series_equal(a,b) + else: + assert(a == b) + class Test(unittest.TestCase): def setUp(self): @@ -241,13 +254,24 @@ def test_multi(self): for k in self.d.keys(): assert_frame_equal(self.d[k],i_rec[k]) + l = tuple([ self.d['float'], self.d['float'].A, self.d['float'].B, None ]) + l_rec = self.encode_decode(l) + check_arbitrary(l,l_rec) + + # this is an oddity in that packed lists will be returned as tuples l = [ self.d['float'], self.d['float'].A, self.d['float'].B, None ] l_rec = self.encode_decode(l) - self.assert_(len(l) == len(l_rec)) - assert_frame_equal(l[0],l_rec[0]) - assert_series_equal(l[1],l_rec[1]) - assert_series_equal(l[2],l_rec[2]) - self.assert_(l[3] == l_rec[3]) + self.assert_(isinstance(l_rec,tuple)) + check_arbitrary(l,l_rec) + + def test_iterator(self): + + l = [ self.d['float'], self.d['float'].A, self.d['float'].B, None ] + + with ensure_clean(self.path) as path: + to_msgpack(path,*l) + for i, packed in enumerate(read_msgpack(path, iterator=True)): + check_arbitrary(packed,l[i]) def _create_sp_series(): From c9a9e3e1e4fefad203ecd25a1dfa31d038f18dd5 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 4 May 2013 11:13:08 -0400 Subject: [PATCH 3/5] ENH: added support for Panel,SparseSeries,SparseDataFrame,SparsePanel,IntIndex,BlockIndex --- doc/source/io.rst | 15 ++++ doc/source/v0.12.0.txt | 7 ++ pandas/io/packers.py | 59 +++++++++++-- pandas/io/tests/test_packers.py | 145 ++++++++++++++++++-------------- 4 files changed, 156 insertions(+), 70 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 1d90cf98067ea..9df6f4d3ecca6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1008,6 +1008,21 @@ You can pass a list of objects and you will receive them back on deserialization pd.to_msgpack('foo.msg', df, 'foo', np.array([1,2,3]), s) pd.read_msgpack('foo.msg') +You can pass ``iterator=True`` to iterator over the unpacked results + +.. ipython:: python + + for o in pd.read_msgpack('foo.msg',iterator=True): + print o + + +You can pass ``append=True`` to the writer to append to an existing pack + +.. ipython:: python + + df.to_msgpack('foo.msg',append=True) + pd.read_msgpack('foo.msg') + .. ipython:: python :suppress: :okexcept: diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt index c680af0221a38..ccb9347135c48 100644 --- a/doc/source/v0.12.0.txt +++ b/doc/source/v0.12.0.txt @@ -24,6 +24,13 @@ Enhancements pd.to_msgpack('foo.msg', df, s) pd.read_msgpack('foo.msg') + You can pass ``iterator=True`` to iterator over the unpacked results + + .. ipython:: python + + for o in pd.read_msgpack('foo.msg',iterator=True): + print o + .. ipython:: python :suppress: :okexcept: diff --git a/pandas/io/packers.py b/pandas/io/packers.py index ecd0ee1d02f5f..c31b3c4b98719 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -57,6 +57,7 @@ from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.core.index import Int64Index, _ensure_index import pandas.core.common as com +from pandas.core.generic import NDFrame from pandas.core.common import needs_i8_conversion from pandas.core.internals import BlockManager, make_block import pandas.core.internals as internals @@ -162,6 +163,7 @@ def encode(obj): Data encoder """ + tobj = type(obj) if isinstance(obj, Index): if isinstance(obj, PeriodIndex): return {'typ' : 'period_index', @@ -191,7 +193,15 @@ def encode(obj): 'data': obj.tolist() } elif isinstance(obj, Series): if isinstance(obj, SparseSeries): - import pdb; pdb.set_trace() + d = {'typ' : 'sparse_series', + 'klass' : obj.__class__.__name__, + 'dtype': obj.dtype.name, + 'index' : obj.index, + 'sp_index' : obj.sp_index, + 'sp_values' : convert(obj.sp_values)} + for f in ['name','fill_value','kind']: + d[f] = getattr(obj,f,None) + return d else: return {'typ' : 'series', 'klass' : obj.__class__.__name__, @@ -199,9 +209,23 @@ def encode(obj): 'index' : obj.index, 'dtype': obj.dtype.name, 'data': convert(obj.values) } - elif isinstance(obj, DataFrame): + elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): - import pdb; pdb.set_trace() + d = {'typ' : 'sparse_dataframe', + 'klass' : obj.__class__.__name__, + 'columns' : obj.columns } + for f in ['default_fill_value','default_kind']: + d[f] = getattr(obj,f,None) + d['data'] = dict([ (name,ss) for name,ss in obj.iteritems() ]) + return d + elif isinstance(obj, SparsePanel): + d = {'typ' : 'sparse_panel', + 'klass' : obj.__class__.__name__, + 'items' : obj.items } + for f in ['default_fill_value','default_kind']: + d[f] = getattr(obj,f,None) + d['data'] = dict([ (name,df) for name,df in obj.iteritems() ]) + return d else: data = obj._data @@ -209,7 +233,7 @@ def encode(obj): data = data.consolidate() # the block manager - return {'typ' : 'dataframe', + return {'typ' : 'block_manager', 'klass' : obj.__class__.__name__, 'axes' : data.axes, 'blocks' : [ { 'items' : b.items, @@ -237,6 +261,17 @@ def encode(obj): return {'typ' : 'period', 'ordinal' : obj.ordinal, 'freq' : obj.freq } + elif isinstance(obj, BlockIndex): + return { 'typ' : 'block_index', + 'klass' : obj.__class__.__name__, + 'blocs' : obj.blocs, + 'blengths' : obj.blengths, + 'length' : obj.length } + elif isinstance(obj, IntIndex): + return { 'typ' : 'int_index', + 'klass' : obj.__class__.__name__, + 'indices' : obj.indices, + 'length' : obj.length } elif isinstance(obj, np.ndarray): return {'typ' : 'ndarray', 'shape': obj.shape, @@ -288,7 +323,7 @@ def decode(obj): dtype = dtype_for(obj['dtype']) index = obj['index'] return globals()[obj['klass']](obj['data'],index=index,dtype=dtype,name=obj['name']) - elif typ == 'dataframe': + elif typ == 'block_manager': axes = obj['axes'] def create_block(b): @@ -300,6 +335,20 @@ def create_block(b): elif typ == 'datetime': import pdb; pdb.set_trace() return datetime.fromtimestamp(obj['data']) + elif typ == 'sparse_series': + dtype = dtype_for(obj['dtype']) + return globals()[obj['klass']](np.array(obj['sp_values'],dtype=dtype),sparse_index=obj['sp_index'], + index=obj['index'],fill_value=obj['fill_value'],kind=obj['kind'],name=obj['name']) + elif typ == 'sparse_dataframe': + return globals()[obj['klass']](obj['data'], + columns=obj['columns'],default_fill_value=obj['default_fill_value'],default_kind=obj['default_kind']) + elif typ == 'sparse_panel': + return globals()[obj['klass']](obj['data'], + items=obj['items'],default_fill_value=obj['default_fill_value'],default_kind=obj['default_kind']) + elif typ == 'block_index': + return globals()[obj['klass']](obj['length'],obj['blocs'],obj['blengths']) + elif typ == 'int_index': + return globals()[obj['klass']](obj['length'],obj['indices']) elif typ == 'ndarray': return np.array(obj['data'], dtype=np.typeDict[obj['dtype']], diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 294b99d20a85b..741df495fad7d 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -14,6 +14,10 @@ from pandas.util.testing import ensure_clean from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal +from pandas.tests.test_panel import assert_panel_equal + +import pandas +from pandas.sparse.tests.test_sparse import assert_sp_series_equal, assert_sp_frame_equal from pandas import concat, Timestamp, tslib from numpy.testing.decorators import slow @@ -32,6 +36,8 @@ def check_arbitrary(a, b): assert(len(a) == len(b)) for a_, b_ in zip(a,b): check_arbitrary(a_,b_) + elif isinstance(a,Panel): + assert_panel_equal(a,b) elif isinstance(a,DataFrame): assert_frame_equal(a,b) elif isinstance(a,Series): @@ -225,10 +231,10 @@ def test_basic(self): i_rec = self.encode_decode(i) assert_series_equal(i,i_rec) -class TestFrame(Test): +class TestNDFrame(Test): def setUp(self): - super(TestFrame, self).setUp() + super(TestNDFrame, self).setUp() data = { 'A': [0., 1., 2., 3., np.nan], @@ -238,98 +244,107 @@ def setUp(self): 'E' : [0., 1, Timestamp('20100101'),'foo',2.], } - self.d = { 'float' : DataFrame(dict(A = data['A'], B = Series(data['A']) + 1)), - 'int' : DataFrame(dict(A = data['B'], B = Series(data['B']) + 1)), - 'mixed' : DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])) } + self.frame = { 'float' : DataFrame(dict(A = data['A'], B = Series(data['A']) + 1)), + 'int' : DataFrame(dict(A = data['B'], B = Series(data['B']) + 1)), + 'mixed' : DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])) } + + self.panel = { 'float' : Panel(dict(ItemA = self.frame['float'], ItemB = self.frame['float']+1)) } - def test_basic(self): + def test_basic_frame(self): - for s, i in self.d.items(): + for s, i in self.frame.items(): i_rec = self.encode_decode(i) assert_frame_equal(i,i_rec) + def test_basic_panel(self): + + for s, i in self.panel.items(): + i_rec = self.encode_decode(i) + assert_panel_equal(i,i_rec) + def test_multi(self): - i_rec = self.encode_decode(self.d) - for k in self.d.keys(): - assert_frame_equal(self.d[k],i_rec[k]) + i_rec = self.encode_decode(self.frame) + for k in self.frame.keys(): + assert_frame_equal(self.frame[k],i_rec[k]) - l = tuple([ self.d['float'], self.d['float'].A, self.d['float'].B, None ]) + l = tuple([ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ]) l_rec = self.encode_decode(l) check_arbitrary(l,l_rec) # this is an oddity in that packed lists will be returned as tuples - l = [ self.d['float'], self.d['float'].A, self.d['float'].B, None ] + l = [ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ] l_rec = self.encode_decode(l) self.assert_(isinstance(l_rec,tuple)) check_arbitrary(l,l_rec) def test_iterator(self): - l = [ self.d['float'], self.d['float'].A, self.d['float'].B, None ] + l = [ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ] with ensure_clean(self.path) as path: to_msgpack(path,*l) for i, packed in enumerate(read_msgpack(path, iterator=True)): check_arbitrary(packed,l[i]) -def _create_sp_series(): +class TestSparse(Test): - # nan-based - arr = np.arange(15, dtype=float) - index = np.arange(15) - arr[7:12] = nan - arr[-1:] = nan + def _check_roundtrip(self, obj, comparator, **kwargs): - date_index = bdate_range('1/1/2011', periods=len(index)) - bseries = SparseSeries(arr, index=index, kind='block') - bseries.name = 'bseries' - return bseries + i_rec = self.encode_decode(obj) + comparator(obj,i_rec,**kwargs) -def _create_sp_frame(): + def test_sparse_series(self): - data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} - - dates = bdate_range('1/1/2011', periods=10) - return SparseDataFrame(data, index=dates) + s = tm.makeStringSeries() + s[3:5] = np.nan + ss = s.to_sparse() + self._check_roundtrip(ss, tm.assert_series_equal, + check_series_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_series_equal, + check_series_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_series_equal, + check_series_type=True) + + def test_sparse_frame(self): + + s = tm.makeDataFrame() + s.ix[3:5, 1:3] = np.nan + s.ix[8:10, -2] = np.nan + ss = s.to_sparse() + + self._check_roundtrip(ss, tm.assert_frame_equal, + check_frame_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_frame_equal, + check_frame_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_frame_equal, + check_frame_type=True) + + def test_sparse_panel(self): + + items = ['x', 'y', 'z'] + p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) + sp = p.to_sparse() + + self._check_roundtrip(sp, tm.assert_panel_equal, + check_panel_type=True) + + sp2 = p.to_sparse(kind='integer') + self._check_roundtrip(sp2, tm.assert_panel_equal, + check_panel_type=True) + + sp3 = p.to_sparse(fill_value=0) + self._check_roundtrip(sp3, tm.assert_panel_equal, + check_panel_type=True) -def create_data(): - """ create the pickle data """ - - data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E' : [0., 1, Timestamp('20100101'),'foo',2.], - } - - index = dict(int = Index(np.arange(10)), - date = date_range('20130101',periods=10)) - mi = dict(reg = MultiIndex.from_tuples(zip([['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]), - names=['first', 'second'])) - series = dict(float = Series(data['A']), - int = Series(data['B']), - mixed = Series(data['E'])) - frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), - int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), - mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']]))) - panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1))) - - - - return dict( series = series, - frame = frame, - panel = panel, - index = index, - mi = mi, - sp_series = dict(float = _create_sp_series()), - sp_frame = dict(float = _create_sp_frame()) - ) if __name__ == '__main__': import nose From a55e7e4e2f977b2bbfb8099c5d5530f157022a70 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 4 May 2013 18:43:12 -0400 Subject: [PATCH 4/5] ENH: handle np.datetime64,np.timedelta64,date,timedelta types --- pandas/io/packers.py | 65 ++++++++++++++++++++++----------- pandas/io/tests/test_packers.py | 16 ++++++++ 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index c31b3c4b98719..f90ff34cfde92 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -40,12 +40,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ -from datetime import datetime +from datetime import datetime, date, timedelta import time import re import copy import itertools import warnings +from dateutil.parser import parse import numpy as np from pandas import ( @@ -128,8 +129,12 @@ def read_msgpack(path, iterator=False, **kwargs): return l[0] return l -dtype_dict = { 'datetime64[ns]' : np.dtype('M8[ns]'), - 'timedelta64[ns]' : np.dtype('m8[ns]') } +dtype_dict = { 21 : np.dtype('M8[ns]'), + u'datetime64[ns]' : np.dtype('M8[ns]'), + u'datetime64[us]' : np.dtype('M8[us]'), + 22 : np.dtype('m8[ns]'), + u'timedelta64[ns]' : np.dtype('m8[ns]'), + u'timedelta64[us]' : np.dtype('m8[us]') } def dtype_for(t): if t in dtype_dict: @@ -169,13 +174,13 @@ def encode(obj): return {'typ' : 'period_index', 'klass' : obj.__class__.__name__, 'name' : getattr(obj,'name',None), - 'dtype': obj.dtype.name, + 'dtype': obj.dtype.num, 'data': obj.tolist() } elif isinstance(obj, DatetimeIndex): return {'typ' : 'datetime_index', 'klass' : obj.__class__.__name__, 'name' : getattr(obj,'name',None), - 'dtype': obj.dtype.name, + 'dtype': obj.dtype.num, 'data': obj.values.view('i8').tolist(), 'freq' : obj.freqstr, 'tz' : obj.tz} @@ -183,19 +188,19 @@ def encode(obj): return {'typ' : 'multi_index', 'klass' : obj.__class__.__name__, 'names' : getattr(obj,'names',None), - 'dtype': obj.dtype.name, + 'dtype': obj.dtype.num, 'data': obj.tolist() } else: return {'typ' : 'index', 'klass' : obj.__class__.__name__, 'name' : getattr(obj,'name',None), - 'dtype': obj.dtype.name, + 'dtype': obj.dtype.num, 'data': obj.tolist() } elif isinstance(obj, Series): if isinstance(obj, SparseSeries): d = {'typ' : 'sparse_series', 'klass' : obj.__class__.__name__, - 'dtype': obj.dtype.name, + 'dtype': obj.dtype.num, 'index' : obj.index, 'sp_index' : obj.sp_index, 'sp_values' : convert(obj.sp_values)} @@ -207,7 +212,7 @@ def encode(obj): 'klass' : obj.__class__.__name__, 'name' : getattr(obj,'name',None), 'index' : obj.index, - 'dtype': obj.dtype.name, + 'dtype': obj.dtype.num, 'data': convert(obj.values) } elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): @@ -239,11 +244,11 @@ def encode(obj): 'blocks' : [ { 'items' : b.items, 'values' : convert(b.values), 'shape' : b.values.shape, - 'dtype' : b.dtype.name, + 'dtype' : b.dtype.num, 'klass' : b.__class__.__name__ } for b in data.blocks ] } - elif isinstance(obj, datetime): + elif isinstance(obj, (datetime,date,timedelta)): if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: @@ -255,8 +260,16 @@ def encode(obj): 'value': obj.value, 'offset' : offset, 'tz' : tz} - return { 'typ' : 'datetime', - 'data' : obj.isoformat() } + elif isinstance(obj, timedelta): + return { 'typ' : 'timedelta', + 'data' : (obj.days,obj.seconds,obj.microseconds) } + elif isinstance(obj, datetime): + return { 'typ' : 'datetime', + 'data' : obj.isoformat() } + elif isinstance(obj, date): + return { 'typ' : 'date', + 'data' : obj.isoformat() } + raise Exception("cannot encode this datetimelike object: %s" % obj) elif isinstance(obj, Period): return {'typ' : 'period', 'ordinal' : obj.ordinal, @@ -276,8 +289,11 @@ def encode(obj): return {'typ' : 'ndarray', 'shape': obj.shape, 'ndim': obj.ndim, - 'dtype': obj.dtype.name, + 'dtype': obj.dtype.num, 'data': convert(obj)} + elif isinstance(obj, np.timedelta64): + return { 'typ' : 'np_timedelta64', + 'data' : obj.view('i8') } elif isinstance(obj, np.number): if np.iscomplexobj(obj): return {'typ' : 'np_scalar', @@ -293,9 +309,8 @@ def encode(obj): return {'typ' : 'np_complex', 'real': obj.real.__repr__(), 'imag': obj.imag.__repr__()} - else: - import pdb; pdb.set_trace() - return obj + + return obj def decode(obj): """ @@ -333,8 +348,11 @@ def create_block(b): blocks = [ create_block(b) for b in obj['blocks'] ] return globals()[obj['klass']](BlockManager(blocks, axes)) elif typ == 'datetime': - import pdb; pdb.set_trace() - return datetime.fromtimestamp(obj['data']) + return parse(obj['data']) + elif typ == 'date': + return parse(obj['data']).date() + elif typ == 'timedelta': + return timedelta(*obj['data']) elif typ == 'sparse_series': dtype = dtype_for(obj['dtype']) return globals()[obj['klass']](np.array(obj['sp_values'],dtype=dtype),sparse_index=obj['sp_index'], @@ -353,17 +371,22 @@ def create_block(b): return np.array(obj['data'], dtype=np.typeDict[obj['dtype']], ndmin=obj['ndim']).reshape(obj['shape']) + elif typ == 'np_timedelta64': + return np.timedelta64(obj['data']) elif typ == 'np_scalar': if obj.get('sub_typ') == 'np_complex': return c2f(obj['real'], obj['imag'], obj['dtype']) else: - return np.typeDict[obj['dtype']](obj['data']) + dtype = dtype_for(obj['dtype']) + try: + return dtype(obj['data']) + except: + return dtype.type(obj['data']) elif typ == 'np_complex': return complex(obj['real']+'+'+obj['imag']+'j') elif isinstance(obj, (dict,list,set)): return obj else: - import pdb; pdb.set_trace() return obj def pack(o, default=encode, diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 741df495fad7d..f9e25f3956d38 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -155,6 +155,22 @@ def test_timestamp(self): i_rec = self.encode_decode(i) self.assert_(i == i_rec) + def test_datetimes(self): + + for i in [ datetime.datetime(2013,1,1), datetime.datetime(2013,1,1,5,1), + datetime.date(2013,1,1), np.datetime64('2013-01-05 2:15') ]: + i_rec = self.encode_decode(i) + self.assert_(i == i_rec) + + def test_timedeltas(self): + + for i in [ datetime.timedelta(days=1), + datetime.timedelta(days=1,seconds=10), + np.timedelta64(1000000) ]: + i_rec = self.encode_decode(i) + self.assert_(i == i_rec) + + class TestIndex(Test): def setUp(self): From 5a02cdf0f36fd37853583671ec348882a01b30e2 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 5 May 2013 20:25:45 -0400 Subject: [PATCH 5/5] TST: added compression (zlib/blosc) via big hack --- pandas/io/packers.py | 93 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 15 deletions(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index f90ff34cfde92..bc32c3c4d4011 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -70,6 +70,18 @@ except: _USE_MSGPACK = False +import zlib + +try: + import blosc + _BLOSC = True +except: + _BLOSC = False + +## until we can pass this into our conversion functions, +## this is pretty hacky +compressor = None + def to_msgpack(path, *args, **kwargs): """ msgpack (serialize) object to input file path @@ -82,10 +94,13 @@ def to_msgpack(path, *args, **kwargs): append : boolean whether to append to an existing msgpack (default is False) + compress : type of compressor (zlib or blosc), default to None (no compression) """ if not _USE_MSGPACK: raise Exception("please install msgpack to create msgpack stores!") + global compressor + compressor = kwargs.get('compress') append = kwargs.get('append') if append: f = open(path, 'a+b') @@ -154,14 +169,60 @@ def c2f(r, i, ctype_name): ftype = c2f_dict[ctype_name] return np.typeDict[ctype_name](ftype(r)+1j*ftype(i)) + def convert(values): """ convert the numpy values to a list """ dtype = values.dtype if needs_i8_conversion(dtype): values = values.view('i8') - return values.ravel().tolist() + v = values.ravel() + + if compressor == 'zlib': + + # return string arrays like they are + if dtype == np.object_: + return v.tolist() + + # convert to a bytes array + v = v.tostring() + return zlib.compress(v) + + elif compressor == 'blosc' and _BLOSC: + + # return string arrays like they are + if dtype == np.object_: + return v.tolist() + + # convert to a bytes array + v = v.tostring() + return blosc.compress(v,typesize=dtype.itemsize) + + # as a list + return v.tolist() + +def unconvert(values, dtype, compress): + + if dtype == np.object_: + return np.array(values,dtype=object) + + if compress == 'zlib': + + values = zlib.decompress(values) + return np.frombuffer(values,dtype=dtype) + + elif compress == 'blosc': + + if not _BLOSC: + raise Exception("cannot uncompress w/o blosc") + + # decompress + values = blosc.decompress(values) + + return np.frombuffer(values,dtype=dtype) + # as a list + return np.array(values,dtype=dtype) def encode(obj): """ @@ -203,7 +264,8 @@ def encode(obj): 'dtype': obj.dtype.num, 'index' : obj.index, 'sp_index' : obj.sp_index, - 'sp_values' : convert(obj.sp_values)} + 'sp_values' : convert(obj.sp_values), + 'compress' : compressor} for f in ['name','fill_value','kind']: d[f] = getattr(obj,f,None) return d @@ -213,7 +275,8 @@ def encode(obj): 'name' : getattr(obj,'name',None), 'index' : obj.index, 'dtype': obj.dtype.num, - 'data': convert(obj.values) } + 'data': convert(obj.values), + 'compress' : compressor} elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): d = {'typ' : 'sparse_dataframe', @@ -245,7 +308,8 @@ def encode(obj): 'values' : convert(b.values), 'shape' : b.values.shape, 'dtype' : b.dtype.num, - 'klass' : b.__class__.__name__ + 'klass' : b.__class__.__name__, + 'compress' : compressor } for b in data.blocks ] } elif isinstance(obj, (datetime,date,timedelta)): @@ -290,7 +354,8 @@ def encode(obj): 'shape': obj.shape, 'ndim': obj.ndim, 'dtype': obj.dtype.num, - 'data': convert(obj)} + 'data': convert(obj), + 'compress' : compressor } elif isinstance(obj, np.timedelta64): return { 'typ' : 'np_timedelta64', 'data' : obj.view('i8') } @@ -337,13 +402,13 @@ def decode(obj): elif typ == 'series': dtype = dtype_for(obj['dtype']) index = obj['index'] - return globals()[obj['klass']](obj['data'],index=index,dtype=dtype,name=obj['name']) + return globals()[obj['klass']](unconvert(obj['data'],dtype,obj['compress']),index=index,name=obj['name']) elif typ == 'block_manager': axes = obj['axes'] def create_block(b): dtype = dtype_for(b['dtype']) - return make_block(np.array(b['values'],dtype=dtype).reshape(b['shape']),b['items'],axes[0],klass=getattr(internals,b['klass'])) + return make_block(unconvert(b['values'],dtype,b['compress']).reshape(b['shape']),b['items'],axes[0],klass=getattr(internals,b['klass'])) blocks = [ create_block(b) for b in obj['blocks'] ] return globals()[obj['klass']](BlockManager(blocks, axes)) @@ -355,7 +420,7 @@ def create_block(b): return timedelta(*obj['data']) elif typ == 'sparse_series': dtype = dtype_for(obj['dtype']) - return globals()[obj['klass']](np.array(obj['sp_values'],dtype=dtype),sparse_index=obj['sp_index'], + return globals()[obj['klass']](unconvert(obj['sp_values'],dtype,obj['compress']),sparse_index=obj['sp_index'], index=obj['index'],fill_value=obj['fill_value'],kind=obj['kind'],name=obj['name']) elif typ == 'sparse_dataframe': return globals()[obj['klass']](obj['data'], @@ -368,9 +433,7 @@ def create_block(b): elif typ == 'int_index': return globals()[obj['klass']](obj['length'],obj['indices']) elif typ == 'ndarray': - return np.array(obj['data'], - dtype=np.typeDict[obj['dtype']], - ndmin=obj['ndim']).reshape(obj['shape']) + return unconvert(obj['data'],np.typeDict[obj['dtype']],obj['compress']).reshape(obj['shape']) elif typ == 'np_timedelta64': return np.timedelta64(obj['data']) elif typ == 'np_scalar': @@ -390,7 +453,7 @@ def create_block(b): return obj def pack(o, default=encode, - encoding='utf-8', unicode_errors='strict', use_single_float=False): + encoding=None, unicode_errors='strict', use_single_float=False): """ Pack an object and return the packed bytes. """ @@ -400,7 +463,7 @@ def pack(o, default=encode, use_single_float=use_single_float).pack(o) def unpack(packed, object_hook=decode, - list_hook=None, use_list=False, encoding='utf-8', + list_hook=None, use_list=False, encoding=None, unicode_errors='strict', object_pairs_hook=None): """ Unpack a packed object, return an iterator @@ -417,7 +480,7 @@ def unpack(packed, object_hook=decode, class Packer(_packer.Packer): def __init__(self, default=encode, - encoding='utf-8', + encoding=None, unicode_errors='strict', use_single_float=False): super(Packer, self).__init__(default=default, @@ -428,7 +491,7 @@ def __init__(self, default=encode, class Unpacker(_unpacker.Unpacker): def __init__(self, file_like=None, read_size=0, use_list=False, object_hook=decode, - object_pairs_hook=None, list_hook=None, encoding='utf-8', + object_pairs_hook=None, list_hook=None, encoding=None, unicode_errors='strict', max_buffer_size=0): super(Unpacker, self).__init__(file_like=file_like, read_size=read_size,