diff --git a/LICENSES/MSGPACK_NUMPY_LICENSE b/LICENSES/MSGPACK_NUMPY_LICENSE new file mode 100644 index 0000000000000..57ea631f0f66d --- /dev/null +++ b/LICENSES/MSGPACK_NUMPY_LICENSE @@ -0,0 +1,33 @@ +.. -*- rst -*- + +License +======= + +Copyright (c) 2013, Lev Givon. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Lev Givon nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/RELEASE.rst b/RELEASE.rst index 77e8e85db6a76..cefb18c9f0ddf 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -32,6 +32,8 @@ pandas 0.11.1 - pd.read_html() can now parse HTML string, files or urls and return dataframes courtesy of @cpcloud. (GH3477_) + - ``pd.read_msgpack()`` and ``pd.to_msgpack()`` are now a supported method of serialization + of arbitrary pandas (and python objects) in a lightweight portable binary format (GH686_) **Improvements to existing features** @@ -75,6 +77,7 @@ pandas 0.11.1 .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 +.. _GH686: https://github.com/pydata/pandas/issues/686 .. _GH2194: https://github.com/pydata/pandas/issues/2194 .. _GH3230: https://github.com/pydata/pandas/issues/3230 .. _GH3251: https://github.com/pydata/pandas/issues/3251 diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 5739fe0922d6d..c6f036d9541a6 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1192,47 +1192,6 @@ While float dtypes are unchanged. casted casted.dtypes -.. _basics.serialize: - -Pickling and serialization --------------------------- - -All pandas objects are equipped with ``save`` methods which use Python's -``cPickle`` module to save data structures to disk using the pickle format. - -.. ipython:: python - - df - df.save('foo.pickle') - -The ``load`` function in the ``pandas`` namespace can be used to load any -pickled pandas object (or any other pickled object) from file: - - -.. ipython:: python - - load('foo.pickle') - -There is also a ``save`` function which takes any object as its first argument: - -.. ipython:: python - - save(df, 'foo.pickle') - load('foo.pickle') - -.. ipython:: python - :suppress: - - import os - os.remove('foo.pickle') - -.. warning:: - - Loading pickled data received from untrusted sources can be unsafe. - - See: http://docs.python.org/2.7/library/pickle.html - - Working with package options ---------------------------- diff --git a/doc/source/install.rst b/doc/source/install.rst index 9d14d1b11c6b1..360ded91c86f0 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -93,6 +93,7 @@ Optional Dependencies version. Version 0.17.1 or higher. * `SciPy `__: miscellaneous statistical functions * `PyTables `__: necessary for HDF5-based storage + * `msgpack `__: necessary for msgpack based serialization * `matplotlib `__: for plotting * `statsmodels `__ * Needed for parts of :mod:`pandas.stats` diff --git a/doc/source/io.rst b/doc/source/io.rst index 9001ae393d552..9df6f4d3ecca6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -981,6 +981,94 @@ one can use the ExcelWriter class, as in the following example: .. _io.hdf5: +.. _basics.serialize: + +Serialization +------------- + +msgpack +~~~~~~~ + +Starting in 0.12.0, pandas is supporting the ``msgpack`` format for +object serialization. This is a lightweight portable binary format, similar +to binary JSON, that is highly space efficient, and provides good performance +both on the writing (serialization), and reading (deserialization). + +.. ipython:: python + + df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df.to_msgpack('foo.msg') + pd.read_msgpack('foo.msg') + s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + +You can pass a list of objects and you will receive them back on deserialization. + +.. ipython:: python + + pd.to_msgpack('foo.msg', df, 'foo', np.array([1,2,3]), s) + pd.read_msgpack('foo.msg') + +You can pass ``iterator=True`` to iterator over the unpacked results + +.. ipython:: python + + for o in pd.read_msgpack('foo.msg',iterator=True): + print o + + +You can pass ``append=True`` to the writer to append to an existing pack + +.. ipython:: python + + df.to_msgpack('foo.msg',append=True) + pd.read_msgpack('foo.msg') + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('foo.msg') + + +pickling +~~~~~~~~ + +All pandas objects are equipped with ``save`` methods which use Python's +``cPickle`` module to save data structures to disk using the pickle format. + +.. ipython:: python + + df + df.save('foo.pickle') + +The ``load`` function in the ``pandas`` namespace can be used to load any +pickled pandas object (or any other pickled object) from file: + + +.. ipython:: python + + load('foo.pickle') + +There is also a ``save`` function which takes any object as its first argument: + +.. ipython:: python + + save(df, 'foo.pickle') + load('foo.pickle') + +.. ipython:: python + :suppress: + + import os + os.remove('foo.pickle') + +.. warning:: + + Loading pickled data received from untrusted sources can be unsafe. + + See: http://docs.python.org/2.7/library/pickle.html + + HDF5 (PyTables) --------------- diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 5cfb24d71e19b..d87e9eea8a35e 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -1,10 +1,9 @@ -.. _whatsnew_0120: +.. _whatsnew_0111: -v0.12.0 (??) +v0.11.1 (??) ------------------------ -This is a major release from 0.11.0 and includes many new features and -enhancements along with a large number of bug fixes. +This is a minor release from 0.11.0 and include a small number of enhances and bug fixes. API changes ~~~~~~~~~~~ @@ -12,7 +11,7 @@ API changes Enhancements ~~~~~~~~~~~~ - - pd.read_html() can now parse HTML string, files or urls and return dataframes + - ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes courtesy of @cpcloud. (GH3477_) See the `full release notes diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt new file mode 100644 index 0000000000000..ccb9347135c48 --- /dev/null +++ b/doc/source/v0.12.0.txt @@ -0,0 +1,42 @@ +.. _whatsnew_0120: + +v0.12.0 (??????) +---------------- + +This is a major release from 0.11.1 and includes many new features and +enhancements along with a large number of bug fixes. There are also a +number of important API changes that long-time pandas users should +pay close attention to. + +Enhancements +~~~~~~~~~~~~ + +- ``pd.read_msgpack()`` and ``pd.to_msgpack()`` are now a supported method of serialization + of arbitrary pandas (and python objects) in a lightweight portable binary format + + .. ipython:: python + + df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df.to_msgpack('foo.msg') + pd.read_msgpack('foo.msg') + + s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + pd.to_msgpack('foo.msg', df, s) + pd.read_msgpack('foo.msg') + + You can pass ``iterator=True`` to iterator over the unpacked results + + .. ipython:: python + + for o in pd.read_msgpack('foo.msg',iterator=True): + print o + + .. ipython:: python + :suppress: + :okexcept: + + os.remove('foo.msg') + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 81bd39dd0e70f..a02e41176ced1 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ These are new features and improvements of note in each release. .. include:: v0.12.0.txt +.. include:: v0.11.1.txt + .. include:: v0.11.0.txt .. include:: v0.10.1.txt diff --git a/pandas/__init__.py b/pandas/__init__.py index bf5bcc81bc21e..3aee9b2ab67d8 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -32,6 +32,7 @@ from pandas.io.parsers import (read_csv, read_table, read_clipboard, read_fwf, to_clipboard, ExcelFile, ExcelWriter) +from pandas.io.packers import read_msgpack, to_msgpack from pandas.io.pytables import HDFStore, Term, get_store, read_hdf from pandas.io.html import read_html from pandas.util.testing import debug diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ed90aab715cfd..571ab4fab07ce 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -487,6 +487,10 @@ def to_hdf(self, path_or_buf, key, **kwargs): from pandas.io import pytables return pytables.to_hdf(path_or_buf, key, self, **kwargs) + def to_msgpack(self, path_or_buf, **kwargs): + from pandas.io import packers + return packers.to_msgpack(path_or_buf, self, **kwargs) + # install the indexerse for _name, _indexer in indexing.get_indexers_list(): PandasObject._create_indexer(_name,_indexer) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5c0f9253beb62..4628773491d61 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -15,7 +15,6 @@ from pandas.tslib import Timestamp from pandas.util import py3compat - class Block(object): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas diff --git a/pandas/io/packers.py b/pandas/io/packers.py new file mode 100644 index 0000000000000..bc32c3c4d4011 --- /dev/null +++ b/pandas/io/packers.py @@ -0,0 +1,522 @@ +""" +Msgpack serializer support for reading and writing pandas data structures +to disk +""" + +# porfions of msgpack_numpy package, by Lev Givon were incorporated +# into this module (and tests_packers.py) + +""" +License +======= + +Copyright (c) 2013, Lev Givon. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Lev Givon nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +from datetime import datetime, date, timedelta +import time +import re +import copy +import itertools +import warnings +from dateutil.parser import parse + +import numpy as np +from pandas import ( + Timestamp, Period, Series, TimeSeries, DataFrame, Panel, Panel4D, + Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, NaT +) +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.array import BlockIndex, IntIndex +from pandas.tseries.api import PeriodIndex, DatetimeIndex +from pandas.core.index import Int64Index, _ensure_index +import pandas.core.common as com +from pandas.core.generic import NDFrame +from pandas.core.common import needs_i8_conversion +from pandas.core.internals import BlockManager, make_block +import pandas.core.internals as internals + +try: + import msgpack + from msgpack import _packer, _unpacker + _USE_MSGPACK = True +except: + _USE_MSGPACK = False + +import zlib + +try: + import blosc + _BLOSC = True +except: + _BLOSC = False + +## until we can pass this into our conversion functions, +## this is pretty hacky +compressor = None + +def to_msgpack(path, *args, **kwargs): + """ + msgpack (serialize) object to input file path + + Parameters + ---------- + path : string + File path + args : an object or objects to serialize + + append : boolean whether to append to an existing msgpack + (default is False) + compress : type of compressor (zlib or blosc), default to None (no compression) + """ + if not _USE_MSGPACK: + raise Exception("please install msgpack to create msgpack stores!") + + global compressor + compressor = kwargs.get('compress') + append = kwargs.get('append') + if append: + f = open(path, 'a+b') + else: + f = open(path, 'wb') + try: + if len(args) == 1: + f.write(pack(args[0])) + else: + for a in args: + f.write(pack(a)) + finally: + f.close() + + +def read_msgpack(path, iterator=False, **kwargs): + """ + Load msgpack pandas object from the specified + file path + + Parameters + ---------- + path : string + File path + iterator : boolean, if True, return an iterator to the unpacker + (default is False) + + Returns + ------- + obj : type of object stored in file + + """ + if not _USE_MSGPACK: + raise Exception("please install msgpack to read msgpack stores!") + if iterator: + return Iterator(path) + + with open(path,'rb') as fh: + l = list(unpack(fh)) + if len(l) == 1: + return l[0] + return l + +dtype_dict = { 21 : np.dtype('M8[ns]'), + u'datetime64[ns]' : np.dtype('M8[ns]'), + u'datetime64[us]' : np.dtype('M8[us]'), + 22 : np.dtype('m8[ns]'), + u'timedelta64[ns]' : np.dtype('m8[ns]'), + u'timedelta64[us]' : np.dtype('m8[us]') } + +def dtype_for(t): + if t in dtype_dict: + return dtype_dict[t] + return np.typeDict[t] + +c2f_dict = {'complex': np.float64, + 'complex128': np.float64, + 'complex256': np.float128, + 'complex64': np.float32} + +def c2f(r, i, ctype_name): + """ + Convert strings to complex number instance with specified numpy type. + """ + + ftype = c2f_dict[ctype_name] + return np.typeDict[ctype_name](ftype(r)+1j*ftype(i)) + + +def convert(values): + """ convert the numpy values to a list """ + + dtype = values.dtype + if needs_i8_conversion(dtype): + values = values.view('i8') + v = values.ravel() + + if compressor == 'zlib': + + # return string arrays like they are + if dtype == np.object_: + return v.tolist() + + # convert to a bytes array + v = v.tostring() + return zlib.compress(v) + + elif compressor == 'blosc' and _BLOSC: + + # return string arrays like they are + if dtype == np.object_: + return v.tolist() + + # convert to a bytes array + v = v.tostring() + return blosc.compress(v,typesize=dtype.itemsize) + + # as a list + return v.tolist() + +def unconvert(values, dtype, compress): + + if dtype == np.object_: + return np.array(values,dtype=object) + + if compress == 'zlib': + + values = zlib.decompress(values) + return np.frombuffer(values,dtype=dtype) + + elif compress == 'blosc': + + if not _BLOSC: + raise Exception("cannot uncompress w/o blosc") + + # decompress + values = blosc.decompress(values) + + return np.frombuffer(values,dtype=dtype) + + # as a list + return np.array(values,dtype=dtype) + +def encode(obj): + """ + Data encoder + """ + + tobj = type(obj) + if isinstance(obj, Index): + if isinstance(obj, PeriodIndex): + return {'typ' : 'period_index', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'dtype': obj.dtype.num, + 'data': obj.tolist() } + elif isinstance(obj, DatetimeIndex): + return {'typ' : 'datetime_index', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'dtype': obj.dtype.num, + 'data': obj.values.view('i8').tolist(), + 'freq' : obj.freqstr, + 'tz' : obj.tz} + elif isinstance(obj, MultiIndex): + return {'typ' : 'multi_index', + 'klass' : obj.__class__.__name__, + 'names' : getattr(obj,'names',None), + 'dtype': obj.dtype.num, + 'data': obj.tolist() } + else: + return {'typ' : 'index', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'dtype': obj.dtype.num, + 'data': obj.tolist() } + elif isinstance(obj, Series): + if isinstance(obj, SparseSeries): + d = {'typ' : 'sparse_series', + 'klass' : obj.__class__.__name__, + 'dtype': obj.dtype.num, + 'index' : obj.index, + 'sp_index' : obj.sp_index, + 'sp_values' : convert(obj.sp_values), + 'compress' : compressor} + for f in ['name','fill_value','kind']: + d[f] = getattr(obj,f,None) + return d + else: + return {'typ' : 'series', + 'klass' : obj.__class__.__name__, + 'name' : getattr(obj,'name',None), + 'index' : obj.index, + 'dtype': obj.dtype.num, + 'data': convert(obj.values), + 'compress' : compressor} + elif issubclass(tobj, NDFrame): + if isinstance(obj, SparseDataFrame): + d = {'typ' : 'sparse_dataframe', + 'klass' : obj.__class__.__name__, + 'columns' : obj.columns } + for f in ['default_fill_value','default_kind']: + d[f] = getattr(obj,f,None) + d['data'] = dict([ (name,ss) for name,ss in obj.iteritems() ]) + return d + elif isinstance(obj, SparsePanel): + d = {'typ' : 'sparse_panel', + 'klass' : obj.__class__.__name__, + 'items' : obj.items } + for f in ['default_fill_value','default_kind']: + d[f] = getattr(obj,f,None) + d['data'] = dict([ (name,df) for name,df in obj.iteritems() ]) + return d + else: + + data = obj._data + if not data.is_consolidated(): + data = data.consolidate() + + # the block manager + return {'typ' : 'block_manager', + 'klass' : obj.__class__.__name__, + 'axes' : data.axes, + 'blocks' : [ { 'items' : b.items, + 'values' : convert(b.values), + 'shape' : b.values.shape, + 'dtype' : b.dtype.num, + 'klass' : b.__class__.__name__, + 'compress' : compressor + } for b in data.blocks ] } + + elif isinstance(obj, (datetime,date,timedelta)): + if isinstance(obj, Timestamp): + tz = obj.tzinfo + if tz is not None: + tz = tz.zone + offset = obj.offset + if offset is not None: + offset = offset.freqstr + return {'typ' : 'timestamp', + 'value': obj.value, + 'offset' : offset, + 'tz' : tz} + elif isinstance(obj, timedelta): + return { 'typ' : 'timedelta', + 'data' : (obj.days,obj.seconds,obj.microseconds) } + elif isinstance(obj, datetime): + return { 'typ' : 'datetime', + 'data' : obj.isoformat() } + elif isinstance(obj, date): + return { 'typ' : 'date', + 'data' : obj.isoformat() } + raise Exception("cannot encode this datetimelike object: %s" % obj) + elif isinstance(obj, Period): + return {'typ' : 'period', + 'ordinal' : obj.ordinal, + 'freq' : obj.freq } + elif isinstance(obj, BlockIndex): + return { 'typ' : 'block_index', + 'klass' : obj.__class__.__name__, + 'blocs' : obj.blocs, + 'blengths' : obj.blengths, + 'length' : obj.length } + elif isinstance(obj, IntIndex): + return { 'typ' : 'int_index', + 'klass' : obj.__class__.__name__, + 'indices' : obj.indices, + 'length' : obj.length } + elif isinstance(obj, np.ndarray): + return {'typ' : 'ndarray', + 'shape': obj.shape, + 'ndim': obj.ndim, + 'dtype': obj.dtype.num, + 'data': convert(obj), + 'compress' : compressor } + elif isinstance(obj, np.timedelta64): + return { 'typ' : 'np_timedelta64', + 'data' : obj.view('i8') } + elif isinstance(obj, np.number): + if np.iscomplexobj(obj): + return {'typ' : 'np_scalar', + 'sub_typ' : 'np_complex', + 'dtype': obj.dtype.name, + 'real': obj.real.__repr__(), + 'imag': obj.imag.__repr__()} + else: + return {'typ' : 'np_scalar', + 'dtype': obj.dtype.name, + 'data': obj.__repr__()} + elif isinstance(obj, complex): + return {'typ' : 'np_complex', + 'real': obj.real.__repr__(), + 'imag': obj.imag.__repr__()} + + return obj + +def decode(obj): + """ + Decoder for deserializing numpy data types. + """ + + typ = obj.get('typ') + if typ is None: + return obj + elif typ == 'timestamp': + return Timestamp(obj['value'],tz=obj['tz'],offset=obj['offset']) + elif typ == 'period': + return Period(ordinal=obj['ordinal'],freq=obj['freq']) + elif typ == 'index': + dtype = dtype_for(obj['dtype']) + data = obj['data'] + return globals()[obj['klass']](data,dtype=dtype,name=obj['name']) + elif typ == 'multi_index': + return globals()[obj['klass']].from_tuples(obj['data'],names=obj['names']) + elif typ == 'period_index': + return globals()[obj['klass']](obj['data'],name=obj['name']) + elif typ == 'datetime_index': + return globals()[obj['klass']](obj['data'],freq=obj['freq'],tz=obj['tz'],name=obj['name']) + elif typ == 'series': + dtype = dtype_for(obj['dtype']) + index = obj['index'] + return globals()[obj['klass']](unconvert(obj['data'],dtype,obj['compress']),index=index,name=obj['name']) + elif typ == 'block_manager': + axes = obj['axes'] + + def create_block(b): + dtype = dtype_for(b['dtype']) + return make_block(unconvert(b['values'],dtype,b['compress']).reshape(b['shape']),b['items'],axes[0],klass=getattr(internals,b['klass'])) + + blocks = [ create_block(b) for b in obj['blocks'] ] + return globals()[obj['klass']](BlockManager(blocks, axes)) + elif typ == 'datetime': + return parse(obj['data']) + elif typ == 'date': + return parse(obj['data']).date() + elif typ == 'timedelta': + return timedelta(*obj['data']) + elif typ == 'sparse_series': + dtype = dtype_for(obj['dtype']) + return globals()[obj['klass']](unconvert(obj['sp_values'],dtype,obj['compress']),sparse_index=obj['sp_index'], + index=obj['index'],fill_value=obj['fill_value'],kind=obj['kind'],name=obj['name']) + elif typ == 'sparse_dataframe': + return globals()[obj['klass']](obj['data'], + columns=obj['columns'],default_fill_value=obj['default_fill_value'],default_kind=obj['default_kind']) + elif typ == 'sparse_panel': + return globals()[obj['klass']](obj['data'], + items=obj['items'],default_fill_value=obj['default_fill_value'],default_kind=obj['default_kind']) + elif typ == 'block_index': + return globals()[obj['klass']](obj['length'],obj['blocs'],obj['blengths']) + elif typ == 'int_index': + return globals()[obj['klass']](obj['length'],obj['indices']) + elif typ == 'ndarray': + return unconvert(obj['data'],np.typeDict[obj['dtype']],obj['compress']).reshape(obj['shape']) + elif typ == 'np_timedelta64': + return np.timedelta64(obj['data']) + elif typ == 'np_scalar': + if obj.get('sub_typ') == 'np_complex': + return c2f(obj['real'], obj['imag'], obj['dtype']) + else: + dtype = dtype_for(obj['dtype']) + try: + return dtype(obj['data']) + except: + return dtype.type(obj['data']) + elif typ == 'np_complex': + return complex(obj['real']+'+'+obj['imag']+'j') + elif isinstance(obj, (dict,list,set)): + return obj + else: + return obj + +def pack(o, default=encode, + encoding=None, unicode_errors='strict', use_single_float=False): + """ + Pack an object and return the packed bytes. + """ + + return Packer(default=default, encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float).pack(o) + +def unpack(packed, object_hook=decode, + list_hook=None, use_list=False, encoding=None, + unicode_errors='strict', object_pairs_hook=None): + """ + Unpack a packed object, return an iterator + Note: packed lists will be returned as tuples + """ + + return Unpacker(packed, object_hook=object_hook, + list_hook=list_hook, + use_list=use_list, encoding=encoding, + unicode_errors=unicode_errors, + object_pairs_hook=object_pairs_hook) + +if _USE_MSGPACK: + + class Packer(_packer.Packer): + def __init__(self, default=encode, + encoding=None, + unicode_errors='strict', + use_single_float=False): + super(Packer, self).__init__(default=default, + encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float) + + class Unpacker(_unpacker.Unpacker): + def __init__(self, file_like=None, read_size=0, use_list=False, + object_hook=decode, + object_pairs_hook=None, list_hook=None, encoding=None, + unicode_errors='strict', max_buffer_size=0): + super(Unpacker, self).__init__(file_like=file_like, + read_size=read_size, + use_list=use_list, + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, + list_hook=list_hook, + encoding=encoding, + unicode_errors=unicode_errors, + max_buffer_size=max_buffer_size) + +class Iterator(object): + """ manage the unpacking iteration, + close the file on completion """ + + def __init__(self, path, **kwargs): + self.path = path + self.kwargs = kwargs + + def __iter__(self): + + try: + fh = open(self.path,'rb') + unpacker = unpack(fh) + for o in unpacker: + yield o + finally: + fh.close() diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py new file mode 100644 index 0000000000000..f9e25f3956d38 --- /dev/null +++ b/pandas/io/tests/test_packers.py @@ -0,0 +1,368 @@ +import nose +import unittest +import os +import sys +import warnings + +import datetime +import numpy as np + +from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, + date_range, period_range, Index, SparseSeries, SparseDataFrame, + SparsePanel) +import pandas.util.testing as tm +from pandas.util.testing import ensure_clean +from pandas.tests.test_series import assert_series_equal +from pandas.tests.test_frame import assert_frame_equal +from pandas.tests.test_panel import assert_panel_equal + +import pandas +from pandas.sparse.tests.test_sparse import assert_sp_series_equal, assert_sp_frame_equal +from pandas import concat, Timestamp, tslib + +from numpy.testing.decorators import slow +nan = np.nan + +from pandas.io.packers import to_msgpack, read_msgpack, _USE_MSGPACK + +if not _USE_MSGPACK: + raise nose.SkipTest('no msgpack') + +_multiprocess_can_split_ = False + +def check_arbitrary(a, b): + + if isinstance(a,(list,tuple)) and isinstance(b,(list,tuple)): + assert(len(a) == len(b)) + for a_, b_ in zip(a,b): + check_arbitrary(a_,b_) + elif isinstance(a,Panel): + assert_panel_equal(a,b) + elif isinstance(a,DataFrame): + assert_frame_equal(a,b) + elif isinstance(a,Series): + assert_series_equal(a,b) + else: + assert(a == b) + +class Test(unittest.TestCase): + + def setUp(self): + self.path = '__%s__.msg' % tm.rands(10) + + def tearDown(self): + pass + + def encode_decode(self, x): + with ensure_clean(self.path) as p: + to_msgpack(p,x) + return read_msgpack(p) + +class TestNumpy(Test): + + def test_numpy_scalar_float(self): + x = np.float32(np.random.rand()) + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_numpy_scalar_complex(self): + x = np.complex64(np.random.rand()+1j*np.random.rand()) + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_scalar_float(self): + x = np.random.rand() + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_scalar_complex(self): + x = np.random.rand()+1j*np.random.rand() + x_rec = self.encode_decode(x) + assert x == x_rec and type(x) == type(x_rec) + + def test_list_numpy_float(self): + x = [np.float32(np.random.rand()) for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_list_numpy_float_complex(self): + x = [np.float32(np.random.rand()) for i in xrange(5)] + \ + [np.complex128(np.random.rand()+1j*np.random.rand()) for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_list_float(self): + x = [np.random.rand() for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_list_float_complex(self): + x = [np.random.rand() for i in xrange(5)] + \ + [(np.random.rand()+1j*np.random.rand()) for i in xrange(5)] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) + + def test_dict_float(self): + x = {'foo': 1.0, 'bar': 2.0} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_dict_complex(self): + x = {'foo': 1.0+1.0j, 'bar': 2.0+2.0j} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_dict_numpy_float(self): + x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_dict_numpy_complex(self): + x = {'foo': np.complex128(1.0+1.0j), 'bar': np.complex128(2.0+2.0j)} + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x.values(), x_rec.values())) and \ + all(map(lambda x,y: type(x) == type(y), x.values(), x_rec.values())) + + def test_numpy_array_float(self): + x = np.random.rand(5).astype(np.float32) + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + x.dtype == x_rec.dtype + def test_numpy_array_complex(self): + x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + x.dtype == x_rec.dtype + + def test_list_mixed(self): + x = [1.0, np.float32(3.5), np.complex128(4.25), u'foo'] + x_rec = self.encode_decode(x) + assert all(map(lambda x,y: x == y, x, x_rec)) and \ + all(map(lambda x,y: type(x) == type(y), x, x_rec)) +class TestBasic(Test): + + def test_timestamp(self): + + for i in [ Timestamp('20130101'), Timestamp('20130101',tz='US/Eastern'), + Timestamp('201301010501') ]: + i_rec = self.encode_decode(i) + self.assert_(i == i_rec) + + def test_datetimes(self): + + for i in [ datetime.datetime(2013,1,1), datetime.datetime(2013,1,1,5,1), + datetime.date(2013,1,1), np.datetime64('2013-01-05 2:15') ]: + i_rec = self.encode_decode(i) + self.assert_(i == i_rec) + + def test_timedeltas(self): + + for i in [ datetime.timedelta(days=1), + datetime.timedelta(days=1,seconds=10), + np.timedelta64(1000000) ]: + i_rec = self.encode_decode(i) + self.assert_(i == i_rec) + + +class TestIndex(Test): + + def setUp(self): + super(TestIndex, self).setUp() + + self.d = { + 'string' : tm.makeStringIndex(100), + 'date' : tm.makeDateIndex(100), + 'int' : tm.makeIntIndex(100), + 'float' : tm.makeFloatIndex(100), + 'empty' : Index([]), + 'tuple' : Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), + 'period' : Index(period_range('2012-1-1', freq='M', periods=3)), + 'date2' : Index(date_range('2013-01-1', periods=10)), + 'bdate' : Index(bdate_range('2013-01-02',periods=10)), + } + + self.mi = { + 'reg' : MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), + ('qux', 'one'), ('qux', 'two')], names=['first','second']), + } + + def test_basic_index(self): + + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + self.assert_(i.equals(i_rec)) + + def test_multi_index(self): + + for s, i in self.mi.items(): + i_rec = self.encode_decode(i) + self.assert_(i.equals(i_rec)) + + def test_unicode(self): + i = tm.makeUnicodeIndex(100) + i_rec = self.encode_decode(i) + self.assert_(i.equals(i_rec)) + +class TestSeries(Test): + + def setUp(self): + super(TestSeries, self).setUp() + + self.d = {} + + + s = tm.makeStringSeries() + s.name = 'string' + self.d['string'] = s + + s = tm.makeObjectSeries() + s.name = 'object' + self.d['object'] = s + + s = Series(tslib.iNaT, dtype='M8[ns]', index=range(5)) + self.d['date'] = s + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E' : [0., 1, Timestamp('20100101'),'foo',2.], + } + + self.d['float'] = Series(data['A']) + self.d['int'] = Series(data['B']) + self.d['mixed'] = Series(data['E']) + + def test_basic(self): + + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + assert_series_equal(i,i_rec) + +class TestNDFrame(Test): + + def setUp(self): + super(TestNDFrame, self).setUp() + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E' : [0., 1, Timestamp('20100101'),'foo',2.], + } + + self.frame = { 'float' : DataFrame(dict(A = data['A'], B = Series(data['A']) + 1)), + 'int' : DataFrame(dict(A = data['B'], B = Series(data['B']) + 1)), + 'mixed' : DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])) } + + self.panel = { 'float' : Panel(dict(ItemA = self.frame['float'], ItemB = self.frame['float']+1)) } + + def test_basic_frame(self): + + for s, i in self.frame.items(): + i_rec = self.encode_decode(i) + assert_frame_equal(i,i_rec) + + def test_basic_panel(self): + + for s, i in self.panel.items(): + i_rec = self.encode_decode(i) + assert_panel_equal(i,i_rec) + + def test_multi(self): + + i_rec = self.encode_decode(self.frame) + for k in self.frame.keys(): + assert_frame_equal(self.frame[k],i_rec[k]) + + l = tuple([ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ]) + l_rec = self.encode_decode(l) + check_arbitrary(l,l_rec) + + # this is an oddity in that packed lists will be returned as tuples + l = [ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ] + l_rec = self.encode_decode(l) + self.assert_(isinstance(l_rec,tuple)) + check_arbitrary(l,l_rec) + + def test_iterator(self): + + l = [ self.frame['float'], self.frame['float'].A, self.frame['float'].B, None ] + + with ensure_clean(self.path) as path: + to_msgpack(path,*l) + for i, packed in enumerate(read_msgpack(path, iterator=True)): + check_arbitrary(packed,l[i]) + +class TestSparse(Test): + + def _check_roundtrip(self, obj, comparator, **kwargs): + + i_rec = self.encode_decode(obj) + comparator(obj,i_rec,**kwargs) + + def test_sparse_series(self): + + s = tm.makeStringSeries() + s[3:5] = np.nan + ss = s.to_sparse() + self._check_roundtrip(ss, tm.assert_series_equal, + check_series_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_series_equal, + check_series_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_series_equal, + check_series_type=True) + + def test_sparse_frame(self): + + s = tm.makeDataFrame() + s.ix[3:5, 1:3] = np.nan + s.ix[8:10, -2] = np.nan + ss = s.to_sparse() + + self._check_roundtrip(ss, tm.assert_frame_equal, + check_frame_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_frame_equal, + check_frame_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_frame_equal, + check_frame_type=True) + + def test_sparse_panel(self): + + items = ['x', 'y', 'z'] + p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) + sp = p.to_sparse() + + self._check_roundtrip(sp, tm.assert_panel_equal, + check_panel_type=True) + + sp2 = p.to_sparse(kind='integer') + self._check_roundtrip(sp2, tm.assert_panel_equal, + check_panel_type=True) + + sp3 = p.to_sparse(fill_value=0) + self._check_roundtrip(sp3, tm.assert_panel_equal, + check_panel_type=True) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/vb_suite/packers.py b/vb_suite/packers.py new file mode 100644 index 0000000000000..6733b5fa6dfb8 --- /dev/null +++ b/vb_suite/packers.py @@ -0,0 +1,80 @@ +from vbench.api import Benchmark +from datetime import datetime + +start_date = datetime(2013, 5, 1) + +common_setup = """from pandas_vb_common import * +import os +from pandas.io import packers +from pandas.core import common as com + +f = '__test__.msg' +def remove(f): + try: + os.remove(f) + except: + pass + +""" + +#---------------------------------------------------------------------- +# read a pack + +setup1 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +packers.save(f,df) +""" + +read_pack = Benchmark("packers.load(f)", setup1, + start_date=start_date) + + +#---------------------------------------------------------------------- +# write to a pack + +setup2 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +""" + +write_pack = Benchmark( + "packers.save(f,df)", setup2, cleanup="remove(f)", + start_date=start_date) + +#---------------------------------------------------------------------- +# read a pickle + +setup1 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +df.save(f) +""" + +read_pickle = Benchmark("com.load(f)", setup1, + start_date=start_date) + + +#---------------------------------------------------------------------- +# write to a pickle + +setup2 = common_setup + """ +index = date_range('20000101',periods=25000,freq='H') +df = DataFrame({'float1' : randn(25000), + 'float2' : randn(25000)}, + index=index) +remove(f) +""" + +write_pickle = Benchmark( + "df.save(f)", setup2, cleanup="remove(f)", + start_date=start_date) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index 905c4371837cc..4ac967dc1664a 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -16,6 +16,7 @@ 'join_merge', 'miscellaneous', 'panel_ctor', + 'packers', 'parser', 'reindex', 'replace',