Skip to content

Commit 5a02cdf

Browse files
committed
TST: added compression (zlib/blosc) via big hack
1 parent a55e7e4 commit 5a02cdf

File tree

1 file changed

+78
-15
lines changed

1 file changed

+78
-15
lines changed

pandas/io/packers.py

+78-15
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,18 @@
7070
except:
7171
_USE_MSGPACK = False
7272

73+
import zlib
74+
75+
try:
76+
import blosc
77+
_BLOSC = True
78+
except:
79+
_BLOSC = False
80+
81+
## until we can pass this into our conversion functions,
82+
## this is pretty hacky
83+
compressor = None
84+
7385
def to_msgpack(path, *args, **kwargs):
7486
"""
7587
msgpack (serialize) object to input file path
@@ -82,10 +94,13 @@ def to_msgpack(path, *args, **kwargs):
8294
8395
append : boolean whether to append to an existing msgpack
8496
(default is False)
97+
compress : type of compressor (zlib or blosc), default to None (no compression)
8598
"""
8699
if not _USE_MSGPACK:
87100
raise Exception("please install msgpack to create msgpack stores!")
88101

102+
global compressor
103+
compressor = kwargs.get('compress')
89104
append = kwargs.get('append')
90105
if append:
91106
f = open(path, 'a+b')
@@ -154,14 +169,60 @@ def c2f(r, i, ctype_name):
154169
ftype = c2f_dict[ctype_name]
155170
return np.typeDict[ctype_name](ftype(r)+1j*ftype(i))
156171

172+
157173
def convert(values):
158174
""" convert the numpy values to a list """
159175

160176
dtype = values.dtype
161177
if needs_i8_conversion(dtype):
162178
values = values.view('i8')
163-
return values.ravel().tolist()
179+
v = values.ravel()
180+
181+
if compressor == 'zlib':
182+
183+
# return string arrays like they are
184+
if dtype == np.object_:
185+
return v.tolist()
186+
187+
# convert to a bytes array
188+
v = v.tostring()
189+
return zlib.compress(v)
190+
191+
elif compressor == 'blosc' and _BLOSC:
192+
193+
# return string arrays like they are
194+
if dtype == np.object_:
195+
return v.tolist()
196+
197+
# convert to a bytes array
198+
v = v.tostring()
199+
return blosc.compress(v,typesize=dtype.itemsize)
200+
201+
# as a list
202+
return v.tolist()
203+
204+
def unconvert(values, dtype, compress):
205+
206+
if dtype == np.object_:
207+
return np.array(values,dtype=object)
208+
209+
if compress == 'zlib':
210+
211+
values = zlib.decompress(values)
212+
return np.frombuffer(values,dtype=dtype)
213+
214+
elif compress == 'blosc':
215+
216+
if not _BLOSC:
217+
raise Exception("cannot uncompress w/o blosc")
218+
219+
# decompress
220+
values = blosc.decompress(values)
221+
222+
return np.frombuffer(values,dtype=dtype)
164223

224+
# as a list
225+
return np.array(values,dtype=dtype)
165226

166227
def encode(obj):
167228
"""
@@ -203,7 +264,8 @@ def encode(obj):
203264
'dtype': obj.dtype.num,
204265
'index' : obj.index,
205266
'sp_index' : obj.sp_index,
206-
'sp_values' : convert(obj.sp_values)}
267+
'sp_values' : convert(obj.sp_values),
268+
'compress' : compressor}
207269
for f in ['name','fill_value','kind']:
208270
d[f] = getattr(obj,f,None)
209271
return d
@@ -213,7 +275,8 @@ def encode(obj):
213275
'name' : getattr(obj,'name',None),
214276
'index' : obj.index,
215277
'dtype': obj.dtype.num,
216-
'data': convert(obj.values) }
278+
'data': convert(obj.values),
279+
'compress' : compressor}
217280
elif issubclass(tobj, NDFrame):
218281
if isinstance(obj, SparseDataFrame):
219282
d = {'typ' : 'sparse_dataframe',
@@ -245,7 +308,8 @@ def encode(obj):
245308
'values' : convert(b.values),
246309
'shape' : b.values.shape,
247310
'dtype' : b.dtype.num,
248-
'klass' : b.__class__.__name__
311+
'klass' : b.__class__.__name__,
312+
'compress' : compressor
249313
} for b in data.blocks ] }
250314

251315
elif isinstance(obj, (datetime,date,timedelta)):
@@ -290,7 +354,8 @@ def encode(obj):
290354
'shape': obj.shape,
291355
'ndim': obj.ndim,
292356
'dtype': obj.dtype.num,
293-
'data': convert(obj)}
357+
'data': convert(obj),
358+
'compress' : compressor }
294359
elif isinstance(obj, np.timedelta64):
295360
return { 'typ' : 'np_timedelta64',
296361
'data' : obj.view('i8') }
@@ -337,13 +402,13 @@ def decode(obj):
337402
elif typ == 'series':
338403
dtype = dtype_for(obj['dtype'])
339404
index = obj['index']
340-
return globals()[obj['klass']](obj['data'],index=index,dtype=dtype,name=obj['name'])
405+
return globals()[obj['klass']](unconvert(obj['data'],dtype,obj['compress']),index=index,name=obj['name'])
341406
elif typ == 'block_manager':
342407
axes = obj['axes']
343408

344409
def create_block(b):
345410
dtype = dtype_for(b['dtype'])
346-
return make_block(np.array(b['values'],dtype=dtype).reshape(b['shape']),b['items'],axes[0],klass=getattr(internals,b['klass']))
411+
return make_block(unconvert(b['values'],dtype,b['compress']).reshape(b['shape']),b['items'],axes[0],klass=getattr(internals,b['klass']))
347412

348413
blocks = [ create_block(b) for b in obj['blocks'] ]
349414
return globals()[obj['klass']](BlockManager(blocks, axes))
@@ -355,7 +420,7 @@ def create_block(b):
355420
return timedelta(*obj['data'])
356421
elif typ == 'sparse_series':
357422
dtype = dtype_for(obj['dtype'])
358-
return globals()[obj['klass']](np.array(obj['sp_values'],dtype=dtype),sparse_index=obj['sp_index'],
423+
return globals()[obj['klass']](unconvert(obj['sp_values'],dtype,obj['compress']),sparse_index=obj['sp_index'],
359424
index=obj['index'],fill_value=obj['fill_value'],kind=obj['kind'],name=obj['name'])
360425
elif typ == 'sparse_dataframe':
361426
return globals()[obj['klass']](obj['data'],
@@ -368,9 +433,7 @@ def create_block(b):
368433
elif typ == 'int_index':
369434
return globals()[obj['klass']](obj['length'],obj['indices'])
370435
elif typ == 'ndarray':
371-
return np.array(obj['data'],
372-
dtype=np.typeDict[obj['dtype']],
373-
ndmin=obj['ndim']).reshape(obj['shape'])
436+
return unconvert(obj['data'],np.typeDict[obj['dtype']],obj['compress']).reshape(obj['shape'])
374437
elif typ == 'np_timedelta64':
375438
return np.timedelta64(obj['data'])
376439
elif typ == 'np_scalar':
@@ -390,7 +453,7 @@ def create_block(b):
390453
return obj
391454

392455
def pack(o, default=encode,
393-
encoding='utf-8', unicode_errors='strict', use_single_float=False):
456+
encoding=None, unicode_errors='strict', use_single_float=False):
394457
"""
395458
Pack an object and return the packed bytes.
396459
"""
@@ -400,7 +463,7 @@ def pack(o, default=encode,
400463
use_single_float=use_single_float).pack(o)
401464

402465
def unpack(packed, object_hook=decode,
403-
list_hook=None, use_list=False, encoding='utf-8',
466+
list_hook=None, use_list=False, encoding=None,
404467
unicode_errors='strict', object_pairs_hook=None):
405468
"""
406469
Unpack a packed object, return an iterator
@@ -417,7 +480,7 @@ def unpack(packed, object_hook=decode,
417480

418481
class Packer(_packer.Packer):
419482
def __init__(self, default=encode,
420-
encoding='utf-8',
483+
encoding=None,
421484
unicode_errors='strict',
422485
use_single_float=False):
423486
super(Packer, self).__init__(default=default,
@@ -428,7 +491,7 @@ def __init__(self, default=encode,
428491
class Unpacker(_unpacker.Unpacker):
429492
def __init__(self, file_like=None, read_size=0, use_list=False,
430493
object_hook=decode,
431-
object_pairs_hook=None, list_hook=None, encoding='utf-8',
494+
object_pairs_hook=None, list_hook=None, encoding=None,
432495
unicode_errors='strict', max_buffer_size=0):
433496
super(Unpacker, self).__init__(file_like=file_like,
434497
read_size=read_size,

0 commit comments

Comments
 (0)