70
70
except :
71
71
_USE_MSGPACK = False
72
72
73
+ import zlib
74
+
75
+ try :
76
+ import blosc
77
+ _BLOSC = True
78
+ except :
79
+ _BLOSC = False
80
+
81
+ ## until we can pass this into our conversion functions,
82
+ ## this is pretty hacky
83
+ compressor = None
84
+
73
85
def to_msgpack (path , * args , ** kwargs ):
74
86
"""
75
87
msgpack (serialize) object to input file path
@@ -82,10 +94,13 @@ def to_msgpack(path, *args, **kwargs):
82
94
83
95
append : boolean whether to append to an existing msgpack
84
96
(default is False)
97
+ compress : type of compressor (zlib or blosc), default to None (no compression)
85
98
"""
86
99
if not _USE_MSGPACK :
87
100
raise Exception ("please install msgpack to create msgpack stores!" )
88
101
102
+ global compressor
103
+ compressor = kwargs .get ('compress' )
89
104
append = kwargs .get ('append' )
90
105
if append :
91
106
f = open (path , 'a+b' )
@@ -154,14 +169,60 @@ def c2f(r, i, ctype_name):
154
169
ftype = c2f_dict [ctype_name ]
155
170
return np .typeDict [ctype_name ](ftype (r )+ 1j * ftype (i ))
156
171
172
+
157
173
def convert (values ):
158
174
""" convert the numpy values to a list """
159
175
160
176
dtype = values .dtype
161
177
if needs_i8_conversion (dtype ):
162
178
values = values .view ('i8' )
163
- return values .ravel ().tolist ()
179
+ v = values .ravel ()
180
+
181
+ if compressor == 'zlib' :
182
+
183
+ # return string arrays like they are
184
+ if dtype == np .object_ :
185
+ return v .tolist ()
186
+
187
+ # convert to a bytes array
188
+ v = v .tostring ()
189
+ return zlib .compress (v )
190
+
191
+ elif compressor == 'blosc' and _BLOSC :
192
+
193
+ # return string arrays like they are
194
+ if dtype == np .object_ :
195
+ return v .tolist ()
196
+
197
+ # convert to a bytes array
198
+ v = v .tostring ()
199
+ return blosc .compress (v ,typesize = dtype .itemsize )
200
+
201
+ # as a list
202
+ return v .tolist ()
203
+
204
+ def unconvert (values , dtype , compress ):
205
+
206
+ if dtype == np .object_ :
207
+ return np .array (values ,dtype = object )
208
+
209
+ if compress == 'zlib' :
210
+
211
+ values = zlib .decompress (values )
212
+ return np .frombuffer (values ,dtype = dtype )
213
+
214
+ elif compress == 'blosc' :
215
+
216
+ if not _BLOSC :
217
+ raise Exception ("cannot uncompress w/o blosc" )
218
+
219
+ # decompress
220
+ values = blosc .decompress (values )
221
+
222
+ return np .frombuffer (values ,dtype = dtype )
164
223
224
+ # as a list
225
+ return np .array (values ,dtype = dtype )
165
226
166
227
def encode (obj ):
167
228
"""
@@ -203,7 +264,8 @@ def encode(obj):
203
264
'dtype' : obj .dtype .num ,
204
265
'index' : obj .index ,
205
266
'sp_index' : obj .sp_index ,
206
- 'sp_values' : convert (obj .sp_values )}
267
+ 'sp_values' : convert (obj .sp_values ),
268
+ 'compress' : compressor }
207
269
for f in ['name' ,'fill_value' ,'kind' ]:
208
270
d [f ] = getattr (obj ,f ,None )
209
271
return d
@@ -213,7 +275,8 @@ def encode(obj):
213
275
'name' : getattr (obj ,'name' ,None ),
214
276
'index' : obj .index ,
215
277
'dtype' : obj .dtype .num ,
216
- 'data' : convert (obj .values ) }
278
+ 'data' : convert (obj .values ),
279
+ 'compress' : compressor }
217
280
elif issubclass (tobj , NDFrame ):
218
281
if isinstance (obj , SparseDataFrame ):
219
282
d = {'typ' : 'sparse_dataframe' ,
@@ -245,7 +308,8 @@ def encode(obj):
245
308
'values' : convert (b .values ),
246
309
'shape' : b .values .shape ,
247
310
'dtype' : b .dtype .num ,
248
- 'klass' : b .__class__ .__name__
311
+ 'klass' : b .__class__ .__name__ ,
312
+ 'compress' : compressor
249
313
} for b in data .blocks ] }
250
314
251
315
elif isinstance (obj , (datetime ,date ,timedelta )):
@@ -290,7 +354,8 @@ def encode(obj):
290
354
'shape' : obj .shape ,
291
355
'ndim' : obj .ndim ,
292
356
'dtype' : obj .dtype .num ,
293
- 'data' : convert (obj )}
357
+ 'data' : convert (obj ),
358
+ 'compress' : compressor }
294
359
elif isinstance (obj , np .timedelta64 ):
295
360
return { 'typ' : 'np_timedelta64' ,
296
361
'data' : obj .view ('i8' ) }
@@ -337,13 +402,13 @@ def decode(obj):
337
402
elif typ == 'series' :
338
403
dtype = dtype_for (obj ['dtype' ])
339
404
index = obj ['index' ]
340
- return globals ()[obj ['klass' ]](obj ['data' ],index = index , dtype = dtype ,name = obj ['name' ])
405
+ return globals ()[obj ['klass' ]](unconvert ( obj ['data' ],dtype , obj [ 'compress' ]), index = index ,name = obj ['name' ])
341
406
elif typ == 'block_manager' :
342
407
axes = obj ['axes' ]
343
408
344
409
def create_block (b ):
345
410
dtype = dtype_for (b ['dtype' ])
346
- return make_block (np . array (b ['values' ],dtype = dtype ).reshape (b ['shape' ]),b ['items' ],axes [0 ],klass = getattr (internals ,b ['klass' ]))
411
+ return make_block (unconvert (b ['values' ],dtype , b [ 'compress' ] ).reshape (b ['shape' ]),b ['items' ],axes [0 ],klass = getattr (internals ,b ['klass' ]))
347
412
348
413
blocks = [ create_block (b ) for b in obj ['blocks' ] ]
349
414
return globals ()[obj ['klass' ]](BlockManager (blocks , axes ))
@@ -355,7 +420,7 @@ def create_block(b):
355
420
return timedelta (* obj ['data' ])
356
421
elif typ == 'sparse_series' :
357
422
dtype = dtype_for (obj ['dtype' ])
358
- return globals ()[obj ['klass' ]](np . array (obj ['sp_values' ],dtype = dtype ),sparse_index = obj ['sp_index' ],
423
+ return globals ()[obj ['klass' ]](unconvert (obj ['sp_values' ],dtype , obj [ 'compress' ] ),sparse_index = obj ['sp_index' ],
359
424
index = obj ['index' ],fill_value = obj ['fill_value' ],kind = obj ['kind' ],name = obj ['name' ])
360
425
elif typ == 'sparse_dataframe' :
361
426
return globals ()[obj ['klass' ]](obj ['data' ],
@@ -368,9 +433,7 @@ def create_block(b):
368
433
elif typ == 'int_index' :
369
434
return globals ()[obj ['klass' ]](obj ['length' ],obj ['indices' ])
370
435
elif typ == 'ndarray' :
371
- return np .array (obj ['data' ],
372
- dtype = np .typeDict [obj ['dtype' ]],
373
- ndmin = obj ['ndim' ]).reshape (obj ['shape' ])
436
+ return unconvert (obj ['data' ],np .typeDict [obj ['dtype' ]],obj ['compress' ]).reshape (obj ['shape' ])
374
437
elif typ == 'np_timedelta64' :
375
438
return np .timedelta64 (obj ['data' ])
376
439
elif typ == 'np_scalar' :
@@ -390,7 +453,7 @@ def create_block(b):
390
453
return obj
391
454
392
455
def pack (o , default = encode ,
393
- encoding = 'utf-8' , unicode_errors = 'strict' , use_single_float = False ):
456
+ encoding = None , unicode_errors = 'strict' , use_single_float = False ):
394
457
"""
395
458
Pack an object and return the packed bytes.
396
459
"""
@@ -400,7 +463,7 @@ def pack(o, default=encode,
400
463
use_single_float = use_single_float ).pack (o )
401
464
402
465
def unpack (packed , object_hook = decode ,
403
- list_hook = None , use_list = False , encoding = 'utf-8' ,
466
+ list_hook = None , use_list = False , encoding = None ,
404
467
unicode_errors = 'strict' , object_pairs_hook = None ):
405
468
"""
406
469
Unpack a packed object, return an iterator
@@ -417,7 +480,7 @@ def unpack(packed, object_hook=decode,
417
480
418
481
class Packer (_packer .Packer ):
419
482
def __init__ (self , default = encode ,
420
- encoding = 'utf-8' ,
483
+ encoding = None ,
421
484
unicode_errors = 'strict' ,
422
485
use_single_float = False ):
423
486
super (Packer , self ).__init__ (default = default ,
@@ -428,7 +491,7 @@ def __init__(self, default=encode,
428
491
class Unpacker (_unpacker .Unpacker ):
429
492
def __init__ (self , file_like = None , read_size = 0 , use_list = False ,
430
493
object_hook = decode ,
431
- object_pairs_hook = None , list_hook = None , encoding = 'utf-8' ,
494
+ object_pairs_hook = None , list_hook = None , encoding = None ,
432
495
unicode_errors = 'strict' , max_buffer_size = 0 ):
433
496
super (Unpacker , self ).__init__ (file_like = file_like ,
434
497
read_size = read_size ,
0 commit comments