Skip to content

Commit 025a0cd

Browse files
committed
add compression support for pickle
1 parent e503d40 commit 025a0cd

File tree

4 files changed

+68
-15
lines changed

4 files changed

+68
-15
lines changed

pandas/core/generic.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1256,17 +1256,19 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
12561256
if_exists=if_exists, index=index, index_label=index_label,
12571257
chunksize=chunksize, dtype=dtype)
12581258

1259-
def to_pickle(self, path):
1259+
def to_pickle(self, path, compression='infer'):
12601260
"""
12611261
Pickle (serialize) object to input file path.
12621262
12631263
Parameters
12641264
----------
12651265
path : string
12661266
File path
1267+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
1268+
.. versionadded:: 0.19.2
12671269
"""
12681270
from pandas.io.pickle import to_pickle
1269-
return to_pickle(self, path)
1271+
return to_pickle(self, path, compression=compression)
12701272

12711273
def to_clipboard(self, excel=None, sep=None, **kwargs):
12721274
"""

pandas/io/common.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ def _infer_compression(filepath_or_buffer, compression):
296296

297297

298298
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
299-
memory_map=False):
299+
memory_map=False, is_text=True):
300300
"""
301301
Get file handle for given path/buffer and mode.
302302
@@ -311,7 +311,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
311311
Supported compression protocols are gzip, bz2, zip, and xz
312312
memory_map : boolean, default False
313313
See parsers._parser_params for more information.
314-
314+
is_text : boolean, default True
315+
whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.)
315316
Returns
316317
-------
317318
f : file-like
@@ -385,13 +386,16 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
385386
elif encoding:
386387
# Python 3 and encoding
387388
f = open(path_or_buf, mode, encoding=encoding)
388-
else:
389+
elif is_text:
389390
# Python 3 and no explicit encoding
390391
f = open(path_or_buf, mode, errors='replace')
392+
else:
393+
# Python 3 and binary mode
394+
f = open(path_or_buf, mode)
391395
handles.append(f)
392396

393397
# in Python 3, convert BytesIO or fileobjects passed with an encoding
394-
if compat.PY3 and (compression or isinstance(f, compat.BytesIO)):
398+
if compat.PY3 and is_text and (compression or isinstance(f, compat.BytesIO)):
395399
from io import TextIOWrapper
396400
f = TextIOWrapper(f, encoding=encoding)
397401
handles.append(f)

pandas/io/pickle.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
from numpy.lib.format import read_array, write_array
55
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
66
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
7+
from pandas.io.common import _get_handle, _infer_compression
78

89

9-
def to_pickle(obj, path):
10+
def to_pickle(obj, path, compression='infer'):
1011
"""
1112
Pickle (serialize) object to input file path
1213
@@ -15,12 +16,16 @@ def to_pickle(obj, path):
1516
obj : any object
1617
path : string
1718
File path
19+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
20+
.. versionadded:: 0.19.2
1821
"""
19-
with open(path, 'wb') as f:
22+
inferred_compression = _infer_compression(path, compression)
23+
f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False)
24+
with f:
2025
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
2126

2227

23-
def read_pickle(path):
28+
def read_pickle(path, compression='infer'):
2429
"""
2530
Load pickled pandas object (or any other pickled object) from the specified
2631
file path
@@ -32,12 +37,16 @@ def read_pickle(path):
3237
----------
3338
path : string
3439
File path
40+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
41+
.. versionadded:: 0.19.2
3542
3643
Returns
3744
-------
3845
unpickled : type of object stored in file
3946
"""
4047

48+
inferred_compression = _infer_compression(path, compression)
49+
4150
def try_read(path, encoding=None):
4251
# try with cPickle
4352
# try with current pickle, if we have a Type Error then
@@ -48,18 +57,21 @@ def try_read(path, encoding=None):
4857
# cpickle
4958
# GH 6899
5059
try:
51-
with open(path, 'rb') as fh:
52-
return pkl.load(fh)
60+
f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
61+
with f:
62+
return pkl.load(f)
5363
except Exception:
5464
# reg/patched pickle
5565
try:
56-
with open(path, 'rb') as fh:
57-
return pc.load(fh, encoding=encoding, compat=False)
66+
f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
67+
with f:
68+
return pc.load(f, encoding=encoding, compat=False)
5869

5970
# compat pickle
6071
except:
61-
with open(path, 'rb') as fh:
62-
return pc.load(fh, encoding=encoding, compat=True)
72+
f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
73+
with f:
74+
return pc.load(f, encoding=encoding, compat=True)
6375

6476
try:
6577
return try_read(path)
@@ -68,6 +80,7 @@ def try_read(path, encoding=None):
6880
return try_read(path, encoding='latin1')
6981
raise
7082

83+
7184
# compat with sparse pickle / unpickle
7285

7386

pandas/io/tests/test_pickle.py

+34
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,40 @@ def test_pickle_v0_15_2(self):
284284
#
285285
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
286286

287+
def compression_explicit(self, compression):
288+
# issue 11666
289+
with tm.ensure_clean(self.path) as path:
290+
df = tm.makeDataFrame()
291+
df.to_pickle(path, compression=compression)
292+
tm.assert_frame_equal(df, pandas.read_pickle(path, compression=compression))
293+
294+
def test_compression_explicit(self):
295+
compressions = [None, 'gzip', 'bz2', 'xz']
296+
for c in compressions:
297+
yield self.compression_explicit, c
298+
299+
def compression_explicit_bad(self, compression):
300+
with tm.assertRaisesRegexp(ValueError, "Unrecognized compression type"):
301+
with tm.ensure_clean(self.path) as path:
302+
df = tm.makeDataFrame()
303+
df.to_pickle(path, compression=compression)
304+
305+
def test_compression_explicit_bad(self):
306+
compressions = ['', 'None', 'bad', '7z']
307+
for c in compressions:
308+
yield self.compression_explicit_bad, c
309+
310+
def compression_infer(self, ext):
311+
with tm.ensure_clean(self.path + ext) as p:
312+
df = tm.makeDataFrame()
313+
df.to_pickle(p)
314+
tm.assert_frame_equal(df, pandas.read_pickle(p))
315+
316+
def test_compression_infer(self):
317+
extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i']
318+
for ext in extensions:
319+
yield self.compression_infer, ext
320+
287321

288322
if __name__ == '__main__':
289323
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)