Skip to content

Commit f187514

Browse files
committed
add compression support for pickle
1 parent daba8e5 commit f187514

File tree

5 files changed

+89
-27
lines changed

5 files changed

+89
-27
lines changed

pandas/core/generic.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1200,17 +1200,18 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
12001200
if_exists=if_exists, index=index, index_label=index_label,
12011201
chunksize=chunksize, dtype=dtype)
12021202

1203-
def to_pickle(self, path):
1203+
def to_pickle(self, path, compression='infer'):
12041204
"""
12051205
Pickle (serialize) object to input file path.
12061206
12071207
Parameters
12081208
----------
12091209
path : string
12101210
File path
1211+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
12111212
"""
12121213
from pandas.io.pickle import to_pickle
1213-
return to_pickle(self, path)
1214+
return to_pickle(self, path, compression)
12141215

12151216
def to_clipboard(self, excel=None, sep=None, **kwargs):
12161217
"""

pandas/io/common.py

+39-2
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,45 @@ def ZipFile(*args, **kwargs):
285285
ZipFile = zipfile.ZipFile
286286

287287

288-
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
288+
def _get_inferred_compression(filepath_or_buffer, compression):
289+
if compression == 'infer':
290+
if isinstance(filepath_or_buffer, compat.string_types):
291+
if filepath_or_buffer.endswith('.gz'):
292+
inferred_compression = 'gzip'
293+
elif filepath_or_buffer.endswith('.bz2'):
294+
inferred_compression = 'bz2'
295+
elif filepath_or_buffer.endswith('.zip'):
296+
inferred_compression = 'zip'
297+
elif filepath_or_buffer.endswith('.xz'):
298+
inferred_compression = 'xz'
299+
else:
300+
inferred_compression = None
301+
else:
302+
inferred_compression = None
303+
else:
304+
inferred_compression = compression
305+
return inferred_compression
306+
307+
308+
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False, is_txt=True):
289309
"""Gets file handle for given path and mode.
310+
311+
Parameters
312+
----------
313+
path : string
314+
file path
315+
mode : string
316+
mode to open file, like 'wb', 'rb', etc
317+
encoding : string, default None
318+
encoding for text file
319+
compression : string, default None
320+
{ None, 'gzip', 'bz2', 'zip', 'xz' }
321+
is_txt : bool, default True
322+
True for text files (csv, json), False for binary files (pickle)
323+
324+
Returns
325+
-------
326+
opened file handle for I/O
290327
"""
291328
if compression is not None:
292329
if encoding is not None and not compat.PY3:
@@ -320,7 +357,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
320357
else:
321358
raise ValueError('Unrecognized compression type: %s' %
322359
compression)
323-
if compat.PY3:
360+
if compat.PY3 and is_txt:
324361
from io import TextIOWrapper
325362
f = TextIOWrapper(f, encoding=encoding)
326363
return f

pandas/io/parsers.py

+2-16
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
2828
_get_handle, UnicodeReader, UTF8Recoder,
2929
BaseIterator, CParserError, EmptyDataError,
30-
ParserWarning, _NA_VALUES)
30+
ParserWarning, _NA_VALUES, _get_inferred_compression)
3131
from pandas.tseries import tools
3232

3333
from pandas.util.decorators import Appender
@@ -353,21 +353,7 @@ def _read(filepath_or_buffer, kwds):
353353
# extension. If we're reading from a URL, the `get_filepath_or_buffer`
354354
# will use header info to determine compression, so use what it finds in
355355
# that case.
356-
inferred_compression = kwds.get('compression')
357-
if inferred_compression == 'infer':
358-
if isinstance(filepath_or_buffer, compat.string_types):
359-
if filepath_or_buffer.endswith('.gz'):
360-
inferred_compression = 'gzip'
361-
elif filepath_or_buffer.endswith('.bz2'):
362-
inferred_compression = 'bz2'
363-
elif filepath_or_buffer.endswith('.zip'):
364-
inferred_compression = 'zip'
365-
elif filepath_or_buffer.endswith('.xz'):
366-
inferred_compression = 'xz'
367-
else:
368-
inferred_compression = None
369-
else:
370-
inferred_compression = None
356+
inferred_compression = _get_inferred_compression(filepath_or_buffer, kwds.get('compression'))
371357

372358
filepath_or_buffer, _, compression = get_filepath_or_buffer(
373359
filepath_or_buffer, encoding,

pandas/io/pickle.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
from numpy.lib.format import read_array, write_array
55
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
66
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
7+
from pandas.io.common import _get_handle, _get_inferred_compression
78

89

9-
def to_pickle(obj, path):
10+
def to_pickle(obj, path, compression='infer'):
1011
"""
1112
Pickle (serialize) object to input file path
1213
@@ -15,12 +16,18 @@ def to_pickle(obj, path):
1516
obj : any object
1617
path : string
1718
File path
19+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
1820
"""
19-
with open(path, 'wb') as f:
21+
inferred_compression = _get_inferred_compression(path, compression)
22+
if inferred_compression:
23+
f = _get_handle(path, 'wb', compression=inferred_compression, is_txt=False)
24+
else:
25+
f = open(path, 'wb')
26+
with f:
2027
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
2128

2229

23-
def read_pickle(path):
30+
def read_pickle(path, compression='infer'):
2431
"""
2532
Load pickled pandas object (or any other pickled object) from the specified
2633
file path
@@ -32,12 +39,21 @@ def read_pickle(path):
3239
----------
3340
path : string
3441
File path
42+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
3543
3644
Returns
3745
-------
3846
unpickled : type of object stored in file
3947
"""
4048

49+
inferred_compression = _get_inferred_compression(path, compression)
50+
51+
def openfile():
52+
if inferred_compression:
53+
return _get_handle(path, 'rb', compression=inferred_compression, is_txt=False)
54+
else:
55+
return open(path, 'rb')
56+
4157
def try_read(path, encoding=None):
4258
# try with cPickle
4359
# try with current pickle, if we have a Type Error then
@@ -48,17 +64,17 @@ def try_read(path, encoding=None):
4864
# cpickle
4965
# GH 6899
5066
try:
51-
with open(path, 'rb') as fh:
52-
return pkl.load(fh)
67+
with openfile() as f:
68+
return pkl.load(f)
5369
except Exception:
5470
# reg/patched pickle
5571
try:
56-
with open(path, 'rb') as fh:
72+
with openfile() as fh:
5773
return pc.load(fh, encoding=encoding, compat=False)
5874

5975
# compat pickle
6076
except:
61-
with open(path, 'rb') as fh:
77+
with openfile() as fh:
6278
return pc.load(fh, encoding=encoding, compat=True)
6379

6480
try:

pandas/io/tests/test_pickle.py

+22
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,28 @@ def test_pickle_v0_15_2(self):
284284
#
285285
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
286286

287+
def compression_explicit(self, compression):
288+
with tm.ensure_clean(self.path) as path:
289+
df = tm.makeDataFrame()
290+
df.to_pickle(path, compression)
291+
tm.assert_frame_equal(df, pandas.read_pickle(path, compression))
292+
293+
def test_compression_explicit(self):
294+
compressions = [None, 'gzip', 'bz2', 'xz']
295+
for c in compressions:
296+
yield self.compression_explicit, c
297+
298+
def compression_infer(self, ext):
299+
with tm.ensure_clean(self.path + ext) as p:
300+
df = tm.makeDataFrame()
301+
df.to_pickle(p)
302+
tm.assert_frame_equal(df, pandas.read_pickle(p))
303+
304+
def test_compression_infer(self):
305+
extensions = ['', '.gz', '.bz2', '.xz']
306+
for ext in extensions:
307+
yield self.compression_infer, ext
308+
287309

288310
if __name__ == '__main__':
289311
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)