pandas-dev · goldenbull · Oct 8, 2016 · Dec 28, 2016 · Dec 29, 2016 · Dec 29, 2016
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1256,17 +1256,21 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
                    if_exists=if_exists, index=index, index_label=index_label,
                    chunksize=chunksize, dtype=dtype)
 
-    def to_pickle(self, path):
+    def to_pickle(self, path, compression='infer'):
         """
         Pickle (serialize) object to input file path.
 
         Parameters
         ----------
         path : string
             File path
+        compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+            a string representing the compression to use in the output file
+
+            .. versionadded:: 0.20.0
         """
         from pandas.io.pickle import to_pickle
-        return to_pickle(self, path)
+        return to_pickle(self, path, compression=compression)
 
     def to_clipboard(self, excel=None, sep=None, **kwargs):
         """

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -14,6 +14,7 @@
 
 try:
     from s3fs import S3File
+
     need_text_wrapping = (BytesIO, S3File)
 except ImportError:
     need_text_wrapping = (BytesIO,)
@@ -28,20 +29,21 @@
 
 try:
     import pathlib
+
     _PATHLIB_INSTALLED = True
 except ImportError:
     _PATHLIB_INSTALLED = False
 
-
 try:
     from py.path import local as LocalPath
+
     _PY_PATH_INSTALLED = True
 except:
     _PY_PATH_INSTALLED = False
 
-
 if compat.PY3:
     from urllib.request import urlopen, pathname2url
+
     _urlopen = urlopen
     from urllib.parse import urlparse as parse_url
     from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -58,13 +60,13 @@
     from contextlib import contextmanager, closing  # noqa
     from functools import wraps  # noqa
 
+
     # @wraps(_urlopen)
     @contextmanager
     def urlopen(*args, **kwargs):
         with closing(_urlopen(*args, **kwargs)) as f:
             yield f
 
-
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard('')
 
@@ -75,6 +77,7 @@ class ParserError(ValueError):
     """
     pass
 
+
 # gh-12665: Alias for now and remove later.
 CParserError = ParserError
 
@@ -109,12 +112,14 @@ class BaseIterator(object):
     """Subclass this and provide a "__next__()" method to obtain an iterator.
     Useful only when the object being iterated is non-reusable (e.g. OK for a
     parser, not for an in-memory table, yes for its iterator)."""
+
     def __iter__(self):
         return self
 
     def __next__(self):
         raise AbstractMethodError(self)
 
+
 if not compat.PY3:
     BaseIterator.next = lambda self: self.__next__()
 
@@ -302,7 +307,7 @@ def _infer_compression(filepath_or_buffer, compression):
 
 
 def _get_handle(path_or_buf, mode, encoding=None, compression=None,
-                memory_map=False):
+                memory_map=False, is_text=True):
     """
     Get file handle for given path/buffer and mode.
 
@@ -317,7 +322,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
         Supported compression protocols are gzip, bz2, zip, and xz
     memory_map : boolean, default False
         See parsers._parser_params for more information.
-
+    is_text : boolean, default True
+        whether file/buffer is in text format (csv, json, etc.), or in binary
+        mode (pickle, etc.)
     Returns
     -------
     f : file-like
@@ -391,13 +398,17 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
         elif encoding:
             # Python 3 and encoding
             f = open(path_or_buf, mode, encoding=encoding)
-        else:
+        elif is_text:
             # Python 3 and no explicit encoding
             f = open(path_or_buf, mode, errors='replace')
+        else:
+            # Python 3 and binary mode
+            f = open(path_or_buf, mode)
         handles.append(f)
 
     # in Python 3, convert BytesIO or fileobjects passed with an encoding
-    if compat.PY3 and (compression or isinstance(f, need_text_wrapping)):
+    if compat.PY3 and is_text and\
+            (compression or isinstance(f, need_text_wrapping)):
         from io import TextIOWrapper
         f = TextIOWrapper(f, encoding=encoding)
         handles.append(f)
@@ -454,7 +465,6 @@ def __next__(self):
 
 
 class UTF8Recoder(BaseIterator):
-
     """
     Iterator that reads an encoded stream and reencodes the input to UTF-8
     """
@@ -477,6 +487,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
         # ignore encoding
         return csv.reader(f, dialect=dialect, **kwds)
 
+
     def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
         return csv.writer(f, dialect=dialect, **kwds)
 else:
@@ -498,6 +509,7 @@ def __next__(self):
             row = next(self.reader)
             return [compat.text_type(s, "utf-8") for s in row]
 
+
     class UnicodeWriter:
 
         """

diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -4,9 +4,10 @@
 from numpy.lib.format import read_array, write_array
 from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
 from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
+from pandas.io.common import _get_handle, _infer_compression
 
 
-def to_pickle(obj, path):
+def to_pickle(obj, path, compression='infer'):
     """
     Pickle (serialize) object to input file path
 
@@ -15,12 +16,23 @@ def to_pickle(obj, path):
     obj : any object
     path : string
         File path
+    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+        a string representing the compression to use in the output file
+
+        .. versionadded:: 0.20.0
     """
-    with open(path, 'wb') as f:
+    inferred_compression = _infer_compression(path, compression)
+    f, fh = _get_handle(path, 'wb',
+                        compression=inferred_compression,
+                        is_text=False)
+    try:
         pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
+    finally:
+        for _f in fh:
+            _f.close()
 
 
-def read_pickle(path):
+def read_pickle(path, compression='infer'):
     """
     Load pickled pandas object (or any other pickled object) from the specified
     file path
@@ -32,12 +44,21 @@ def read_pickle(path):
     ----------
     path : string
         File path
+    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer', then use
+        gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz',
+        respectively, and no decompression otherwise.
+        Set to None for no decompression.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
     unpickled : type of object stored in file
     """
 
+    inferred_compression = _infer_compression(path, compression)
+
     def try_read(path, encoding=None):
         # try with cPickle
         # try with current pickle, if we have a Type Error then
@@ -48,26 +69,43 @@ def try_read(path, encoding=None):
         # cpickle
         # GH 6899
         try:
-            with open(path, 'rb') as fh:
-                return pkl.load(fh)
+            f, fh = _get_handle(path, 'rb',
+                                compression=inferred_compression,
+                                is_text=False)
+            try:
+                return pkl.load(f)
+            finally:
+                for _f in fh:
+                    _f.close()
         except Exception:
             # reg/patched pickle
             try:
-                with open(path, 'rb') as fh:
-                    return pc.load(fh, encoding=encoding, compat=False)
-
+                f, fh = _get_handle(path, 'rb',
+                                    compression=inferred_compression,
+                                    is_text=False)
+                try:
+                    return pc.load(f, encoding=encoding, compat=False)
+                finally:
+                    for _f in fh:
+                        _f.close()
             # compat pickle
             except:
-                with open(path, 'rb') as fh:
-                    return pc.load(fh, encoding=encoding, compat=True)
-
+                f, fh = _get_handle(path, 'rb',
+                                    compression=inferred_compression,
+                                    is_text=False)
+                try:
+                    return pc.load(f, encoding=encoding, compat=True)
+                finally:
+                    for _f in fh:
+                        _f.close()
     try:
         return try_read(path)
     except:
         if PY3:
             return try_read(path, encoding='latin1')
         raise
 
+
 # compat with sparse pickle / unpickle
 
 

diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py
@@ -284,6 +284,46 @@ def test_pickle_v0_15_2(self):
         #
         tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
 
+    def compression_explicit(self, compression):
+        # issue 11666
+        if compression == 'xz':
+            tm._skip_if_no_lzma()
+        with tm.ensure_clean(self.path) as path:
+            df = tm.makeDataFrame()
+            df.to_pickle(path, compression=compression)
+            df2 = pd.read_pickle(path, compression=compression)
+            tm.assert_frame_equal(df, df2)
+
+    def test_compression_explicit(self):
+        compressions = [None, 'gzip', 'bz2', 'xz']
+        for c in compressions:
+            yield self.compression_explicit, c
+
+    def compression_explicit_bad(self, compression):
+        with tm.assertRaisesRegexp(ValueError,
+                                   "Unrecognized compression type"):
+            with tm.ensure_clean(self.path) as path:
+                df = tm.makeDataFrame()
+                df.to_pickle(path, compression=compression)
+
+    def test_compression_explicit_bad(self):
+        compressions = ['', 'None', 'bad', '7z']
+        for c in compressions:
+            yield self.compression_explicit_bad, c
+
+    def compression_infer(self, ext):
+        if ext == '.xz':
+            tm._skip_if_no_lzma()
+        with tm.ensure_clean(self.path + ext) as path:
+            df = tm.makeDataFrame()
+            df.to_pickle(path)
+            tm.assert_frame_equal(df, pd.read_pickle(path))
+
+    def test_compression_infer(self):
+        extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i']
+        for ext in extensions:
+            yield self.compression_infer, ext
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],