pickle compression code update

goldenbull · goldenbull · commit 6df661101f05 · 2016-12-29T10:52:46.000+08:00
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1265,7 +1265,9 @@ def to_pickle(self, path, compression='infer'):
         path : string
             File path
         compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
-            .. versionadded:: 0.19.2
+            a string representing the compression to use in the output file
+
+            .. versionadded:: 0.20.0
         """
         from pandas.io.pickle import to_pickle
         return to_pickle(self, path, compression=compression)
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -14,6 +14,7 @@
 
 try:
     from s3fs import S3File
+
     need_text_wrapping = (BytesIO, S3File)
 except ImportError:
     need_text_wrapping = (BytesIO,)
@@ -28,20 +29,21 @@
 
 try:
     import pathlib
+
     _PATHLIB_INSTALLED = True
 except ImportError:
     _PATHLIB_INSTALLED = False
 
-
 try:
     from py.path import local as LocalPath
+
     _PY_PATH_INSTALLED = True
 except:
     _PY_PATH_INSTALLED = False
 
-
 if compat.PY3:
     from urllib.request import urlopen, pathname2url
+
     _urlopen = urlopen
     from urllib.parse import urlparse as parse_url
     from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -58,13 +60,13 @@
     from contextlib import contextmanager, closing  # noqa
     from functools import wraps  # noqa
 
+
     # @wraps(_urlopen)
     @contextmanager
     def urlopen(*args, **kwargs):
         with closing(_urlopen(*args, **kwargs)) as f:
             yield f
 
-
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard('')
 
@@ -75,6 +77,7 @@ class ParserError(ValueError):
     """
     pass
 
+
 # gh-12665: Alias for now and remove later.
 CParserError = ParserError
 
@@ -109,12 +112,14 @@ class BaseIterator(object):
     """Subclass this and provide a "__next__()" method to obtain an iterator.
     Useful only when the object being iterated is non-reusable (e.g. OK for a
     parser, not for an in-memory table, yes for its iterator)."""
+
     def __iter__(self):
         return self
 
     def __next__(self):
         raise AbstractMethodError(self)
 
+
 if not compat.PY3:
     BaseIterator.next = lambda self: self.__next__()
 
@@ -318,7 +323,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
-        whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.)
+        whether file/buffer is in text format (csv, json, etc.), or in binary
+        mode (pickle, etc.)
     Returns
     -------
     f : file-like
@@ -401,7 +407,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
         handles.append(f)
 
     # in Python 3, convert BytesIO or fileobjects passed with an encoding
-    if compat.PY3 and is_text and (compression or isinstance(f, need_text_wrapping)):
+    if compat.PY3 and is_text and\
+            (compression or isinstance(f, need_text_wrapping)):
         from io import TextIOWrapper
         f = TextIOWrapper(f, encoding=encoding)
         handles.append(f)
@@ -458,7 +465,6 @@ def __next__(self):
 
 
 class UTF8Recoder(BaseIterator):
-
     """
     Iterator that reads an encoded stream and reencodes the input to UTF-8
     """
@@ -481,6 +487,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
         # ignore encoding
         return csv.reader(f, dialect=dialect, **kwds)
 
+
     def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
         return csv.writer(f, dialect=dialect, **kwds)
 else:
@@ -502,6 +509,7 @@ def __next__(self):
             row = next(self.reader)
             return [compat.text_type(s, "utf-8") for s in row]
 
+
     class UnicodeWriter:
 
         """
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -17,12 +17,19 @@ def to_pickle(obj, path, compression='infer'):
     path : string
         File path
     compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
-        .. versionadded:: 0.19.2
+        a string representing the compression to use in the output file
+
+        .. versionadded:: 0.20.0
     """
     inferred_compression = _infer_compression(path, compression)
-    f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False)
-    with f:
+    f, fh = _get_handle(path, 'wb',
+                        compression=inferred_compression,
+                        is_text=False)
+    try:
         pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
+    finally:
+        for _f in fh:
+            _f.close()
 
 
 def read_pickle(path, compression='infer'):
@@ -38,7 +45,12 @@ def read_pickle(path, compression='infer'):
     path : string
         File path
     compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
-        .. versionadded:: 0.19.2
+        For on-the-fly decompression of on-disk data. If 'infer', then use
+        gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz',
+        respectively, and no decompression otherwise.
+        Set to None for no decompression.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
@@ -57,22 +69,35 @@ def try_read(path, encoding=None):
         # cpickle
         # GH 6899
         try:
-            f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
-            with f:
+            f, fh = _get_handle(path, 'rb',
+                                compression=inferred_compression,
+                                is_text=False)
+            try:
                 return pkl.load(f)
+            finally:
+                for _f in fh:
+                    _f.close()
         except Exception:
             # reg/patched pickle
             try:
-                f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
-                with f:
+                f, fh = _get_handle(path, 'rb',
+                                    compression=inferred_compression,
+                                    is_text=False)
+                try:
                     return pc.load(f, encoding=encoding, compat=False)
-
+                finally:
+                    for _f in fh:
+                        _f.close()
             # compat pickle
             except:
-                f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
-                with f:
+                f, fh = _get_handle(path, 'rb',
+                                    compression=inferred_compression,
+                                    is_text=False)
+                try:
                     return pc.load(f, encoding=encoding, compat=True)
-
+                finally:
+                    for _f in fh:
+                        _f.close()
     try:
         return try_read(path)
     except:
diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py
@@ -286,18 +286,22 @@ def test_pickle_v0_15_2(self):
 
     def compression_explicit(self, compression):
         # issue 11666
+        if compression == 'xz':
+            tm._skip_if_no_lzma()
         with tm.ensure_clean(self.path) as path:
             df = tm.makeDataFrame()
             df.to_pickle(path, compression=compression)
-            tm.assert_frame_equal(df, pandas.read_pickle(path, compression=compression))
+            df2 = pd.read_pickle(path, compression=compression)
+            tm.assert_frame_equal(df, df2)
 
     def test_compression_explicit(self):
         compressions = [None, 'gzip', 'bz2', 'xz']
         for c in compressions:
             yield self.compression_explicit, c
 
     def compression_explicit_bad(self, compression):
-        with tm.assertRaisesRegexp(ValueError, "Unrecognized compression type"):
+        with tm.assertRaisesRegexp(ValueError,
+                                   "Unrecognized compression type"):
             with tm.ensure_clean(self.path) as path:
                 df = tm.makeDataFrame()
                 df.to_pickle(path, compression=compression)
@@ -308,10 +312,12 @@ def test_compression_explicit_bad(self):
             yield self.compression_explicit_bad, c
 
     def compression_infer(self, ext):
-        with tm.ensure_clean(self.path + ext) as p:
+        if ext == '.xz':
+            tm._skip_if_no_lzma()
+        with tm.ensure_clean(self.path + ext) as path:
             df = tm.makeDataFrame()
-            df.to_pickle(p)
-            tm.assert_frame_equal(df, pandas.read_pickle(p))
+            df.to_pickle(path)
+            tm.assert_frame_equal(df, pd.read_pickle(path))
 
     def test_compression_infer(self):
         extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i']