add zip decompression support. refactor using lambda.

goldenbull · goldenbull · commit 1cb810bf8ba2 · 2016-12-30T15:28:23.000+08:00
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -14,7 +14,6 @@
 
 try:
     from s3fs import S3File
-
     need_text_wrapping = (BytesIO, S3File)
 except ImportError:
     need_text_wrapping = (BytesIO,)
@@ -29,21 +28,20 @@
 
 try:
     import pathlib
-
     _PATHLIB_INSTALLED = True
 except ImportError:
     _PATHLIB_INSTALLED = False
 
+
 try:
     from py.path import local as LocalPath
-
     _PY_PATH_INSTALLED = True
 except:
     _PY_PATH_INSTALLED = False
 
+
 if compat.PY3:
     from urllib.request import urlopen, pathname2url
-
     _urlopen = urlopen
     from urllib.parse import urlparse as parse_url
     from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -60,13 +58,13 @@
     from contextlib import contextmanager, closing  # noqa
     from functools import wraps  # noqa
 
-
     # @wraps(_urlopen)
     @contextmanager
     def urlopen(*args, **kwargs):
         with closing(_urlopen(*args, **kwargs)) as f:
             yield f
 
+
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard('')
 
@@ -77,7 +75,6 @@ class ParserError(ValueError):
     """
     pass
 
-
 # gh-12665: Alias for now and remove later.
 CParserError = ParserError
 
@@ -112,14 +109,12 @@ class BaseIterator(object):
     """Subclass this and provide a "__next__()" method to obtain an iterator.
     Useful only when the object being iterated is non-reusable (e.g. OK for a
     parser, not for an in-memory table, yes for its iterator)."""
-
     def __iter__(self):
         return self
 
     def __next__(self):
         raise AbstractMethodError(self)
 
-
 if not compat.PY3:
     BaseIterator.next = lambda self: self.__next__()
 
@@ -465,6 +460,7 @@ def __next__(self):
 
 
 class UTF8Recoder(BaseIterator):
+
     """
     Iterator that reads an encoded stream and reencodes the input to UTF-8
     """
@@ -487,7 +483,6 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
         # ignore encoding
         return csv.reader(f, dialect=dialect, **kwds)
 
-
     def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
         return csv.writer(f, dialect=dialect, **kwds)
 else:
@@ -509,7 +504,6 @@ def __next__(self):
             row = next(self.reader)
             return [compat.text_type(s, "utf-8") for s in row]
 
-
     class UnicodeWriter:
 
         """
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -44,10 +44,10 @@ def read_pickle(path, compression='infer'):
     ----------
     path : string
         File path
-    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
-        gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz',
-        respectively, and no decompression otherwise.
+        gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
+        or 'zip' respectively, and no decompression otherwise.
         Set to None for no decompression.
 
         .. versionadded:: 0.20.0
@@ -59,6 +59,17 @@ def read_pickle(path, compression='infer'):
 
     inferred_compression = _infer_compression(path, compression)
 
+    def read_wrapper(func):
+        # wrapper file handle open/close operation
+        f, fh = _get_handle(path, 'rb',
+                            compression=inferred_compression,
+                            is_text=False)
+        try:
+            return func(f)
+        finally:
+            for _f in fh:
+                _f.close()
+
     def try_read(path, encoding=None):
         # try with cPickle
         # try with current pickle, if we have a Type Error then
@@ -69,35 +80,16 @@ def try_read(path, encoding=None):
         # cpickle
         # GH 6899
         try:
-            f, fh = _get_handle(path, 'rb',
-                                compression=inferred_compression,
-                                is_text=False)
-            try:
-                return pkl.load(f)
-            finally:
-                for _f in fh:
-                    _f.close()
+            return read_wrapper(lambda f: pkl.load(f))
         except Exception:
             # reg/patched pickle
             try:
-                f, fh = _get_handle(path, 'rb',
-                                    compression=inferred_compression,
-                                    is_text=False)
-                try:
-                    return pc.load(f, encoding=encoding, compat=False)
-                finally:
-                    for _f in fh:
-                        _f.close()
+                return read_wrapper(
+                    lambda f: pc.load(f, encoding=encoding, compat=False))
             # compat pickle
             except:
-                f, fh = _get_handle(path, 'rb',
-                                    compression=inferred_compression,
-                                    is_text=False)
-                try:
-                    return pc.load(f, encoding=encoding, compat=True)
-                finally:
-                    for _f in fh:
-                        _f.close()
+                return read_wrapper(
+                    lambda f: pc.load(f, encoding=encoding, compat=True))
     try:
         return try_read(path)
     except:
diff --git a/pandas/io/tests/data/pickle_compression/data.pickle.zip b/pandas/io/tests/data/pickle_compression/data.pickle.zip
diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py
@@ -324,7 +324,9 @@ def test_compression_infer(self):
         for ext in extensions:
             yield self.compression_infer, ext
 
-    def compression_prepared_data(self, ext):
+    def decompression_prepared_data(self, ext):
+        if ext == '.xz':
+            tm._skip_if_no_lzma()
         pickle_path = os.path.join(tm.get_data_path(),
                                    'pickle_compression',
                                    'data.pickle')
@@ -333,10 +335,10 @@ def compression_prepared_data(self, ext):
         data2 = pd.read_pickle(compressed_path)
         tm.assert_frame_equal(data1, data2)
 
-    def test_compression_prepared_data(self):
-        extensions = ['.gz', '.bz2', '.xz']
+    def test_decompression_prepared_data(self):
+        extensions = ['.gz', '.bz2', '.xz', '.zip']
         for ext in extensions:
-            yield self.compression_prepared_data, ext
+            yield self.decompression_prepared_data, ext
 
 
 if __name__ == '__main__':
diff --git a/setup.py b/setup.py
@@ -660,6 +660,7 @@ def pxd(name):
       package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5',
                                   'tests/data/legacy_pickle/*/*.pickle',
                                   'tests/data/legacy_msgpack/*/*.msgpack',
+                                  'tests/data/pickle_compression/*',
                                   'tests/data/*.csv*',
                                   'tests/data/*.dta',
                                   'tests/data/*.pickle',