EHN: allow zip compression in to_pickle, to_json, to_csv (pandas-dev#20394)

minggli · javadnoorb · commit 189dd8e4a05e · 2018-03-29T15:18:59.000-04:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -344,6 +344,7 @@ Other Enhancements
 - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
   ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
 - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
+- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
 
 .. _whatsnew_0230.api_breaking:
 
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -75,16 +75,6 @@ def compression(request):
     return request.param
 
 
-@pytest.fixture(params=[None, 'gzip', 'bz2',
-                        pytest.param('xz', marks=td.skip_if_no_lzma)])
-def compression_no_zip(request):
-    """
-    Fixture for trying common compression types in compression tests
-    except zip
-    """
-    return request.param
-
-
 @pytest.fixture(scope='module')
 def datetime_tz_utc():
     from datetime import timezone
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1654,9 +1654,9 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
             A string representing the encoding to use in the output file,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
         compression : string, optional
-            a string representing the compression to use in the output file,
-            allowed values are 'gzip', 'bz2', 'xz',
-            only used when the first argument is a filename
+            A string representing the compression to use in the output file.
+            Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
+            used when the first argument is a filename.
         line_terminator : string, default ``'\n'``
             The newline character or character sequence to use in the output
             file
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1814,9 +1814,9 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
 
             .. versionadded:: 0.19.0
 
-        compression : {None, 'gzip', 'bz2', 'xz'}
+        compression : {None, 'gzip', 'bz2', 'zip', 'xz'}
             A string representing the compression to use in the output file,
-            only used when the first argument is a filename
+            only used when the first argument is a filename.
 
             .. versionadded:: 0.21.0
 
@@ -2133,7 +2133,8 @@ def to_pickle(self, path, compression='infer',
         ----------
         path : str
             File path where the pickled object will be stored.
-        compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \
+        default 'infer'
             A string representing the compression to use in the output file. By
             default, infers from the file extension in specified path.
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3633,9 +3633,9 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
             a string representing the encoding to use if the contents are
             non-ascii, for python versions prior to 3
         compression : string, optional
-            a string representing the compression to use in the output file,
-            allowed values are 'gzip', 'bz2', 'xz', only used when the first
-            argument is a filename
+            A string representing the compression to use in the output file.
+            Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
+            used when the first argument is a filename.
         date_format: string, default None
             Format string for datetime objects.
         decimal: string, default '.'
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -5,6 +5,7 @@
 import codecs
 import mmap
 from contextlib import contextmanager, closing
+from zipfile import ZipFile
 
 from pandas.compat import StringIO, BytesIO, string_types, text_type
 from pandas import compat
@@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
 
         # ZIP Compression
         elif compression == 'zip':
-            import zipfile
-            zip_file = zipfile.ZipFile(path_or_buf)
-            zip_names = zip_file.namelist()
-            if len(zip_names) == 1:
-                f = zip_file.open(zip_names.pop())
-            elif len(zip_names) == 0:
-                raise ValueError('Zero files found in ZIP file {}'
-                                 .format(path_or_buf))
-            else:
-                raise ValueError('Multiple files found in ZIP file.'
-                                 ' Only one file per ZIP: {}'
-                                 .format(zip_names))
+            zf = BytesZipFile(path_or_buf, mode)
+            if zf.mode == 'w':
+                f = zf
+            elif zf.mode == 'r':
+                zip_names = zf.namelist()
+                if len(zip_names) == 1:
+                    f = zf.open(zip_names.pop())
+                elif len(zip_names) == 0:
+                    raise ValueError('Zero files found in ZIP file {}'
+                                     .format(path_or_buf))
+                else:
+                    raise ValueError('Multiple files found in ZIP file.'
+                                     ' Only one file per ZIP: {}'
+                                     .format(zip_names))
 
         # XZ Compression
         elif compression == 'xz':
@@ -425,6 +428,24 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     return f, handles
 
 
+class BytesZipFile(ZipFile, BytesIO):
+    """
+    Wrapper for standard library class ZipFile and allow the returned file-like
+    handle to accept byte strings via `write` method.
+
+    BytesIO provides attributes of file-like object and ZipFile.writestr writes
+    bytes strings into a member of the archive.
+    """
+    # GH 17778
+    def __init__(self, file, mode='r', **kwargs):
+        if mode in ['wb', 'rb']:
+            mode = mode.replace('b', '')
+        super(BytesZipFile, self).__init__(file, mode, **kwargs)
+
+    def write(self, data):
+        super(BytesZipFile, self).writestr(self.filename, data)
+
+
 class MMapWrapper(BaseIterator):
     """
     Wrapper for the Python's mmap class so that it can be properly read in
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -133,8 +133,8 @@ def save(self):
         else:
             f, handles = _get_handle(self.path_or_buf, self.mode,
                                      encoding=encoding,
-                                     compression=self.compression)
-            close = True
+                                     compression=None)
+            close = True if self.compression is None else False
 
         try:
             writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,6 +151,16 @@ def save(self):
             self._save()
 
         finally:
+            # GH 17778 handles compression for byte strings.
+            if not close and self.compression:
+                f.close()
+                with open(self.path_or_buf, 'r') as f:
+                    data = f.read()
+                f, handles = _get_handle(self.path_or_buf, self.mode,
+                                         encoding=encoding,
+                                         compression=self.compression)
+                f.write(data)
+                close = True
             if close:
                 f.close()
 
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
         Any python object.
     path : str
         File path where the pickled object will be stored.
-    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
         A string representing the compression to use in the output file. By
         default, infers from the file extension in specified path.
 
@@ -74,7 +74,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
     if protocol < 0:
         protocol = pkl.HIGHEST_PROTOCOL
     try:
-        pkl.dump(obj, f, protocol=protocol)
+        f.write(pkl.dumps(obj, protocol=protocol))
     finally:
         for _f in fh:
             _f.close()
@@ -93,7 +93,7 @@ def read_pickle(path, compression='infer'):
     ----------
     path : str
         File path where the pickled object will be loaded.
-    compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
         gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
         or '.zip' respectively, and no decompression otherwise.
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -919,43 +919,30 @@ def test_to_csv_path_is_none(self):
         recons = pd.read_csv(StringIO(csv_str), index_col=0)
         assert_frame_equal(self.frame, recons)
 
-    def test_to_csv_compression(self, compression_no_zip):
+    def test_to_csv_compression(self, compression):
 
         df = DataFrame([[0.123456, 0.234567, 0.567567],
                         [12.32112, 123123.2, 321321.2]],
                        index=['A', 'B'], columns=['X', 'Y', 'Z'])
 
         with ensure_clean() as filename:
 
-            df.to_csv(filename, compression=compression_no_zip)
+            df.to_csv(filename, compression=compression)
 
             # test the round trip - to_csv -> read_csv
-            rs = read_csv(filename, compression=compression_no_zip,
+            rs = read_csv(filename, compression=compression,
                           index_col=0)
             assert_frame_equal(df, rs)
 
             # explicitly make sure file is compressed
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 text = fh.read().decode('utf8')
                 for col in df.columns:
                     assert col in text
 
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 assert_frame_equal(df, read_csv(fh, index_col=0))
 
-    def test_to_csv_compression_value_error(self):
-        # GH7615
-        # use the compression kw in to_csv
-        df = DataFrame([[0.123456, 0.234567, 0.567567],
-                        [12.32112, 123123.2, 321321.2]],
-                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
-
-        with ensure_clean() as filename:
-            # zip compression is not supported and should raise ValueError
-            import zipfile
-            pytest.raises(zipfile.BadZipfile, df.to_csv,
-                          filename, compression="zip")
-
     def test_to_csv_date_format(self):
         with ensure_clean('__tmp_to_csv_date_format__') as path:
             dt_index = self.tsframe.index
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -5,32 +5,22 @@
 from pandas.util.testing import assert_frame_equal, assert_raises_regex
 
 
-def test_compression_roundtrip(compression_no_zip):
+def test_compression_roundtrip(compression):
     df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
                        [12.32112, 123123.2, 321321.2]],
                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
 
     with tm.ensure_clean() as path:
-        df.to_json(path, compression=compression_no_zip)
+        df.to_json(path, compression=compression)
         assert_frame_equal(df, pd.read_json(path,
-                                            compression=compression_no_zip))
+                                            compression=compression))
 
         # explicitly ensure file was compressed.
-        with tm.decompress_file(path, compression_no_zip) as fh:
+        with tm.decompress_file(path, compression) as fh:
             result = fh.read().decode('utf8')
         assert_frame_equal(df, pd.read_json(result))
 
 
-def test_compress_zip_value_error():
-    df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
-                       [12.32112, 123123.2, 321321.2]],
-                      index=['A', 'B'], columns=['X', 'Y', 'Z'])
-
-    with tm.ensure_clean() as path:
-        import zipfile
-        pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip")
-
-
 def test_read_zipped_json():
     uncompressed_path = tm.get_data_path("tsframe_v012.json")
     uncompressed_df = pd.read_json(uncompressed_path)
@@ -41,7 +31,7 @@ def test_read_zipped_json():
     assert_frame_equal(uncompressed_df, compressed_df)
 
 
-def test_with_s3_url(compression_no_zip):
+def test_with_s3_url(compression):
     boto3 = pytest.importorskip('boto3')
     pytest.importorskip('s3fs')
     moto = pytest.importorskip('moto')
@@ -52,35 +42,35 @@ def test_with_s3_url(compression_no_zip):
         bucket = conn.create_bucket(Bucket="pandas-test")
 
         with tm.ensure_clean() as path:
-            df.to_json(path, compression=compression_no_zip)
+            df.to_json(path, compression=compression)
             with open(path, 'rb') as f:
                 bucket.put_object(Key='test-1', Body=f)
 
         roundtripped_df = pd.read_json('s3://pandas-test/test-1',
-                                       compression=compression_no_zip)
+                                       compression=compression)
         assert_frame_equal(df, roundtripped_df)
 
 
-def test_lines_with_compression(compression_no_zip):
+def test_lines_with_compression(compression):
 
     with tm.ensure_clean() as path:
         df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
         df.to_json(path, orient='records', lines=True,
-                   compression=compression_no_zip)
+                   compression=compression)
         roundtripped_df = pd.read_json(path, lines=True,
-                                       compression=compression_no_zip)
+                                       compression=compression)
         assert_frame_equal(df, roundtripped_df)
 
 
-def test_chunksize_with_compression(compression_no_zip):
+def test_chunksize_with_compression(compression):
 
     with tm.ensure_clean() as path:
         df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
         df.to_json(path, orient='records', lines=True,
-                   compression=compression_no_zip)
+                   compression=compression)
 
         res = pd.read_json(path, lines=True, chunksize=1,
-                           compression=compression_no_zip)
+                           compression=compression)
         roundtripped_df = pd.concat(res)
         assert_frame_equal(df, roundtripped_df)
 
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
@@ -352,7 +352,7 @@ def compress_file(self, src_path, dest_path, compression):
                 f.write(fh.read())
             f.close()
 
-    def test_write_explicit(self, compression_no_zip, get_random_path):
+    def test_write_explicit(self, compression, get_random_path):
         base = get_random_path
         path1 = base + ".compressed"
         path2 = base + ".raw"
@@ -361,10 +361,10 @@ def test_write_explicit(self, compression_no_zip, get_random_path):
             df = tm.makeDataFrame()
 
             # write to compressed file
-            df.to_pickle(p1, compression=compression_no_zip)
+            df.to_pickle(p1, compression=compression)
 
             # decompress
-            with tm.decompress_file(p1, compression=compression_no_zip) as f:
+            with tm.decompress_file(p1, compression=compression) as f:
                 with open(p2, "wb") as fh:
                     fh.write(f.read())
 
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -138,26 +138,26 @@ def test_to_csv_path_is_none(self):
         csv_str = s.to_csv(path=None)
         assert isinstance(csv_str, str)
 
-    def test_to_csv_compression(self, compression_no_zip):
+    def test_to_csv_compression(self, compression):
 
         s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
                    name='X')
 
         with ensure_clean() as filename:
 
-            s.to_csv(filename, compression=compression_no_zip, header=True)
+            s.to_csv(filename, compression=compression, header=True)
 
             # test the round trip - to_csv -> read_csv
-            rs = pd.read_csv(filename, compression=compression_no_zip,
+            rs = pd.read_csv(filename, compression=compression,
                              index_col=0, squeeze=True)
             assert_series_equal(s, rs)
 
             # explicitly ensure file was compressed
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 text = fh.read().decode('utf8')
                 assert s.name in text
 
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 assert_series_equal(s, pd.read_csv(fh,
                                                    index_col=0,
                                                    squeeze=True))
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -173,7 +173,7 @@ def decompress_file(path, compression):
     path : str
         The path where the file is read from
 
-    compression : {'gzip', 'bz2', 'xz', None}
+    compression : {'gzip', 'bz2', 'zip', 'xz', None}
         Name of the decompression to use
 
     Returns