diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 4876678baaa6e..c02d988a7bc63 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -83,6 +83,7 @@ Indexing I/O ^^^ +- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - diff --git a/pandas/io/common.py b/pandas/io/common.py index 0827216975f15..a492b7c0b8e8e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import codecs import mmap from contextlib import contextmanager, closing -from zipfile import ZipFile +import zipfile from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat @@ -428,7 +428,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, return f, handles -class BytesZipFile(ZipFile, BytesIO): +class BytesZipFile(zipfile.ZipFile, BytesIO): """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -437,10 +437,10 @@ class BytesZipFile(ZipFile, BytesIO): bytes strings into a member of the archive. """ # GH 17778 - def __init__(self, file, mode='r', **kwargs): + def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') - super(BytesZipFile, self).__init__(file, mode, **kwargs) + super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 0b329f64dafa3..bb7ee1b911fee 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- import pytest +import os import collections from functools import partial import numpy as np -from pandas import Series, Timestamp +from pandas import Series, DataFrame, Timestamp from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops @@ -222,3 +223,21 @@ def test_standardize_mapping(): dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) + + +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +def test_compression_size(obj, method, compression): + if not compression: + pytest.skip("only test compression case.") + + with tm.ensure_clean() as filename: + getattr(obj, method)(filename, compression=compression) + compressed = os.path.getsize(filename) + getattr(obj, method)(filename, compression=None) + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed