-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: set keyword argument so zipfile actually compresses #21144
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
86d2f72
498451d
012383f
a790148
42f5c32
74b8c34
c46fbe0
f31fc3d
3a29ab3
4775cac
fa6c433
adb6fd6
974b063
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
import codecs | ||
import mmap | ||
from contextlib import contextmanager, closing | ||
from zipfile import ZipFile | ||
import zipfile | ||
|
||
from pandas.compat import StringIO, BytesIO, string_types, text_type | ||
from pandas import compat | ||
|
@@ -428,7 +428,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
return f, handles | ||
|
||
|
||
class BytesZipFile(ZipFile, BytesIO): | ||
class BytesZipFile(zipfile.ZipFile, BytesIO): | ||
""" | ||
Wrapper for standard library class ZipFile and allow the returned file-like | ||
handle to accept byte strings via `write` method. | ||
|
@@ -437,10 +437,10 @@ class BytesZipFile(ZipFile, BytesIO): | |
bytes strings into a member of the archive. | ||
""" | ||
# GH 17778 | ||
def __init__(self, file, mode='r', **kwargs): | ||
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): | ||
if mode in ['wb', 'rb']: | ||
mode = mode.replace('b', '') | ||
super(BytesZipFile, self).__init__(file, mode, **kwargs) | ||
super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add tests and a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, because you are modifying the default behavior, I'm not sure if we need a deprecation cycle for this (to be safe, we should I would imagine). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no this is a bug There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough, though tests and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks. added whatsnew and tests. |
||
|
||
def write(self, data): | ||
super(BytesZipFile, self).writestr(self.filename, data) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,13 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import pytest | ||
import os | ||
import collections | ||
from functools import partial | ||
|
||
import numpy as np | ||
|
||
from pandas import Series, Timestamp | ||
from pandas import Series, DataFrame, Timestamp | ||
from pandas.compat import range, lmap | ||
import pandas.core.common as com | ||
from pandas.core import ops | ||
|
@@ -222,3 +223,21 @@ def test_standardize_mapping(): | |
|
||
dd = collections.defaultdict(list) | ||
assert isinstance(com.standardize_mapping(dd), partial) | ||
|
||
|
||
@pytest.mark.parametrize('obj', [ | ||
DataFrame(100 * [[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
columns=['X', 'Y', 'Z']), | ||
Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) | ||
@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) | ||
def test_compression_size(obj, method, compression): | ||
if not compression: | ||
pytest.skip("only test compression case.") | ||
|
||
with tm.ensure_clean() as filename: | ||
getattr(obj, method)(filename, compression=compression) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this fail under 0.23.0? (and works here clearly) |
||
compressed = os.path.getsize(filename) | ||
getattr(obj, method)(filename, compression=None) | ||
uncompressed = os.path.getsize(filename) | ||
assert uncompressed > compressed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you reference 21144 (as well is fine) as that other issue was closed for 0.23.0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @jreback @WillAyd , tested 21144 on older version of pandas 0.20 on windows and same issue occurred. I think it's an existing issue unrelated to this PR. This PR only address zip compression and doesn't touch gzip at all.