Skip to content

Commit 189dd8e

Browse files
mingglijavadnoorb
authored andcommitted
EHN: allow zip compression in to_pickle, to_json, to_csv (pandas-dev#20394)
1 parent 3a76199 commit 189dd8e

File tree

13 files changed

+86
-86
lines changed

13 files changed

+86
-86
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ Other Enhancements
344344
- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
345345
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
346346
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
347+
- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
347348

348349
.. _whatsnew_0230.api_breaking:
349350

pandas/conftest.py

-10
Original file line numberDiff line numberDiff line change
@@ -75,16 +75,6 @@ def compression(request):
7575
return request.param
7676

7777

78-
@pytest.fixture(params=[None, 'gzip', 'bz2',
79-
pytest.param('xz', marks=td.skip_if_no_lzma)])
80-
def compression_no_zip(request):
81-
"""
82-
Fixture for trying common compression types in compression tests
83-
except zip
84-
"""
85-
return request.param
86-
87-
8878
@pytest.fixture(scope='module')
8979
def datetime_tz_utc():
9080
from datetime import timezone

pandas/core/frame.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1654,9 +1654,9 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
16541654
A string representing the encoding to use in the output file,
16551655
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
16561656
compression : string, optional
1657-
a string representing the compression to use in the output file,
1658-
allowed values are 'gzip', 'bz2', 'xz',
1659-
only used when the first argument is a filename
1657+
A string representing the compression to use in the output file.
1658+
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
1659+
used when the first argument is a filename.
16601660
line_terminator : string, default ``'\n'``
16611661
The newline character or character sequence to use in the output
16621662
file

pandas/core/generic.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1814,9 +1814,9 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
18141814
18151815
.. versionadded:: 0.19.0
18161816
1817-
compression : {None, 'gzip', 'bz2', 'xz'}
1817+
compression : {None, 'gzip', 'bz2', 'zip', 'xz'}
18181818
A string representing the compression to use in the output file,
1819-
only used when the first argument is a filename
1819+
only used when the first argument is a filename.
18201820
18211821
.. versionadded:: 0.21.0
18221822
@@ -2133,7 +2133,8 @@ def to_pickle(self, path, compression='infer',
21332133
----------
21342134
path : str
21352135
File path where the pickled object will be stored.
2136-
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
2136+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \
2137+
default 'infer'
21372138
A string representing the compression to use in the output file. By
21382139
default, infers from the file extension in specified path.
21392140

pandas/core/series.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3633,9 +3633,9 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
36333633
a string representing the encoding to use if the contents are
36343634
non-ascii, for python versions prior to 3
36353635
compression : string, optional
3636-
a string representing the compression to use in the output file,
3637-
allowed values are 'gzip', 'bz2', 'xz', only used when the first
3638-
argument is a filename
3636+
A string representing the compression to use in the output file.
3637+
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
3638+
used when the first argument is a filename.
36393639
date_format: string, default None
36403640
Format string for datetime objects.
36413641
decimal: string, default '.'

pandas/io/common.py

+33-12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import codecs
66
import mmap
77
from contextlib import contextmanager, closing
8+
from zipfile import ZipFile
89

910
from pandas.compat import StringIO, BytesIO, string_types, text_type
1011
from pandas import compat
@@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
363364

364365
# ZIP Compression
365366
elif compression == 'zip':
366-
import zipfile
367-
zip_file = zipfile.ZipFile(path_or_buf)
368-
zip_names = zip_file.namelist()
369-
if len(zip_names) == 1:
370-
f = zip_file.open(zip_names.pop())
371-
elif len(zip_names) == 0:
372-
raise ValueError('Zero files found in ZIP file {}'
373-
.format(path_or_buf))
374-
else:
375-
raise ValueError('Multiple files found in ZIP file.'
376-
' Only one file per ZIP: {}'
377-
.format(zip_names))
367+
zf = BytesZipFile(path_or_buf, mode)
368+
if zf.mode == 'w':
369+
f = zf
370+
elif zf.mode == 'r':
371+
zip_names = zf.namelist()
372+
if len(zip_names) == 1:
373+
f = zf.open(zip_names.pop())
374+
elif len(zip_names) == 0:
375+
raise ValueError('Zero files found in ZIP file {}'
376+
.format(path_or_buf))
377+
else:
378+
raise ValueError('Multiple files found in ZIP file.'
379+
' Only one file per ZIP: {}'
380+
.format(zip_names))
378381

379382
# XZ Compression
380383
elif compression == 'xz':
@@ -425,6 +428,24 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
425428
return f, handles
426429

427430

431+
class BytesZipFile(ZipFile, BytesIO):
432+
"""
433+
Wrapper for standard library class ZipFile and allow the returned file-like
434+
handle to accept byte strings via `write` method.
435+
436+
BytesIO provides attributes of file-like object and ZipFile.writestr writes
437+
bytes strings into a member of the archive.
438+
"""
439+
# GH 17778
440+
def __init__(self, file, mode='r', **kwargs):
441+
if mode in ['wb', 'rb']:
442+
mode = mode.replace('b', '')
443+
super(BytesZipFile, self).__init__(file, mode, **kwargs)
444+
445+
def write(self, data):
446+
super(BytesZipFile, self).writestr(self.filename, data)
447+
448+
428449
class MMapWrapper(BaseIterator):
429450
"""
430451
Wrapper for the Python's mmap class so that it can be properly read in

pandas/io/formats/csvs.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ def save(self):
133133
else:
134134
f, handles = _get_handle(self.path_or_buf, self.mode,
135135
encoding=encoding,
136-
compression=self.compression)
137-
close = True
136+
compression=None)
137+
close = True if self.compression is None else False
138138

139139
try:
140140
writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,6 +151,16 @@ def save(self):
151151
self._save()
152152

153153
finally:
154+
# GH 17778 handles compression for byte strings.
155+
if not close and self.compression:
156+
f.close()
157+
with open(self.path_or_buf, 'r') as f:
158+
data = f.read()
159+
f, handles = _get_handle(self.path_or_buf, self.mode,
160+
encoding=encoding,
161+
compression=self.compression)
162+
f.write(data)
163+
close = True
154164
if close:
155165
f.close()
156166

pandas/io/pickle.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
1818
Any python object.
1919
path : str
2020
File path where the pickled object will be stored.
21-
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
21+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
2222
A string representing the compression to use in the output file. By
2323
default, infers from the file extension in specified path.
2424
@@ -74,7 +74,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
7474
if protocol < 0:
7575
protocol = pkl.HIGHEST_PROTOCOL
7676
try:
77-
pkl.dump(obj, f, protocol=protocol)
77+
f.write(pkl.dumps(obj, protocol=protocol))
7878
finally:
7979
for _f in fh:
8080
_f.close()
@@ -93,7 +93,7 @@ def read_pickle(path, compression='infer'):
9393
----------
9494
path : str
9595
File path where the pickled object will be loaded.
96-
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
96+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
9797
For on-the-fly decompression of on-disk data. If 'infer', then use
9898
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
9999
or '.zip' respectively, and no decompression otherwise.

pandas/tests/frame/test_to_csv.py

+5-18
Original file line numberDiff line numberDiff line change
@@ -919,43 +919,30 @@ def test_to_csv_path_is_none(self):
919919
recons = pd.read_csv(StringIO(csv_str), index_col=0)
920920
assert_frame_equal(self.frame, recons)
921921

922-
def test_to_csv_compression(self, compression_no_zip):
922+
def test_to_csv_compression(self, compression):
923923

924924
df = DataFrame([[0.123456, 0.234567, 0.567567],
925925
[12.32112, 123123.2, 321321.2]],
926926
index=['A', 'B'], columns=['X', 'Y', 'Z'])
927927

928928
with ensure_clean() as filename:
929929

930-
df.to_csv(filename, compression=compression_no_zip)
930+
df.to_csv(filename, compression=compression)
931931

932932
# test the round trip - to_csv -> read_csv
933-
rs = read_csv(filename, compression=compression_no_zip,
933+
rs = read_csv(filename, compression=compression,
934934
index_col=0)
935935
assert_frame_equal(df, rs)
936936

937937
# explicitly make sure file is compressed
938-
with tm.decompress_file(filename, compression_no_zip) as fh:
938+
with tm.decompress_file(filename, compression) as fh:
939939
text = fh.read().decode('utf8')
940940
for col in df.columns:
941941
assert col in text
942942

943-
with tm.decompress_file(filename, compression_no_zip) as fh:
943+
with tm.decompress_file(filename, compression) as fh:
944944
assert_frame_equal(df, read_csv(fh, index_col=0))
945945

946-
def test_to_csv_compression_value_error(self):
947-
# GH7615
948-
# use the compression kw in to_csv
949-
df = DataFrame([[0.123456, 0.234567, 0.567567],
950-
[12.32112, 123123.2, 321321.2]],
951-
index=['A', 'B'], columns=['X', 'Y', 'Z'])
952-
953-
with ensure_clean() as filename:
954-
# zip compression is not supported and should raise ValueError
955-
import zipfile
956-
pytest.raises(zipfile.BadZipfile, df.to_csv,
957-
filename, compression="zip")
958-
959946
def test_to_csv_date_format(self):
960947
with ensure_clean('__tmp_to_csv_date_format__') as path:
961948
dt_index = self.tsframe.index

pandas/tests/io/json/test_compression.py

+13-23
Original file line numberDiff line numberDiff line change
@@ -5,32 +5,22 @@
55
from pandas.util.testing import assert_frame_equal, assert_raises_regex
66

77

8-
def test_compression_roundtrip(compression_no_zip):
8+
def test_compression_roundtrip(compression):
99
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
1010
[12.32112, 123123.2, 321321.2]],
1111
index=['A', 'B'], columns=['X', 'Y', 'Z'])
1212

1313
with tm.ensure_clean() as path:
14-
df.to_json(path, compression=compression_no_zip)
14+
df.to_json(path, compression=compression)
1515
assert_frame_equal(df, pd.read_json(path,
16-
compression=compression_no_zip))
16+
compression=compression))
1717

1818
# explicitly ensure file was compressed.
19-
with tm.decompress_file(path, compression_no_zip) as fh:
19+
with tm.decompress_file(path, compression) as fh:
2020
result = fh.read().decode('utf8')
2121
assert_frame_equal(df, pd.read_json(result))
2222

2323

24-
def test_compress_zip_value_error():
25-
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
26-
[12.32112, 123123.2, 321321.2]],
27-
index=['A', 'B'], columns=['X', 'Y', 'Z'])
28-
29-
with tm.ensure_clean() as path:
30-
import zipfile
31-
pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip")
32-
33-
3424
def test_read_zipped_json():
3525
uncompressed_path = tm.get_data_path("tsframe_v012.json")
3626
uncompressed_df = pd.read_json(uncompressed_path)
@@ -41,7 +31,7 @@ def test_read_zipped_json():
4131
assert_frame_equal(uncompressed_df, compressed_df)
4232

4333

44-
def test_with_s3_url(compression_no_zip):
34+
def test_with_s3_url(compression):
4535
boto3 = pytest.importorskip('boto3')
4636
pytest.importorskip('s3fs')
4737
moto = pytest.importorskip('moto')
@@ -52,35 +42,35 @@ def test_with_s3_url(compression_no_zip):
5242
bucket = conn.create_bucket(Bucket="pandas-test")
5343

5444
with tm.ensure_clean() as path:
55-
df.to_json(path, compression=compression_no_zip)
45+
df.to_json(path, compression=compression)
5646
with open(path, 'rb') as f:
5747
bucket.put_object(Key='test-1', Body=f)
5848

5949
roundtripped_df = pd.read_json('s3://pandas-test/test-1',
60-
compression=compression_no_zip)
50+
compression=compression)
6151
assert_frame_equal(df, roundtripped_df)
6252

6353

64-
def test_lines_with_compression(compression_no_zip):
54+
def test_lines_with_compression(compression):
6555

6656
with tm.ensure_clean() as path:
6757
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
6858
df.to_json(path, orient='records', lines=True,
69-
compression=compression_no_zip)
59+
compression=compression)
7060
roundtripped_df = pd.read_json(path, lines=True,
71-
compression=compression_no_zip)
61+
compression=compression)
7262
assert_frame_equal(df, roundtripped_df)
7363

7464

75-
def test_chunksize_with_compression(compression_no_zip):
65+
def test_chunksize_with_compression(compression):
7666

7767
with tm.ensure_clean() as path:
7868
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
7969
df.to_json(path, orient='records', lines=True,
80-
compression=compression_no_zip)
70+
compression=compression)
8171

8272
res = pd.read_json(path, lines=True, chunksize=1,
83-
compression=compression_no_zip)
73+
compression=compression)
8474
roundtripped_df = pd.concat(res)
8575
assert_frame_equal(df, roundtripped_df)
8676

pandas/tests/io/test_pickle.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def compress_file(self, src_path, dest_path, compression):
352352
f.write(fh.read())
353353
f.close()
354354

355-
def test_write_explicit(self, compression_no_zip, get_random_path):
355+
def test_write_explicit(self, compression, get_random_path):
356356
base = get_random_path
357357
path1 = base + ".compressed"
358358
path2 = base + ".raw"
@@ -361,10 +361,10 @@ def test_write_explicit(self, compression_no_zip, get_random_path):
361361
df = tm.makeDataFrame()
362362

363363
# write to compressed file
364-
df.to_pickle(p1, compression=compression_no_zip)
364+
df.to_pickle(p1, compression=compression)
365365

366366
# decompress
367-
with tm.decompress_file(p1, compression=compression_no_zip) as f:
367+
with tm.decompress_file(p1, compression=compression) as f:
368368
with open(p2, "wb") as fh:
369369
fh.write(f.read())
370370

pandas/tests/series/test_io.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -138,26 +138,26 @@ def test_to_csv_path_is_none(self):
138138
csv_str = s.to_csv(path=None)
139139
assert isinstance(csv_str, str)
140140

141-
def test_to_csv_compression(self, compression_no_zip):
141+
def test_to_csv_compression(self, compression):
142142

143143
s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
144144
name='X')
145145

146146
with ensure_clean() as filename:
147147

148-
s.to_csv(filename, compression=compression_no_zip, header=True)
148+
s.to_csv(filename, compression=compression, header=True)
149149

150150
# test the round trip - to_csv -> read_csv
151-
rs = pd.read_csv(filename, compression=compression_no_zip,
151+
rs = pd.read_csv(filename, compression=compression,
152152
index_col=0, squeeze=True)
153153
assert_series_equal(s, rs)
154154

155155
# explicitly ensure file was compressed
156-
with tm.decompress_file(filename, compression_no_zip) as fh:
156+
with tm.decompress_file(filename, compression) as fh:
157157
text = fh.read().decode('utf8')
158158
assert s.name in text
159159

160-
with tm.decompress_file(filename, compression_no_zip) as fh:
160+
with tm.decompress_file(filename, compression) as fh:
161161
assert_series_equal(s, pd.read_csv(fh,
162162
index_col=0,
163163
squeeze=True))

pandas/util/testing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def decompress_file(path, compression):
173173
path : str
174174
The path where the file is read from
175175
176-
compression : {'gzip', 'bz2', 'xz', None}
176+
compression : {'gzip', 'bz2', 'zip', 'xz', None}
177177
Name of the decompression to use
178178
179179
Returns

0 commit comments

Comments
 (0)