-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
EHN: allow zip compression in to_pickle
, to_json
, to_csv
#20394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
ccfd240
fd7362c
c570091
ec712b9
bf271ce
113db83
9b9e5d1
dedb853
dfa9913
67b9727
ecdf5a2
b9fab3c
5c5c161
d072ca8
cecb0ac
ed189c4
4ac9488
694c6b5
80992a3
3288691
272c6e7
d35b6af
c6034b4
71d9979
4c87e0f
fd44980
ab7a7b7
cfd0715
dd958ac
2956103
63890ec
e4966be
437d716
04886e9
099993c
6aa1493
129a55a
4531c78
ebd8e6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import codecs | ||
import mmap | ||
from contextlib import contextmanager, closing | ||
from zipfile import ZipFile | ||
|
||
from pandas.compat import StringIO, BytesIO, string_types, text_type | ||
from pandas import compat | ||
|
@@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
|
||
# ZIP Compression | ||
elif compression == 'zip': | ||
import zipfile | ||
zip_file = zipfile.ZipFile(path_or_buf) | ||
zip_names = zip_file.namelist() | ||
if len(zip_names) == 1: | ||
f = zip_file.open(zip_names.pop()) | ||
elif len(zip_names) == 0: | ||
raise ValueError('Zero files found in ZIP file {}' | ||
.format(path_or_buf)) | ||
else: | ||
raise ValueError('Multiple files found in ZIP file.' | ||
' Only one file per ZIP: {}' | ||
.format(zip_names)) | ||
zf = BytesZipFile(path_or_buf, mode) | ||
if zf.mode == 'w': | ||
f = zf | ||
elif zf.mode == 'r': | ||
zip_names = zf.namelist() | ||
if len(zip_names) == 1: | ||
f = zf.open(zip_names.pop()) | ||
elif len(zip_names) == 0: | ||
raise ValueError('Zero files found in ZIP file {}' | ||
.format(path_or_buf)) | ||
else: | ||
raise ValueError('Multiple files found in ZIP file.' | ||
' Only one file per ZIP: {}' | ||
.format(zip_names)) | ||
|
||
# XZ Compression | ||
elif compression == 'xz': | ||
|
@@ -425,6 +428,24 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
return f, handles | ||
|
||
|
||
class BytesZipFile(ZipFile, BytesIO): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I personally like this location. I would keep it here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a little bit more to this class doc-strings. e.g. why its needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added. we currently don't have ability to write zip compressed pickle, json, csv, only read them. standard library ZipFile isn't designed exactly to produce a writable file handle, hence the custom class. |
||
"""override write method with writestr to accept bytes.""" | ||
# GH 17778 | ||
def __init__(self, file, mode='r', **kwargs): | ||
if mode in ['wb', 'rb']: | ||
mode = mode.replace('b', '') | ||
super(BytesZipFile, self).__init__(file, mode, **kwargs) | ||
|
||
def write(self, data): | ||
super(BytesZipFile, self).writestr(self.filename, data) | ||
|
||
def writable(self): | ||
return self.mode == 'w' | ||
|
||
def readable(self): | ||
return self.mode == 'r' | ||
|
||
|
||
class MMapWrapper(BaseIterator): | ||
""" | ||
Wrapper for the Python's mmap class so that it can be properly read in | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -919,30 +919,31 @@ def test_to_csv_path_is_none(self): | |
recons = pd.read_csv(StringIO(csv_str), index_col=0) | ||
assert_frame_equal(self.frame, recons) | ||
|
||
def test_to_csv_compression(self, compression_no_zip): | ||
def test_to_csv_compression(self, compression): | ||
|
||
df = DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
with ensure_clean() as filename: | ||
|
||
df.to_csv(filename, compression=compression_no_zip) | ||
df.to_csv(filename, compression=compression) | ||
|
||
# test the round trip - to_csv -> read_csv | ||
rs = read_csv(filename, compression=compression_no_zip, | ||
rs = read_csv(filename, compression=compression, | ||
index_col=0) | ||
assert_frame_equal(df, rs) | ||
|
||
# explicitly make sure file is compressed | ||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
with tm.decompress_file(filename, compression) as fh: | ||
text = fh.read().decode('utf8') | ||
for col in df.columns: | ||
assert col in text | ||
|
||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
with tm.decompress_file(filename, compression) as fh: | ||
assert_frame_equal(df, read_csv(fh, index_col=0)) | ||
|
||
@pytest.mark.xfail(reason='zip compression is now supported for csv.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you xfailing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is an old test case that assert raising a BadZipFile exception when zip compression was not supported. so it will now fail the test because it doesn't no longer raise that exception. this test case is now redundant and removed in 04886e9 |
||
def test_to_csv_compression_value_error(self): | ||
# GH7615 | ||
# use the compression kw in to_csv | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,22 +5,23 @@ | |
from pandas.util.testing import assert_frame_equal, assert_raises_regex | ||
|
||
|
||
def test_compression_roundtrip(compression_no_zip): | ||
def test_compression_roundtrip(compression): | ||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression=compression_no_zip) | ||
df.to_json(path, compression=compression) | ||
assert_frame_equal(df, pd.read_json(path, | ||
compression=compression_no_zip)) | ||
compression=compression)) | ||
|
||
# explicitly ensure file was compressed. | ||
with tm.decompress_file(path, compression_no_zip) as fh: | ||
with tm.decompress_file(path, compression) as fh: | ||
result = fh.read().decode('utf8') | ||
assert_frame_equal(df, pd.read_json(result)) | ||
|
||
|
||
@pytest.mark.xfail(reason='zip compression is now supported for json.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you xfailing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above. |
||
def test_compress_zip_value_error(): | ||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
|
@@ -41,7 +42,7 @@ def test_read_zipped_json(): | |
assert_frame_equal(uncompressed_df, compressed_df) | ||
|
||
|
||
def test_with_s3_url(compression_no_zip): | ||
def test_with_s3_url(compression): | ||
boto3 = pytest.importorskip('boto3') | ||
pytest.importorskip('s3fs') | ||
moto = pytest.importorskip('moto') | ||
|
@@ -52,35 +53,35 @@ def test_with_s3_url(compression_no_zip): | |
bucket = conn.create_bucket(Bucket="pandas-test") | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression=compression_no_zip) | ||
df.to_json(path, compression=compression) | ||
with open(path, 'rb') as f: | ||
bucket.put_object(Key='test-1', Body=f) | ||
|
||
roundtripped_df = pd.read_json('s3://pandas-test/test-1', | ||
compression=compression_no_zip) | ||
compression=compression) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
||
def test_lines_with_compression(compression_no_zip): | ||
def test_lines_with_compression(compression): | ||
|
||
with tm.ensure_clean() as path: | ||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') | ||
df.to_json(path, orient='records', lines=True, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
roundtripped_df = pd.read_json(path, lines=True, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
||
def test_chunksize_with_compression(compression_no_zip): | ||
def test_chunksize_with_compression(compression): | ||
|
||
with tm.ensure_clean() as path: | ||
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') | ||
df.to_json(path, orient='records', lines=True, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
|
||
res = pd.read_json(path, lines=True, chunksize=1, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
roundtripped_df = pd.concat(res) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,26 +138,26 @@ def test_to_csv_path_is_none(self): | |
csv_str = s.to_csv(path=None) | ||
assert isinstance(csv_str, str) | ||
|
||
def test_to_csv_compression(self, compression_no_zip): | ||
def test_to_csv_compression(self, compression): | ||
|
||
s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], | ||
name='X') | ||
|
||
with ensure_clean() as filename: | ||
|
||
s.to_csv(filename, compression=compression_no_zip, header=True) | ||
s.to_csv(filename, compression=compression, header=True) | ||
|
||
# test the round trip - to_csv -> read_csv | ||
rs = pd.read_csv(filename, compression=compression_no_zip, | ||
rs = pd.read_csv(filename, compression=compression, | ||
index_col=0, squeeze=True) | ||
assert_series_equal(s, rs) | ||
|
||
# explicitly ensure file was compressed | ||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are there any uses of the compression_no_zip fixture left? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, the compression_no_zip fixture is solely for excluding zip compression in tests because writing zip compression had not been implemented. |
||
with tm.decompress_file(filename, compression) as fh: | ||
text = fh.read().decode('utf8') | ||
assert s.name in text | ||
|
||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
with tm.decompress_file(filename, compression) as fh: | ||
assert_series_equal(s, pd.read_csv(fh, | ||
index_col=0, | ||
squeeze=True)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's the fix the docstring here as I suggested for
frame.py
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.