Skip to content

TST: Clean up pickle compression tests #19350

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy
import pandas
import dateutil
import pandas.util._test_decorators as td


def pytest_addoption(parser):
Expand Down Expand Up @@ -73,3 +74,22 @@ def ip():
is_dateutil_gt_261 = pytest.mark.skipif(
LooseVersion(dateutil.__version__) <= LooseVersion('2.6.1'),
reason="dateutil stable version")


@pytest.fixture(params=[None, 'gzip', 'bz2', 'zip',
pytest.param('xz', marks=td.skip_if_no_lzma)])
def compression(request):
"""
Fixture for trying common compression types in compression tests
"""
return request.param


@pytest.fixture(params=[None, 'gzip', 'bz2',
pytest.param('xz', marks=td.skip_if_no_lzma)])
def compression_no_zip(request):
"""
Fixture for trying common compression types in compression tests
except zip
"""
return request.param
11 changes: 0 additions & 11 deletions pandas/tests/conftest.py

This file was deleted.

11 changes: 6 additions & 5 deletions pandas/tests/frame/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,27 +919,28 @@ def test_to_csv_path_is_none(self):
recons = pd.read_csv(StringIO(csv_str), index_col=0)
assert_frame_equal(self.frame, recons)

def test_to_csv_compression(self, compression):
def test_to_csv_compression(self, compression_no_zip):

df = DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with ensure_clean() as filename:

df.to_csv(filename, compression=compression)
df.to_csv(filename, compression=compression_no_zip)

# test the round trip - to_csv -> read_csv
rs = read_csv(filename, compression=compression, index_col=0)
rs = read_csv(filename, compression=compression_no_zip,
index_col=0)
assert_frame_equal(df, rs)

# explicitly make sure file is compressed
with tm.decompress_file(filename, compression) as fh:
with tm.decompress_file(filename, compression_no_zip) as fh:
text = fh.read().decode('utf8')
for col in df.columns:
assert col in text

with tm.decompress_file(filename, compression) as fh:
with tm.decompress_file(filename, compression_no_zip) as fh:
assert_frame_equal(df, read_csv(fh, index_col=0))

def test_to_csv_compression_value_error(self):
Expand Down
34 changes: 20 additions & 14 deletions pandas/tests/io/json/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@
from pandas.util.testing import assert_frame_equal, assert_raises_regex


def test_compression_roundtrip(compression):
def test_compression_roundtrip(compression_no_zip):
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
assert_frame_equal(df, pd.read_json(path, compression=compression))
df.to_json(path, compression=compression_no_zip)
assert_frame_equal(df, pd.read_json(path,
compression=compression_no_zip))

# explicitly ensure file was compressed.
with tm.decompress_file(path, compression) as fh:
with tm.decompress_file(path, compression_no_zip) as fh:
result = fh.read().decode('utf8')
assert_frame_equal(df, pd.read_json(result))

Expand All @@ -40,7 +41,7 @@ def test_read_zipped_json():
assert_frame_equal(uncompressed_df, compressed_df)


def test_with_s3_url(compression):
def test_with_s3_url(compression_no_zip):
boto3 = pytest.importorskip('boto3')
pytest.importorskip('s3fs')
moto = pytest.importorskip('moto')
Expand All @@ -51,31 +52,36 @@ def test_with_s3_url(compression):
bucket = conn.create_bucket(Bucket="pandas-test")

with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
df.to_json(path, compression=compression_no_zip)
with open(path, 'rb') as f:
bucket.put_object(Key='test-1', Body=f)

roundtripped_df = pd.read_json('s3://pandas-test/test-1',
compression=compression)
compression=compression_no_zip)
assert_frame_equal(df, roundtripped_df)


def test_lines_with_compression(compression):
def test_lines_with_compression(compression_no_zip):

with tm.ensure_clean() as path:
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True, compression=compression)
df.to_json(path, orient='records', lines=True,
compression=compression_no_zip)
roundtripped_df = pd.read_json(path, lines=True,
compression=compression)
compression=compression_no_zip)
assert_frame_equal(df, roundtripped_df)


def test_chunksize_with_compression(compression):
def test_chunksize_with_compression(compression_no_zip):

with tm.ensure_clean() as path:
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True, compression=compression)
df.to_json(path, orient='records', lines=True,
compression=compression_no_zip)

roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1,
compression=compression))
res = pd.read_json(path, lines=True, chunksize=1,
compression=compression_no_zip)
roundtripped_df = pd.concat(res)
assert_frame_equal(df, roundtripped_df)


Expand Down
51 changes: 8 additions & 43 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,42 +352,7 @@ def compress_file(self, src_path, dest_path, compression):
f.write(fh.read())
f.close()

def decompress_file(self, src_path, dest_path, compression):
if compression is None:
shutil.copyfile(src_path, dest_path)
return

if compression == 'gzip':
import gzip
f = gzip.open(src_path, "r")
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(src_path, "r")
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(src_path)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
f = zip_file.open(zip_names.pop())
else:
raise ValueError('ZIP file {} error. Only one file per ZIP.'
.format(src_path))
elif compression == 'xz':
lzma = pandas.compat.import_lzma()
f = lzma.LZMAFile(src_path, "r")
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)

with open(dest_path, "wb") as fh:
fh.write(f.read())
f.close()

@pytest.mark.parametrize('compression', [
None, 'gzip', 'bz2',
pytest.param('xz', marks=td.skip_if_no_lzma) # issue 11666
])
def test_write_explicit(self, compression, get_random_path):
def test_write_explicit(self, compression_no_zip, get_random_path):
base = get_random_path
path1 = base + ".compressed"
path2 = base + ".raw"
Expand All @@ -396,10 +361,12 @@ def test_write_explicit(self, compression, get_random_path):
df = tm.makeDataFrame()

# write to compressed file
df.to_pickle(p1, compression=compression)
df.to_pickle(p1, compression=compression_no_zip)

# decompress
self.decompress_file(p1, p2, compression=compression)
with tm.decompress_file(p1, compression=compression_no_zip) as f:
with open(p2, "wb") as fh:
fh.write(f.read())

# read decompressed file
df2 = pd.read_pickle(p2, compression=None)
Expand Down Expand Up @@ -435,17 +402,15 @@ def test_write_infer(self, ext, get_random_path):
df.to_pickle(p1)

# decompress
self.decompress_file(p1, p2, compression=compression)
with tm.decompress_file(p1, compression=compression) as f:
with open(p2, "wb") as fh:
fh.write(f.read())

# read decompressed file
df2 = pd.read_pickle(p2, compression=None)

tm.assert_frame_equal(df, df2)

@pytest.mark.parametrize('compression', [
None, 'gzip', 'bz2', "zip",
pytest.param('xz', marks=td.skip_if_no_lzma)
])
def test_read_explicit(self, compression, get_random_path):
base = get_random_path
path1 = base + ".raw"
Expand Down
15 changes: 8 additions & 7 deletions pandas/tests/series/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,28 +138,29 @@ def test_to_csv_path_is_none(self):
csv_str = s.to_csv(path=None)
assert isinstance(csv_str, str)

def test_to_csv_compression(self, compression):
def test_to_csv_compression(self, compression_no_zip):

s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
name='X')

with ensure_clean() as filename:

s.to_csv(filename, compression=compression, header=True)
s.to_csv(filename, compression=compression_no_zip, header=True)

# test the round trip - to_csv -> read_csv
rs = pd.read_csv(filename, compression=compression, index_col=0,
squeeze=True)
rs = pd.read_csv(filename, compression=compression_no_zip,
index_col=0, squeeze=True)
assert_series_equal(s, rs)

# explicitly ensure file was compressed
with tm.decompress_file(filename, compression=compression) as fh:
with tm.decompress_file(filename, compression_no_zip) as fh:
text = fh.read().decode('utf8')
assert s.name in text

with tm.decompress_file(filename, compression=compression) as fh:
with tm.decompress_file(filename, compression_no_zip) as fh:
assert_series_equal(s, pd.read_csv(fh,
index_col=0, squeeze=True))
index_col=0,
squeeze=True))


class TestSeriesIO(TestData):
Expand Down
9 changes: 9 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,15 @@ def decompress_file(path, compression):
elif compression == 'xz':
lzma = compat.import_lzma()
f = lzma.LZMAFile(path, 'rb')
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(path)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
f = zip_file.open(zip_names.pop())
else:
raise ValueError('ZIP file {} error. Only one file per ZIP.'
.format(path))
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)
Expand Down