Skip to content

REF: Mock all S3 Tests #20409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 0 additions & 32 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,38 +118,6 @@ def time_read_uint64_na_values(self):
na_values=self.na_values)


class S3(object):
# Make sure that we can read part of a file from S3 without
# needing to download the entire thing. Use the timeit.default_timer
# to measure wall time instead of CPU time -- we want to see
# how long it takes to download the data.
timer = timeit.default_timer
params = ([None, "gzip", "bz2"], ["python", "c"])
param_names = ["compression", "engine"]

def setup(self, compression, engine):
if compression == "bz2" and engine == "c" and PY2:
# The Python 2 C parser can't read bz2 from open files.
raise NotImplementedError
try:
import s3fs # noqa
except ImportError:
# Skip these benchmarks if `boto` is not installed.
raise NotImplementedError

ext = ""
if compression == "gzip":
ext = ".gz"
elif compression == "bz2":
ext = ".bz2"
self.big_fname = "s3://pandas-test/large_random.csv" + ext

def time_read_csv_10_rows(self, compression, engine):
# Read a small number of rows from a huge (100,000 x 50) table.
read_csv(self.big_fname, nrows=10, compression=compression,
engine=engine)


class ReadCSVThousands(BaseIO):

goal_time = 0.2
Expand Down
35 changes: 24 additions & 11 deletions pandas/tests/io/parser/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
import logging

import pytest
import numpy as np

import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame
from pandas.io.parsers import read_csv, read_table
from pandas.compat import BytesIO
from pandas.compat import BytesIO, StringIO


@pytest.mark.network
Expand Down Expand Up @@ -45,9 +48,9 @@ def check_compressed_urls(salaries_table, compression, extension, mode,
tm.assert_frame_equal(url_table, salaries_table)


@pytest.mark.usefixtures("s3_resource")
class TestS3(object):

@tm.network
def test_parse_public_s3_bucket(self):
pytest.importorskip('s3fs')
# more of an integration test due to the not-public contents portion
Expand All @@ -66,7 +69,6 @@ def test_parse_public_s3_bucket(self):
assert not df.empty
tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df)

@tm.network
def test_parse_public_s3n_bucket(self):

# Read from AWS s3 as "s3n" URL
Expand All @@ -76,7 +78,6 @@ def test_parse_public_s3n_bucket(self):
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_parse_public_s3a_bucket(self):
# Read from AWS s3 as "s3a" URL
df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
Expand All @@ -85,7 +86,6 @@ def test_parse_public_s3a_bucket(self):
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_parse_public_s3_bucket_nrows(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' +
Expand All @@ -95,7 +95,6 @@ def test_parse_public_s3_bucket_nrows(self):
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_parse_public_s3_bucket_chunked(self):
# Read with a chunksize
chunksize = 5
Expand All @@ -114,7 +113,6 @@ def test_parse_public_s3_bucket_chunked(self):
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)

@tm.network
def test_parse_public_s3_bucket_chunked_python(self):
# Read with a chunksize using the Python parser
chunksize = 5
Expand All @@ -133,7 +131,6 @@ def test_parse_public_s3_bucket_chunked_python(self):
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)

@tm.network
def test_parse_public_s3_bucket_python(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
Expand All @@ -143,7 +140,6 @@ def test_parse_public_s3_bucket_python(self):
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)

@tm.network
def test_infer_s3_compression(self):
for ext in ['', '.gz', '.bz2']:
df = read_csv('s3://pandas-test/tips.csv' + ext,
Expand All @@ -153,7 +149,6 @@ def test_infer_s3_compression(self):
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)

@tm.network
def test_parse_public_s3_bucket_nrows_python(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
Expand All @@ -163,7 +158,6 @@ def test_parse_public_s3_bucket_nrows_python(self):
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_s3_fails(self):
with pytest.raises(IOError):
read_csv('s3://nyqpug/asdf.csv')
Expand All @@ -188,3 +182,22 @@ def test_read_csv_handles_boto_s3_object(self,

expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)

def test_read_csv_chunked_download(self, s3_resource, caplog):
# 8 MB, S3FS usees 5MB chunks
df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
buf = BytesIO()
str_buf = StringIO()

df.to_csv(str_buf)

buf = BytesIO(str_buf.getvalue().encode('utf-8'))

s3_resource.Bucket("pandas-test").put_object(
Key="large-file.csv",
Body=buf)

with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@martindurant does this seem like a reasonable way to test that read_csv(key, nrows=5) only triggers S3FS reading part of the object? Do you know of a better way, that's perhaps less reliant on the internals of S3FS?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm afraid I don't have a better method for you, s3fs doesn't keep a log of transactions in any data structure you could access, and the s3file used for the download will have been cleaned up as soon as read_csv is done with it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm OK. Do you know if moto keeps a record anywhere?

I dislike this test since s3fs adding an additional logger.debug anywhere, or changing the log message, default bytes size, etc. will break it.

boto also has a callback mechanism on download_file, but I don't see that option for get_object. If I can't figure out a way to get that working, I'll try to make the test using the logger a bit less fragile.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Certainly you could look through all log messages captured, not just the last one. Note that you do have access to the exact s3filesystem in S3FileSystem._singleton[0], but I don't see that that helps you in this case.

You could maybe patch S3File.__exit__ to store the values of self.loc and self.end?

read_csv("s3://pandas-test/large-file.csv", nrows=5)
# log of fetch_range (start, stop)
assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))