From eefa29fa7ef332cada4a611ddd6bb22830a553a5 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Thu, 16 Jul 2015 14:30:42 -0500 Subject: [PATCH] ENH: More permissive S3 reading When calling `get_bucket`, boto will by default try to establish that the S3 bucket exists by listing all of the keys that exist in it. This behavior is controlled by the "validate" keyword, which defaults to True. If your access key doesn't have permission to read everything in a bucket (even if you do have permission to read the file you're trying to access), this generates an uninformative exception. This PR sets "validate=False". This means that boto will trust you that the bucket exists, and not try to check immediately. If the bucket actually doesn't exist, the `get_contents_as_string` call a couple of lines later will generate the exception "S3ResponseError: S3ResponseError: 404 Not Found". One of the test cases expected a failure when reading the file "s3://cant_get_it/tips.csv"; with the changes in this PR, this file is now accessible. --- doc/source/whatsnew/v0.17.0.txt | 2 ++ pandas/io/common.py | 2 +- pandas/io/tests/test_parsers.py | 12 ++++++++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 6ab299eb70eb5..b6e166619c1e8 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -389,3 +389,5 @@ Bug Fixes - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in `read_msgpack` where DataFrame to decode has duplicate column names (:issue:`9618`) + +- Bug in ``io.common.get_filepath_or_buffer`` which caused reading of valid S3 files to fail if the bucket also contained keys for which the user does not have read permission (:issue:`10604`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 65cfdff1df14b..b341679176256 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -151,7 +151,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): except boto.exception.NoAuthHandlerFound: conn = boto.connect_s3(anon=True) - b = conn.get_bucket(parsed_url.netloc) + b = conn.get_bucket(parsed_url.netloc, validate=False) k = boto.s3.key.Key(b) k.key = parsed_url.path filepath_or_buffer = BytesIO(k.get_contents_as_string( diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 0f0486e8ea596..a4940ebdd6079 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -4075,6 +4075,12 @@ def test_parse_public_s3_bucket(self): nt.assert_false(df.empty) tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + # Read public file from bucket with not-public contents + df = pd.read_csv('s3://cant_get_it/tips.csv') + nt.assert_true(isinstance(df, pd.DataFrame)) + nt.assert_false(df.empty) + tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + @tm.network def test_s3_fails(self): import boto @@ -4082,9 +4088,11 @@ def test_s3_fails(self): 'S3ResponseError: 404 Not Found'): pd.read_csv('s3://nyqpug/asdf.csv') + # Receive a permission error when trying to read a private bucket. + # It's irrelevant here that this isn't actually a table. with tm.assertRaisesRegexp(boto.exception.S3ResponseError, - 'S3ResponseError: 403 Forbidden'): - pd.read_csv('s3://cant_get_it/tips.csv') + 'S3ResponseError: 403 Forbidden'): + pd.read_csv('s3://cant_get_it/') def assert_same_values_and_dtype(res, exp):