ENH: More permissive S3 reading

stephen-hoover · stephen-hoover · commit eefa29fa7ef3 · 2015-07-20T20:37:49.000-05:00
When calling `get_bucket`, boto will by default try to establish that the S3 bucket exists by listing all of the keys that exist in it. This behavior is controlled by the "validate" keyword, which defaults to True. If your access key doesn't have permission to read everything in a bucket (even if you do have permission to read the file you're trying to access), this generates an uninformative exception. This PR sets "validate=False". This means that boto will trust you that the bucket exists, and not try to check immediately. If the bucket actually doesn't exist, the `get_contents_as_string` call a couple of lines later will generate the exception "S3ResponseError: S3ResponseError: 404 Not Found".

    One of the test cases expected a failure when reading the file "s3://cant_get_it/tips.csv"; with the changes in this PR, this file is now accessible.
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -389,3 +389,5 @@ Bug Fixes
 - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`).
 
 - Bug in `read_msgpack` where DataFrame to decode has duplicate column names (:issue:`9618`)
+
+- Bug in ``io.common.get_filepath_or_buffer`` which caused reading of valid S3 files to fail if the bucket also contained keys for which the user does not have read permission (:issue:`10604`)
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -151,7 +151,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
         except boto.exception.NoAuthHandlerFound:
             conn = boto.connect_s3(anon=True)
 
-        b = conn.get_bucket(parsed_url.netloc)
+        b = conn.get_bucket(parsed_url.netloc, validate=False)
         k = boto.s3.key.Key(b)
         k.key = parsed_url.path
         filepath_or_buffer = BytesIO(k.get_contents_as_string(
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -4075,16 +4075,24 @@ def test_parse_public_s3_bucket(self):
         nt.assert_false(df.empty)
         tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df)
 
+        # Read public file from bucket with not-public contents
+        df = pd.read_csv('s3://cant_get_it/tips.csv')
+        nt.assert_true(isinstance(df, pd.DataFrame))
+        nt.assert_false(df.empty)
+        tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df)
+
     @tm.network
     def test_s3_fails(self):
         import boto
         with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
                                 'S3ResponseError: 404 Not Found'):
             pd.read_csv('s3://nyqpug/asdf.csv')
 
+        # Receive a permission error when trying to read a private bucket.
+        # It's irrelevant here that this isn't actually a table.
         with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
-                                'S3ResponseError: 403 Forbidden'):
-            pd.read_csv('s3://cant_get_it/tips.csv')
+                                   'S3ResponseError: 403 Forbidden'):
+            pd.read_csv('s3://cant_get_it/')
 
 
 def assert_same_values_and_dtype(res, exp):