From eba6c5fb3fde51aae7c87cb8661ccf1ffd644995 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Fri, 19 Jun 2020 10:07:55 +0100 Subject: [PATCH 1/8] Public Bucket Read Test --- pandas/io/s3.py | 6 +++--- pandas/tests/io/test_s3.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 329c861d2386a..c4d26eb81f7a7 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -16,8 +16,8 @@ def _strip_schema(url): return result.netloc + result.path -def get_fs(): - return s3fs.S3FileSystem(anon=False) +def get_fs(anon: bool = False): + return s3fs.S3FileSystem(anon=anon) def get_file_and_filesystem( @@ -38,7 +38,7 @@ def get_file_and_filesystem( # aren't valid for that bucket. # A NoCredentialsError is raised if you don't have creds # for that bucket. - fs = get_fs() + fs = get_fs(anon=True) file = fs.open(_strip_schema(filepath_or_buffer), mode) return file, fs diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 04c6979596eca..0fe451358549b 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -2,7 +2,10 @@ import pytest +import pandas.util._test_decorators as td + from pandas import read_csv +import pandas._testing as tm from pandas.io.common import is_s3_url @@ -23,3 +26,10 @@ def test_streaming_s3_objects(): for el in data: body = StreamingBody(BytesIO(el), content_length=len(el)) read_csv(body) + + +@tm.network +@td.skip_if_no("s3fs") +def test_read_without_creds_from_pub_bucket(): + result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3) + assert len(result) == 3 From 523480c6b832f7dda0e958add35142a505c6d10d Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 20 Jun 2020 00:02:15 +0100 Subject: [PATCH 2/8] Public Bucket Read Test --- pandas/tests/io/test_s3.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 0fe451358549b..285c11fcd2c3e 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -31,5 +31,7 @@ def test_streaming_s3_objects(): @tm.network @td.skip_if_no("s3fs") def test_read_without_creds_from_pub_bucket(): + # GH 34626 + # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3) assert len(result) == 3 From 5d6355546e7f64b7866205f1a3754568a43389fe Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 20 Jun 2020 00:56:36 +0100 Subject: [PATCH 3/8] Handle Perm Error --- pandas/io/s3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/s3.py b/pandas/io/s3.py index c4d26eb81f7a7..bd91173faab76 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -31,7 +31,7 @@ def get_file_and_filesystem( fs = get_fs() try: file = fs.open(_strip_schema(filepath_or_buffer), mode) - except (FileNotFoundError, NoCredentialsError): + except (FileNotFoundError, NoCredentialsError, PermissionError): # boto3 has troubles when trying to access a public file # when credentialed... # An OSError is raised if you have credentials, but they From e308bf89f64a4006c4b0893d00dac27e913cfdfe Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 22 Jun 2020 22:32:15 +0100 Subject: [PATCH 4/8] Add with Creds Test --- pandas/tests/io/test_s3.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 285c11fcd2c3e..b0009dbcbbb84 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -1,4 +1,5 @@ from io import BytesIO +import os import pytest @@ -35,3 +36,21 @@ def test_read_without_creds_from_pub_bucket(): # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3) assert len(result) == 3 + + +@tm.network +@td.skip_if_no("s3fs") +def test_read_with_creds_from_pub_bucke(): + # Ensure we can read from a public bucket with credentials + # GH 34626 + # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt + + with tm.ensure_safe_environment_variables(): + # temporary workaround as moto fails for botocore >= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 + os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") + df = read_csv( + "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None, + ) + assert len(df) == 5 From 49ce4a8db4c7d9c4b7431508db0060a10a2789a4 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 9 Jul 2020 10:30:08 +0100 Subject: [PATCH 5/8] Allows Reads from Public Buckets --- pandas/io/common.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 51323c5ff3ef5..6421c9a87c530 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -201,10 +201,22 @@ def get_filepath_or_buffer( if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") + from botocore.exceptions import NoCredentialsError + + try: + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + ).open() + # GH 34626 Reads from Public Buckets without Credentials needs anon=True + except NoCredentialsError: + if storage_options is None: + storage_options = {"anon": True} + else: + storage_options["anon"] = True + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + ).open() - file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) - ).open() return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): From 03ee47216b8834d80bb813dbbdc6dac304ddfc77 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 12 Jul 2020 20:29:37 +0100 Subject: [PATCH 6/8] Allows Reads from Public Buckets --- pandas/io/common.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6421c9a87c530..0f6f6e2a56300 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -201,14 +201,23 @@ def get_filepath_or_buffer( if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") - from botocore.exceptions import NoCredentialsError + + # If botocore is installed we fallback to reading with anon=True + # to allow reads from public buckets + try: + import_optional_dependency("botocore") + from botocore.exceptions import ClientError, NoCredentialsError + + err_types_to_retry_with_anon = (ClientError, NoCredentialsError) + except ImportError: + err_types_to_retry_with_anon = () try: file_obj = fsspec.open( filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) ).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True - except NoCredentialsError: + except err_types_to_retry_with_anon: if storage_options is None: storage_options = {"anon": True} else: From 2c9c755eea12f97cf59b9263ac9e1f0bb25a0705 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 12 Jul 2020 22:47:37 +0100 Subject: [PATCH 7/8] Allows Reads from Public Buckets --- pandas/io/common.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 0f6f6e2a56300..b5f165e328f44 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -204,20 +204,25 @@ def get_filepath_or_buffer( # If botocore is installed we fallback to reading with anon=True # to allow reads from public buckets + err_types_to_retry_with_anon: List[Any] = [] try: import_optional_dependency("botocore") from botocore.exceptions import ClientError, NoCredentialsError - err_types_to_retry_with_anon = (ClientError, NoCredentialsError) + err_types_to_retry_with_anon = [ + ClientError, + NoCredentialsError, + PermissionError, + ] except ImportError: - err_types_to_retry_with_anon = () + pass try: file_obj = fsspec.open( filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) ).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True - except err_types_to_retry_with_anon: + except tuple(err_types_to_retry_with_anon): if storage_options is None: storage_options = {"anon": True} else: From 4d0c9803c6c2deea60cddeb4c22de7eefac6759e Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Tue, 14 Jul 2020 22:28:38 +0100 Subject: [PATCH 8/8] Update pandas/io/common.py Co-authored-by: Tom Augspurger --- pandas/io/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index b5f165e328f44..32ec088f00d88 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -226,6 +226,8 @@ def get_filepath_or_buffer( if storage_options is None: storage_options = {"anon": True} else: + # don't mutate user input. + storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open( filepath_or_buffer, mode=mode or "rb", **(storage_options or {})