Skip to content

Commit 32ed15d

Browse files
authored
BUG/TST: Read from Public s3 Bucket Without Creds (#34877)
* Public Bucket Read Test
1 parent 7b718ba commit 32ed15d

File tree

2 files changed

+62
-3
lines changed

2 files changed

+62
-3
lines changed

pandas/io/common.py

+31-3
Original file line numberDiff line numberDiff line change
@@ -202,9 +202,37 @@ def get_filepath_or_buffer(
202202
filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://")
203203
fsspec = import_optional_dependency("fsspec")
204204

205-
file_obj = fsspec.open(
206-
filepath_or_buffer, mode=mode or "rb", **(storage_options or {})
207-
).open()
205+
# If botocore is installed we fallback to reading with anon=True
206+
# to allow reads from public buckets
207+
err_types_to_retry_with_anon: List[Any] = []
208+
try:
209+
import_optional_dependency("botocore")
210+
from botocore.exceptions import ClientError, NoCredentialsError
211+
212+
err_types_to_retry_with_anon = [
213+
ClientError,
214+
NoCredentialsError,
215+
PermissionError,
216+
]
217+
except ImportError:
218+
pass
219+
220+
try:
221+
file_obj = fsspec.open(
222+
filepath_or_buffer, mode=mode or "rb", **(storage_options or {})
223+
).open()
224+
# GH 34626 Reads from Public Buckets without Credentials needs anon=True
225+
except tuple(err_types_to_retry_with_anon):
226+
if storage_options is None:
227+
storage_options = {"anon": True}
228+
else:
229+
# don't mutate user input.
230+
storage_options = dict(storage_options)
231+
storage_options["anon"] = True
232+
file_obj = fsspec.open(
233+
filepath_or_buffer, mode=mode or "rb", **(storage_options or {})
234+
).open()
235+
208236
return file_obj, encoding, compression, True
209237

210238
if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):

pandas/tests/io/test_s3.py

+31
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
from io import BytesIO
2+
import os
23

34
import pytest
45

6+
import pandas.util._test_decorators as td
7+
58
from pandas import read_csv
9+
import pandas._testing as tm
610

711

812
def test_streaming_s3_objects():
@@ -15,3 +19,30 @@ def test_streaming_s3_objects():
1519
for el in data:
1620
body = StreamingBody(BytesIO(el), content_length=len(el))
1721
read_csv(body)
22+
23+
24+
@tm.network
25+
@td.skip_if_no("s3fs")
26+
def test_read_without_creds_from_pub_bucket():
27+
# GH 34626
28+
# Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
29+
result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3)
30+
assert len(result) == 3
31+
32+
33+
@tm.network
34+
@td.skip_if_no("s3fs")
35+
def test_read_with_creds_from_pub_bucke():
36+
# Ensure we can read from a public bucket with credentials
37+
# GH 34626
38+
# Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
39+
40+
with tm.ensure_safe_environment_variables():
41+
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
42+
# see https://github.com/spulec/moto/issues/1924 & 1952
43+
os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
44+
os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
45+
df = read_csv(
46+
"s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None,
47+
)
48+
assert len(df) == 5

0 commit comments

Comments
 (0)