Skip to content

Commit 8bab235

Browse files
authored
TST: Refactor s3 resource (#53803)
* TST: Refactor s3 resource * Fix api usage * bot3 instead of cli * refactor call * Make cleanup and bucket names more unique * Use external error raised
1 parent a7a5b13 commit 8bab235

File tree

9 files changed

+180
-132
lines changed

9 files changed

+180
-132
lines changed

pandas/tests/io/conftest.py

+58-54
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import shlex
22
import subprocess
33
import time
4+
import uuid
45

56
import pytest
67

@@ -54,13 +55,13 @@ def s3so(worker_id):
5455
return {"client_kwargs": {"endpoint_url": url}}
5556

5657

57-
@pytest.fixture(scope="session")
58+
@pytest.fixture(scope="function" if is_ci_environment() else "session")
5859
def monkeysession():
5960
with pytest.MonkeyPatch.context() as mp:
6061
yield mp
6162

6263

63-
@pytest.fixture(scope="session")
64+
@pytest.fixture(scope="function" if is_ci_environment() else "session")
6465
def s3_base(worker_id, monkeysession):
6566
"""
6667
Fixture for mocking S3 interaction.
@@ -123,24 +124,67 @@ def s3_base(worker_id, monkeysession):
123124

124125

125126
@pytest.fixture
126-
def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
127-
"""
128-
Sets up S3 bucket with contents
127+
def s3_resource(s3_base):
128+
import boto3
129+
130+
s3 = boto3.resource("s3", endpoint_url=s3_base)
131+
return s3
132+
129133

130-
The primary bucket name is "pandas-test". The following datasets
134+
@pytest.fixture
135+
def s3_public_bucket(s3_resource):
136+
bucket = s3_resource.Bucket(f"pandas-test-{uuid.uuid4()}")
137+
bucket.create()
138+
yield bucket
139+
bucket.objects.delete()
140+
bucket.delete()
141+
142+
143+
@pytest.fixture
144+
def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_file):
145+
"""
146+
The following datasets
131147
are loaded.
132148
133149
- tips.csv
134150
- tips.csv.gz
135151
- tips.csv.bz2
136152
- items.jsonl
153+
"""
154+
test_s3_files = [
155+
("tips#1.csv", tips_file),
156+
("tips.csv", tips_file),
157+
("tips.csv.gz", tips_file + ".gz"),
158+
("tips.csv.bz2", tips_file + ".bz2"),
159+
("items.jsonl", jsonl_file),
160+
("simple_dataset.feather", feather_file),
161+
]
162+
for s3_key, file_name in test_s3_files:
163+
with open(file_name, "rb") as f:
164+
s3_public_bucket.put_object(Key=s3_key, Body=f)
165+
return s3_public_bucket
166+
167+
168+
@pytest.fixture
169+
def s3_private_bucket(s3_resource):
170+
bucket = s3_resource.Bucket(f"cant_get_it-{uuid.uuid4()}")
171+
bucket.create(ACL="private")
172+
yield bucket
173+
bucket.objects.delete()
174+
bucket.delete()
175+
137176

138-
A private bucket "cant_get_it" is also created. The boto3 s3 resource
139-
is yielded by the fixture.
177+
@pytest.fixture
178+
def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feather_file):
140179
"""
141-
import boto3
142-
import s3fs
180+
The following datasets
181+
are loaded.
143182
183+
- tips.csv
184+
- tips.csv.gz
185+
- tips.csv.bz2
186+
- items.jsonl
187+
"""
144188
test_s3_files = [
145189
("tips#1.csv", tips_file),
146190
("tips.csv", tips_file),
@@ -149,50 +193,10 @@ def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
149193
("items.jsonl", jsonl_file),
150194
("simple_dataset.feather", feather_file),
151195
]
152-
153-
def add_tips_files(bucket_name):
154-
for s3_key, file_name in test_s3_files:
155-
with open(file_name, "rb") as f:
156-
cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f)
157-
158-
bucket = "pandas-test"
159-
conn = boto3.resource("s3", endpoint_url=s3_base)
160-
cli = boto3.client("s3", endpoint_url=s3_base)
161-
162-
try:
163-
cli.create_bucket(Bucket=bucket)
164-
except Exception:
165-
# OK is bucket already exists
166-
pass
167-
try:
168-
cli.create_bucket(Bucket="cant_get_it", ACL="private")
169-
except Exception:
170-
# OK is bucket already exists
171-
pass
172-
timeout = 2
173-
while not cli.list_buckets()["Buckets"] and timeout > 0:
174-
time.sleep(0.1)
175-
timeout -= 0.1
176-
177-
add_tips_files(bucket)
178-
add_tips_files("cant_get_it")
179-
s3fs.S3FileSystem.clear_instance_cache()
180-
yield conn
181-
182-
s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base})
183-
184-
try:
185-
s3.rm(bucket, recursive=True)
186-
except Exception:
187-
pass
188-
try:
189-
s3.rm("cant_get_it", recursive=True)
190-
except Exception:
191-
pass
192-
timeout = 2
193-
while cli.list_buckets()["Buckets"] and timeout > 0:
194-
time.sleep(0.1)
195-
timeout -= 0.1
196+
for s3_key, file_name in test_s3_files:
197+
with open(file_name, "rb") as f:
198+
s3_private_bucket.put_object(Key=s3_key, Body=f)
199+
return s3_private_bucket
196200

197201

198202
_compression_formats_params = [

pandas/tests/io/excel/test_readers.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -894,29 +894,29 @@ def test_read_from_http_url(self, read_ext):
894894

895895
@td.skip_if_not_us_locale
896896
@pytest.mark.single_cpu
897-
def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
898-
# Bucket "pandas-test" created in tests/io/conftest.py
897+
def test_read_from_s3_url(self, read_ext, s3_public_bucket, s3so):
898+
# Bucket created in tests/io/conftest.py
899899
with open("test1" + read_ext, "rb") as f:
900-
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
900+
s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)
901901

902-
url = "s3://pandas-test/test1" + read_ext
902+
url = f"s3://{s3_public_bucket.name}/test1" + read_ext
903903

904904
url_table = pd.read_excel(url, storage_options=s3so)
905905
local_table = pd.read_excel("test1" + read_ext)
906906
tm.assert_frame_equal(url_table, local_table)
907907

908908
@pytest.mark.single_cpu
909-
def test_read_from_s3_object(self, read_ext, s3_resource, s3so):
909+
def test_read_from_s3_object(self, read_ext, s3_public_bucket, s3so):
910910
# GH 38788
911-
# Bucket "pandas-test" created in tests/io/conftest.py
911+
# Bucket created in tests/io/conftest.py
912912
with open("test1" + read_ext, "rb") as f:
913-
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
913+
s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)
914914

915915
import s3fs
916916

917917
s3 = s3fs.S3FileSystem(**s3so)
918918

919-
with s3.open("s3://pandas-test/test1" + read_ext) as f:
919+
with s3.open(f"s3://{s3_public_bucket.name}/test1" + read_ext) as f:
920920
url_table = pd.read_excel(f)
921921

922922
local_table = pd.read_excel("test1" + read_ext)

pandas/tests/io/excel/test_style.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -274,18 +274,16 @@ def custom_converter(css):
274274

275275
@pytest.mark.single_cpu
276276
@td.skip_if_not_us_locale
277-
def test_styler_to_s3(s3_resource, s3so):
277+
def test_styler_to_s3(s3_public_bucket, s3so):
278278
# GH#46381
279279

280-
mock_bucket_name, target_file = "pandas-test", "test.xlsx"
280+
mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx"
281281
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
282282
styler = df.style.set_sticky(axis="index")
283283
styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
284284
timeout = 5
285285
while True:
286-
if target_file in (
287-
obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
288-
):
286+
if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
289287
break
290288
time.sleep(0.1)
291289
timeout -= 0.1

pandas/tests/io/json/test_compression.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,19 @@ def test_read_zipped_json(datapath):
4141

4242
@td.skip_if_not_us_locale
4343
@pytest.mark.single_cpu
44-
def test_with_s3_url(compression, s3_resource, s3so):
45-
# Bucket "pandas-test" created in tests/io/conftest.py
44+
def test_with_s3_url(compression, s3_public_bucket, s3so):
45+
# Bucket created in tests/io/conftest.py
4646
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
4747

4848
with tm.ensure_clean() as path:
4949
df.to_json(path, compression=compression)
5050
with open(path, "rb") as f:
51-
s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f)
51+
s3_public_bucket.put_object(Key="test-1", Body=f)
5252

5353
roundtripped_df = pd.read_json(
54-
"s3://pandas-test/test-1", compression=compression, storage_options=s3so
54+
f"s3://{s3_public_bucket.name}/test-1",
55+
compression=compression,
56+
storage_options=s3so,
5557
)
5658
tm.assert_frame_equal(df, roundtripped_df)
5759

pandas/tests/io/json/test_pandas.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1267,11 +1267,13 @@ def test_read_inline_jsonl(self):
12671267

12681268
@pytest.mark.single_cpu
12691269
@td.skip_if_not_us_locale
1270-
def test_read_s3_jsonl(self, s3_resource, s3so):
1270+
def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so):
12711271
# GH17200
12721272

12731273
result = read_json(
1274-
"s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so
1274+
f"s3n://{s3_public_bucket_with_data.name}/items.jsonl",
1275+
lines=True,
1276+
storage_options=s3so,
12751277
)
12761278
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
12771279
tm.assert_frame_equal(result, expected)
@@ -1843,16 +1845,14 @@ def test_json_multiindex(self, dataframe, expected):
18431845
assert result == expected
18441846

18451847
@pytest.mark.single_cpu
1846-
def test_to_s3(self, s3_resource, s3so):
1848+
def test_to_s3(self, s3_public_bucket, s3so):
18471849
# GH 28375
1848-
mock_bucket_name, target_file = "pandas-test", "test.json"
1850+
mock_bucket_name, target_file = s3_public_bucket.name, "test.json"
18491851
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
18501852
df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
18511853
timeout = 5
18521854
while True:
1853-
if target_file in (
1854-
obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
1855-
):
1855+
if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
18561856
break
18571857
time.sleep(0.1)
18581858
timeout -= 0.1

0 commit comments

Comments
 (0)