Skip to content

Commit 3fdf728

Browse files
fix: gpu use p3/p2 per avail for region
1 parent e3398d9 commit 3fdf728

File tree

5 files changed

+68
-12
lines changed

5 files changed

+68
-12
lines changed

tests/conftest.py

+33-4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,25 @@
3737
"me-south-1",
3838
]
3939

40+
NO_P3_REGIONS = [
41+
"af-south-1",
42+
"ap-east-1",
43+
"ap-southeast-1", # it has p3, but not enough
44+
"ap-southeast-2", # it has p3, but not enough
45+
"ca-central-1", # it has p3, but not enough
46+
"eu-central-1", # it has p3, but not enough
47+
"eu-north-1",
48+
"eu-west-2", # it has p3, but not enough
49+
"eu-west-3",
50+
"eu-south-1",
51+
"me-south-1",
52+
"sa-east-1",
53+
"us-west-1",
54+
"ap-northeast-1", # it has p3, but not enough
55+
"ap-south-1",
56+
"ap-northeast-2", # it has p3, but not enough
57+
]
58+
4059
NO_T2_REGIONS = ["eu-north-1", "ap-east-1", "me-south-1"]
4160

4261
FRAMEWORKS_FOR_GENERATED_VERSION_FIXTURES = (
@@ -361,9 +380,13 @@ def cpu_instance_type(sagemaker_session, request):
361380
return "ml.m4.xlarge"
362381

363382

364-
@pytest.fixture(scope="module")
365-
def gpu_instance_type(request):
366-
return "ml.p3.2xlarge"
383+
@pytest.fixture(scope="session")
384+
def gpu_instance_type(sagemaker_session, request):
385+
region = sagemaker_session.boto_session.region_name
386+
if region in NO_P3_REGIONS:
387+
return "ml.p2.xlarge"
388+
else:
389+
return "ml.p3.2xlarge"
367390

368391

369392
@pytest.fixture(scope="session")
@@ -405,10 +428,16 @@ def pytest_generate_tests(metafunc):
405428

406429
params = [cpu_instance_type]
407430
if not (
431+
region in tests.integ.HOSTING_NO_P3_REGIONS
432+
or region in tests.integ.TRAINING_NO_P3_REGIONS
433+
):
434+
params.append("ml.p3.2xlarge")
435+
elif not (
408436
region in tests.integ.HOSTING_NO_P2_REGIONS
409437
or region in tests.integ.TRAINING_NO_P2_REGIONS
410438
):
411-
params.append("ml.p3.2xlarge")
439+
params.append("ml.p2.xlarge")
440+
412441
metafunc.parametrize("instance_type", params, scope="session")
413442

414443
_generate_all_framework_version_fixtures(metafunc)

tests/integ/__init__.py

+18
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,24 @@
6666
"sa-east-1",
6767
"us-west-1",
6868
]
69+
TRAINING_NO_P3_REGIONS = [
70+
"af-south-1",
71+
"ap-east-1",
72+
"ap-southeast-1", # it has p3, but not enough
73+
"ap-southeast-2", # it has p3, but not enough
74+
"ca-central-1", # it has p3, but not enough
75+
"eu-central-1", # it has p3, but not enough
76+
"eu-north-1",
77+
"eu-west-2", # it has p3, but not enough
78+
"eu-west-3",
79+
"eu-south-1",
80+
"me-south-1",
81+
"sa-east-1",
82+
"us-west-1",
83+
"ap-northeast-1", # it has p3, but not enough
84+
"ap-south-1",
85+
"ap-northeast-2", # it has p3, but not enough
86+
]
6987

7088
# EI is currently only supported in the following regions
7189
# regions were derived from https://aws.amazon.com/machine-learning/elastic-inference/pricing/

tests/integ/test_horovod.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,22 @@ def test_hvd_cpu(
4747

4848
@pytest.mark.release
4949
@pytest.mark.skipif(
50-
integ.test_region() in integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in this region"
50+
integ.test_region() in integ.TRAINING_NO_P2_REGIONS
51+
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
52+
reason="no ml.p2 or ml.p3 instances in this region",
5153
)
5254
def test_hvd_gpu(
5355
sagemaker_session,
5456
tensorflow_training_latest_version,
5557
tensorflow_training_latest_py_version,
58+
gpu_instance_type,
5659
tmpdir,
5760
):
5861
_create_and_fit_estimator(
5962
sagemaker_session,
6063
tensorflow_training_latest_version,
6164
tensorflow_training_latest_py_version,
62-
"ml.p3.2xlarge",
65+
gpu_instance_type,
6366
tmpdir,
6467
)
6568

tests/integ/test_horovod_mx.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ def test_hvd_cpu(
4747

4848
@pytest.mark.release
4949
@pytest.mark.skipif(
50-
integ.test_region() in integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in this region"
50+
integ.test_region() in integ.TRAINING_NO_P2_REGIONS
51+
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
52+
reason="no ml.p2 or ml.p3 instances in this region",
5153
)
5254
def test_hvd_gpu(
5355
mxnet_training_latest_version,

tests/integ/test_huggingface.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@
2828

2929
@pytest.mark.release
3030
@pytest.mark.skipif(
31-
integ.test_region() in integ.TRAINING_NO_P2_REGIONS,
32-
reason="no ml.p2 instances in this region",
31+
integ.test_region() in integ.TRAINING_NO_P2_REGIONS
32+
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
33+
reason="no ml.p2 or ml.p3 instances in this region",
3334
)
3435
def test_framework_processing_job_with_deps(
3536
sagemaker_session,
@@ -63,8 +64,9 @@ def test_framework_processing_job_with_deps(
6364

6465
@pytest.mark.release
6566
@pytest.mark.skipif(
66-
integ.test_region() in integ.TRAINING_NO_P2_REGIONS,
67-
reason="no ml.p2 instances in this region",
67+
integ.test_region() in integ.TRAINING_NO_P2_REGIONS
68+
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
69+
reason="no ml.p2 or ml.p3 instances in this region",
6870
)
6971
def test_huggingface_training(
7072
sagemaker_session,
@@ -108,7 +110,9 @@ def test_huggingface_training(
108110

109111
@pytest.mark.release
110112
@pytest.mark.skipif(
111-
integ.test_region() in integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in this region"
113+
integ.test_region() in integ.TRAINING_NO_P2_REGIONS
114+
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
115+
reason="no ml.p2 or ml.p3 instances in this region",
112116
)
113117
def test_huggingface_training_tf(
114118
sagemaker_session,

0 commit comments

Comments
 (0)