Skip to content

Commit 499a810

Browse files
authored
[test][ec2] Fix collision between parametrized keypair names (#875)
1 parent 62ef4b4 commit 499a810

File tree

9 files changed

+109
-49
lines changed

9 files changed

+109
-49
lines changed

test/dlc_tests/conftest.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,11 +454,13 @@ def pytest_generate_tests(metafunc):
454454
images_to_parametrize.append(image)
455455
elif "eia_only" in metafunc.fixturenames and "eia" in image:
456456
images_to_parametrize.append(image)
457-
elif ("cpu_only" not in metafunc.fixturenames and "gpu_only" not in metafunc.fixturenames
458-
and "eia_only" not in metafunc.fixturenames):
459-
images_to_parametrize.append(image)
460457
elif "neuron_only" in metafunc.fixturenames and "neuron" in image:
461458
images_to_parametrize.append(image)
459+
elif ("cpu_only" not in metafunc.fixturenames and
460+
"gpu_only" not in metafunc.fixturenames and
461+
"eia_only" not in metafunc.fixturenames and
462+
"neuron_only" not in metafunc.fixturenames):
463+
images_to_parametrize.append(image)
462464

463465
# Remove all images tagged as "py2" if py3_only is a fixture
464466
if images_to_parametrize and "py3_only" in metafunc.fixturenames:

test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
import pytest
33

4+
import test.test_utils.ec2 as ec2_utils
5+
46
from test import test_utils
57
from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag
68
from test.test_utils.ec2 import get_ec2_instance_type, execute_ec2_inference_test, get_ec2_accelerator_type
@@ -15,6 +17,13 @@
1517
MX_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
1618
MX_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu")
1719
MX_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia")
20+
MX_EC2_GPU_EIA_INSTANCE_TYPE = get_ec2_instance_type(
21+
default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_not_heavy_instance_types,
22+
)
23+
MX_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
24+
default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
25+
)
26+
1827
MX_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "test_mx_dlc_telemetry_test")
1928

2029

@@ -51,9 +60,8 @@ def test_ec2_mxnet_resnet_inference_eia_cpu(mxnet_inference_eia, ec2_connection,
5160

5261
@pytest.mark.integration("elastic_inference")
5362
@pytest.mark.model(RESNET_EIA_MODEL)
54-
@pytest.mark.parametrize("ec2_instance_type", MX_EC2_GPU_INSTANCE_TYPE, indirect=True)
63+
@pytest.mark.parametrize("ec2_instance_type", MX_EC2_GPU_EIA_INSTANCE_TYPE, indirect=True)
5564
@pytest.mark.parametrize("ei_accelerator_type", MX_EC2_EIA_ACCELERATOR_TYPE, indirect=True)
56-
@pytest.mark.skipif(MX_EC2_GPU_INSTANCE_TYPE == ["p3dn.24xlarge"], reason="Skipping EIA test on p3dn instances")
5765
def test_ec2_mxnet_resnet_inferencei_eia_gpu(mxnet_inference_eia, ec2_connection, region, eia_only):
5866
model_name = RESNET_EIA_MODEL
5967
image_framework, image_framework_version = get_framework_and_version_from_tag(mxnet_inference_eia)
@@ -108,14 +116,14 @@ def run_ec2_mxnet_inference(image_uri, model_name, container_tag, ec2_connection
108116
@pytest.mark.flaky(reruns=3)
109117
@pytest.mark.integration("telemetry")
110118
@pytest.mark.model("N/A")
111-
@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
119+
@pytest.mark.parametrize("ec2_instance_type", MX_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
112120
def test_mxnet_inference_telemetry_gpu(mxnet_inference, ec2_connection, gpu_only):
113121
execute_ec2_inference_test(ec2_connection, mxnet_inference, MX_TELEMETRY_CMD)
114122

115123

116124
@pytest.mark.flaky(reruns=3)
117125
@pytest.mark.integration("telemetry")
118126
@pytest.mark.model("N/A")
119-
@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
127+
@pytest.mark.parametrize("ec2_instance_type", MX_EC2_CPU_INSTANCE_TYPE, indirect=True)
120128
def test_mxnet_inference_telemetry_cpu(mxnet_inference, ec2_connection, cpu_only):
121129
execute_ec2_inference_test(ec2_connection, mxnet_inference, MX_TELEMETRY_CMD)

test/dlc_tests/ec2/mxnet/training/test_mxnet_training.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import pytest
44

5+
import test.test_utils.ec2 as ec2_utils
6+
57
from test.test_utils import CONTAINER_TESTS_PREFIX
68
from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type
79

@@ -16,6 +18,9 @@
1618

1719
MX_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
1820
MX_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu")
21+
MX_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
22+
default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
23+
)
1924

2025

2126
@pytest.mark.integration("mxnet_sanity_test")
@@ -105,14 +110,14 @@ def test_mxnet_with_horovod_cpu(mxnet_training, ec2_connection, cpu_only):
105110
@pytest.mark.flaky(reruns=3)
106111
@pytest.mark.integration("telemetry")
107112
@pytest.mark.model("N/A")
108-
@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
113+
@pytest.mark.parametrize("ec2_instance_type", MX_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
109114
def test_mxnet_telemetry_gpu(mxnet_training, ec2_connection, gpu_only):
110115
execute_ec2_training_test(ec2_connection, mxnet_training, MX_TELEMETRY_CMD)
111116

112117

113118
@pytest.mark.flaky(reruns=3)
114119
@pytest.mark.integration("telemetry")
115120
@pytest.mark.model("N/A")
116-
@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
121+
@pytest.mark.parametrize("ec2_instance_type", MX_EC2_CPU_INSTANCE_TYPE, indirect=True)
117122
def test_mxnet_telemetry_cpu(mxnet_training, ec2_connection, cpu_only):
118123
execute_ec2_training_test(ec2_connection, mxnet_training, MX_TELEMETRY_CMD)

test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import pytest
44

5+
import test.test_utils.ec2 as ec2_utils
6+
57
from test import test_utils
68
from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag
79
from test.test_utils.ec2 import get_ec2_instance_type, execute_ec2_inference_test, get_ec2_accelerator_type
@@ -10,17 +12,25 @@
1012

1113
PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
1214
PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu")
15+
PT_EC2_GPU_EIA_INSTANCE_TYPE = get_ec2_instance_type(
16+
default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_not_heavy_instance_types,
17+
)
1318
PT_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia")
19+
PT_EC2_NEURON_INSTANCE_TYPE = get_ec2_instance_type(default="inf1.xlarge", processor="neuron")
20+
PT_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
21+
default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
22+
)
23+
1424
PT_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test")
15-
PT_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="inf1.xlarge", processor="neuron")
1625

1726

1827
@pytest.mark.model("resnet")
1928
@pytest.mark.parametrize("ec2_instance_ami", [test_utils.NEURON_AL2_DLAMI], indirect=True)
20-
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_ACCELERATOR_TYPE, indirect=True)
29+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_INSTANCE_TYPE, indirect=True)
2130
def test_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region, gpu_only):
2231
ec2_pytorch_inference(pytorch_inference, "neuron", ec2_connection, region)
2332

33+
2434
@pytest.mark.model("densenet")
2535
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
2636
def test_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region, gpu_only):
@@ -43,9 +53,8 @@ def test_ec2_pytorch_inference_eia_cpu(pytorch_inference_eia, ec2_connection, re
4353

4454
@pytest.mark.integration("elastic_inference")
4555
@pytest.mark.model("resnet")
46-
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
56+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_EIA_INSTANCE_TYPE, indirect=True)
4757
@pytest.mark.parametrize("ei_accelerator_type", PT_EC2_EIA_ACCELERATOR_TYPE, indirect=True)
48-
@pytest.mark.skipif(PT_EC2_GPU_INSTANCE_TYPE == ["p3dn.24xlarge"], reason="Skipping EIA test on p3dn instances")
4958
def test_ec2_pytorch_inference_eia_gpu(pytorch_inference_eia, ec2_connection, region, eia_only):
5059
ec2_pytorch_inference(pytorch_inference_eia, "eia", ec2_connection, region)
5160

@@ -96,13 +105,13 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region):
96105

97106
@pytest.mark.integration("telemetry")
98107
@pytest.mark.model("N/A")
99-
@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
108+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
100109
def test_pytorch_inference_telemetry_gpu(pytorch_inference, ec2_connection, gpu_only):
101110
execute_ec2_inference_test(ec2_connection, pytorch_inference, PT_TELEMETRY_CMD)
102111

103112

104113
@pytest.mark.integration("telemetry")
105114
@pytest.mark.model("N/A")
106-
@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
115+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
107116
def test_pytorch_inference_telemetry_cpu(pytorch_inference, ec2_connection, cpu_only):
108117
execute_ec2_inference_test(ec2_connection, pytorch_inference, PT_TELEMETRY_CMD)

test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import pytest
66

7+
import test.test_utils.ec2 as ec2_utils
8+
79
from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag, get_cuda_version_from_tag
810
from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type
911

@@ -19,6 +21,12 @@
1921

2022
PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
2123
PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu")
24+
PT_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
25+
default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
26+
)
27+
PT_EC2_MULTI_GPU_INSTANCE_TYPE = get_ec2_instance_type(
28+
default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_only_multi_gpu,
29+
)
2230

2331

2432
@pytest.mark.integration("pytorch_sanity_test")
@@ -63,10 +71,6 @@ def test_pytorch_linear_regression_cpu(pytorch_training, ec2_connection, cpu_onl
6371
@pytest.mark.model("gcn")
6472
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
6573
def test_pytorch_train_dgl_gpu(pytorch_training, ec2_connection, gpu_only, py3_only):
66-
_, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
67-
image_cuda_version = get_cuda_version_from_tag(pytorch_training)
68-
if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
69-
pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")
7074
execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)
7175

7276

@@ -127,8 +131,7 @@ def test_nvapex(pytorch_training, ec2_connection, gpu_only):
127131

128132
@pytest.mark.integration("amp")
129133
@pytest.mark.model("resnet50")
130-
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
131-
@pytest.mark.skipif(PT_EC2_GPU_INSTANCE_TYPE == ["g3.4xlarge"], reason="Skipping AMP DDP test on single gpu instance")
134+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_MULTI_GPU_INSTANCE_TYPE, indirect=True)
132135
def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only):
133136
_, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
134137
if Version(image_framework_version) < Version("1.6"):
@@ -138,13 +141,13 @@ def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only):
138141

139142
@pytest.mark.integration("telemetry")
140143
@pytest.mark.model("N/A")
141-
@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
144+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
142145
def test_pytorch_telemetry_gpu(pytorch_training, ec2_connection, gpu_only):
143146
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TELEMETRY_CMD)
144147

145148

146149
@pytest.mark.integration("telemetry")
147150
@pytest.mark.model("N/A")
148-
@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
151+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
149152
def test_pytorch_telemetry_cpu(pytorch_training, ec2_connection, cpu_only):
150153
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TELEMETRY_CMD)

test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import pytest
66

7+
import test.test_utils.ec2 as ec2_utils
8+
79
from test import test_utils
810
from test.test_utils.ec2 import get_ec2_instance_type, get_ec2_accelerator_type
911
from test.dlc_tests.conftest import LOGGER
@@ -15,7 +17,11 @@
1517
TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
1618
TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu")
1719
TF_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia")
18-
TF_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="inf1.xlarge", processor="neuron")
20+
TF_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_instance_type(default="inf1.xlarge", processor="neuron")
21+
TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
22+
default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
23+
)
24+
1925

2026
@pytest.mark.model("mnist")
2127
@pytest.mark.parametrize("ec2_instance_type", TF_EC2_NEURON_ACCELERATOR_TYPE, indirect=True)
@@ -24,6 +30,7 @@
2430
def test_ec2_tensorflow_inference_neuron(tensorflow_inference_neuron, ec2_connection, region, neuron_only):
2531
run_ec2_tensorflow_inference(tensorflow_inference_neuron, ec2_connection, "8500", region)
2632

33+
2734
@pytest.mark.model("mnist")
2835
@pytest.mark.parametrize("ec2_instance_type", TF_EC2_GPU_INSTANCE_TYPE, indirect=True)
2936
def test_ec2_tensorflow_inference_gpu(tensorflow_inference, ec2_connection, region, gpu_only):
@@ -53,13 +60,13 @@ def test_ec2_tensorflow_inference_eia_gpu(tensorflow_inference_eia, ec2_connecti
5360

5461

5562
@pytest.mark.model("mnist")
56-
@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
63+
@pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
5764
def test_ec2_tensorflow_inference_gpu_telemetry(tensorflow_inference, ec2_connection, region, gpu_only):
5865
run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500", region, True)
5966

6067

6168
@pytest.mark.model("mnist")
62-
@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
69+
@pytest.mark.parametrize("ec2_instance_type", TF_EC2_CPU_INSTANCE_TYPE, indirect=True)
6370
def test_ec2_tensorflow_inference_cpu_telemetry(tensorflow_inference, ec2_connection, region, cpu_only):
6471
run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500", region, True)
6572

@@ -76,10 +83,8 @@ def run_ec2_tensorflow_inference(image_uri, ec2_connection, grpc_port, region, t
7683
)
7784

7885
is_neuron = "neuron" in image_uri
79-
8086

8187
docker_cmd = "nvidia-docker" if "gpu" in image_uri else "docker"
82-
docker_run_cmd = ""
8388
if is_neuron:
8489
docker_run_cmd = (
8590
f"{docker_cmd} run -id --name {container_name} -p {grpc_port}:8500 "
@@ -155,10 +160,10 @@ def host_setup_for_tensorflow_inference(serving_folder_path, framework_version,
155160
LOGGER.info(f"Host Model path {neuron_model_file_path}")
156161
ec2_connection.run(f"mkdir -p {neuron_model_file_path}")
157162
model_file_path = f"https://aws-dlc-sample-models.s3.amazonaws.com/{model_name}_neuron/1/saved_model.pb"
158-
model_dwld = (
163+
model_download = (
159164
f"wget -O {neuron_model_file} {model_file_path} "
160165
)
161-
ec2_connection.run(model_dwld)
166+
ec2_connection.run(model_download)
162167
else:
163168
local_scripts_path = os.path.join("container_tests", "bin", "tensorflow_serving")
164169
ec2_connection.run(f"mkdir -p {serving_folder_path}")

test/dlc_tests/ec2/tensorflow/training/test_tensorflow_training.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import os
33
import pytest
44

5+
import test.test_utils.ec2 as ec2_utils
6+
57
from test.test_utils import CONTAINER_TESTS_PREFIX, LOGGER, is_tf_version
68
from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type
79

@@ -20,7 +22,9 @@
2022
TF_DATASERVICE_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testDataservice")
2123
TF_DATASERVICE_DISTRIBUTE_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testDataserviceDistribute")
2224

23-
TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="p3.2xlarge", processor="gpu")
25+
TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
26+
default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
27+
)
2428
TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.16xlarge", processor="gpu")
2529
TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c4.8xlarge", processor="cpu")
2630

@@ -199,6 +203,7 @@ def test_tensorflow_dataservice_cpu(tensorflow_training, ec2_connection, tf24_an
199203
def test_tensorflow_dataservice_gpu(tensorflow_training, ec2_connection, tf24_and_above_only, gpu_only):
200204
run_data_service_test(ec2_connection, tensorflow_training, TF_DATASERVICE_TEST_CMD)
201205

206+
202207
# Testing Data Service Distributed mode on only one CPU instance
203208
# Skip test for TF 2.3 and below
204209
@pytest.mark.integration('tensorflow-dataservice-distribute-test')

0 commit comments

Comments
 (0)