[test][ec2] Fix collision between parametrized keypair names (#875)

saimidu · web-flow · commit 499a81028ecd · 2021-02-23T16:26:32.000-08:00
diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
@@ -454,11 +454,13 @@ def pytest_generate_tests(metafunc):
                             images_to_parametrize.append(image)
                         elif "eia_only" in metafunc.fixturenames and "eia" in image:
                             images_to_parametrize.append(image)
-                        elif ("cpu_only" not in metafunc.fixturenames and "gpu_only" not in metafunc.fixturenames
-                              and "eia_only" not in metafunc.fixturenames):
-                            images_to_parametrize.append(image)
                         elif "neuron_only" in metafunc.fixturenames and "neuron" in image:
                             images_to_parametrize.append(image)
+                        elif ("cpu_only" not in metafunc.fixturenames and
+                              "gpu_only" not in metafunc.fixturenames and
+                              "eia_only" not in metafunc.fixturenames and
+                              "neuron_only" not in metafunc.fixturenames):
+                            images_to_parametrize.append(image)
 
             # Remove all images tagged as "py2" if py3_only is a fixture
             if images_to_parametrize and "py3_only" in metafunc.fixturenames:
diff --git a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py
@@ -1,6 +1,8 @@
 import os
 import pytest
 
+import test.test_utils.ec2 as ec2_utils
+
 from test import test_utils
 from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag
 from test.test_utils.ec2 import get_ec2_instance_type, execute_ec2_inference_test, get_ec2_accelerator_type
@@ -15,6 +17,13 @@
 MX_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
 MX_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu")
 MX_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia")
+MX_EC2_GPU_EIA_INSTANCE_TYPE = get_ec2_instance_type(
+    default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_not_heavy_instance_types,
+)
+MX_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
+    default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
+)
+
 MX_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "test_mx_dlc_telemetry_test")
 
 
@@ -51,9 +60,8 @@ def test_ec2_mxnet_resnet_inference_eia_cpu(mxnet_inference_eia, ec2_connection,
 
 @pytest.mark.integration("elastic_inference")
 @pytest.mark.model(RESNET_EIA_MODEL)
-@pytest.mark.parametrize("ec2_instance_type", MX_EC2_GPU_INSTANCE_TYPE, indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", MX_EC2_GPU_EIA_INSTANCE_TYPE, indirect=True)
 @pytest.mark.parametrize("ei_accelerator_type", MX_EC2_EIA_ACCELERATOR_TYPE, indirect=True)
-@pytest.mark.skipif(MX_EC2_GPU_INSTANCE_TYPE == ["p3dn.24xlarge"], reason="Skipping EIA test on p3dn instances")
 def test_ec2_mxnet_resnet_inferencei_eia_gpu(mxnet_inference_eia, ec2_connection, region, eia_only):
     model_name = RESNET_EIA_MODEL
     image_framework, image_framework_version = get_framework_and_version_from_tag(mxnet_inference_eia)
@@ -108,14 +116,14 @@ def run_ec2_mxnet_inference(image_uri, model_name, container_tag, ec2_connection
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", MX_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
 def test_mxnet_inference_telemetry_gpu(mxnet_inference, ec2_connection, gpu_only):
     execute_ec2_inference_test(ec2_connection, mxnet_inference, MX_TELEMETRY_CMD)
 
 
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", MX_EC2_CPU_INSTANCE_TYPE, indirect=True)
 def test_mxnet_inference_telemetry_cpu(mxnet_inference, ec2_connection, cpu_only):
     execute_ec2_inference_test(ec2_connection, mxnet_inference, MX_TELEMETRY_CMD)
diff --git a/test/dlc_tests/ec2/mxnet/training/test_mxnet_training.py b/test/dlc_tests/ec2/mxnet/training/test_mxnet_training.py
@@ -2,6 +2,8 @@
 
 import pytest
 
+import test.test_utils.ec2 as ec2_utils
+
 from test.test_utils import CONTAINER_TESTS_PREFIX
 from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type
 
@@ -16,6 +18,9 @@
 
 MX_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
 MX_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu")
+MX_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
+    default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
+)
 
 
 @pytest.mark.integration("mxnet_sanity_test")
@@ -105,14 +110,14 @@ def test_mxnet_with_horovod_cpu(mxnet_training, ec2_connection, cpu_only):
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", MX_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
 def test_mxnet_telemetry_gpu(mxnet_training, ec2_connection, gpu_only):
     execute_ec2_training_test(ec2_connection, mxnet_training, MX_TELEMETRY_CMD)
 
 
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", MX_EC2_CPU_INSTANCE_TYPE, indirect=True)
 def test_mxnet_telemetry_cpu(mxnet_training, ec2_connection, cpu_only):
     execute_ec2_training_test(ec2_connection, mxnet_training, MX_TELEMETRY_CMD)
diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py
@@ -2,6 +2,8 @@
 
 import pytest
 
+import test.test_utils.ec2 as ec2_utils
+
 from test import test_utils
 from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag
 from test.test_utils.ec2 import get_ec2_instance_type, execute_ec2_inference_test, get_ec2_accelerator_type
@@ -10,17 +12,25 @@
 
 PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
 PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu")
+PT_EC2_GPU_EIA_INSTANCE_TYPE = get_ec2_instance_type(
+    default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_not_heavy_instance_types,
+)
 PT_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia")
+PT_EC2_NEURON_INSTANCE_TYPE = get_ec2_instance_type(default="inf1.xlarge", processor="neuron")
+PT_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
+    default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
+)
+
 PT_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test")
-PT_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="inf1.xlarge", processor="neuron")
 
 
 @pytest.mark.model("resnet")
 @pytest.mark.parametrize("ec2_instance_ami", [test_utils.NEURON_AL2_DLAMI], indirect=True)
-@pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_ACCELERATOR_TYPE, indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_INSTANCE_TYPE, indirect=True)
 def test_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region, gpu_only):
     ec2_pytorch_inference(pytorch_inference, "neuron", ec2_connection, region)
 
+
 @pytest.mark.model("densenet")
 @pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
 def test_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region, gpu_only):
@@ -43,9 +53,8 @@ def test_ec2_pytorch_inference_eia_cpu(pytorch_inference_eia, ec2_connection, re
 
 @pytest.mark.integration("elastic_inference")
 @pytest.mark.model("resnet")
-@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_EIA_INSTANCE_TYPE, indirect=True)
 @pytest.mark.parametrize("ei_accelerator_type", PT_EC2_EIA_ACCELERATOR_TYPE, indirect=True)
-@pytest.mark.skipif(PT_EC2_GPU_INSTANCE_TYPE == ["p3dn.24xlarge"], reason="Skipping EIA test on p3dn instances")
 def test_ec2_pytorch_inference_eia_gpu(pytorch_inference_eia, ec2_connection, region, eia_only):
     ec2_pytorch_inference(pytorch_inference_eia, "eia", ec2_connection, region)
 
@@ -96,13 +105,13 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region):
 
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
 def test_pytorch_inference_telemetry_gpu(pytorch_inference, ec2_connection, gpu_only):
     execute_ec2_inference_test(ec2_connection, pytorch_inference, PT_TELEMETRY_CMD)
 
 
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
 def test_pytorch_inference_telemetry_cpu(pytorch_inference, ec2_connection, cpu_only):
     execute_ec2_inference_test(ec2_connection, pytorch_inference, PT_TELEMETRY_CMD)
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
@@ -4,6 +4,8 @@
 
 import pytest
 
+import test.test_utils.ec2 as ec2_utils
+
 from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag, get_cuda_version_from_tag
 from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type
 
@@ -19,6 +21,12 @@
 
 PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
 PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu")
+PT_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
+    default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
+)
+PT_EC2_MULTI_GPU_INSTANCE_TYPE = get_ec2_instance_type(
+    default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_only_multi_gpu,
+)
 
 
 @pytest.mark.integration("pytorch_sanity_test")
@@ -63,10 +71,6 @@ def test_pytorch_linear_regression_cpu(pytorch_training, ec2_connection, cpu_onl
 @pytest.mark.model("gcn")
 @pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
 def test_pytorch_train_dgl_gpu(pytorch_training, ec2_connection, gpu_only, py3_only):
-    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
-    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
-    if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
-        pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)
 
 
@@ -127,8 +131,7 @@ def test_nvapex(pytorch_training, ec2_connection, gpu_only):
 
 @pytest.mark.integration("amp")
 @pytest.mark.model("resnet50")
-@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
-@pytest.mark.skipif(PT_EC2_GPU_INSTANCE_TYPE == ["g3.4xlarge"], reason="Skipping AMP DDP test on single gpu instance")
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_MULTI_GPU_INSTANCE_TYPE, indirect=True)
 def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only):
     _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
     if Version(image_framework_version) < Version("1.6"):
@@ -138,13 +141,13 @@ def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only):
 
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
 def test_pytorch_telemetry_gpu(pytorch_training, ec2_connection, gpu_only):
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_TELEMETRY_CMD)
 
 
 @pytest.mark.integration("telemetry")
 @pytest.mark.model("N/A")
-@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
 def test_pytorch_telemetry_cpu(pytorch_training, ec2_connection, cpu_only):
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_TELEMETRY_CMD)
diff --git a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py
@@ -4,6 +4,8 @@
 
 import pytest
 
+import test.test_utils.ec2 as ec2_utils
+
 from test import test_utils
 from test.test_utils.ec2 import get_ec2_instance_type, get_ec2_accelerator_type
 from test.dlc_tests.conftest import LOGGER
@@ -15,7 +17,11 @@
 TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu")
 TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu")
 TF_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia")
-TF_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="inf1.xlarge", processor="neuron")
+TF_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_instance_type(default="inf1.xlarge", processor="neuron")
+TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
+    default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
+)
+
 
 @pytest.mark.model("mnist")
 @pytest.mark.parametrize("ec2_instance_type", TF_EC2_NEURON_ACCELERATOR_TYPE, indirect=True)
@@ -24,6 +30,7 @@
 def test_ec2_tensorflow_inference_neuron(tensorflow_inference_neuron, ec2_connection, region, neuron_only):
     run_ec2_tensorflow_inference(tensorflow_inference_neuron, ec2_connection, "8500", region)
 
+
 @pytest.mark.model("mnist")
 @pytest.mark.parametrize("ec2_instance_type", TF_EC2_GPU_INSTANCE_TYPE, indirect=True)
 def test_ec2_tensorflow_inference_gpu(tensorflow_inference, ec2_connection, region, gpu_only):
@@ -53,13 +60,13 @@ def test_ec2_tensorflow_inference_eia_gpu(tensorflow_inference_eia, ec2_connecti
 
 
 @pytest.mark.model("mnist")
-@pytest.mark.parametrize("ec2_instance_type", ["p2.xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
 def test_ec2_tensorflow_inference_gpu_telemetry(tensorflow_inference, ec2_connection, region, gpu_only):
     run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500", region, True)
 
 
 @pytest.mark.model("mnist")
-@pytest.mark.parametrize("ec2_instance_type", ["c5.4xlarge"], indirect=True)
+@pytest.mark.parametrize("ec2_instance_type", TF_EC2_CPU_INSTANCE_TYPE, indirect=True)
 def test_ec2_tensorflow_inference_cpu_telemetry(tensorflow_inference, ec2_connection, region, cpu_only):
     run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500", region, True)
 
@@ -76,10 +83,8 @@ def run_ec2_tensorflow_inference(image_uri, ec2_connection, grpc_port, region, t
     )
     
     is_neuron = "neuron" in image_uri
-         
 
     docker_cmd = "nvidia-docker" if "gpu" in image_uri else "docker"
-    docker_run_cmd = ""
     if is_neuron:
         docker_run_cmd = (
             f"{docker_cmd} run -id --name {container_name} -p {grpc_port}:8500 "
@@ -155,10 +160,10 @@ def host_setup_for_tensorflow_inference(serving_folder_path, framework_version,
             LOGGER.info(f"Host Model path {neuron_model_file_path}")
             ec2_connection.run(f"mkdir -p {neuron_model_file_path}")
             model_file_path = f"https://aws-dlc-sample-models.s3.amazonaws.com/{model_name}_neuron/1/saved_model.pb"
-            model_dwld = (
+            model_download = (
                 f"wget -O {neuron_model_file} {model_file_path} "
             )
-            ec2_connection.run(model_dwld)
+            ec2_connection.run(model_download)
     else:
         local_scripts_path = os.path.join("container_tests", "bin", "tensorflow_serving")
         ec2_connection.run(f"mkdir -p {serving_folder_path}")
diff --git a/test/dlc_tests/ec2/tensorflow/training/test_tensorflow_training.py b/test/dlc_tests/ec2/tensorflow/training/test_tensorflow_training.py
@@ -2,6 +2,8 @@
 import os
 import pytest
 
+import test.test_utils.ec2 as ec2_utils
+
 from test.test_utils import CONTAINER_TESTS_PREFIX, LOGGER, is_tf_version
 from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type
 
@@ -20,7 +22,9 @@
 TF_DATASERVICE_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testDataservice")
 TF_DATASERVICE_DISTRIBUTE_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testDataserviceDistribute")
 
-TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="p3.2xlarge", processor="gpu")
+TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(
+    default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu,
+)
 TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.16xlarge", processor="gpu")
 TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c4.8xlarge", processor="cpu")
 
@@ -199,6 +203,7 @@ def test_tensorflow_dataservice_cpu(tensorflow_training, ec2_connection, tf24_an
 def test_tensorflow_dataservice_gpu(tensorflow_training, ec2_connection, tf24_and_above_only, gpu_only):
     run_data_service_test(ec2_connection, tensorflow_training, TF_DATASERVICE_TEST_CMD)
 
+
 # Testing Data Service Distributed mode on only one CPU instance
 # Skip test for TF 2.3 and below
 @pytest.mark.integration('tensorflow-dataservice-distribute-test')
diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py
diff --git a/test/test_utils/test_reporting.py b/test/test_utils/test_reporting.py