Add local test to check that all GPU IDs are returned on multi-GPU host

sachanub · sachanub · commit 25f594d014a6 · 2023-10-11T08:56:11.000Z
diff --git a/buildspec.yml b/buildspec.yml
@@ -110,4 +110,4 @@ phases:
             aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG;
           done
       
-      # - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG
+      # - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
@@ -20,6 +20,7 @@
 mnist_path = os.path.join(resources_path, 'mnist')
 resnet18_path = os.path.join(resources_path, 'resnet18')
 mme_path = os.path.join(resources_path, 'mme')
+model_gpu_context_dir = os.path.join(resources_path, 'model_gpu_context')
 data_dir = os.path.join(mnist_path, 'data')
 training_dir = os.path.join(data_dir, 'training')
 cpu_sub_dir = 'model_cpu'
@@ -44,21 +45,17 @@
 
 model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d')
 mnist_1d_script = os.path.join(model_cpu_1d_dir, code_sub_dir, 'mnist_1d.py')
-mnist_1d_requirements = os.path.join(model_cpu_1d_dir, code_sub_dir, 'requirements.txt')
 model_cpu_1d_tar = file_utils.make_tarfile(mnist_1d_script,
                                            os.path.join(model_cpu_1d_dir, "torch_model.pth"),
                                            model_cpu_1d_dir,
-                                           script_path="code",
-                                           requirements=mnist_1d_requirements)
+                                           script_path="code")
 
 model_gpu_dir = os.path.join(mnist_path, gpu_sub_dir)
 mnist_gpu_script = os.path.join(model_gpu_dir, code_sub_dir, 'mnist.py')
-mnist_gpu_requirements = os.path.join(model_gpu_dir, code_sub_dir, 'requirements.txt')
 model_gpu_tar = file_utils.make_tarfile(mnist_gpu_script,
                                         os.path.join(model_gpu_dir, "torch_model.pth"),
                                         model_gpu_dir,
-                                        script_path="code",
-                                        requirements=mnist_gpu_requirements)
+                                        script_path="code")
 
 model_eia_dir = os.path.join(mnist_path, eia_sub_dir)
 mnist_eia_script = os.path.join(model_eia_dir, 'mnist.py')
@@ -73,13 +70,11 @@
                                              model_inductor_dir)
 
 call_model_fn_once_script = os.path.join(model_cpu_dir, code_sub_dir, 'call_model_fn_once.py')
-call_model_fn_once_requirements = os.path.join(model_cpu_dir, code_sub_dir, 'requirements.txt')
 call_model_fn_once_tar = file_utils.make_tarfile(call_model_fn_once_script,
                                                  os.path.join(model_cpu_dir, "torch_model.pth"),
                                                  model_cpu_dir,
                                                  "model_call_model_fn_once.tar.gz",
-                                                 script_path="code",
-                                                 requirements=call_model_fn_once_requirements)
+                                                 script_path="code")
 
 default_model_dir = os.path.join(resnet18_path, default_sub_dir)
 default_model_script = os.path.join(default_model_dir, code_sub_dir, "resnet18.py")
diff --git a/test/integration/local/test_mnist_serving.py b/test/integration/local/test_mnist_serving.py
diff --git a/test/integration/local/test_model_fn_context.py b/test/integration/local/test_model_fn_context.py
@@ -0,0 +1,75 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+import subprocess
+import sys
+import time
+
+import pytest
+import requests
+import torch
+
+from integration import model_gpu_context_dir
+
+BASE_URL = "http://0.0.0.0:8080/"
+PING_URL = BASE_URL + "ping"
+
+
+@pytest.fixture(scope="module", autouse=True)
+def container(image_uri):
+    try:
+        if 'cpu' in image_uri:
+            pytest.skip("Skipping because tests running on CPU instance")
+
+        command = (
+            "docker run --gpus=all "
+            "--name sagemaker-pytorch-inference-toolkit-context-test "
+            "-v {}:/opt/ml/model "
+            "{} serve"
+        ).format(model_gpu_context_dir, image_uri)
+
+        proc = subprocess.Popen(command.split(), stdout=sys.stdout, stderr=subprocess.STDOUT)
+
+        attempts = 0
+        while attempts < 10:
+            time.sleep(3)
+            try:
+                requests.get(PING_URL)
+                break
+            except Exception:
+                attempts += 1
+                pass
+        yield proc.pid
+
+    finally:
+        if 'cpu' in image_uri:
+            pytest.skip("Skipping because tests running on CPU instance")
+        subprocess.check_call("docker rm -f sagemaker-pytorch-inference-toolkit-context-test".split())
+
+
+def test_context_all_device_ids():
+    gpu_count = torch.cuda.device_count()
+
+    gpu_ids_expected = [i for i in range(gpu_count)]
+    gpu_ids_actual = []
+
+    with open(os.path.join(model_gpu_context_dir, 'code', 'device_info.txt'), 'r') as f:
+        for line in f:
+            gpu_ids_actual.append(int(line))
+
+    gpu_ids_actual = list(set(gpu_ids_actual))
+    gpu_ids_actual.sort()
+
+    assert gpu_ids_actual == gpu_ids_expected
diff --git a/test/integration/local/test_multi_model_endpoint_local.py b/test/integration/local/test_multi_model_endpoint_local.py
@@ -38,7 +38,7 @@ def container(image_uri, use_gpu):
         traced_resnet18_path = os.path.join(mme_path, 'traced_resnet18')
 
         command = (
-            "docker run -it --rm {} "
+            "docker run {} "
             "--name sagemaker-pytorch-inference-toolkit-mme-test "
             "-p 8080:8080 "
             "-v {}:/resnet18 "
diff --git a/test/integration/sagemaker/test_multi_model_endpoint_sagemaker.py b/test/integration/sagemaker/test_multi_model_endpoint_sagemaker.py
diff --git a/test/resources/mnist/model_cpu/1d/code/mnist_1d.py b/test/resources/mnist/model_cpu/1d/code/mnist_1d.py
@@ -11,12 +11,6 @@
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
-
-try:
-    import transformers
-except ImportError:
-    raise ImportError("The 'transformers' module was not found.'")
-    
 import os
 import torch
 import torch.nn as nn
@@ -50,4 +44,4 @@ def model_fn(model_dir):
     model = torch.nn.DataParallel(Net())
     with open(os.path.join(model_dir, 'torch_model.pth'), 'rb') as f:
         model.load_state_dict(torch.load(f))
-    return model
+    return model
diff --git a/test/resources/mnist/model_cpu/1d/code/requirements.txt b/test/resources/mnist/model_cpu/1d/code/requirements.txt
diff --git a/test/resources/mnist/model_cpu/code/call_model_fn_once.py b/test/resources/mnist/model_cpu/code/call_model_fn_once.py
@@ -14,11 +14,7 @@
 
 import os
 
-try:
-    import transformers
-except ImportError:
-    raise ImportError("The 'transformers' module was not found.'")
-    
+
 def model_fn(model_dir):
     lock_file = os.path.join(model_dir, 'model_fn.lock.{}'.format(os.getpid()))
     if os.path.exists(lock_file):
@@ -38,4 +34,4 @@ def predict_fn(data, model):
 
 
 def output_fn(prediction, accept):
-    return prediction
+    return prediction
diff --git a/test/resources/mnist/model_gpu/code/mnist.py b/test/resources/mnist/model_gpu/code/mnist.py
@@ -16,11 +16,6 @@
 import os
 import sys
 
-try:
-    import transformers
-except ImportError:
-    raise ImportError("The 'transformers' module was not found.'")
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -68,4 +63,4 @@ def model_fn(model_dir):
     # Move the model to the GPU
     device = torch.device("cuda")
     model = model.to(device)
-    return model
+    return model 
diff --git a/test/resources/mnist/model_gpu/code/requirements.txt b/test/resources/mnist/model_gpu/code/requirements.txt
diff --git a/test/resources/model_gpu_context/code/inference.py b/test/resources/model_gpu_context/code/inference.py
@@ -0,0 +1,27 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+import torch
+
+def model_fn(model_dir, context):
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(script_dir, "device_info.txt")
+    
+    device = torch.device("cuda:" + str(context.system_properties.get("gpu_id")))
+    device_str = str(device)[-1]
+    with open(file_path, "a") as file:
+        file.write(device_str + "\n")
+    
+    return 'model'