Skip to content

Commit 25f594d

Browse files
committed
Add local test to check that all GPU IDs are returned on multi-GPU host
1 parent f900670 commit 25f594d

File tree

12 files changed

+112
-32
lines changed

12 files changed

+112
-32
lines changed

buildspec.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,4 @@ phases:
110110
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG;
111111
done
112112
113-
# - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG
113+
# - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG

test/integration/__init__.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
mnist_path = os.path.join(resources_path, 'mnist')
2121
resnet18_path = os.path.join(resources_path, 'resnet18')
2222
mme_path = os.path.join(resources_path, 'mme')
23+
model_gpu_context_dir = os.path.join(resources_path, 'model_gpu_context')
2324
data_dir = os.path.join(mnist_path, 'data')
2425
training_dir = os.path.join(data_dir, 'training')
2526
cpu_sub_dir = 'model_cpu'
@@ -44,21 +45,17 @@
4445

4546
model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d')
4647
mnist_1d_script = os.path.join(model_cpu_1d_dir, code_sub_dir, 'mnist_1d.py')
47-
mnist_1d_requirements = os.path.join(model_cpu_1d_dir, code_sub_dir, 'requirements.txt')
4848
model_cpu_1d_tar = file_utils.make_tarfile(mnist_1d_script,
4949
os.path.join(model_cpu_1d_dir, "torch_model.pth"),
5050
model_cpu_1d_dir,
51-
script_path="code",
52-
requirements=mnist_1d_requirements)
51+
script_path="code")
5352

5453
model_gpu_dir = os.path.join(mnist_path, gpu_sub_dir)
5554
mnist_gpu_script = os.path.join(model_gpu_dir, code_sub_dir, 'mnist.py')
56-
mnist_gpu_requirements = os.path.join(model_gpu_dir, code_sub_dir, 'requirements.txt')
5755
model_gpu_tar = file_utils.make_tarfile(mnist_gpu_script,
5856
os.path.join(model_gpu_dir, "torch_model.pth"),
5957
model_gpu_dir,
60-
script_path="code",
61-
requirements=mnist_gpu_requirements)
58+
script_path="code")
6259

6360
model_eia_dir = os.path.join(mnist_path, eia_sub_dir)
6461
mnist_eia_script = os.path.join(model_eia_dir, 'mnist.py')
@@ -73,13 +70,11 @@
7370
model_inductor_dir)
7471

7572
call_model_fn_once_script = os.path.join(model_cpu_dir, code_sub_dir, 'call_model_fn_once.py')
76-
call_model_fn_once_requirements = os.path.join(model_cpu_dir, code_sub_dir, 'requirements.txt')
7773
call_model_fn_once_tar = file_utils.make_tarfile(call_model_fn_once_script,
7874
os.path.join(model_cpu_dir, "torch_model.pth"),
7975
model_cpu_dir,
8076
"model_call_model_fn_once.tar.gz",
81-
script_path="code",
82-
requirements=call_model_fn_once_requirements)
77+
script_path="code")
8378

8479
default_model_dir = os.path.join(resnet18_path, default_sub_dir)
8580
default_model_script = os.path.join(default_model_dir, code_sub_dir, "resnet18.py")
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
import subprocess
17+
import sys
18+
import time
19+
20+
import pytest
21+
import requests
22+
import torch
23+
24+
from integration import model_gpu_context_dir
25+
26+
BASE_URL = "http://0.0.0.0:8080/"
27+
PING_URL = BASE_URL + "ping"
28+
29+
30+
@pytest.fixture(scope="module", autouse=True)
31+
def container(image_uri):
32+
try:
33+
if 'cpu' in image_uri:
34+
pytest.skip("Skipping because tests running on CPU instance")
35+
36+
command = (
37+
"docker run --gpus=all "
38+
"--name sagemaker-pytorch-inference-toolkit-context-test "
39+
"-v {}:/opt/ml/model "
40+
"{} serve"
41+
).format(model_gpu_context_dir, image_uri)
42+
43+
proc = subprocess.Popen(command.split(), stdout=sys.stdout, stderr=subprocess.STDOUT)
44+
45+
attempts = 0
46+
while attempts < 10:
47+
time.sleep(3)
48+
try:
49+
requests.get(PING_URL)
50+
break
51+
except Exception:
52+
attempts += 1
53+
pass
54+
yield proc.pid
55+
56+
finally:
57+
if 'cpu' in image_uri:
58+
pytest.skip("Skipping because tests running on CPU instance")
59+
subprocess.check_call("docker rm -f sagemaker-pytorch-inference-toolkit-context-test".split())
60+
61+
62+
def test_context_all_device_ids():
63+
gpu_count = torch.cuda.device_count()
64+
65+
gpu_ids_expected = [i for i in range(gpu_count)]
66+
gpu_ids_actual = []
67+
68+
with open(os.path.join(model_gpu_context_dir, 'code', 'device_info.txt'), 'r') as f:
69+
for line in f:
70+
gpu_ids_actual.append(int(line))
71+
72+
gpu_ids_actual = list(set(gpu_ids_actual))
73+
gpu_ids_actual.sort()
74+
75+
assert gpu_ids_actual == gpu_ids_expected

test/integration/local/test_mme_local.py renamed to test/integration/local/test_multi_model_endpoint_local.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def container(image_uri, use_gpu):
3838
traced_resnet18_path = os.path.join(mme_path, 'traced_resnet18')
3939

4040
command = (
41-
"docker run -it --rm {} "
41+
"docker run {} "
4242
"--name sagemaker-pytorch-inference-toolkit-mme-test "
4343
"-p 8080:8080 "
4444
"-v {}:/resnet18 "

test/resources/mnist/model_cpu/1d/code/mnist_1d.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,6 @@
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
1313
from __future__ import absolute_import
14-
15-
try:
16-
import transformers
17-
except ImportError:
18-
raise ImportError("The 'transformers' module was not found.'")
19-
2014
import os
2115
import torch
2216
import torch.nn as nn
@@ -50,4 +44,4 @@ def model_fn(model_dir):
5044
model = torch.nn.DataParallel(Net())
5145
with open(os.path.join(model_dir, 'torch_model.pth'), 'rb') as f:
5246
model.load_state_dict(torch.load(f))
53-
return model
47+
return model

test/resources/mnist/model_cpu/1d/code/requirements.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

test/resources/mnist/model_cpu/code/call_model_fn_once.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,7 @@
1414

1515
import os
1616

17-
try:
18-
import transformers
19-
except ImportError:
20-
raise ImportError("The 'transformers' module was not found.'")
21-
17+
2218
def model_fn(model_dir):
2319
lock_file = os.path.join(model_dir, 'model_fn.lock.{}'.format(os.getpid()))
2420
if os.path.exists(lock_file):
@@ -38,4 +34,4 @@ def predict_fn(data, model):
3834

3935

4036
def output_fn(prediction, accept):
41-
return prediction
37+
return prediction

test/resources/mnist/model_gpu/code/mnist.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@
1616
import os
1717
import sys
1818

19-
try:
20-
import transformers
21-
except ImportError:
22-
raise ImportError("The 'transformers' module was not found.'")
23-
2419
import torch
2520
import torch.nn as nn
2621
import torch.nn.functional as F
@@ -68,4 +63,4 @@ def model_fn(model_dir):
6863
# Move the model to the GPU
6964
device = torch.device("cuda")
7065
model = model.to(device)
71-
return model
66+
return model

test/resources/mnist/model_gpu/code/requirements.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
import torch
17+
18+
def model_fn(model_dir, context):
19+
script_dir = os.path.dirname(os.path.abspath(__file__))
20+
file_path = os.path.join(script_dir, "device_info.txt")
21+
22+
device = torch.device("cuda:" + str(context.system_properties.get("gpu_id")))
23+
device_str = str(device)[-1]
24+
with open(file_path, "a") as file:
25+
file.write(device_str + "\n")
26+
27+
return 'model'

0 commit comments

Comments
 (0)