Skip to content
This repository was archived by the owner on May 23, 2024. It is now read-only.

Commit 54db583

Browse files
committed
Feature: Support multiple inference.py files and universal inference.py file along with universal requirements.txt file
1 parent 1a265db commit 54db583

File tree

24 files changed

+412
-33
lines changed

24 files changed

+412
-33
lines changed

docker/build_artifacts/sagemaker/python_service.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import os
1818
import subprocess
1919
import grpc
20+
import sys
2021

2122
import falcon
2223
import requests
@@ -143,6 +144,7 @@ def _handle_load_model_post(self, res, data): # noqa: C901
143144
# validate model files are in the specified base_path
144145
if self.validate_model_dir(base_path):
145146
try:
147+
self._import_custom_modules(model_name)
146148
tfs_config = tfs_utils.create_tfs_config_individual_model(model_name, base_path)
147149
tfs_config_file = "/sagemaker/tfs-config/{}/model-config.cfg".format(model_name)
148150
log.info("tensorflow serving model config: \n%s\n", tfs_config)
@@ -221,6 +223,17 @@ def _handle_load_model_post(self, res, data): # noqa: C901
221223
}
222224
)
223225

226+
def _import_custom_modules(self, model_name):
227+
inference_script_path = "/opt/ml/models/{}/model/code/inference.py".format(model_name)
228+
python_lib_path = "/opt/ml/models/{}/model/code/lib".format(model_name)
229+
if os.path.exists(python_lib_path):
230+
log.info("add Python code library path")
231+
sys.path.append(python_lib_path)
232+
if os.path.exists(inference_script_path):
233+
handler, input_handler, output_handler = self._import_handlers(inference_script_path)
234+
model_handlers = self._make_handler(handler, input_handler, output_handler)
235+
self.model_handlers[model_name] = model_handlers
236+
224237
def _cleanup_config_file(self, config_file):
225238
if os.path.exists(config_file):
226239
os.remove(config_file)
@@ -264,8 +277,10 @@ def _handle_invocation_post(self, req, res, model_name=None):
264277

265278
try:
266279
res.status = falcon.HTTP_200
267-
268-
res.body, res.content_type = self._handlers(data, context)
280+
handlers = self._handlers
281+
if SAGEMAKER_MULTI_MODEL_ENABLED and model_name in self.model_handlers:
282+
handlers = self.model_handlers[model_name]
283+
res.body, res.content_type = handlers(data, context)
269284
except Exception as e: # pylint: disable=broad-except
270285
log.exception("exception handling request: {}".format(e))
271286
res.status = falcon.HTTP_500
@@ -276,8 +291,7 @@ def _setup_channel(self, grpc_port):
276291
log.info("Creating grpc channel for port: %s", grpc_port)
277292
self._channels[grpc_port] = grpc.insecure_channel("localhost:{}".format(grpc_port))
278293

279-
def _import_handlers(self):
280-
inference_script = INFERENCE_SCRIPT_PATH
294+
def _import_handlers(self, inference_script=INFERENCE_SCRIPT_PATH):
281295
spec = importlib.util.spec_from_file_location("inference", inference_script)
282296
inference = importlib.util.module_from_spec(spec)
283297
spec.loader.exec_module(inference)

docker/build_artifacts/sagemaker/serve.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ def __init__(self):
134134
os.environ["TFS_REST_PORTS"] = self._tfs_rest_concat_ports
135135

136136
def _need_python_service(self):
137-
if os.path.exists(INFERENCE_PATH):
137+
if (os.path.exists(INFERENCE_PATH) or os.path.exists(REQUIREMENTS_PATH)
138+
or os.path.exists(PYTHON_LIB_PATH)):
138139
self._enable_python_service = True
139140
if os.environ.get("SAGEMAKER_MULTI_MODEL_UNIVERSAL_BUCKET") and os.environ.get(
140141
"SAGEMAKER_MULTI_MODEL_UNIVERSAL_PREFIX"
@@ -308,6 +309,14 @@ def _enable_per_process_gpu_memory_fraction(self):
308309

309310
return False
310311

312+
def _get_number_of_gpu_on_host(self):
313+
nvidia_smi_exist = os.path.exists("/usr/bin/nvidia-smi")
314+
if nvidia_smi_exist:
315+
return len(subprocess.check_output(['nvidia-smi', '-L'])
316+
.decode('utf-8').strip().split('\n'))
317+
318+
return 0
319+
311320
def _calculate_per_process_gpu_memory_fraction(self):
312321
return round((1 - self._tfs_gpu_margin) / float(self._tfs_instance_count), 4)
313322

@@ -420,8 +429,20 @@ def _start_single_tfs(self, instance_id):
420429
tfs_gpu_memory_fraction=self._calculate_per_process_gpu_memory_fraction(),
421430
)
422431
log.info("tensorflow serving command: {}".format(cmd))
423-
p = subprocess.Popen(cmd.split())
424-
log.info("started tensorflow serving (pid: %d)", p.pid)
432+
433+
num_gpus = self._get_number_of_gpu_on_host()
434+
if num_gpus > 1:
435+
# utilizing multi-gpu
436+
worker_env = os.environ.copy()
437+
worker_env["CUDA_VISIBLE_DEVICES"] = str(instance_id % num_gpus)
438+
p = subprocess.Popen(cmd.split(), env=worker_env)
439+
log.info("started tensorflow serving (pid: {}) on GPU {}"
440+
.format(p.pid, instance_id % num_gpus))
441+
else:
442+
# cpu and single gpu
443+
p = subprocess.Popen(cmd.split())
444+
log.info("started tensorflow serving (pid: {})".format(p.pid))
445+
425446
return p
426447

427448
def _monitor(self):
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
import json
15+
import os
16+
import shutil
17+
import subprocess
18+
import sys
19+
import time
20+
21+
import pytest
22+
23+
import requests
24+
25+
from multi_model_endpoint_test_utils import make_load_model_request, make_headers
26+
27+
28+
PING_URL = "http://localhost:8080/ping"
29+
INVOCATION_URL = "http://localhost:8080/models/{}/invoke"
30+
MODEL_NAMES = ["half_plus_three","half_plus_two"]
31+
32+
33+
@pytest.fixture(scope="session", autouse=True)
34+
def volume():
35+
try:
36+
model_dir = os.path.abspath("test/resources/mme_multiple_inference_scripts")
37+
subprocess.check_call(
38+
"docker volume create --name model_volume_mme_multiple_inference_scripts --opt type=none "
39+
"--opt device={} --opt o=bind".format(model_dir).split())
40+
yield model_dir
41+
finally:
42+
subprocess.check_call("docker volume rm model_volume_mme_multiple_inference_scripts".split())
43+
44+
45+
@pytest.fixture(scope="module", autouse=True)
46+
def container(docker_base_name, tag, runtime_config):
47+
try:
48+
command = (
49+
"docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
50+
" --mount type=volume,source=model_volume_mme_multiple_inference_scripts,target=/opt/ml/models,readonly"
51+
" -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info"
52+
" -e SAGEMAKER_BIND_TO_PORT=8080"
53+
" -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999"
54+
" -e SAGEMAKER_MULTI_MODEL=True"
55+
" {}:{} serve"
56+
).format(runtime_config, docker_base_name, tag)
57+
58+
proc = subprocess.Popen(command.split(), stdout=sys.stdout, stderr=subprocess.STDOUT)
59+
60+
attempts = 0
61+
while attempts < 40:
62+
time.sleep(3)
63+
try:
64+
res_code = requests.get("http://localhost:8080/ping").status_code
65+
if res_code == 200:
66+
break
67+
except:
68+
attempts += 1
69+
pass
70+
71+
yield proc.pid
72+
finally:
73+
subprocess.check_call("docker rm -f sagemaker-tensorflow-serving-test".split())
74+
75+
76+
@pytest.fixture
77+
def models():
78+
for MODEL_NAME in MODEL_NAMES:
79+
model_data = {
80+
"model_name": MODEL_NAME,
81+
"url": "/opt/ml/models/{}/model/{}".format(MODEL_NAME,MODEL_NAME)
82+
}
83+
make_load_model_request(json.dumps(model_data))
84+
return MODEL_NAMES
85+
86+
87+
@pytest.mark.skip_gpu
88+
def test_ping_service():
89+
response = requests.get(PING_URL)
90+
assert 200 == response.status_code
91+
92+
93+
@pytest.mark.skip_gpu
94+
def test_predict_json(models):
95+
headers = make_headers()
96+
data = "{\"instances\": [1.0, 2.0, 5.0]}"
97+
responses = []
98+
for model in models:
99+
response = requests.post(INVOCATION_URL.format(model), data=data, headers=headers).json()
100+
responses.append(response)
101+
assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
102+
assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
103+
104+
105+
@pytest.mark.skip_gpu
106+
def test_zero_content():
107+
headers = make_headers()
108+
x = ""
109+
for MODEL_NAME in MODEL_NAMES:
110+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers)
111+
assert 500 == response.status_code
112+
assert "document is empty" in response.text
113+
114+
115+
@pytest.mark.skip_gpu
116+
def test_large_input():
117+
data_file = "test/resources/inputs/test-large.csv"
118+
119+
with open(data_file, "r") as file:
120+
x = file.read()
121+
headers = make_headers(content_type="text/csv")
122+
for MODEL_NAME in MODEL_NAMES:
123+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers).json()
124+
predictions = response["predictions"]
125+
assert len(predictions) == 753936
126+
127+
128+
@pytest.mark.skip_gpu
129+
def test_csv_input():
130+
headers = make_headers(content_type="text/csv")
131+
data = "1.0,2.0,5.0"
132+
responses = []
133+
for MODEL_NAME in MODEL_NAMES:
134+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers).json()
135+
responses.append(response)
136+
assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
137+
assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
138+
139+
@pytest.mark.skip_gpu
140+
def test_specific_versions():
141+
MODEL_NAME = MODEL_NAMES[0]
142+
for version in ("123", "124"):
143+
headers = make_headers(content_type="text/csv", version=version)
144+
data = "1.0,2.0,5.0"
145+
response = requests.post(
146+
INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers
147+
).json()
148+
assert response == {"predictions": [3.5, 4.0, 5.5]}
149+
150+
151+
@pytest.mark.skip_gpu
152+
def test_unsupported_content_type():
153+
headers = make_headers("unsupported-type", "predict")
154+
data = "aW1hZ2UgYnl0ZXM="
155+
for MODEL_NAME in MODEL_NAMES:
156+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
157+
assert 500 == response.status_code
158+
assert "unsupported content type" in response.text

test/integration/local/test_pre_post_processing_mme.py renamed to test/integration/local/test_pre_post_processing_mme_universal_inference.py

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,27 +27,27 @@
2727

2828
PING_URL = "http://localhost:8080/ping"
2929
INVOCATION_URL = "http://localhost:8080/models/{}/invoke"
30-
MODEL_NAME = "half_plus_three"
30+
MODEL_NAMES = ["half_plus_three","half_plus_two"]
3131

3232

3333
@pytest.fixture(scope="session", autouse=True)
3434
def volume():
3535
try:
3636
model_dir = os.path.abspath("test/resources/mme_universal_script")
3737
subprocess.check_call(
38-
"docker volume create --name model_volume_mme --opt type=none "
38+
"docker volume create --name model_volume_mme_universal_script --opt type=none "
3939
"--opt device={} --opt o=bind".format(model_dir).split())
4040
yield model_dir
4141
finally:
42-
subprocess.check_call("docker volume rm model_volume_mme".split())
42+
subprocess.check_call("docker volume rm model_volume_mme_universal_script".split())
4343

4444

4545
@pytest.fixture(scope="module", autouse=True)
4646
def container(docker_base_name, tag, runtime_config):
4747
try:
4848
command = (
4949
"docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
50-
" --mount type=volume,source=model_volume_mme,target=/opt/ml/models,readonly"
50+
" --mount type=volume,source=model_volume_mme_universal_script,target=/opt/ml/models,readonly"
5151
" -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info"
5252
" -e SAGEMAKER_BIND_TO_PORT=8080"
5353
" -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999"
@@ -74,13 +74,14 @@ def container(docker_base_name, tag, runtime_config):
7474

7575

7676
@pytest.fixture
77-
def model():
78-
model_data = {
79-
"model_name": MODEL_NAME,
80-
"url": "/opt/ml/models/half_plus_three/model/half_plus_three"
81-
}
82-
make_load_model_request(json.dumps(model_data))
83-
return MODEL_NAME
77+
def models():
78+
for MODEL_NAME in MODEL_NAMES:
79+
model_data = {
80+
"model_name": MODEL_NAME,
81+
"url": "/opt/ml/models/{}/model/{}".format(MODEL_NAME,MODEL_NAME)
82+
}
83+
make_load_model_request(json.dumps(model_data))
84+
return MODEL_NAMES
8485

8586

8687
@pytest.mark.skip_gpu
@@ -90,20 +91,25 @@ def test_ping_service():
9091

9192

9293
@pytest.mark.skip_gpu
93-
def test_predict_json(model):
94+
def test_predict_json(models):
9495
headers = make_headers()
9596
data = "{\"instances\": [1.0, 2.0, 5.0]}"
96-
response = requests.post(INVOCATION_URL.format(model), data=data, headers=headers).json()
97-
assert response == {"predictions": [3.5, 4.0, 5.5]}
97+
responses = []
98+
for model in models:
99+
response = requests.post(INVOCATION_URL.format(model), data=data, headers=headers).json()
100+
responses.append(response)
101+
assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
102+
assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
98103

99104

100105
@pytest.mark.skip_gpu
101106
def test_zero_content():
102107
headers = make_headers()
103108
x = ""
104-
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers)
105-
assert 500 == response.status_code
106-
assert "document is empty" in response.text
109+
for MODEL_NAME in MODEL_NAMES:
110+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers)
111+
assert 500 == response.status_code
112+
assert "document is empty" in response.text
107113

108114

109115
@pytest.mark.skip_gpu
@@ -113,21 +119,26 @@ def test_large_input():
113119
with open(data_file, "r") as file:
114120
x = file.read()
115121
headers = make_headers(content_type="text/csv")
116-
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers).json()
117-
predictions = response["predictions"]
118-
assert len(predictions) == 753936
122+
for MODEL_NAME in MODEL_NAMES:
123+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers).json()
124+
predictions = response["predictions"]
125+
assert len(predictions) == 753936
119126

120127

121128
@pytest.mark.skip_gpu
122129
def test_csv_input():
123130
headers = make_headers(content_type="text/csv")
124131
data = "1.0,2.0,5.0"
125-
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers).json()
126-
assert response == {"predictions": [3.5, 4.0, 5.5]}
127-
132+
responses = []
133+
for MODEL_NAME in MODEL_NAMES:
134+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers).json()
135+
responses.append(response)
136+
assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
137+
assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
128138

129139
@pytest.mark.skip_gpu
130140
def test_specific_versions():
141+
MODEL_NAME = MODEL_NAMES[0]
131142
for version in ("123", "124"):
132143
headers = make_headers(content_type="text/csv", version=version)
133144
data = "1.0,2.0,5.0"
@@ -141,6 +152,7 @@ def test_specific_versions():
141152
def test_unsupported_content_type():
142153
headers = make_headers("unsupported-type", "predict")
143154
data = "aW1hZ2UgYnl0ZXM="
144-
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
145-
assert 500 == response.status_code
146-
assert "unsupported content type" in response.text
155+
for MODEL_NAME in MODEL_NAMES:
156+
response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
157+
assert 500 == response.status_code
158+
assert "unsupported content type" in response.text

0 commit comments

Comments
 (0)