aws
diff --git a/‎docker/build_artifacts/sagemaker/python_service.py
Lines changed: 18 additions & 4 deletions b/‎docker/build_artifacts/sagemaker/python_service.py
Lines changed: 18 additions & 4 deletions
diff --git a/‎docker/build_artifacts/sagemaker/serve.py
Lines changed: 24 additions & 3 deletions b/‎docker/build_artifacts/sagemaker/serve.py
Lines changed: 24 additions & 3 deletions
diff --git a/‎test/integration/local/test_pre_post_processing_mme.py renamed to ‎test/integration/local/test_pre_post_processing_mme_multiple_inference.py
Lines changed: 23 additions & 26 deletions b/‎test/integration/local/test_pre_post_processing_mme.py renamed to ‎test/integration/local/test_pre_post_processing_mme_multiple_inference.py
Lines changed: 23 additions & 26 deletions
diff --git a/‎test/integration/local/test_pre_post_processing_mme_universal_inference.py
Lines changed: 158 additions & 0 deletions b/‎test/integration/local/test_pre_post_processing_mme_universal_inference.py
Lines changed: 158 additions & 0 deletions
diff --git a/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/saved_model.pb
9.12 KB b/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/saved_model.pb
9.12 KB
diff --git a/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.data-00000-of-00001
12 Bytes b/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.data-00000-of-00001
12 Bytes
diff --git a/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.index
151 Bytes b/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.index
151 Bytes
@@ -17,6 +17,7 @@
 import os
 import subprocess
 import grpc
+import sys
 
 import falcon
 import requests
@@ -143,6 +144,7 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
         # validate model files are in the specified base_path
         if self.validate_model_dir(base_path):
             try:
+                self._import_custom_modules(model_name)
                 tfs_config = tfs_utils.create_tfs_config_individual_model(model_name, base_path)
                 tfs_config_file = "/sagemaker/tfs-config/{}/model-config.cfg".format(model_name)
                 log.info("tensorflow serving model config: \n%s\n", tfs_config)
@@ -221,6 +223,17 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
                 }
             )
 
+    def _import_custom_modules(self, model_name):
+        inference_script_path = "/opt/ml/models/{}/model/code/inference.py".format(model_name)
+        python_lib_path = "/opt/ml/models/{}/model/code/lib".format(model_name)
+        if os.path.exists(python_lib_path):
+            log.info("add Python code library path")
+            sys.path.append(python_lib_path)
+        if os.path.exists(inference_script_path):
+            handler, input_handler, output_handler = self._import_handlers(inference_script_path)
+            model_handlers = self._make_handler(handler, input_handler, output_handler)
+            self.model_handlers[model_name] = model_handlers
+
     def _cleanup_config_file(self, config_file):
         if os.path.exists(config_file):
             os.remove(config_file)
@@ -264,8 +277,10 @@ def _handle_invocation_post(self, req, res, model_name=None):
 
         try:
             res.status = falcon.HTTP_200
-
-            res.body, res.content_type = self._handlers(data, context)
+            handlers = self._handlers
+            if SAGEMAKER_MULTI_MODEL_ENABLED and model_name in self.model_handlers:
+                handlers = self.model_handlers[model_name]
+            res.body, res.content_type = handlers(data, context)
         except Exception as e:  # pylint: disable=broad-except
             log.exception("exception handling request: {}".format(e))
             res.status = falcon.HTTP_500
@@ -276,8 +291,7 @@ def _setup_channel(self, grpc_port):
             log.info("Creating grpc channel for port: %s", grpc_port)
             self._channels[grpc_port] = grpc.insecure_channel("localhost:{}".format(grpc_port))
 
-    def _import_handlers(self):
-        inference_script = INFERENCE_SCRIPT_PATH
+    def _import_handlers(self, inference_script=INFERENCE_SCRIPT_PATH):
         spec = importlib.util.spec_from_file_location("inference", inference_script)
         inference = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(inference)
 
@@ -134,7 +134,8 @@ def __init__(self):
         os.environ["TFS_REST_PORTS"] = self._tfs_rest_concat_ports
 
     def _need_python_service(self):
-        if os.path.exists(INFERENCE_PATH):
+        if (os.path.exists(INFERENCE_PATH) or os.path.exists(REQUIREMENTS_PATH)
+                or os.path.exists(PYTHON_LIB_PATH)):
             self._enable_python_service = True
         if os.environ.get("SAGEMAKER_MULTI_MODEL_UNIVERSAL_BUCKET") and os.environ.get(
             "SAGEMAKER_MULTI_MODEL_UNIVERSAL_PREFIX"
@@ -308,6 +309,14 @@ def _enable_per_process_gpu_memory_fraction(self):
 
         return False
 
+    def _get_number_of_gpu_on_host(self):
+        nvidia_smi_exist = os.path.exists("/usr/bin/nvidia-smi")
+        if nvidia_smi_exist:
+            return len(subprocess.check_output(['nvidia-smi', '-L'])
+                       .decode('utf-8').strip().split('\n'))
+
+        return 0
+
     def _calculate_per_process_gpu_memory_fraction(self):
         return round((1 - self._tfs_gpu_margin) / float(self._tfs_instance_count), 4)
 
@@ -420,8 +429,20 @@ def _start_single_tfs(self, instance_id):
             tfs_gpu_memory_fraction=self._calculate_per_process_gpu_memory_fraction(),
         )
         log.info("tensorflow serving command: {}".format(cmd))
-        p = subprocess.Popen(cmd.split())
-        log.info("started tensorflow serving (pid: %d)", p.pid)
+
+        num_gpus = self._get_number_of_gpu_on_host()
+        if num_gpus > 1:
+            # utilizing multi-gpu
+            worker_env = os.environ.copy()
+            worker_env["CUDA_VISIBLE_DEVICES"] = str(instance_id % num_gpus)
+            p = subprocess.Popen(cmd.split(), env=worker_env)
+            log.info("started tensorflow serving (pid: {}) on GPU {}"
+                     .format(p.pid, instance_id % num_gpus))
+        else:
+            # cpu and single gpu
+            p = subprocess.Popen(cmd.split())
+            log.info("started tensorflow serving (pid: {})".format(p.pid))
+
         return p
 
     def _monitor(self):
 
@@ -30,30 +30,42 @@
 MODEL_NAME = "half_plus_three"
 
 
-@pytest.fixture(scope="session", autouse=True)
-def volume():
+@pytest.fixture(scope="module", autouse=True)
+def volume(tmpdir_factory, request):
     try:
-        model_dir = os.path.abspath("test/resources/mme_universal_script")
+        print("tmpdir_factory: "+str(tmpdir_factory))
+        model_dir = os.path.join(tmpdir_factory.mktemp("test"), "model")
+        code_dir = os.path.join(model_dir, "code")
+        print("model_dir: "+model_dir)
+        print("code_dir: " + code_dir)
+        assert 1==2
+        test_example = "test/resources/examples/test1"
+
+        model_src_dir = "test/resources/models"
+        shutil.copytree(model_src_dir, model_dir)
+        shutil.copytree(test_example, code_dir)
+
+        volume_name = f"model_volume_1"
         subprocess.check_call(
-            "docker volume create --name model_volume_mme --opt type=none "
-            "--opt device={} --opt o=bind".format(model_dir).split())
-        yield model_dir
+            "docker volume create --name {} --opt type=none "
+            "--opt device={} --opt o=bind".format(volume_name, model_dir).split())
+        yield volume_name
     finally:
-        subprocess.check_call("docker volume rm model_volume_mme".split())
+        subprocess.check_call(f"docker volume rm {volume_name}".split())
 
 
 @pytest.fixture(scope="module", autouse=True)
-def container(docker_base_name, tag, runtime_config):
+def container(volume, docker_base_name, tag, runtime_config):
     try:
         command = (
             "docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
-            " --mount type=volume,source=model_volume_mme,target=/opt/ml/models,readonly"
+            " --mount type=volume,source={},target=/opt/ml/models/half_plus_three/model,readonly"
             " -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info"
             " -e SAGEMAKER_BIND_TO_PORT=8080"
             " -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999"
             " -e SAGEMAKER_MULTI_MODEL=True"
             " {}:{} serve"
-        ).format(runtime_config, docker_base_name, tag)
+        ).format(runtime_config, volume, docker_base_name, tag)
 
         proc = subprocess.Popen(command.split(), stdout=sys.stdout, stderr=subprocess.STDOUT)
 
@@ -81,51 +93,38 @@ def model():
     }
     make_load_model_request(json.dumps(model_data))
     return MODEL_NAME
-
-
 @pytest.mark.skip_gpu
 def test_ping_service():
     response = requests.get(PING_URL)
     assert 200 == response.status_code
-
-
 @pytest.mark.skip_gpu
 def test_predict_json(model):
     headers = make_headers()
     data = "{\"instances\": [1.0, 2.0, 5.0]}"
     response = requests.post(INVOCATION_URL.format(model), data=data, headers=headers).json()
     assert response == {"predictions": [3.5, 4.0, 5.5]}
-
-
 @pytest.mark.skip_gpu
 def test_zero_content():
     headers = make_headers()
     x = ""
     response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers)
     assert 500 == response.status_code
     assert "document is empty" in response.text
-
-
 @pytest.mark.skip_gpu
 def test_large_input():
     data_file = "test/resources/inputs/test-large.csv"
-
     with open(data_file, "r") as file:
         x = file.read()
         headers = make_headers(content_type="text/csv")
         response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers).json()
         predictions = response["predictions"]
         assert len(predictions) == 753936
-
-
 @pytest.mark.skip_gpu
 def test_csv_input():
     headers = make_headers(content_type="text/csv")
     data = "1.0,2.0,5.0"
     response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers).json()
     assert response == {"predictions": [3.5, 4.0, 5.5]}
-
-
 @pytest.mark.skip_gpu
 def test_specific_versions():
     for version in ("123", "124"):
@@ -135,12 +134,10 @@ def test_specific_versions():
             INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers
         ).json()
         assert response == {"predictions": [3.5, 4.0, 5.5]}
-
-
 @pytest.mark.skip_gpu
 def test_unsupported_content_type():
     headers = make_headers("unsupported-type", "predict")
     data = "aW1hZ2UgYnl0ZXM="
     response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
     assert 500 == response.status_code
-    assert "unsupported content type" in response.text
+    assert "unsupported content type" in response.text
@@ -0,0 +1,158 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+
+import pytest
+
+import requests
+
+from multi_model_endpoint_test_utils import make_load_model_request, make_headers
+
+
+PING_URL = "http://localhost:8080/ping"
+INVOCATION_URL = "http://localhost:8080/models/{}/invoke"
+MODEL_NAMES = ["half_plus_three","half_plus_two"]
+
+
+@pytest.fixture(scope="session", autouse=True)
+def volume():
+    try:
+        model_dir = os.path.abspath("test/resources/mme_universal_script")
+        subprocess.check_call(
+            "docker volume create --name model_volume_mme --opt type=none "
+            "--opt device={} --opt o=bind".format(model_dir).split())
+        yield model_dir
+    finally:
+        subprocess.check_call("docker volume rm model_volume_mme".split())
+
+
+@pytest.fixture(scope="module", autouse=True)
+def container(docker_base_name, tag, runtime_config):
+    try:
+        command = (
+            "docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
+            " --mount type=volume,source=model_volume_mme,target=/opt/ml/models,readonly"
+            " -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info"
+            " -e SAGEMAKER_BIND_TO_PORT=8080"
+            " -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999"
+            " -e SAGEMAKER_MULTI_MODEL=True"
+            " {}:{} serve"
+        ).format(runtime_config, docker_base_name, tag)
+
+        proc = subprocess.Popen(command.split(), stdout=sys.stdout, stderr=subprocess.STDOUT)
+
+        attempts = 0
+        while attempts < 40:
+            time.sleep(3)
+            try:
+                res_code = requests.get("http://localhost:8080/ping").status_code
+                if res_code == 200:
+                    break
+            except:
+                attempts += 1
+                pass
+
+        yield proc.pid
+    finally:
+        subprocess.check_call("docker rm -f sagemaker-tensorflow-serving-test".split())
+
+
+@pytest.fixture
+def models():
+    for MODEL_NAME in MODEL_NAMES:
+        model_data = {
+            "model_name": MODEL_NAME,
+            "url": "/opt/ml/models/{}/model/{}".format(MODEL_NAME,MODEL_NAME)
+        }
+        make_load_model_request(json.dumps(model_data))
+    return MODEL_NAMES
+
+
+@pytest.mark.skip_gpu
+def test_ping_service():
+    response = requests.get(PING_URL)
+    assert 200 == response.status_code
+
+
+@pytest.mark.skip_gpu
+def test_predict_json(models):
+    headers = make_headers()
+    data = "{\"instances\": [1.0, 2.0, 5.0]}"
+    responses = []
+    for model in models:
+        response = requests.post(INVOCATION_URL.format(model), data=data, headers=headers).json()
+        responses.append(response)
+    assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
+    assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
+
+
+@pytest.mark.skip_gpu
+def test_zero_content():
+    headers = make_headers()
+    x = ""
+    for MODEL_NAME in MODEL_NAMES:
+        response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers)
+        assert 500 == response.status_code
+        assert "document is empty" in response.text
+
+
+@pytest.mark.skip_gpu
+def test_large_input():
+    data_file = "test/resources/inputs/test-large.csv"
+
+    with open(data_file, "r") as file:
+        x = file.read()
+        headers = make_headers(content_type="text/csv")
+        for MODEL_NAME in MODEL_NAMES:
+            response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers).json()
+            predictions = response["predictions"]
+            assert len(predictions) == 753936
+
+
+@pytest.mark.skip_gpu
+def test_csv_input():
+    headers = make_headers(content_type="text/csv")
+    data = "1.0,2.0,5.0"
+    responses = []
+    for MODEL_NAME in MODEL_NAMES:
+        response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers).json()
+        responses.append(response)
+    assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
+    assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
+
+@pytest.mark.skip_gpu
+def test_specific_versions():
+    MODEL_NAME = MODEL_NAMES[0]
+    for version in ("123", "124"):
+        headers = make_headers(content_type="text/csv", version=version)
+        data = "1.0,2.0,5.0"
+        response = requests.post(
+            INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers
+        ).json()
+        assert response == {"predictions": [3.5, 4.0, 5.5]}
+
+
+@pytest.mark.skip_gpu
+def test_unsupported_content_type():
+    headers = make_headers("unsupported-type", "predict")
+    data = "aW1hZ2UgYnl0ZXM="
+    for MODEL_NAME in MODEL_NAMES:
+        response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
+        assert 500 == response.status_code
+        assert "unsupported content type" in response.text