aws · chuyang-deng · Jul 16, 2020 · Jul 7, 2020 · Jul 7, 2020 · Jul 7, 2020
diff --git a/README.md b/README.md
@@ -645,23 +645,21 @@ Only 90% of the ports will be utilized and each loaded model will be allocated w
 For example, if the ``SAGEMAKER_SAFE_PORT_RANGE`` is between 9000 to 9999, the maximum number of models that can be loaded to the endpoint at the same time would be 499 ((9999 - 9000) * 0.9 / 2).
 
 ### Using Multi-Model Endpoint with Pre/Post-Processing
-Multi-Model Endpoint can be used together with Pre/Post-Processing. However, please note that in Multi-Model mode, the path of ``inference.py`` is ``/opt/ml/models/code`` instead of ``/opt/ml/model/code``.
-Also, all loaded models will share the same ``inference.py`` to handle invocation requests. An example of the directory structure of Multi-Model Endpoint and Pre/Post-Processing would look like this:
+Multi-Model Endpoint can be used together with Pre/Post-Processing. Each model will need its own ``inference.py`` otherwise default handlers will be used. An example of the directory structure of Multi-Model Endpoint and Pre/Post-Processing would look like this:
 
-        model1
+        /opt/ml/models/model1/model
             |--[model_version_number]
                 |--variables
                 |--saved_model.pb
-        model2
+        /opt/ml/models/model2/model
             |--[model_version_number]
                 |--assets
                 |--variables
                 |--saved_model.pb
-        code
-            |--lib
-                |--external_module
-            |--inference.py
-            |--requirements.txt
+            code
+                |--lib
+                    |--external_module
+                |--inference.py
 
 ## Contributing
 

diff --git a/docker/build_artifacts/sagemaker/python_service.py b/docker/build_artifacts/sagemaker/python_service.py
@@ -17,6 +17,7 @@
 import os
 import subprocess
 import time
+import sys
 
 import falcon
 import requests
@@ -27,10 +28,8 @@
 import tfs_utils
 
 SAGEMAKER_MULTI_MODEL_ENABLED = os.environ.get('SAGEMAKER_MULTI_MODEL', 'false').lower() == 'true'
-INFERENCE_SCRIPT_PATH = '/opt/ml/{}/code/inference.py'.format('models'
-                                                              if SAGEMAKER_MULTI_MODEL_ENABLED
-                                                              else 'model')
-PYTHON_PROCESSING_ENABLED = os.path.exists(INFERENCE_SCRIPT_PATH)
+INFERENCE_SCRIPT_PATH = '/opt/ml/model/code/inference.py'
+
 SAGEMAKER_BATCHING_ENABLED = os.environ.get('SAGEMAKER_TFS_ENABLE_BATCHING', 'false').lower()
 MODEL_CONFIG_FILE_PATH = '/sagemaker/model-config.cfg'
 TFS_GRPC_PORT = os.environ.get('TFS_GRPC_PORT')
@@ -64,21 +63,24 @@ def __init__(self):
             self._model_tfs_grpc_port = {}
             self._model_tfs_pid = {}
             self._tfs_ports = self._parse_sagemaker_port_range(SAGEMAKER_TFS_PORT_RANGE)
+            # If Multi-Model mode is enabled, dependencies/handlers will be imported
+            # during the _handle_load_model_post()
+            self.model_handlers = {}
         else:
             self._tfs_grpc_port = TFS_GRPC_PORT
             self._tfs_rest_port = TFS_REST_PORT
 
+            if os.path.exists(INFERENCE_SCRIPT_PATH):
+                self._handler, self._input_handler, self._output_handler = self._import_handlers()
+                self._handlers = self._make_handler(self._handler,
+                                                    self._input_handler,
+                                                    self._output_handler)
+            else:
+                self._handlers = default_handler
+
         self._tfs_enable_batching = SAGEMAKER_BATCHING_ENABLED == 'true'
         self._tfs_default_model_name = os.environ.get('TFS_DEFAULT_MODEL_NAME', "None")
 
-        if PYTHON_PROCESSING_ENABLED:
-            self._handler, self._input_handler, self._output_handler = self._import_handlers()
-            self._handlers = self._make_handler(self._handler,
-                                                self._input_handler,
-                                                self._output_handler)
-        else:
-            self._handlers = default_handler
-
     def on_post(self, req, res, model_name=None):
         log.info(req.uri)
         if model_name or "invocations" in req.uri:
@@ -129,6 +131,9 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
         # validate model files are in the specified base_path
         if self.validate_model_dir(base_path):
             try:
+                # install custom dependencies, import handlers
+                self._import_custom_modules(model_name)
+
                 tfs_config = tfs_utils.create_tfs_config_individual_model(model_name, base_path)
                 tfs_config_file = '/sagemaker/tfs-config/{}/model-config.cfg'.format(model_name)
                 log.info('tensorflow serving model config: \n%s\n', tfs_config)
@@ -197,6 +202,31 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
                                                                                model_name)
             })
 
+    def _import_custom_modules(self, model_name):
+        inference_script_path = "/opt/ml/models/{}/model/code/inference.py".format(model_name)
+        requirements_file_path = "/opt/ml/models/{}/model/code/requirements.txt".format(model_name)
+        python_lib_path = "/opt/ml/models/{}/model/code/lib".format(model_name)
+
+        if os.path.exists(requirements_file_path):
+            log.info("pip install dependencies from requirements.txt")
+            pip_install_cmd = "pip3 install -r {}".format(requirements_file_path)
+            try:
+                subprocess.check_call(pip_install_cmd.split())
+            except subprocess.CalledProcessError:
+                log.error('failed to install required packages, exiting.')
+                raise ChildProcessError('failed to install required packages.')
+
+        if os.path.exists(python_lib_path):
+            log.info("add Python code library path")
+            sys.path.append(python_lib_path)
+
+        if os.path.exists(inference_script_path):
+            handler, input_handler, output_handler = self._import_handlers(model_name)
+            model_handlers = self._make_handler(handler, input_handler, output_handler)
+            self.model_handlers[model_name] = model_handlers
+        else:
+            self.model_handlers[model_name] = default_handler
+
     def _cleanup_config_file(self, config_file):
         if os.path.exists(config_file):
             os.remove(config_file)
@@ -249,16 +279,24 @@ def _handle_invocation_post(self, req, res, model_name=None):
 
         try:
             res.status = falcon.HTTP_200
-            res.body, res.content_type = self._handlers(data, context)
+            if SAGEMAKER_MULTI_MODEL_ENABLED:
+                with lock():
+                    handlers = self.model_handlers[model_name]
+                    res.body, res.content_type = handlers(data, context)
+            else:
+                res.body, res.content_type = self._handlers(data, context)
         except Exception as e:  # pylint: disable=broad-except
             log.exception('exception handling request: {}'.format(e))
             res.status = falcon.HTTP_500
             res.body = json.dumps({
                 'error': str(e)
             }).encode('utf-8')  # pylint: disable=E1101
 
-    def _import_handlers(self):
-        spec = importlib.util.spec_from_file_location('inference', INFERENCE_SCRIPT_PATH)
+    def _import_handlers(self, model_name=None):
+        inference_script = INFERENCE_SCRIPT_PATH
+        if model_name:
+            inference_script = "/opt/ml/models/{}/model/code/inference.py".format(model_name)
+        spec = importlib.util.spec_from_file_location('inference', inference_script)
         inference = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(inference)
 
@@ -358,7 +396,6 @@ def validate_model_dir(self, model_path):
         versions = []
         for _, dirs, _ in os.walk(model_path):
             for dirname in dirs:
-                log.info("dirname: {}".format(dirname))
                 if dirname.isdigit():
                     versions.append(dirname)
         return self.validate_model_versions(versions)
@@ -383,7 +420,6 @@ def on_get(self, req, res):  # pylint: disable=W0613
 
 class ServiceResources:
     def __init__(self):
-        self._enable_python_processing = PYTHON_PROCESSING_ENABLED
         self._enable_model_manager = SAGEMAKER_MULTI_MODEL_ENABLED
         self._python_service_resource = PythonServiceResource()
         self._ping_resource = PingResource()

diff --git a/docker/build_artifacts/sagemaker/serve.py b/docker/build_artifacts/sagemaker/serve.py
@@ -83,6 +83,7 @@ def __init__(self):
 
     def _create_tfs_config(self):
         models = tfs_utils.find_models()
+
         if not models:
             raise ValueError('no SavedModel bundles found!')
 
@@ -255,12 +256,6 @@ def start(self):
         self._state = 'starting'
         signal.signal(signal.SIGTERM, self._stop)
 
-        self._create_nginx_config()
-
-        if self._tfs_enable_batching:
-            log.info('batching is enabled')
-            tfs_utils.create_batching_config(self._tfs_batching_config_path)
-
         if self._tfs_enable_multi_model_endpoint:
             log.info('multi-model endpoint is enabled, TFS model servers will be started later')
         else:
@@ -271,6 +266,12 @@ def start(self):
             self._create_tfs_config()
             self._start_tfs()
 
+        self._create_nginx_config()
+
+        if self._tfs_enable_batching:
+            log.info('batching is enabled')
+            tfs_utils.create_batching_config(self._tfs_batching_config_path)
+
         if self._use_gunicorn:
             self._setup_gunicorn()
             self._start_gunicorn()

diff --git a/docker/build_artifacts/sagemaker/tfs_utils.py b/docker/build_artifacts/sagemaker/tfs_utils.py
@@ -111,8 +111,6 @@ def create_tfs_config(
         config += '  }\n'
     config += '}\n'
 
-    log.info('tensorflow serving model config: \n%s\n', config)
-
     with open(tfs_config_path, 'w') as f:
         f.write(config)
 

diff --git a/test/integration/local/conftest.py b/test/integration/local/conftest.py
@@ -53,3 +53,11 @@ def tag(request, framework_version, processor):
     if not image_tag:
         image_tag = '{}-{}'.format(framework_version, processor)
     return image_tag
+
+
+@pytest.fixture(autouse=True)
+def skip_by_device_type(request, processor):
+    is_gpu = processor == "gpu"
+    if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \
+            (request.node.get_closest_marker('skip_cpu') and not is_gpu):
+        pytest.skip('Skipping because running on \'{}\' instance'.format(processor))
diff --git a/test/integration/local/test_container.py b/test/integration/local/test_container.py
@@ -45,7 +45,6 @@ def container(request, docker_base_name, tag, runtime_config):
         command = (
             'docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080'
             ' --mount type=volume,source=model_volume,target=/opt/ml/model,readonly'
-            ' -e SAGEMAKER_TFS_DEFAULT_MODEL_NAME=half_plus_three'
             ' -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info'
             ' -e SAGEMAKER_BIND_TO_PORT=8080'
             ' -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999'

diff --git a/test/integration/local/test_multi_model_endpoint.py b/test/integration/local/test_multi_model_endpoint.py
@@ -20,8 +20,12 @@
 import pytest
 import requests
 
-from multi_model_endpoint_test_utils import make_invocation_request, make_list_model_request, \
-    make_get_model_request, make_load_model_request, make_unload_model_request
+from multi_model_endpoint_test_utils import (
+    make_invocation_request,
+    make_list_model_request,
+    make_load_model_request,
+    make_unload_model_request,
+)
 
 PING_URL = 'http://localhost:8080/ping'
 
@@ -69,11 +73,13 @@ def container(request, docker_base_name, tag, runtime_config):
         subprocess.check_call('docker rm -f sagemaker-tensorflow-serving-test'.split())
 
 
+@pytest.mark.skip_gpu
 def test_ping():
     res = requests.get(PING_URL)
     assert res.status_code == 200
 
 
+@pytest.mark.skip_gpu
 def test_container_start_invocation_fail():
     x = {
         'instances': [1.0, 2.0, 5.0]
@@ -84,13 +90,15 @@ def test_container_start_invocation_fail():
     assert "Model half_plus_three is not loaded yet." in str(y)
 
 
+@pytest.mark.skip_gpu
 def test_list_models_empty():
     code, res = make_list_model_request()
     res = json.loads(res)
     assert code == 200
     assert len(res) == 0
 
 
+@pytest.mark.skip_gpu
 def test_delete_unloaded_model():
     # unloads the given model/version, no-op if not loaded
     model_name = 'non-existing-model'
@@ -99,6 +107,7 @@ def test_delete_unloaded_model():
     assert 'Model {} is not loaded yet'.format(model_name) in res
 
 
+@pytest.mark.skip_gpu
 def test_delete_model():
     model_name = 'half_plus_three'
     model_data = {
@@ -125,6 +134,7 @@ def test_delete_model():
     assert 'Model {} is not loaded yet.'.format(model_name) in str(y2)
 
 
+@pytest.mark.skip_gpu
 def test_load_two_models():
     model_name_1 = 'half_plus_two'
     model_data_1 = {
@@ -165,6 +175,7 @@ def test_load_two_models():
     assert len(res3) == 2
 
 
+@pytest.mark.skip_gpu
 def test_load_one_model_two_times():
     model_name = 'cifar'
     model_data = {
@@ -180,6 +191,7 @@ def test_load_one_model_two_times():
     assert'Model {} is already loaded'.format(model_name) in res2
 
 
+@pytest.mark.skip_gpu
 def test_load_non_existing_model():
     model_name = 'non-existing'
     base_path = '/opt/ml/models/non-existing'
@@ -192,6 +204,7 @@ def test_load_non_existing_model():
     assert 'Could not find valid base path {} for servable {}'.format(base_path, model_name) in str(res)
 
 
+@pytest.mark.skip_gpu
 def test_bad_model_reqeust():
     bad_model_data = {
         'model_name': 'model_name',
@@ -201,6 +214,7 @@ def test_bad_model_reqeust():
     assert code == 500
 
 
+@pytest.mark.skip_gpu
 def test_invalid_model_version():
     model_name = 'invalid_version'
     base_path = '/opt/ml/models/invalid_version'

diff --git a/test/integration/local/test_pre_post_processing.py b/test/integration/local/test_pre_post_processing.py
@@ -53,7 +53,6 @@ def container(volume, docker_base_name, tag, runtime_config):
         command = (
             'docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080'
             ' --mount type=volume,source={},target=/opt/ml/model,readonly'
-            ' -e SAGEMAKER_TFS_DEFAULT_MODEL_NAME=half_plus_three'
             ' -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info'
             ' -e SAGEMAKER_BIND_TO_PORT=8080'
             ' -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999'