aws · chuyang-deng · Jul 16, 2020 · Jul 7, 2020 · Jul 7, 2020 · Jul 7, 2020
diff --git a/README.md b/README.md
@@ -645,23 +645,22 @@ Only 90% of the ports will be utilized and each loaded model will be allocated w
 For example, if the ``SAGEMAKER_SAFE_PORT_RANGE`` is between 9000 to 9999, the maximum number of models that can be loaded to the endpoint at the same time would be 499 ((9999 - 9000) * 0.9 / 2).
 
 ### Using Multi-Model Endpoint with Pre/Post-Processing
-Multi-Model Endpoint can be used together with Pre/Post-Processing. However, please note that in Multi-Model mode, the path of ``inference.py`` is ``/opt/ml/models/code`` instead of ``/opt/ml/model/code``.
-Also, all loaded models will share the same ``inference.py`` to handle invocation requests. An example of the directory structure of Multi-Model Endpoint and Pre/Post-Processing would look like this:
+Multi-Model Endpoint can be used together with Pre/Post-Processing. Each model will need its own ``inference.py`` otherwise default handlers will be used. An example of the directory structure of Multi-Model Endpoint and Pre/Post-Processing would look like this:
 
-        model1
+        /opt/ml/models/model1/model
             |--[model_version_number]
                 |--variables
                 |--saved_model.pb
-        model2
+        /opt/ml/models/model2/model
             |--[model_version_number]
                 |--assets
                 |--variables
                 |--saved_model.pb
-        code
-            |--lib
-                |--external_module
-            |--inference.py
-            |--requirements.txt
+            code
+                |--lib
+                    |--external_module
+                |--inference.py
+                |--requirements.txt
 
 ## Contributing
 

diff --git a/docker/build_artifacts/sagemaker/python_service.py b/docker/build_artifacts/sagemaker/python_service.py
@@ -17,6 +17,7 @@
 import os
 import subprocess
 import time
+import sys
 
 import falcon
 import requests
@@ -27,10 +28,8 @@
 import tfs_utils
 
 SAGEMAKER_MULTI_MODEL_ENABLED = os.environ.get('SAGEMAKER_MULTI_MODEL', 'false').lower() == 'true'
-INFERENCE_SCRIPT_PATH = '/opt/ml/{}/code/inference.py'.format('models'
-                                                              if SAGEMAKER_MULTI_MODEL_ENABLED
-                                                              else 'model')
-PYTHON_PROCESSING_ENABLED = os.path.exists(INFERENCE_SCRIPT_PATH)
+INFERENCE_SCRIPT_PATH = '/opt/ml/model/code/inference.py'
+
 SAGEMAKER_BATCHING_ENABLED = os.environ.get('SAGEMAKER_TFS_ENABLE_BATCHING', 'false').lower()
 MODEL_CONFIG_FILE_PATH = '/sagemaker/model-config.cfg'
 TFS_GRPC_PORT = os.environ.get('TFS_GRPC_PORT')
@@ -64,21 +63,24 @@ def __init__(self):
             self._model_tfs_grpc_port = {}
             self._model_tfs_pid = {}
             self._tfs_ports = self._parse_sagemaker_port_range(SAGEMAKER_TFS_PORT_RANGE)
+            # If Multi-Model mode is enabled, dependencies/handlers will be imported
+            # during the _handle_load_model_post()
+            self.model_handlers = {}
         else:
             self._tfs_grpc_port = TFS_GRPC_PORT
             self._tfs_rest_port = TFS_REST_PORT
 
+            if os.path.exists(INFERENCE_SCRIPT_PATH):
+                self._handler, self._input_handler, self._output_handler = self._import_handlers()
+                self._handlers = self._make_handler(self._handler,
+                                                    self._input_handler,
+                                                    self._output_handler)
+            else:
+                self._handlers = default_handler
+
         self._tfs_enable_batching = SAGEMAKER_BATCHING_ENABLED == 'true'
         self._tfs_default_model_name = os.environ.get('TFS_DEFAULT_MODEL_NAME', "None")
 
-        if PYTHON_PROCESSING_ENABLED:
-            self._handler, self._input_handler, self._output_handler = self._import_handlers()
-            self._handlers = self._make_handler(self._handler,
-                                                self._input_handler,
-                                                self._output_handler)
-        else:
-            self._handlers = default_handler
-
     def on_post(self, req, res, model_name=None):
         log.info(req.uri)
         if model_name or "invocations" in req.uri:
@@ -129,6 +131,9 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
         # validate model files are in the specified base_path
         if self.validate_model_dir(base_path):
             try:
+                # install custom dependencies, import handlers
+                self._import_custom_modules(model_name)
+
                 tfs_config = tfs_utils.create_tfs_config_individual_model(model_name, base_path)
                 tfs_config_file = '/sagemaker/tfs-config/{}/model-config.cfg'.format(model_name)
                 log.info('tensorflow serving model config: \n%s\n', tfs_config)
@@ -197,6 +202,33 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
                                                                                model_name)
             })
 
+    def _import_custom_modules(self, model_name):
+        inference_script_path = "/opt/ml/models/{}/model/code/inference.py".format(model_name)
+        requirements_file_path = "/opt/ml/models/{}/model/code/requirements.txt".format(model_name)
+        python_lib_path = "/opt/ml/models/{}/model/code/lib".format(model_name)
+
+        if os.path.exists(requirements_file_path):
+            log.info("pip install dependencies from requirements.txt")
+            pip_install_cmd = "pip3 install -r {}".format(requirements_file_path)
+            try:
+                subprocess.check_call(pip_install_cmd.split())
+            except subprocess.CalledProcessError:
+                log.error('failed to install required packages, exiting.')
+                raise ChildProcessError('failed to install required packages.')
+
+        if os.path.exists(python_lib_path):
+            log.info("add Python code library path")
+            sys.path.append(python_lib_path)
+
+        if os.path.exists(inference_script_path):
+            handler, input_handler, output_handler = self._import_handlers(model_name)
+            model_handlers = self._make_handler(handler,
+                                                input_handler,
+                                                output_handler)
+            self.model_handlers[model_name] = model_handlers
+        else:
+            self.model_handlers[model_name] = default_handler
+
     def _cleanup_config_file(self, config_file):
         if os.path.exists(config_file):
             os.remove(config_file)
@@ -249,16 +281,24 @@ def _handle_invocation_post(self, req, res, model_name=None):
 
         try:
             res.status = falcon.HTTP_200
-            res.body, res.content_type = self._handlers(data, context)
+            if SAGEMAKER_MULTI_MODEL_ENABLED:
+                with lock():
+                    handlers = self.model_handlers[model_name]
+                    res.body, res.content_type = handlers(data, context)
+            else:
+                res.body, res.content_type = self._handlers(data, context)
         except Exception as e:  # pylint: disable=broad-except
             log.exception('exception handling request: {}'.format(e))
             res.status = falcon.HTTP_500
             res.body = json.dumps({
                 'error': str(e)
             }).encode('utf-8')  # pylint: disable=E1101
 
-    def _import_handlers(self):
-        spec = importlib.util.spec_from_file_location('inference', INFERENCE_SCRIPT_PATH)
+    def _import_handlers(self, model_name=None):
+        inference_script = INFERENCE_SCRIPT_PATH
+        if model_name:
+            inference_script = "/opt/ml/models/{}/model/code/inference.py".format(model_name)
+        spec = importlib.util.spec_from_file_location('inference', inference_script)
         inference = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(inference)
 
@@ -358,7 +398,6 @@ def validate_model_dir(self, model_path):
         versions = []
         for _, dirs, _ in os.walk(model_path):
             for dirname in dirs:
-                log.info("dirname: {}".format(dirname))
                 if dirname.isdigit():
                     versions.append(dirname)
         return self.validate_model_versions(versions)
@@ -383,7 +422,6 @@ def on_get(self, req, res):  # pylint: disable=W0613
 
 class ServiceResources:
     def __init__(self):
-        self._enable_python_processing = PYTHON_PROCESSING_ENABLED
         self._enable_model_manager = SAGEMAKER_MULTI_MODEL_ENABLED
         self._python_service_resource = PythonServiceResource()
         self._ping_resource = PingResource()

diff --git a/test/integration/local/test_multi_model_endpoint.py b/test/integration/local/test_multi_model_endpoint.py
@@ -20,8 +20,8 @@
 import pytest
 import requests
 
-from multi_model_endpoint_test_utils import make_invocation_request, make_list_model_request, \
-    make_get_model_request, make_load_model_request, make_unload_model_request
+from multi_model_endpoint_test_utils import make_invocation_request, make_list_model_request,\
+    make_load_model_request, make_unload_model_request
 
 PING_URL = 'http://localhost:8080/ping'
 

diff --git a/test/integration/local/test_pre_post_processing_mme.py b/test/integration/local/test_pre_post_processing_mme.py
@@ -22,8 +22,7 @@
 
 import requests
 
-from multi_model_endpoint_test_utils import make_invocation_request, make_list_model_request, \
-    make_get_model_request, make_load_model_request, make_unload_model_request, make_headers
+from multi_model_endpoint_test_utils import make_load_model_request, make_headers
 
 
 PING_URL = 'http://localhost:8080/ping'
@@ -57,7 +56,7 @@ def container(volume, docker_base_name, tag, runtime_config):
     try:
         command = (
             'docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080'
-            ' --mount type=volume,source={},target=/opt/ml/models,readonly'
+            ' --mount type=volume,source={},target=/opt/ml/models/half_plus_three/model,readonly'
             ' -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info'
             ' -e SAGEMAKER_BIND_TO_PORT=8080'
             ' -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999'
@@ -87,7 +86,7 @@ def container(volume, docker_base_name, tag, runtime_config):
 def model():
     model_data = {
         'model_name': MODEL_NAME,
-        'url': '/opt/ml/models/half_plus_three'
+        'url': '/opt/ml/models/half_plus_three/model/half_plus_three'
     }
     make_load_model_request(json.dumps(model_data))
     return MODEL_NAME