aws
diff --git a/‎README.md
Lines changed: 14 additions & 3 deletions b/‎README.md
Lines changed: 14 additions & 3 deletions
diff --git a/‎docker/build_artifacts/sagemaker/python_service.py
Lines changed: 31 additions & 5 deletions b/‎docker/build_artifacts/sagemaker/python_service.py
Lines changed: 31 additions & 5 deletions
diff --git a/‎docker/build_artifacts/sagemaker/serve.py
Lines changed: 5 additions & 5 deletions b/‎docker/build_artifacts/sagemaker/serve.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎test/integration/local/test_pre_post_processing_mme.py renamed to ‎test/integration/local/test_pre_post_processing_mme1.py
Lines changed: 52 additions & 35 deletions b/‎test/integration/local/test_pre_post_processing_mme.py renamed to ‎test/integration/local/test_pre_post_processing_mme1.py
Lines changed: 52 additions & 35 deletions
@@ -164,8 +164,6 @@ For example:
 
 ## Pre/Post-Processing
 
-**NOTE: There is currently no support for pre-/post-processing with multi-model containers.**
-
 SageMaker TensorFlow Serving Container supports the following Content-Types for requests:
 
 * `application/json` (default)
@@ -672,7 +670,7 @@ Only 90% of the ports will be utilized and each loaded model will be allocated w
 For example, if the ``SAGEMAKER_SAFE_PORT_RANGE`` is between 9000 to 9999, the maximum number of models that can be loaded to the endpoint at the same time would be 499 ((9999 - 9000) * 0.9 / 2).
 
 ### Using Multi-Model Endpoint with Pre/Post-Processing
-Multi-Model Endpoint can be used together with Pre/Post-Processing. Each model will need its own ``inference.py`` otherwise default handlers will be used. An example of the directory structure of Multi-Model Endpoint and Pre/Post-Processing would look like this:
+Multi-Model Endpoint can be used together with Pre/Post-Processing. Each model can either have its own ``inference.py`` or use a universal ``inference.py``. If both model-specific and universal ``inference.py`` files are provided, then the model-specific ``inference.py`` file is used. If both files are absent, then the default handlers will be used. An example of the directory structure of Multi-Model Endpoint with a model-specific ``inference.py`` file would look like this:
 
         /opt/ml/models/model1/model
             |--[model_version_number]
@@ -687,7 +685,20 @@ Multi-Model Endpoint can be used together with Pre/Post-Processing. Each model w
                 |--lib
                     |--external_module
                 |--inference.py
+Another example with of the directory structure of Multi-Model Endpoint with a universal ``inference.py`` file is as follows:
 
+        /opt/ml/models/model1/model
+            |--[model_version_number]
+                |--variables
+                |--saved_model.pb
+        /opt/ml/models/model2/model
+            |--[model_version_number]
+                |--assets
+                |--variables
+                |--saved_model.pb
+        code
+            |--requirements.txt
+            |--inference.py
 ## Contributing
 
 Please read [CONTRIBUTING.md](https://github.com/aws/sagemaker-tensorflow-serving-container/blob/master/CONTRIBUTING.md)
 
@@ -17,6 +17,7 @@
 import os
 import subprocess
 import grpc
+import sys
 
 import falcon
 import requests
@@ -26,7 +27,7 @@
 import tfs_utils
 
 SAGEMAKER_MULTI_MODEL_ENABLED = os.environ.get("SAGEMAKER_MULTI_MODEL", "false").lower() == "true"
-MODEL_DIR = "models" if SAGEMAKER_MULTI_MODEL_ENABLED else "model"
+MODEL_DIR = "" if SAGEMAKER_MULTI_MODEL_ENABLED else "model"
 INFERENCE_SCRIPT_PATH = f"/opt/ml/{MODEL_DIR}/code/inference.py"
 
 SAGEMAKER_BATCHING_ENABLED = os.environ.get("SAGEMAKER_TFS_ENABLE_BATCHING", "false").lower()
@@ -64,6 +65,7 @@ def __init__(self):
             self._model_tfs_grpc_port = {}
             self._model_tfs_pid = {}
             self._tfs_ports = self._parse_sagemaker_port_range_mme(SAGEMAKER_TFS_PORT_RANGE)
+            self._default_handlers_enabled = False
             # If Multi-Model mode is enabled, dependencies/handlers will be imported
             # during the _handle_load_model_post()
             self.model_handlers = {}
@@ -85,6 +87,7 @@ def __init__(self):
             )
         else:
             self._handlers = default_handler
+            self._default_handlers_enabled = True
 
         self._tfs_enable_batching = SAGEMAKER_BATCHING_ENABLED == "true"
         self._tfs_default_model_name = os.environ.get("TFS_DEFAULT_MODEL_NAME", "None")
@@ -143,6 +146,7 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
         # validate model files are in the specified base_path
         if self.validate_model_dir(base_path):
             try:
+                self._import_custom_modules(model_name)
                 tfs_config = tfs_utils.create_tfs_config_individual_model(model_name, base_path)
                 tfs_config_file = "/sagemaker/tfs-config/{}/model-config.cfg".format(model_name)
                 log.info("tensorflow serving model config: \n%s\n", tfs_config)
@@ -221,6 +225,17 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
                 }
             )
 
+    def _import_custom_modules(self, model_name):
+        inference_script_path = "/opt/ml/models/{}/model/code/inference.py".format(model_name)
+        python_lib_path = "/opt/ml/models/{}/model/code/lib".format(model_name)
+        if os.path.exists(python_lib_path):
+            log.info("add Python code library path")
+            sys.path.append(python_lib_path)
+        if os.path.exists(inference_script_path):
+            handler, input_handler, output_handler = self._import_handlers(inference_script_path)
+            model_handlers = self._make_handler(handler, input_handler, output_handler)
+            self.model_handlers[model_name] = model_handlers
+
     def _cleanup_config_file(self, config_file):
         if os.path.exists(config_file):
             os.remove(config_file)
@@ -264,8 +279,20 @@ def _handle_invocation_post(self, req, res, model_name=None):
 
         try:
             res.status = falcon.HTTP_200
-
-            res.body, res.content_type = self._handlers(data, context)
+            handlers = self._handlers
+            if SAGEMAKER_MULTI_MODEL_ENABLED and model_name in self.model_handlers:
+                inference_script_path = "/opt/ml/models/{}/model/code/" \
+                                        "inference.py".format(model_name)
+                log.info("Inference script found at path {}.".format(inference_script_path))
+                log.info("Inference script exists, importing handlers.")
+                handlers = self.model_handlers[model_name]
+            elif not self._default_handlers_enabled:
+                log.info("Universal inference script found at path "
+                         "{}.".format(INFERENCE_SCRIPT_PATH))
+                log.info("Universal inference script exists, importing handlers.")
+            else:
+                log.info("Inference script does not exist, using default handlers.")
+            res.body, res.content_type = handlers(data, context)
         except Exception as e:  # pylint: disable=broad-except
             log.exception("exception handling request: {}".format(e))
             res.status = falcon.HTTP_500
@@ -276,8 +303,7 @@ def _setup_channel(self, grpc_port):
             log.info("Creating grpc channel for port: %s", grpc_port)
             self._channels[grpc_port] = grpc.insecure_channel("localhost:{}".format(grpc_port))
 
-    def _import_handlers(self):
-        inference_script = INFERENCE_SCRIPT_PATH
+    def _import_handlers(self, inference_script=INFERENCE_SCRIPT_PATH):
         spec = importlib.util.spec_from_file_location("inference", inference_script)
         inference = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(inference)
 
@@ -28,9 +28,8 @@
 JS_INVOCATIONS = "js_content tensorflowServing.invocations"
 GUNICORN_PING = "proxy_pass http://gunicorn_upstream/ping"
 GUNICORN_INVOCATIONS = "proxy_pass http://gunicorn_upstream/invocations"
-MULTI_MODEL = "s" if os.environ.get("SAGEMAKER_MULTI_MODEL", "False").lower() == "true" else ""
-MODEL_DIR = f"model{MULTI_MODEL}"
-CODE_DIR = "/opt/ml/{}/code".format(MODEL_DIR)
+MODEL_DIR = "" if os.environ.get("SAGEMAKER_MULTI_MODEL", "False").lower() == "true" else "model"
+CODE_DIR = f"/opt/ml/{MODEL_DIR}/code"
 PYTHON_LIB_PATH = os.path.join(CODE_DIR, "lib")
 REQUIREMENTS_PATH = os.path.join(CODE_DIR, "requirements.txt")
 INFERENCE_PATH = os.path.join(CODE_DIR, "inference.py")
@@ -134,7 +133,8 @@ def __init__(self):
         os.environ["TFS_REST_PORTS"] = self._tfs_rest_concat_ports
 
     def _need_python_service(self):
-        if os.path.exists(INFERENCE_PATH):
+        if (os.path.exists(INFERENCE_PATH) or os.path.exists(REQUIREMENTS_PATH)
+                or os.path.exists(PYTHON_LIB_PATH)):
             self._enable_python_service = True
         if os.environ.get("SAGEMAKER_MULTI_MODEL_UNIVERSAL_BUCKET") and os.environ.get(
             "SAGEMAKER_MULTI_MODEL_UNIVERSAL_PREFIX"
@@ -256,7 +256,7 @@ def _download_scripts(self, bucket, prefix):
         paginator = client.get_paginator("list_objects")
         for result in paginator.paginate(Bucket=bucket, Delimiter="/", Prefix=prefix):
             for file in result.get("Contents", []):
-                destination = os.path.join(CODE_DIR, file.get("Key"))
+                destination = os.path.join(CODE_DIR, file.get("Key").split("/")[-1])
                 if not os.path.exists(os.path.dirname(destination)):
                     os.makedirs(os.path.dirname(destination))
                 resource.meta.client.download_file(bucket, file.get("Key"), destination)
 
@@ -11,9 +11,11 @@
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
 
+# In this test, only a universal inference.py file is provided. It's expected the handlers from the universal
+# inference.py file should be used by both models.
+
 import json
 import os
-import shutil
 import subprocess
 import sys
 import time
@@ -27,27 +29,27 @@
 
 PING_URL = "http://localhost:8080/ping"
 INVOCATION_URL = "http://localhost:8080/models/{}/invoke"
-MODEL_NAME = "half_plus_three"
+MODEL_NAMES = ["half_plus_three","half_plus_two"]
 
 
 @pytest.fixture(scope="session", autouse=True)
 def volume():
     try:
-        model_dir = os.path.abspath("test/resources/mme_universal_script")
+        model_dir = os.path.abspath("test/resources/mme1")
         subprocess.check_call(
-            "docker volume create --name model_volume_mme --opt type=none "
+            "docker volume create --name model_volume_mme1 --opt type=none "
             "--opt device={} --opt o=bind".format(model_dir).split())
         yield model_dir
     finally:
-        subprocess.check_call("docker volume rm model_volume_mme".split())
+        subprocess.check_call("docker volume rm model_volume_mme1".split())
 
 
 @pytest.fixture(scope="module", autouse=True)
 def container(docker_base_name, tag, runtime_config):
     try:
         command = (
             "docker run {}--name sagemaker-tensorflow-serving-test -p 8080:8080"
-            " --mount type=volume,source=model_volume_mme,target=/opt/ml/models,readonly"
+            " --mount type=volume,source=model_volume_mme1,target=/opt/ml/models,readonly"
             " -e SAGEMAKER_TFS_NGINX_LOGLEVEL=info"
             " -e SAGEMAKER_BIND_TO_PORT=8080"
             " -e SAGEMAKER_SAFE_PORT_RANGE=9000-9999"
@@ -74,13 +76,14 @@ def container(docker_base_name, tag, runtime_config):
 
 
 @pytest.fixture
-def model():
-    model_data = {
-        "model_name": MODEL_NAME,
-        "url": "/opt/ml/models/half_plus_three/model/half_plus_three"
-    }
-    make_load_model_request(json.dumps(model_data))
-    return MODEL_NAME
+def models():
+    for MODEL_NAME in MODEL_NAMES:
+        model_data = {
+            "model_name": MODEL_NAME,
+            "url": "/opt/ml/models/{}/model/{}".format(MODEL_NAME,MODEL_NAME)
+        }
+        make_load_model_request(json.dumps(model_data))
+    return MODEL_NAMES
 
 
 @pytest.mark.skip_gpu
@@ -90,20 +93,25 @@ def test_ping_service():
 
 
 @pytest.mark.skip_gpu
-def test_predict_json(model):
+def test_predict_json(models):
     headers = make_headers()
     data = "{\"instances\": [1.0, 2.0, 5.0]}"
-    response = requests.post(INVOCATION_URL.format(model), data=data, headers=headers).json()
-    assert response == {"predictions": [3.5, 4.0, 5.5]}
+    responses = []
+    for model in models:
+        response = requests.post(INVOCATION_URL.format(model), data=data, headers=headers).json()
+        responses.append(response)
+    assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
+    assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
 
 
 @pytest.mark.skip_gpu
 def test_zero_content():
     headers = make_headers()
     x = ""
-    response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers)
-    assert 500 == response.status_code
-    assert "document is empty" in response.text
+    for MODEL_NAME in MODEL_NAMES:
+        response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers)
+        assert 500 == response.status_code
+        assert "document is empty" in response.text
 
 
 @pytest.mark.skip_gpu
@@ -113,34 +121,43 @@ def test_large_input():
     with open(data_file, "r") as file:
         x = file.read()
         headers = make_headers(content_type="text/csv")
-        response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers).json()
-        predictions = response["predictions"]
-        assert len(predictions) == 753936
+        for MODEL_NAME in MODEL_NAMES:
+            response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=x, headers=headers).json()
+            predictions = response["predictions"]
+            assert len(predictions) == 753936
 
 
 @pytest.mark.skip_gpu
 def test_csv_input():
     headers = make_headers(content_type="text/csv")
     data = "1.0,2.0,5.0"
-    response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers).json()
-    assert response == {"predictions": [3.5, 4.0, 5.5]}
-
+    responses = []
+    for MODEL_NAME in MODEL_NAMES:
+        response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers).json()
+        responses.append(response)
+    assert responses[0] == {"predictions": [3.5, 4.0, 5.5]}
+    assert responses[1] == {"predictions": [2.5, 3.0, 4.5]}
 
 @pytest.mark.skip_gpu
 def test_specific_versions():
-    for version in ("123", "124"):
-        headers = make_headers(content_type="text/csv", version=version)
-        data = "1.0,2.0,5.0"
-        response = requests.post(
-            INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers
-        ).json()
-        assert response == {"predictions": [3.5, 4.0, 5.5]}
+    for MODEL_NAME in MODEL_NAMES:
+        for version in ("123", "124"):
+            headers = make_headers(content_type="text/csv", version=version)
+            data = "1.0,2.0,5.0"
+            response = requests.post(
+                INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers
+            ).json()
+            if MODEL_NAME == "half_plus_three":
+                assert response == {"predictions": [3.5, 4.0, 5.5]}
+            else:
+                assert response == {"predictions": [2.5, 3.0, 4.5]}
 
 
 @pytest.mark.skip_gpu
 def test_unsupported_content_type():
     headers = make_headers("unsupported-type", "predict")
     data = "aW1hZ2UgYnl0ZXM="
-    response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
-    assert 500 == response.status_code
-    assert "unsupported content type" in response.text
+    for MODEL_NAME in MODEL_NAMES:
+        response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
+        assert 500 == response.status_code
+        assert "unsupported content type" in response.text