aws · sachanub · Sep 30, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/src/sagemaker_huggingface_inference_toolkit/handler_service.py b/src/sagemaker_huggingface_inference_toolkit/handler_service.py
@@ -18,6 +18,7 @@
 import sys
 import time
 from abc import ABC
+from inspect import signature
 
 from sagemaker_inference import environment, utils
 from transformers.pipelines import SUPPORTED_TASKS
@@ -74,7 +75,7 @@ def initialize(self, context):
         self.validate_and_initialize_user_module()
 
         self.device = self.get_device()
-        self.model = self.load(self.model_dir)
+        self.model = self.run_handler_function(self.load, *(self.model_dir,))
         self.initialized = True
         # # Load methods from file
         # if (not self._initialized) and ENABLE_MULTI_MODEL:
@@ -92,10 +93,15 @@ def get_device(self):
         else:
             return -1
 
-    def load(self, model_dir):
+    def load(self, model_dir, context=None):
         """
         The Load handler is responsible for loading the Hugging Face transformer model.
-        It can be overridden to load the model from storage
+        It can be overridden to load the model from storage.
+
+        Args:
+            model_dir (str): The directory where model files are stored.
+            context (obj): metadata on the incoming request data (default: None).
+
         Returns:
             hf_pipeline (Pipeline): A Hugging Face Transformer pipeline.
         """
@@ -111,14 +117,16 @@ def load(self, model_dir):
             )
         return hf_pipeline
 
-    def preprocess(self, input_data, content_type):
+    def preprocess(self, input_data, content_type, context=None):
         """
         The preprocess handler is responsible for deserializing the input data into
         an object for prediction, can handle JSON.
-        The preprocess handler can be overridden for data or feature transformation,
+        The preprocess handler can be overridden for data or feature transformation.
+
         Args:
-            input_data: the request payload serialized in the content_type format
-            content_type: the request content_type
+            input_data: the request payload serialized in the content_type format.
+            content_type: the request content_type.
+            context (obj): metadata on the incoming request data (default: None).
 
         Returns:
             decoded_input_data (dict): deserialized input_data into a Python dictonary.
@@ -136,13 +144,16 @@ def preprocess(self, input_data, content_type):
         decoded_input_data = decoder_encoder.decode(input_data, content_type)
         return decoded_input_data
 
-    def predict(self, data, model):
+    def predict(self, data, model, context=None):
         """The predict handler is responsible for model predictions. Calls the `__call__` method of the provided `Pipeline`
         on decoded_input_data deserialized in input_fn. Runs prediction on GPU if is available.
         The predict handler can be overridden to implement the model inference.
+
         Args:
             data (dict): deserialized decoded_input_data returned by the input_fn
             model : Model returned by the `load` method or if it is a custom module `model_fn`.
+            context (obj): metadata on the incoming request data (default: None).
+
         Returns:
             obj (dict): prediction result.
         """
@@ -158,38 +169,42 @@ def predict(self, data, model):
             prediction = model(inputs)
         return prediction
 
-    def postprocess(self, prediction, accept):
+    def postprocess(self, prediction, accept, context=None):
         """
         The postprocess handler is responsible for serializing the prediction result to
         the desired accept type, can handle JSON.
-        The postprocess handler can be overridden for inference response transformation
+        The postprocess handler can be overridden for inference response transformation.
+
         Args:
-            prediction (dict): a prediction result from predict
-            accept (str): type which the output data needs to be serialized
+            prediction (dict): a prediction result from predict.
+            accept (str): type which the output data needs to be serialized.
+            context (obj): metadata on the incoming request data (default: None).
         Returns: output data serialized
         """
         return decoder_encoder.encode(prediction, accept)
 
-    def transform_fn(self, model, input_data, content_type, accept):
+    def transform_fn(self, model, input_data, content_type, accept, context=None):
         """
         Transform function ("transform_fn") can be used to write one function with pre/post-processing steps and predict step in it.
-        This fuction can't be mixed with "input_fn", "output_fn" or "predict_fn"
+        This fuction can't be mixed with "input_fn", "output_fn" or "predict_fn".
+
         Args:
             model: Model returned by the model_fn above
             input_data: Data received for inference
             content_type: The content type of the inference data
             accept: The response accept type.
+            context (obj): metadata on the incoming request data (default: None).
 
         Returns: Response in the "accept" format type.
 
         """
         # run pipeline
         start_time = time.time()
-        processed_data = self.preprocess(input_data, content_type)
+        processed_data = self.run_handler_function(self.preprocess, *(input_data, content_type))
         preprocess_time = time.time() - start_time
-        predictions = self.predict(processed_data, model)
+        predictions = self.run_handler_function(self.predict, *(processed_data, model))
         predict_time = time.time() - preprocess_time - start_time
-        response = self.postprocess(predictions, accept)
+        response = self.run_handler_function(self.postprocess, *(predictions, accept))
         postprocess_time = time.time() - predict_time - preprocess_time - start_time
 
         logger.info(
@@ -231,7 +246,7 @@ def handle(self, data, context):
                 input_data = input_data.decode("utf-8")
 
             predict_start = time.time()
-            response = self.transform_fn(self.model, input_data, content_type, accept)
+            response = self.run_handler_function(self.transform_fn, *(self.model, input_data, content_type, accept))
             predict_end = time.time()
 
             context.metrics.add_time("Transform Fn", round((predict_end - predict_start) * 1000, 2))
@@ -272,3 +287,22 @@ def validate_and_initialize_user_module(self):
                 self.postprocess = postprocess_fn
             if transform_fn is not None:
                 self.transform_fn = transform_fn
+
+    def run_handler_function(self, func, *argv):
+        """Helper to call the handler function which covers 2 cases:
+        1. the handle function takes context
+        2. the handle function does not take context
+        """
+        num_func_input = len(signature(func).parameters)
+        if num_func_input == len(argv):
+            # function does not take context
+            result = func(*argv)
+        elif num_func_input == len(argv) + 1:
+            # function takes context
+            argv_context = argv + (self.context,)
+            result = func(*argv_context)
+        else:
+            raise TypeError(
+                "{} takes {} arguments but {} were given.".format(func.__name__, num_func_input, len(argv))
+            )
+        return result
 response = self.transform_fn(self.model, input_data, content_type, accept) 
 response = self.transform_fn(self.model, input_data, content_type, accept) 
diff --git a/tests/resources/model_input_predict_output_fn/code/inference.py b/tests/resources/model_input_predict_output_fn/code/inference.py
@@ -1,14 +1,14 @@
-def model_fn(model_dir):
+def model_fn(model_dir, context=None):
     return "model"
 
 
-def input_fn(data, content_type):
+def input_fn(data, content_type, context=None):
     return "data"
 
 
-def predict_fn(data, model):
+def predict_fn(data, model, context=None):
     return "output"
 
 
-def output_fn(prediction, accept):
+def output_fn(prediction, accept, context=None):
     return prediction
diff --git a/tests/resources/model_transform_fn/code/inference_tranform_fn.py b/tests/resources/model_transform_fn/code/inference_tranform_fn.py
@@ -1,9 +1,9 @@
 import os
 
 
-def model_fn(model_dir):
+def model_fn(model_dir, context=None):
     return f"Loading {os.path.basename(__file__)}"
 
 
-def transform_fn(a, b, c, d):
+def transform_fn(a, b, c, d, context=None):
     return f"output {b}"
diff --git a/tests/unit/test_handler_service.py b/tests/unit/test_handler_service.py
@@ -21,6 +21,7 @@
 
 from mms.context import Context, RequestProcessor
 from mms.metrics.metrics_store import MetricsStore
+from mock import Mock
 from sagemaker_huggingface_inference_toolkit import handler_service
 from sagemaker_huggingface_inference_toolkit.transformers_utils import _load_model_from_hub, get_pipeline
 
@@ -84,47 +85,52 @@ def test_handle(inference_handler):
 
 @require_torch
 def test_load(inference_handler):
+    context = Mock()
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_folder = _load_model_from_hub(
             model_id=MODEL,
             model_dir=tmpdirname,
         )
         # test with automatic infer
-        hf_pipeline_without_task = inference_handler.load(storage_folder)
+        hf_pipeline_without_task = inference_handler.load(storage_folder, context)
         assert hf_pipeline_without_task.task == "token-classification"
 
         # test with automatic infer
         os.environ["HF_TASK"] = TASK
-        hf_pipeline_with_task = inference_handler.load(storage_folder)
+        hf_pipeline_with_task = inference_handler.load(storage_folder, context)
         assert hf_pipeline_with_task.task == TASK
 
 
 def test_preprocess(inference_handler):
+    context = Mock()
     json_data = json.dumps(INPUT)
-    decoded_input_data = inference_handler.preprocess(json_data, content_types.JSON)
+    decoded_input_data = inference_handler.preprocess(json_data, content_types.JSON, context)
     assert "inputs" in decoded_input_data
 
 
 def test_preprocess_bad_content_type(inference_handler):
+    context = Mock()
     with pytest.raises(json.decoder.JSONDecodeError):
-        inference_handler.preprocess(b"", content_types.JSON)
+        inference_handler.preprocess(b"", content_types.JSON, context)
 
 
 @require_torch
 def test_predict(inference_handler):
+    context = Mock()
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_folder = _load_model_from_hub(
             model_id=MODEL,
             model_dir=tmpdirname,
         )
         inference_handler.model = get_pipeline(task=TASK, device=-1, model_dir=storage_folder)
-        prediction = inference_handler.predict(INPUT, inference_handler.model)
+        prediction = inference_handler.predict(INPUT, inference_handler.model, context)
         assert "label" in prediction[0]
         assert "score" in prediction[0]
 
 
 def test_postprocess(inference_handler):
-    output = inference_handler.postprocess(OUTPUT, content_types.JSON)
+    context = Mock()
+    output = inference_handler.postprocess(OUTPUT, content_types.JSON, context)
     assert isinstance(output, str)
 
 
@@ -139,10 +145,10 @@ def test_validate_and_initialize_user_module(inference_handler):
     prediction = inference_handler.handle([{"body": b""}], CONTEXT)
     assert "output" in prediction[0]
 
-    assert inference_handler.load({}) == "model"
-    assert inference_handler.preprocess({}, "") == "data"
-    assert inference_handler.predict({}, "model") == "output"
-    assert inference_handler.postprocess("output", "") == "output"
+    assert inference_handler.load({}, CONTEXT) == "model"
+    assert inference_handler.preprocess({}, "", CONTEXT) == "data"
+    assert inference_handler.predict({}, "model", CONTEXT) == "output"
+    assert inference_handler.postprocess("output", "", CONTEXT) == "output"
 
 
 def test_validate_and_initialize_user_module_transform_fn():
@@ -155,5 +161,8 @@ def test_validate_and_initialize_user_module_transform_fn():
     CONTEXT.request_processor = [RequestProcessor({"Content-Type": "application/json"})]
     CONTEXT.metrics = MetricsStore(1, MODEL)
     assert "output" in inference_handler.handle([{"body": b"dummy"}], CONTEXT)[0]
-    assert inference_handler.load({}) == "Loading inference_tranform_fn.py"
-    assert inference_handler.transform_fn("model", "dummy", "application/json", "application/json") == "output dummy"
+    assert inference_handler.load({}, CONTEXT) == "Loading inference_tranform_fn.py"
+    assert (
+        inference_handler.transform_fn("model", "dummy", "application/json", "application/json", CONTEXT)
+        == "output dummy"
+    )