aws · ahsan-z-khan · Apr 7, 2022 · Mar 29, 2022
@@ -1226,28 +1226,28 @@ to configure or manage the underlying infrastructure. After you trained a model,
 Serverless endpoint and then invoke the endpoint with the model to get inference results back. More information about
 SageMaker Serverless Inference can be found in the `AWS documentation <https://docs.aws.amazon.com/sagemaker/latest/dg/serverless-endpoints.html>`__.
 
-For using SageMaker Serverless Inference, if you plan to use any of the SageMaker-provided container or Bring Your Own Container
-model, you will need to pass ``image_uri``. An example to use ``image_uri`` for creating MXNet model:
+For using SageMaker Serverless Inference, you can either use SageMaker-provided container or Bring Your Own Container model.
+A step by step example for using Serverless Inference with MXNet image :
+
+Firstly, create MXNet model
 
 .. code:: python
 
     from sagemaker.mxnet import MXNetModel
+    from sagemaker.serverless import ServerlessInferenceConfig
     import sagemaker
 
     role = sagemaker.get_execution_role()
 
     # create MXNet Model Class
-    mxnet_model = MXNetModel(
+    model = MXNetModel(
         model_data="s3://my_bucket/pretrained_model/model.tar.gz", # path to your trained sagemaker model
         role=role, # iam role with permissions to create an Endpoint
         entry_point="inference.py",
-        image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/mxnet-inference:1.4.1-cpu-py3" # image wanted to use
+        py_version="py3", # Python version
+        framework_version="1.6.0", # MXNet framework version
     )
 
-For more Amazon SageMaker provided algorithms and containers image paths, please check this page: `Amazon SageMaker provided
-algorithms and Deep Learning Containers <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html>`_.
-After creating model using ``image_uri``, you can then follow the steps below to create serverless endpoint.
-
 To deploy serverless endpoint, you will need to create a ``ServerlessInferenceConfig``.
 If you create ``ServerlessInferenceConfig`` without specifying its arguments, the default ``MemorySizeInMB`` will be **2048** and
 the default ``MaxConcurrency`` will be **5** :
@@ -1283,7 +1283,6 @@ Or directly using model's ``deploy()`` method to deploy a serverless endpoint:
     # Deploys the model to a SageMaker serverless endpoint
     serverless_predictor = model.deploy(serverless_inference_config=serverless_config)
 
-
 After deployment is complete, you can use predictor's ``predict()`` method to invoke the serverless endpoint just like
 real-time endpoints:
 

@@ -99,13 +99,10 @@ def __init__(
                 file which should be executed as the entry point to model
                 hosting. If ``source_dir`` is specified, then ``entry_point``
                 must point to a file located at the root of ``source_dir``.
-            image_uri (str): A Docker image URI (default: None). In serverless
-                inferece, it is required. More image information can be found in
-                `Amazon SageMaker provided algorithms and Deep Learning Containers
-                <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html>`_.
-                In instance based inference, if not specified, a default image for
-                Chainer will be used. If ``framework_version`` or ``py_version``
-                are ``None``, then ``image_uri`` is required. If also ``None``,
+            image_uri (str): A Docker image URI (default: None). If not specified,
+                a default image for Chainer will be used.
+                If ``framework_version`` or ``py_version``
+                are ``None``, then ``image_uri`` is required. If ``image_uri`` is also ``None``,
                 then a ``ValueError`` will be raised.
             framework_version (str): Chainer version you want to use for
                 executing your model training code. Defaults to ``None``. Required
@@ -143,7 +140,9 @@ def __init__(
 
         self.model_server_workers = model_server_workers
 
-    def prepare_container_def(self, instance_type=None, accelerator_type=None):
+    def prepare_container_def(
+        self, instance_type=None, accelerator_type=None, serverless_inference_config=None
+    ):
         """Return a container definition with framework configuration set in model environment.
 
         Args:
@@ -152,21 +151,27 @@ def prepare_container_def(self, instance_type=None, accelerator_type=None):
             accelerator_type (str): The Elastic Inference accelerator type to
                 deploy to the instance for loading and making inferences to the
                 model. For example, 'ml.eia1.medium'.
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Instance type is
+                not provided in serverless inference. So this is used to find image URIs.
 
         Returns:
             dict[str, str]: A container definition object usable with the
             CreateModel API.
         """
         deploy_image = self.image_uri
         if not deploy_image:
-            if instance_type is None:
+            if instance_type is None and serverless_inference_config is None:
                 raise ValueError(
                     "Must supply either an instance type (for choosing CPU vs GPU) or an image URI."
                 )
 
             region_name = self.sagemaker_session.boto_session.region_name
             deploy_image = self.serving_image_uri(
-                region_name, instance_type, accelerator_type=accelerator_type
+                region_name,
+                instance_type,
+                accelerator_type=accelerator_type,
+                serverless_inference_config=serverless_inference_config,
             )
 
         deploy_key_prefix = model_code_key_prefix(self.key_prefix, self.name, deploy_image)
@@ -178,13 +183,18 @@ def prepare_container_def(self, instance_type=None, accelerator_type=None):
             deploy_env[MODEL_SERVER_WORKERS_PARAM_NAME.upper()] = str(self.model_server_workers)
         return sagemaker.container_def(deploy_image, self.model_data, deploy_env)
 
-    def serving_image_uri(self, region_name, instance_type, accelerator_type=None):
+    def serving_image_uri(
+        self, region_name, instance_type, accelerator_type=None, serverless_inference_config=None
+    ):
         """Create a URI for the serving image.
 
         Args:
             region_name (str): AWS region where the image is uploaded.
             instance_type (str): SageMaker instance type. Used to determine device type
                 (cpu/gpu/family-specific optimized).
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Instance type is
+                not provided in serverless inference. So this is used to determine device type.
 
         Returns:
             str: The appropriate image URI based on the given parameters.
@@ -198,4 +208,5 @@ def serving_image_uri(self, region_name, instance_type, accelerator_type=None):
             instance_type=instance_type,
             accelerator_type=accelerator_type,
             image_scope="inference",
+            serverless_inference_config=serverless_inference_config,
         )
@@ -133,11 +133,7 @@ def __init__(
             py_version (str): Python version you want to use for executing your
                 model training code. Defaults to ``None``. Required unless
                 ``image_uri`` is provided.
-            image_uri (str): A Docker image URI. Defaults to None. For serverless
-                inferece, it is required. More image information can be found in
-                `Amazon SageMaker provided algorithms and Deep Learning Containers
-                <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html>`_.
-                For instance based inference, if not specified, a
+            image_uri (str): A Docker image URI. Defaults to None. If not specified, a
                 default image for PyTorch will be used. If ``framework_version``
                 or ``py_version`` are ``None``, then ``image_uri`` is required. If
                 also ``None``, then a ``ValueError`` will be raised.
@@ -272,7 +268,7 @@ def deploy(
                 is not None. Otherwise, return None.
         """
 
-        if not self.image_uri and instance_type.startswith("ml.inf"):
+        if not self.image_uri and instance_type is not None and instance_type.startswith("ml.inf"):
             self.image_uri = self.serving_image_uri(
                 region_name=self.sagemaker_session.boto_session.region_name,
                 instance_type=instance_type,
@@ -365,7 +361,9 @@ def register(
             drift_check_baselines=drift_check_baselines,
         )
 
-    def prepare_container_def(self, instance_type=None, accelerator_type=None):
+    def prepare_container_def(
+        self, instance_type=None, accelerator_type=None, serverless_inference_config=None
+    ):
         """A container definition with framework configuration set in model environment variables.
 
         Args:
@@ -374,21 +372,27 @@ def prepare_container_def(self, instance_type=None, accelerator_type=None):
             accelerator_type (str): The Elastic Inference accelerator type to
                 deploy to the instance for loading and making inferences to the
                 model.
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Instance type is
+                not provided in serverless inference. So this is used to find image URIs.
 
         Returns:
             dict[str, str]: A container definition object usable with the
             CreateModel API.
         """
         deploy_image = self.image_uri
         if not deploy_image:
-            if instance_type is None:
+            if instance_type is None and serverless_inference_config is None:
                 raise ValueError(
                     "Must supply either an instance type (for choosing CPU vs GPU) or an image URI."
                 )
 
             region_name = self.sagemaker_session.boto_session.region_name
             deploy_image = self.serving_image_uri(
-                region_name, instance_type, accelerator_type=accelerator_type
+                region_name,
+                instance_type,
+                accelerator_type=accelerator_type,
+                serverless_inference_config=serverless_inference_config,
             )
 
         deploy_key_prefix = model_code_key_prefix(self.key_prefix, self.name, deploy_image)
@@ -402,7 +406,13 @@ def prepare_container_def(self, instance_type=None, accelerator_type=None):
             deploy_image, self.repacked_model_data or self.model_data, deploy_env
         )
 
-    def serving_image_uri(self, region_name, instance_type, accelerator_type=None):
+    def serving_image_uri(
+        self,
+        region_name,
+        instance_type=None,
+        accelerator_type=None,
+        serverless_inference_config=None,
+    ):
         """Create a URI for the serving image.
 
         Args:
@@ -412,6 +422,9 @@ def serving_image_uri(self, region_name, instance_type, accelerator_type=None):
             accelerator_type (str): The Elastic Inference accelerator type to
                 deploy to the instance for loading and making inferences to the
                 model.
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Instance type is
+                not provided in serverless inference. So this is used used to determine device type.
 
         Returns:
             str: The appropriate image URI based on the given parameters.
@@ -432,4 +445,5 @@ def serving_image_uri(self, region_name, instance_type, accelerator_type=None):
             accelerator_type=accelerator_type,
             image_scope="inference",
             base_framework_version=base_framework_version,
+            serverless_inference_config=serverless_inference_config,
         )
@@ -48,6 +48,7 @@ def retrieve(
     tolerate_deprecated_model=False,
     sdk_version=None,
     inference_tool=None,
+    serverless_inference_config=None,
 ) -> str:
     """Retrieves the ECR URI for the Docker image matching the given arguments.
 
@@ -94,6 +95,9 @@ def retrieve(
         inference_tool (str): the tool that will be used to aid in the inference.
             Valid values: "neuron, None"
             (default: None).
+        serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+            Specifies configuration related to serverless endpoint. Instance type is
+            not provided in serverless inference. So this is used to determine processor type.
 
     Returns:
         str: The ECR URI for the corresponding SageMaker Docker image.
@@ -159,7 +163,9 @@ def retrieve(
     repo = version_config["repository"]
 
     processor = _processor(
-        instance_type, config.get("processors") or version_config.get("processors")
+        instance_type,
+        config.get("processors") or version_config.get("processors"),
+        serverless_inference_config,
     )
 
     # if container version is available in .json file, utilize that
@@ -202,7 +208,9 @@ def retrieve(
 
     tag = _format_tag(tag_prefix, processor, py_version, container_version, inference_tool)
 
-    if _should_auto_select_container_version(instance_type, distribution):
+    if instance_type is not None and _should_auto_select_container_version(
+        instance_type, distribution
+    ):
         container_versions = {
             "tensorflow-2.3-gpu-py37": "cu110-ubuntu18.04-v3",
             "tensorflow-2.3.1-gpu-py37": "cu110-ubuntu18.04",
@@ -327,7 +335,7 @@ def _registry_from_region(region, registry_dict):
     return registry_dict[region]
 
 
-def _processor(instance_type, available_processors):
+def _processor(instance_type, available_processors, serverless_inference_config=None):
     """Returns the processor type for the given instance type."""
     if not available_processors:
         logger.info("Ignoring unnecessary instance type: %s.", instance_type)
@@ -337,6 +345,10 @@ def _processor(instance_type, available_processors):
         logger.info("Defaulting to only supported image scope: %s.", available_processors[0])
         return available_processors[0]
 
+    if serverless_inference_config is not None:
+        logger.info("Defaulting to CPU type when using serverless inference")
+        return "cpu"
+
     if not instance_type:
         raise ValueError(
             "Empty SageMaker instance type. For options, see: "

@@ -383,7 +383,10 @@ def _init_sagemaker_session_if_does_not_exist(self, instance_type=None):
             self.sagemaker_session = session.Session()
 
     def prepare_container_def(
-        self, instance_type=None, accelerator_type=None
+        self,
+        instance_type=None,
+        accelerator_type=None,
+        serverless_inference_config=None,
     ):  # pylint: disable=unused-argument
         """Return a dict created by ``sagemaker.container_def()``.
 
@@ -398,6 +401,9 @@ def prepare_container_def(
             accelerator_type (str): The Elastic Inference accelerator type to
                 deploy to the instance for loading and making inferences to the
                 model. For example, 'ml.eia1.medium'.
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Instance type is
+                not provided in serverless inference. So this is used to find image URIs.
 
         Returns:
             dict: A container definition object usable with the CreateModel API.
@@ -498,7 +504,9 @@ def enable_network_isolation(self):
         """
         return self._enable_network_isolation
 
-    def _create_sagemaker_model(self, instance_type=None, accelerator_type=None, tags=None):
+    def _create_sagemaker_model(
+        self, instance_type=None, accelerator_type=None, tags=None, serverless_inference_config=None
+    ):
         """Create a SageMaker Model Entity
 
         Args:
@@ -514,8 +522,15 @@ def _create_sagemaker_model(self, instance_type=None, accelerator_type=None, tag
                 'tagvalue'}] For more information about tags, see
                 https://boto3.amazonaws.com/v1/documentation
                 /api/latest/reference/services/sagemaker.html#SageMaker.Client.add_tags
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Instance type is
+                not provided in serverless inference. So this is used to find image URIs.
         """
-        container_def = self.prepare_container_def(instance_type, accelerator_type=accelerator_type)
+        container_def = self.prepare_container_def(
+            instance_type,
+            accelerator_type=accelerator_type,
+            serverless_inference_config=serverless_inference_config,
+        )
 
         self._ensure_base_name_if_needed(
             image_uri=container_def["Image"], script_uri=self.source_dir, model_uri=self.model_data
@@ -983,7 +998,9 @@ def deploy(
             if self._base_name is not None:
                 self._base_name = "-".join((self._base_name, compiled_model_suffix))
 
-        self._create_sagemaker_model(instance_type, accelerator_type, tags)
+        self._create_sagemaker_model(
+            instance_type, accelerator_type, tags, serverless_inference_config
+        )
 
         serverless_inference_config_dict = (
             serverless_inference_config._to_request_dict() if is_serverless else None