aws
diff --git a/‎setup.py
+1-1 b/‎setup.py
+1-1
diff --git a/‎src/sagemaker/estimator.py
+28-8 b/‎src/sagemaker/estimator.py
+28-8
diff --git a/‎src/sagemaker/model.py
+39-12 b/‎src/sagemaker/model.py
+39-12
diff --git a/‎src/sagemaker/serverless/__init__.py
+3 b/‎src/sagemaker/serverless/__init__.py
+3
diff --git a/‎src/sagemaker/serverless/serverless_inference_config.py
+54 b/‎src/sagemaker/serverless/serverless_inference_config.py
+54
diff --git a/‎src/sagemaker/session.py
+13-4 b/‎src/sagemaker/session.py
+13-4
diff --git a/‎src/sagemaker/tensorflow/model.py
+4-2 b/‎src/sagemaker/tensorflow/model.py
+4-2
@@ -34,7 +34,7 @@ def read_version():
 # Declare minimal set for installation
 required_packages = [
     "attrs",
-    "boto3>=1.20.18",
+    "boto3>=1.20.21",
     "google-pasta",
     "numpy>=1.9.0",
     "protobuf>=3.1",
 
@@ -852,8 +852,8 @@ def logs(self):
 
     def deploy(
         self,
-        initial_instance_count,
-        instance_type,
+        initial_instance_count=None,
+        instance_type=None,
         serializer=None,
         deserializer=None,
         accelerator_type=None,
@@ -864,6 +864,7 @@ def deploy(
         kms_key=None,
         data_capture_config=None,
         tags=None,
+        serverless_inference_config=None,
         **kwargs,
     ):
         """Deploy the trained model to an Amazon SageMaker endpoint.
@@ -874,10 +875,14 @@ def deploy(
         http://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-training.html
 
         Args:
-            initial_instance_count (int): Minimum number of EC2 instances to
-                deploy to an endpoint for prediction.
-            instance_type (str): Type of EC2 instance to deploy to an endpoint
-                for prediction, for example, 'ml.c4.xlarge'.
+            initial_instance_count (int): The initial number of instances to run
+                in the ``Endpoint`` created from this ``Model``. If not using
+                serverless inference, then it need to be a number larger or equals
+                to 1 (default: None)
+            instance_type (str): The EC2 instance type to deploy this Model to.
+                For example, 'ml.p2.xlarge', or 'local' for local mode. If not using
+                serverless inference, then it is required to deploy a model.
+                (default: None)
             serializer (:class:`~sagemaker.serializers.BaseSerializer`): A
                 serializer object, used to encode data for an inference endpoint
                 (default: None). If ``serializer`` is not None, then
@@ -910,6 +915,11 @@ def deploy(
             data_capture_config (sagemaker.model_monitor.DataCaptureConfig): Specifies
                 configuration related to Endpoint data capture for use with
                 Amazon SageMaker Model Monitoring. Default: None.
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Use this configuration
+                when trying to create serverless endpoint and make serverless inference. If
+                empty config object passed through, we will use default config to deploy
+                serverless endpoint (default: None)
             tags(List[dict[str, str]]): Optional. The list of tags to attach to this specific
                 endpoint. Example:
                 >>> tags = [{'Key': 'tagname', 'Value': 'tagvalue'}]
@@ -920,21 +930,30 @@ def deploy(
                 Implementations may customize ``create_model()`` to accept
                 ``**kwargs`` to customize model creation during deploy.
                 For more, see the implementation docs.
-
+        Raises:
+             ValueError: If serverless inference config is not specified and instance type
+                and instance count are also not specified
         Returns:
             sagemaker.predictor.Predictor: A predictor that provides a ``predict()`` method,
                 which can be used to send requests to the Amazon SageMaker
                 endpoint and obtain inferences.
         """
         removed_kwargs("update_endpoint", kwargs)
+
+        is_serverless = bool(serverless_inference_config)
+        if not is_serverless and not (instance_type and initial_instance_count):
+            raise ValueError(
+                "Must specify instance type and instance count unless using serverless inference"
+            )
+
         self._ensure_latest_training_job()
         self._ensure_base_job_name()
         default_name = name_from_base(self.base_job_name)
         endpoint_name = endpoint_name or default_name
         model_name = model_name or default_name
 
         self.deploy_instance_type = instance_type
-        if use_compiled_model:
+        if use_compiled_model and not is_serverless:
             family = "_".join(instance_type.split(".")[:-1])
             if family not in self._compiled_models:
                 raise ValueError(
@@ -959,6 +978,7 @@ def deploy(
             wait=wait,
             kms_key=kms_key,
             data_capture_config=data_capture_config,
+            serverless_inference_config=serverless_inference_config,
         )
 
     def register(
 
@@ -209,7 +209,7 @@ def register(
             model_package_arn=model_package.get("ModelPackageArn"),
         )
 
-    def _init_sagemaker_session_if_does_not_exist(self, instance_type):
+    def _init_sagemaker_session_if_does_not_exist(self, instance_type=None):
         """Set ``self.sagemaker_session`` to ``LocalSession`` or ``Session`` if it's not already.
 
         The type of session object is determined by the instance type.
@@ -688,8 +688,8 @@ def compile(
 
     def deploy(
         self,
-        initial_instance_count,
-        instance_type,
+        initial_instance_count=None,
+        instance_type=None,
         serializer=None,
         deserializer=None,
         accelerator_type=None,
@@ -698,6 +698,7 @@ def deploy(
         kms_key=None,
         wait=True,
         data_capture_config=None,
+        serverless_inference_config=None,
         **kwargs,
     ):
         """Deploy this ``Model`` to an ``Endpoint`` and optionally return a ``Predictor``.
@@ -715,9 +716,13 @@ def deploy(
 
         Args:
             initial_instance_count (int): The initial number of instances to run
-                in the ``Endpoint`` created from this ``Model``.
+                in the ``Endpoint`` created from this ``Model``. If not using
+                serverless inference, then it need to be a number larger or equals
+                to 1 (default: None)
             instance_type (str): The EC2 instance type to deploy this Model to.
-                For example, 'ml.p2.xlarge', or 'local' for local mode.
+                For example, 'ml.p2.xlarge', or 'local' for local mode. If not using
+                serverless inference, then it is required to deploy a model.
+                (default: None)
             serializer (:class:`~sagemaker.serializers.BaseSerializer`): A
                 serializer object, used to encode data for an inference endpoint
                 (default: None). If ``serializer`` is not None, then
@@ -746,7 +751,14 @@ def deploy(
             data_capture_config (sagemaker.model_monitor.DataCaptureConfig): Specifies
                 configuration related to Endpoint data capture for use with
                 Amazon SageMaker Model Monitoring. Default: None.
-
+            serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
+                Specifies configuration related to serverless endpoint. Use this configuration
+                when trying to create serverless endpoint and make serverless inference. If
+                empty config object passed through, we will use default config to deploy
+                serverless endpoint (default: None)
+        Raises:
+             ValueError: If no role is specified or if serverless inference config is not
+                specified and instance type and instance count are also not specified
         Returns:
             callable[string, sagemaker.session.Session] or None: Invocation of
                 ``self.predictor_cls`` on the created endpoint name, if ``self.predictor_cls``
@@ -757,28 +769,43 @@ def deploy(
 
         if self.role is None:
             raise ValueError("Role can not be null for deploying a model")
+        is_serverless = bool(serverless_inference_config)
+        if not is_serverless and not (instance_type and initial_instance_count):
+            raise ValueError(
+                "Must specify instance type and instance count unless using serverless inference"
+            )
 
-        if instance_type.startswith("ml.inf") and not self._is_compiled_model:
+        if instance_type and instance_type.startswith("ml.inf") and not self._is_compiled_model:
             LOGGER.warning(
                 "Your model is not compiled. Please compile your model before using Inferentia."
             )
 
-        compiled_model_suffix = "-".join(instance_type.split(".")[:-1])
-        if self._is_compiled_model:
+        if self._is_compiled_model and not is_serverless:
+            compiled_model_suffix = "-".join(instance_type.split(".")[:-1])
             self._ensure_base_name_if_needed(self.image_uri)
             if self._base_name is not None:
                 self._base_name = "-".join((self._base_name, compiled_model_suffix))
 
         self._create_sagemaker_model(instance_type, accelerator_type, tags)
+
+        serverless_inference_config_dict = (
+            serverless_inference_config._to_request_dict() if is_serverless else None
+        )
         production_variant = sagemaker.production_variant(
-            self.name, instance_type, initial_instance_count, accelerator_type=accelerator_type
+            self.name,
+            instance_type,
+            initial_instance_count,
+            accelerator_type=accelerator_type,
+            serverless_inference_config=serverless_inference_config_dict,
         )
         if endpoint_name:
             self.endpoint_name = endpoint_name
         else:
             base_endpoint_name = self._base_name or utils.base_from_name(self.name)
-            if self._is_compiled_model and not base_endpoint_name.endswith(compiled_model_suffix):
-                base_endpoint_name = "-".join((base_endpoint_name, compiled_model_suffix))
+            if self._is_compiled_model and not is_serverless:
+                compiled_model_suffix = "-".join(instance_type.split(".")[:-1])
+                if not base_endpoint_name.endswith(compiled_model_suffix):
+                    base_endpoint_name = "-".join((base_endpoint_name, compiled_model_suffix))
             self.endpoint_name = utils.name_from_base(base_endpoint_name)
 
         data_capture_config_dict = None
 
@@ -13,3 +13,6 @@
 """Classes for performing machine learning on serverless compute."""
 from sagemaker.serverless.model import LambdaModel  # noqa: F401
 from sagemaker.serverless.predictor import LambdaPredictor  # noqa: F401
+from sagemaker.serverless.serverless_inference_config import (  # noqa: F401
+    ServerlessInferenceConfig,
+)
@@ -0,0 +1,54 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""This module contains code related to the ServerlessInferenceConfig class.
+
+Codes are used for configuring async inference endpoint. Use it when deploying
+the model to the endpoints.
+"""
+from __future__ import print_function, absolute_import
+
+
+class ServerlessInferenceConfig(object):
+    """Configuration object passed in when deploying models to Amazon SageMaker Endpoints.
+
+    This object specifies configuration related to serverless endpoint. Use this configuration
+    when trying to create serverless endpoint and make serverless inference
+    """
+
+    def __init__(
+        self,
+        memory_size_in_mb=2048,
+        max_concurrency=5,
+    ):
+        """Initialize a ServerlessInferenceConfig object for serverless inference configuration.
+
+        Args:
+            memory_size_in_mb (int): Optional. The memory size of your serverless endpoint.
+                Valid values are in 1 GB increments: 1024 MB, 2048 MB, 3072 MB, 4096 MB,
+                5120 MB, or 6144 MB. If no value is provided, Amazon SageMaker will choose
+                the default value for you. (Default: 2048)
+            max_concurrency (int): Optional. The maximum number of concurrent invocations
+                your serverless endpoint can process. If no value is provided, Amazon
+                SageMaker will choose the default value for you. (Default: 1)
+        """
+        self.memory_size_in_mb = memory_size_in_mb
+        self.max_concurrency = max_concurrency
+
+    def _to_request_dict(self):
+        """Generates a request dictionary using the parameters provided to the class."""
+        request_dict = {
+            "MemorySizeInMB": self.memory_size_in_mb,
+            "MaxConcurrency": self.max_concurrency,
+        }
+
+        return request_dict
@@ -4377,11 +4377,12 @@ def pipeline_container_def(models, instance_type=None):
 
 def production_variant(
     model_name,
-    instance_type,
-    initial_instance_count=1,
+    instance_type=None,
+    initial_instance_count=None,
     variant_name="AllTraffic",
     initial_weight=1,
     accelerator_type=None,
+    serverless_inference_config=None,
 ):
     """Create a production variant description suitable for use in a ``ProductionVariant`` list.
 
@@ -4400,21 +4401,29 @@ def production_variant(
         accelerator_type (str): Type of Elastic Inference accelerator for this production variant.
             For example, 'ml.eia1.medium'.
             For more information: https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html
+        serverless_inference_config (dict): Specifies configuration dict related to serverless
+            endpoint. The dict is converted from sagemaker.model_monitor.ServerlessInferenceConfig
+            object (default: None)
 
     Returns:
         dict[str, str]: An SageMaker ``ProductionVariant`` description
     """
     production_variant_configuration = {
         "ModelName": model_name,
-        "InstanceType": instance_type,
-        "InitialInstanceCount": initial_instance_count,
         "VariantName": variant_name,
         "InitialVariantWeight": initial_weight,
     }
 
     if accelerator_type:
         production_variant_configuration["AcceleratorType"] = accelerator_type
 
+    if serverless_inference_config:
+        production_variant_configuration["ServerlessConfig"] = serverless_inference_config
+    else:
+        initial_instance_count = initial_instance_count or 1
+        production_variant_configuration["InitialInstanceCount"] = initial_instance_count
+        production_variant_configuration["InstanceType"] = instance_type
+
     return production_variant_configuration
 
 
 
@@ -258,8 +258,8 @@ def register(
 
     def deploy(
         self,
-        initial_instance_count,
-        instance_type,
+        initial_instance_count=None,
+        instance_type=None,
         serializer=None,
         deserializer=None,
         accelerator_type=None,
@@ -269,6 +269,7 @@ def deploy(
         wait=True,
         data_capture_config=None,
         update_endpoint=None,
+        serverless_inference_config=None,
     ):
         """Deploy a Tensorflow ``Model`` to a SageMaker ``Endpoint``."""
 
@@ -287,6 +288,7 @@ def deploy(
             kms_key=kms_key,
             wait=wait,
             data_capture_config=data_capture_config,
+            serverless_inference_config=serverless_inference_config,
             update_endpoint=update_endpoint,
         )