aws
diff --git a/‎doc/api/inference/async_inference.rst
+19 b/‎doc/api/inference/async_inference.rst
+19
diff --git a/‎doc/api/inference/predictor_async.rst
+9 b/‎doc/api/inference/predictor_async.rst
+9
diff --git a/‎doc/overview.rst
+90 b/‎doc/overview.rst
+90
diff --git a/‎src/sagemaker/async_inference/__init__.py
+19 b/‎src/sagemaker/async_inference/__init__.py
+19
diff --git a/‎src/sagemaker/async_inference/async_inference_config.py
+81 b/‎src/sagemaker/async_inference/async_inference_config.py
+81
diff --git a/‎src/sagemaker/async_inference/async_inference_response.py
+98 b/‎src/sagemaker/async_inference/async_inference_response.py
+98
diff --git a/‎src/sagemaker/async_inference/waiter_config.py
+46 b/‎src/sagemaker/async_inference/waiter_config.py
+46
@@ -0,0 +1,19 @@
+Async Inference
+-----------------
+
+This module contains classes related to Amazon Sagemaker Async Inference
+
+.. automodule:: sagemaker.async_inference.async_inference_config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+.. automodule:: sagemaker.async_inference.async_inference_response
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+.. automodule:: sagemaker.async_inference.waiter_config
+    :members:
+    :undoc-members:
+    :show-inheritance:
@@ -0,0 +1,9 @@
+AsyncPredictor
+--------------------
+
+Make async predictions against SageMaker endpoints with Python objects
+
+.. autoclass:: sagemaker.predictor_async.AsyncPredictor
+    :members:
+    :undoc-members:
+    :show-inheritance:
@@ -684,6 +684,96 @@ For more detailed explanations of the classes that this library provides for aut
 - `API docs for HyperparameterTuner and parameter range classes <https://sagemaker.readthedocs.io/en/stable/tuner.html>`__
 - `API docs for analytics classes <https://sagemaker.readthedocs.io/en/stable/analytics.html>`__
 
+**********************************
+SageMaker Asynchronous Inference
+**********************************
+Amazon SageMaker Asynchronous Inference is a new capability in SageMaker that queues incoming requests and processes them asynchronously.
+This option is ideal for requests with large payload sizes up to 1GB, long processing times, and near real-time latency requirements.
+Asynchronous Inference enables you to save on costs by autoscaling the instance count to zero when there are no requests to process,
+so you only pay when your endpoint is processing requests. More information about
+SageMaker Serverless Inference can be found in the `AWS documentation <https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference.html>`__.
+
+To deploy asynchronous endpoint, you will need to create a ``AsyncInferenceConfig`` object.
+If you create ``AsyncInferenceConfig`` without specifying its arguments, the default ``S3OutputPath`` will
+be ``s3://sagemaker-{REGION}-{ACCOUNTID}/async-output/{UNIQUE-JOB-NAME}``. (example shown below):
+
+.. code:: python
+
+    from sagemaker.async_inference import AsyncInferenceConfig
+
+    # Create an empty AsyncInferenceConfig object to use default values
+    async_config = new AsyncInferenceConfig()
+
+Or you can specify configurations in ``AsyncInferenceConfig`` as you like (example shown below):
+
+.. code:: python
+
+    # Specify S3OutputPath, MaxConcurrentInvocationsPerInstance and NotificationConfig in the async config object
+    async_config = new AsyncInferenceConfig(
+        output_path="s3://{s3_bucket}/{bucket_prefix}/output",
+        max_concurrent_invocations_per_instance=10,
+        notification_config = {
+            "SuccessTopic": "arn:aws:sns:aws-region:account-id:topic-name",
+            "ErrorTopic": "arn:aws:sns:aws-region:account-id:topic-name",
+        }
+    )
+
+Then use the ``AsyncInferenceConfig`` in the estimator's ``deploy()`` method to deploy an asynchronous endpoint:
+
+.. code:: python
+
+    # Deploys the model that was generated by fit() to a SageMaker asynchronous endpoint
+    async_predictor = estimator.deploy(async_inference_config=async_config)
+
+After deployment is complete, it will return an ``AsyncPredictor``. You can use it to perform asynchronous inference
+by using ``predict_async()`` and then get the result in the future. For input data, you can upload data to S3 bucket
+and use that:
+
+.. code:: python
+
+    # Upload data to S3 bucket then use that as input
+    async_response = async_predictor.predict_async(input_path=input_s3_path)
+
+Or you can serialize data and use it directly just like real-time inference. This option will let Amazon SageMaker SDK
+upload the data to Amazon S3 bucket under ``s3://sagemaker-{REGION}-{ACCOUNTID}/async-input/``.
+
+.. code:: python
+
+    # Serializes data and makes a prediction request to the SageMaker asynchronous endpoint
+    async_response = async_predictor.predict_async(data=data)
+
+Then you can switch to other stuff and wait the inference to complete. After it completed, you can check
+the result then:
+
+.. code:: python
+
+    # Switch back to check the result
+    result = async_response.get_result()
+
+If you want to wait the result at the first place, you can use ``predict()`` method. It will check the result
+periodically and return the result when it appears in the output Amazon S3 path:
+
+.. code:: python
+
+    # Use predict() to wait for the result
+    response = async_predictor.predict(data=data)
+
+    # Or use Amazon S3 input path
+    response = async_predictor.predict(input_path=input_s3_path)
+
+Clean up the endpoint and model if needed after inference:
+
+.. code:: python
+
+    # Tears down the SageMaker endpoint and endpoint configuration
+    async_predictor.delete_endpoint()
+
+    # Deletes the SageMaker model
+    async_predictor.delete_model()
+
+For more details about Asynchronous Inference,
+see the API docs for `Asynchronous Inference <https://sagemaker.readthedocs.io/en/stable/api/inference/async_inference.html>`__
+
 *******************************
 SageMaker Serverless Inference
 *******************************
 
@@ -0,0 +1,19 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Imports the classes in this module to simplify customer imports"""
+
+from __future__ import absolute_import
+
+from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig  # noqa: F401
+from sagemaker.async_inference.waiter_config import WaiterConfig  # noqa: F401
+from sagemaker.async_inference.async_inference_response import AsyncInferenceResponse  # noqa: F401
@@ -0,0 +1,81 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""A class for AsyncInferenceConfig
+
+Codes are used for configuring async inference endpoint. Use it when deploying
+the model to the endpoints.
+"""
+from __future__ import print_function, absolute_import
+
+
+class AsyncInferenceConfig(object):
+    """Configuration object passed in when deploying models to Amazon SageMaker Endpoints.
+
+    This object specifies configuration related to async endpoint. Use this configuration
+    when trying to create async endpoint and make async inference
+    """
+
+    def __init__(
+        self,
+        output_path=None,
+        max_concurrent_invocations_per_instance=None,
+        kms_key_id=None,
+        notification_config=None,
+    ):
+        """Initialize an AsyncInferenceConfig object for async inference related configuration.
+
+        Args:
+            output_path (str): Optional. The Amazon S3 location that endpoints upload
+                inference responses to. If no value is provided, Amazon SageMaker will
+                use default Amazon S3 Async Inference output path. (Default: None)
+            max_concurrent_invocations_per_instance (int): Optional. The maximum number of
+                concurrent requests sent by the SageMaker client to the model container. If
+                no value is provided, Amazon SageMaker will choose an optimal value for you.
+                (Default: None)
+            kms_key_id (str): Optional. The Amazon Web Services Key Management Service
+                (Amazon Web Services KMS) key that Amazon SageMaker uses to encrypt the
+                asynchronous inference output in Amazon S3. (Default: None)
+            notification_config (dict): Optional. Specifies the configuration for notifications
+                of inference results for asynchronous inference (Default: None):
+                * success_topic (str): Amazon SNS topic to post a notification to when inference
+                completes successfully. If no topic is provided, no notification is sent on success.
+                The key in notification_config is 'SuccessTopic'.
+                * error_topic (str): Amazon SNS topic to post a notification to when inference
+                fails. If no topic is provided, no notification is sent on failure.
+                The key in notification_config is 'ErrorTopic'.
+        """
+        self.output_path = output_path
+        self.max_concurrent_invocations_per_instance = max_concurrent_invocations_per_instance
+        self.kms_key_id = kms_key_id
+        self.notification_config = notification_config
+
+    def _to_request_dict(self):
+        """Generates a request dictionary using the parameters provided to the class."""
+        request_dict = {
+            "OutputConfig": {
+                "S3OutputPath": self.output_path,
+            },
+        }
+
+        if self.max_concurrent_invocations_per_instance:
+            request_dict["ClientConfig"] = {
+                "MaxConcurrentInvocationsPerInstance": self.max_concurrent_invocations_per_instance
+            }
+
+        if self.kms_key_id:
+            request_dict["OutputConfig"]["KmsKeyId"] = self.kms_key_id
+
+        if self.notification_config:
+            request_dict["OutputConfig"]["NotificationConfig"] = self.notification_config
+
+        return request_dict
@@ -0,0 +1,98 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""A class for AsyncInferenceResponse"""
+
+from __future__ import print_function, absolute_import
+
+from botocore.exceptions import ClientError
+from sagemaker.s3 import parse_s3_url
+from sagemaker.async_inference import WaiterConfig
+from sagemaker.exceptions import ObjectNotExistedError, UnexpectedClientError
+
+
+class AsyncInferenceResponse(object):
+    """Response from Async Inference endpoint
+
+    This response object provides a method to check the async Amazon S3
+    output path. If result object exists in that path, decode and return
+    the result
+    """
+
+    def __init__(
+        self,
+        predictor_async,
+        output_path,
+    ):
+        """Initialize an AsyncInferenceResponse object.
+
+        AsyncInferenceResponse can help users to get async inference result
+        from the Amazon S3 output path
+
+        Args:
+            predictor_async (sagemaker.predictor.AsyncPredictor): The ``AsyncPredictor``
+                that return this response.
+            output_path (str): The Amazon S3 location that endpoints upload inference responses
+                to.
+        """
+        self.predictor_async = predictor_async
+        self.output_path = output_path
+        self._result = None
+
+    def get_result(
+        self,
+        waiter_config=None,
+    ):
+        """Get result from the async Amazon S3 output path
+
+        Args:
+            waiter_config (sagemaker.async_inference.waiter_config.WaiterConfig): Configuration
+                for the waiter. The pre-defined value for the delay between poll is 15 seconds
+                and the default max attempts is 60
+        Raises:
+            ValueError: If a wrong type of object is provided as ``waiter_config``
+        Returns:
+            object: Inference result in the given Amazon S3 output path. If a deserializer was
+                specified when creating the AsyncPredictor, the result of the deserializer is
+                returned. Otherwise the response returns the sequence of bytes
+                as is.
+        """
+        if waiter_config is not None and not isinstance(waiter_config, WaiterConfig):
+            raise ValueError("waiter_config should be a WaiterConfig object")
+
+        if self._result is None:
+            if waiter_config is None:
+                self._result = self._get_result_from_s3(self.output_path)
+            else:
+                self._result = self.predictor_async._wait_for_output(
+                    self.output_path, waiter_config
+                )
+        return self._result
+
+    def _get_result_from_s3(
+        self,
+        output_path,
+    ):
+        """Get inference result from the output Amazon S3 path"""
+        bucket, key = parse_s3_url(output_path)
+        try:
+            response = self.predictor_async.s3_client.get_object(Bucket=bucket, Key=key)
+            return self.predictor_async.predictor._handle_response(response)
+        except ClientError as ex:
+            if ex.response["Error"]["Code"] == "NoSuchKey":
+                raise ObjectNotExistedError(
+                    message="Inference could still be running",
+                    output_path=output_path,
+                )
+            raise UnexpectedClientError(
+                message=ex.response["Error"]["Message"],
+            )
@@ -0,0 +1,46 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""A class for WaiterConfig used in async inference
+
+Use it when using async inference and wait for the result.
+"""
+
+from __future__ import absolute_import
+
+
+class WaiterConfig(object):
+    """Configuration object passed in when using async inference and wait for the result."""
+
+    def __init__(
+        self,
+        max_attempts=60,
+        delay=15,
+    ):
+        """Initialize a WaiterConfig object that provides parameters to control waiting behavior.
+
+        Args:
+            max_attempts (int): The maximum number of attempts to be made. (Default: 60)
+            delay (int): The amount of time in seconds to wait between attempts. (Default: 15)
+        """
+
+        self.max_attempts = max_attempts
+        self.delay = delay
+
+    def _to_request_dict(self):
+        """Generates a dictionary using the parameters provided to the class."""
+        waiter_dict = {
+            "Delay": self.delay,
+            "MaxAttempts": self.max_attempts,
+        }
+
+        return waiter_dict