From dde8d003e778ad0db1a9f347da1097560495f035 Mon Sep 17 00:00:00 2001
From: Payton Staub <staubhpa@gmail.com>
Date: Wed, 8 Dec 2021 15:34:50 -0800
Subject: [PATCH 01/11] =?UTF-8?q?fix:=20Set=20ProcessingStep=20upload=20lo?=
 =?UTF-8?q?cations=20deterministically=20to=20avoid=20c=E2=80=A6=20(#2790)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/sagemaker/workflow/steps.py             |  32 ++++
 src/sagemaker/workflow/utilities.py         |  21 +++
 tests/unit/sagemaker/workflow/test_steps.py | 195 +++++++++++++++++---
 3 files changed, 222 insertions(+), 26 deletions(-)

diff --git a/src/sagemaker/workflow/steps.py b/src/sagemaker/workflow/steps.py
index 6975c6ca97..dd81553a02 100644
--- a/src/sagemaker/workflow/steps.py
+++ b/src/sagemaker/workflow/steps.py
@@ -14,8 +14,10 @@
 from __future__ import absolute_import
 
 import abc
+import warnings
 from enum import Enum
 from typing import Dict, List, Union
+from urllib.parse import urlparse
 
 import attr
 
@@ -270,6 +272,16 @@ def __init__(
         )
         self.cache_config = cache_config
 
+        if self.cache_config is not None and not self.estimator.disable_profiler:
+            msg = (
+                "Profiling is enabled on the provided estimator. "
+                "The default profiler rule includes a timestamp "
+                "which will change each time the pipeline is "
+                "upserted, causing cache misses. If profiling "
+                "is not needed, set disable_profiler to True on the estimator."
+            )
+            warnings.warn(msg)
+
     @property
     def arguments(self) -> RequestType:
         """The arguments dict that is used to call `create_training_job`.
@@ -498,6 +510,7 @@ def __init__(
         self.job_arguments = job_arguments
         self.code = code
         self.property_files = property_files
+        self.job_name = None
 
         # Examine why run method in sagemaker.processing.Processor mutates the processor instance
         # by setting the instance's arguments attribute. Refactor Processor.run, if possible.
@@ -508,6 +521,17 @@ def __init__(
         )
         self.cache_config = cache_config
 
+        if code:
+            code_url = urlparse(code)
+            if code_url.scheme == "" or code_url.scheme == "file":
+                # By default, Processor will upload the local code to an S3 path
+                # containing a timestamp. This causes cache misses whenever a
+                # pipeline is updated, even if the underlying script hasn't changed.
+                # To avoid this, hash the contents of the script and include it
+                # in the job_name passed to the Processor, which will be used
+                # instead of the timestamped path.
+                self.job_name = self._generate_code_upload_path()
+
     @property
     def arguments(self) -> RequestType:
         """The arguments dict that is used to call `create_processing_job`.
@@ -516,6 +540,7 @@ def arguments(self) -> RequestType:
         ProcessingJobName and ExperimentConfig cannot be included in the arguments.
         """
         normalized_inputs, normalized_outputs = self.processor._normalize_args(
+            job_name=self.job_name,
             arguments=self.job_arguments,
             inputs=self.inputs,
             outputs=self.outputs,
@@ -546,6 +571,13 @@ def to_request(self) -> RequestType:
             ]
         return request_dict
 
+    def _generate_code_upload_path(self) -> str:
+        """Generate an upload path for local processing scripts based on its contents"""
+        from sagemaker.workflow.utilities import hash_file
+
+        code_hash = hash_file(self.code)
+        return f"{self.name}-{code_hash}"[:1024]
+
 
 class TuningStep(ConfigurableRetryStep):
     """Tuning step for workflow."""
diff --git a/src/sagemaker/workflow/utilities.py b/src/sagemaker/workflow/utilities.py
index 069894d761..3e77465ff6 100644
--- a/src/sagemaker/workflow/utilities.py
+++ b/src/sagemaker/workflow/utilities.py
@@ -14,6 +14,7 @@
 from __future__ import absolute_import
 
 from typing import List, Sequence, Union
+import hashlib
 
 from sagemaker.workflow.entities import (
     Entity,
@@ -37,3 +38,23 @@ def list_to_request(entities: Sequence[Union[Entity, StepCollection]]) -> List[R
         elif isinstance(entity, StepCollection):
             request_dicts.extend(entity.request_dicts())
     return request_dicts
+
+
+def hash_file(path: str) -> str:
+    """Get the MD5 hash of a file.
+
+    Args:
+        path (str): The local path for the file.
+    Returns:
+        str: The MD5 hash of the file.
+    """
+    BUF_SIZE = 65536  # read in 64KiB chunks
+    md5 = hashlib.md5()
+    with open(path, "rb") as f:
+        while True:
+            data = f.read(BUF_SIZE)
+            if not data:
+                break
+            md5.update(data)
+
+    return md5.hexdigest()
diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py
index 42c3bed7b6..3c2adc7bd9 100644
--- a/tests/unit/sagemaker/workflow/test_steps.py
+++ b/tests/unit/sagemaker/workflow/test_steps.py
@@ -16,6 +16,7 @@
 import pytest
 import sagemaker
 import os
+import warnings
 
 from mock import (
     Mock,
@@ -63,8 +64,7 @@
 )
 from tests.unit import DATA_DIR
 
-SCRIPT_FILE = "dummy_script.py"
-SCRIPT_PATH = os.path.join(DATA_DIR, SCRIPT_FILE)
+DUMMY_SCRIPT_PATH = os.path.join(DATA_DIR, "dummy_script.py")
 
 REGION = "us-west-2"
 BUCKET = "my-bucket"
@@ -129,6 +129,31 @@ def sagemaker_session(boto_session, client):
     )
 
 
+@pytest.fixture
+def script_processor(sagemaker_session):
+    return ScriptProcessor(
+        role=ROLE,
+        image_uri="012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri",
+        command=["python3"],
+        instance_type="ml.m4.xlarge",
+        instance_count=1,
+        volume_size_in_gb=100,
+        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
+        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
+        max_runtime_in_seconds=3600,
+        base_job_name="my_sklearn_processor",
+        env={"my_env_variable": "my_env_variable_value"},
+        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
+        network_config=NetworkConfig(
+            subnets=["my_subnet_id"],
+            security_group_ids=["my_security_group_id"],
+            enable_network_isolation=True,
+            encrypt_inter_container_traffic=True,
+        ),
+        sagemaker_session=sagemaker_session,
+    )
+
+
 def test_custom_step():
     step = CustomStep(
         name="MyStep", display_name="CustomStepDisplayName", description="CustomStepDescription"
@@ -326,7 +351,7 @@ def test_training_step_tensorflow(sagemaker_session):
     training_epochs_parameter = ParameterInteger(name="TrainingEpochs", default_value=5)
     training_batch_size_parameter = ParameterInteger(name="TrainingBatchSize", default_value=500)
     estimator = TensorFlow(
-        entry_point=os.path.join(DATA_DIR, SCRIPT_FILE),
+        entry_point=DUMMY_SCRIPT_PATH,
         role=ROLE,
         model_dir=False,
         image_uri=IMAGE_URI,
@@ -403,6 +428,75 @@ def test_training_step_tensorflow(sagemaker_session):
     assert step.properties.TrainingJobName.expr == {"Get": "Steps.MyTrainingStep.TrainingJobName"}
 
 
+def test_training_step_profiler_warning(sagemaker_session):
+    estimator = TensorFlow(
+        entry_point=DUMMY_SCRIPT_PATH,
+        role=ROLE,
+        model_dir=False,
+        image_uri=IMAGE_URI,
+        source_dir="s3://mybucket/source",
+        framework_version="2.4.1",
+        py_version="py37",
+        disable_profiler=False,
+        instance_count=1,
+        instance_type="ml.p3.16xlarge",
+        sagemaker_session=sagemaker_session,
+        hyperparameters={
+            "batch-size": 500,
+            "epochs": 5,
+        },
+        debugger_hook_config=False,
+        distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
+    )
+
+    inputs = TrainingInput(s3_data=f"s3://{BUCKET}/train_manifest")
+    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
+    with warnings.catch_warnings(record=True) as w:
+        TrainingStep(
+            name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=cache_config
+        )
+        assert len(w) == 1
+        assert issubclass(w[-1].category, UserWarning)
+        assert "Profiling is enabled on the provided estimator" in str(w[-1].message)
+
+
+def test_training_step_no_profiler_warning(sagemaker_session):
+    estimator = TensorFlow(
+        entry_point=DUMMY_SCRIPT_PATH,
+        role=ROLE,
+        model_dir=False,
+        image_uri=IMAGE_URI,
+        source_dir="s3://mybucket/source",
+        framework_version="2.4.1",
+        py_version="py37",
+        disable_profiler=True,
+        instance_count=1,
+        instance_type="ml.p3.16xlarge",
+        sagemaker_session=sagemaker_session,
+        hyperparameters={
+            "batch-size": 500,
+            "epochs": 5,
+        },
+        debugger_hook_config=False,
+        distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
+    )
+
+    inputs = TrainingInput(s3_data=f"s3://{BUCKET}/train_manifest")
+    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
+    with warnings.catch_warnings(record=True) as w:
+        # profiler disabled, cache config not None
+        TrainingStep(
+            name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=cache_config
+        )
+        assert len(w) == 0
+
+    with warnings.catch_warnings(record=True) as w:
+        # profiler enabled, cache config is None
+        estimator.disable_profiler = False
+        TrainingStep(name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=None)
+        assert len(w) == 0
+
+
 def test_processing_step(sagemaker_session):
     processing_input_data_uri_parameter = ParameterString(
         name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest"
@@ -473,28 +567,42 @@ def test_processing_step(sagemaker_session):
 
 
 @patch("sagemaker.processing.ScriptProcessor._normalize_args")
-def test_processing_step_normalizes_args(mock_normalize_args, sagemaker_session):
-    processor = ScriptProcessor(
-        role=ROLE,
-        image_uri="012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri",
-        command=["python3"],
-        instance_type="ml.m4.xlarge",
-        instance_count=1,
-        volume_size_in_gb=100,
-        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
-        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
-        max_runtime_in_seconds=3600,
-        base_job_name="my_sklearn_processor",
-        env={"my_env_variable": "my_env_variable_value"},
-        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
-        network_config=NetworkConfig(
-            subnets=["my_subnet_id"],
-            security_group_ids=["my_security_group_id"],
-            enable_network_isolation=True,
-            encrypt_inter_container_traffic=True,
-        ),
-        sagemaker_session=sagemaker_session,
+def test_processing_step_normalizes_args_with_local_code(mock_normalize_args, script_processor):
+    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
+    inputs = [
+        ProcessingInput(
+            source=f"s3://{BUCKET}/processing_manifest",
+            destination="processing_manifest",
+        )
+    ]
+    outputs = [
+        ProcessingOutput(
+            source=f"s3://{BUCKET}/processing_manifest",
+            destination="processing_manifest",
+        )
+    ]
+    step = ProcessingStep(
+        name="MyProcessingStep",
+        processor=script_processor,
+        code=DUMMY_SCRIPT_PATH,
+        inputs=inputs,
+        outputs=outputs,
+        job_arguments=["arg1", "arg2"],
+        cache_config=cache_config,
     )
+    mock_normalize_args.return_value = [step.inputs, step.outputs]
+    step.to_request()
+    mock_normalize_args.assert_called_with(
+        job_name="MyProcessingStep-3e89f0c7e101c356cbedf27d9d27e9db",
+        arguments=step.job_arguments,
+        inputs=step.inputs,
+        outputs=step.outputs,
+        code=step.code,
+    )
+
+
+@patch("sagemaker.processing.ScriptProcessor._normalize_args")
+def test_processing_step_normalizes_args_with_s3_code(mock_normalize_args, script_processor):
     cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
     inputs = [
         ProcessingInput(
@@ -510,8 +618,8 @@ def test_processing_step_normalizes_args(mock_normalize_args, sagemaker_session)
     ]
     step = ProcessingStep(
         name="MyProcessingStep",
-        processor=processor,
-        code="foo.py",
+        processor=script_processor,
+        code="s3://foo",
         inputs=inputs,
         outputs=outputs,
         job_arguments=["arg1", "arg2"],
@@ -520,6 +628,7 @@ def test_processing_step_normalizes_args(mock_normalize_args, sagemaker_session)
     mock_normalize_args.return_value = [step.inputs, step.outputs]
     step.to_request()
     mock_normalize_args.assert_called_with(
+        job_name=None,
         arguments=step.job_arguments,
         inputs=step.inputs,
         outputs=step.outputs,
@@ -527,6 +636,40 @@ def test_processing_step_normalizes_args(mock_normalize_args, sagemaker_session)
     )
 
 
+@patch("sagemaker.processing.ScriptProcessor._normalize_args")
+def test_processing_step_normalizes_args_with_no_code(mock_normalize_args, script_processor):
+    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
+    inputs = [
+        ProcessingInput(
+            source=f"s3://{BUCKET}/processing_manifest",
+            destination="processing_manifest",
+        )
+    ]
+    outputs = [
+        ProcessingOutput(
+            source=f"s3://{BUCKET}/processing_manifest",
+            destination="processing_manifest",
+        )
+    ]
+    step = ProcessingStep(
+        name="MyProcessingStep",
+        processor=script_processor,
+        inputs=inputs,
+        outputs=outputs,
+        job_arguments=["arg1", "arg2"],
+        cache_config=cache_config,
+    )
+    mock_normalize_args.return_value = [step.inputs, step.outputs]
+    step.to_request()
+    mock_normalize_args.assert_called_with(
+        job_name=None,
+        arguments=step.job_arguments,
+        inputs=step.inputs,
+        outputs=step.outputs,
+        code=None,
+    )
+
+
 def test_create_model_step(sagemaker_session):
     model = Model(
         image_uri=IMAGE_URI,

From 0f72907d852f8018427e358d81924b29596101f6 Mon Sep 17 00:00:00 2001
From: Payton Staub <staubhpa@gmail.com>
Date: Thu, 9 Dec 2021 09:56:17 -0800
Subject: [PATCH 02/11] fix: Prevent repack_model script from referencing
 nonexistent directories (#2755)

Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
---
 src/sagemaker/workflow/_repack_model.py       | 27 ++++++++++---------
 .../workflow/test_repack_model_script.py      |  4 +--
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/sagemaker/workflow/_repack_model.py b/src/sagemaker/workflow/_repack_model.py
index 60b74d66c7..6ce7e41831 100644
--- a/src/sagemaker/workflow/_repack_model.py
+++ b/src/sagemaker/workflow/_repack_model.py
@@ -62,15 +62,15 @@ def repack(inference_script, model_archive, dependencies=None, source_dir=None):
         with tarfile.open(name=local_path, mode="r:gz") as tf:
             tf.extractall(path=src_dir)
 
-        # copy the custom inference script to code/
-        entry_point = os.path.join("/opt/ml/code", inference_script)
-        shutil.copy2(entry_point, os.path.join(src_dir, "code", inference_script))
-
-        # copy source_dir to code/
         if source_dir:
+            # copy /opt/ml/code to code/
             if os.path.exists(code_dir):
                 shutil.rmtree(code_dir)
-                shutil.copytree(source_dir, code_dir)
+            shutil.copytree("/opt/ml/code", code_dir)
+        else:
+            # copy the custom inference script to code/
+            entry_point = os.path.join("/opt/ml/code", inference_script)
+            shutil.copy2(entry_point, os.path.join(code_dir, inference_script))
 
         # copy any dependencies to code/lib/
         if dependencies:
@@ -79,13 +79,16 @@ def repack(inference_script, model_archive, dependencies=None, source_dir=None):
                 lib_dir = os.path.join(code_dir, "lib")
                 if not os.path.exists(lib_dir):
                     os.mkdir(lib_dir)
-                if os.path.isdir(actual_dependency_path):
-                    shutil.copytree(
-                        actual_dependency_path,
-                        os.path.join(lib_dir, os.path.basename(actual_dependency_path)),
-                    )
-                else:
+                if os.path.isfile(actual_dependency_path):
                     shutil.copy2(actual_dependency_path, lib_dir)
+                else:
+                    if os.path.exists(lib_dir):
+                        shutil.rmtree(lib_dir)
+                    # a directory is in the dependencies. we have to copy
+                    # all of /opt/ml/code into the lib dir because the original directory
+                    # was flattened by the SDK training job upload..
+                    shutil.copytree("/opt/ml/code", lib_dir)
+                    break
 
         # copy the "src" dir, which includes the previous training job's model and the
         # custom inference script, to the output of this training job
diff --git a/tests/unit/sagemaker/workflow/test_repack_model_script.py b/tests/unit/sagemaker/workflow/test_repack_model_script.py
index 67c8231dcc..69c9e7b740 100644
--- a/tests/unit/sagemaker/workflow/test_repack_model_script.py
+++ b/tests/unit/sagemaker/workflow/test_repack_model_script.py
@@ -94,7 +94,7 @@ def test_repack_with_dependencies(tmp):
     _repack_model.repack(
         inference_script="inference.py",
         model_archive=model_tar_name,
-        dependencies=["dependencies/a", "bb", "dependencies/some/dir"],
+        dependencies="dependencies/a bb dependencies/some/dir",
     )
 
     # /opt/ml/model should now have the original model and the inference script
@@ -145,7 +145,7 @@ def test_repack_with_source_dir_and_dependencies(tmp):
     _repack_model.repack(
         inference_script="inference.py",
         model_archive=model_tar_name,
-        dependencies=["dependencies/a", "bb", "dependencies/some/dir"],
+        dependencies="dependencies/a bb dependencies/some/dir",
         source_dir="sourcedir",
     )
 

From 0bae07139566718bde98058786636a7bcdfa8905 Mon Sep 17 00:00:00 2001
From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Date: Wed, 15 Dec 2021 10:04:06 -0800
Subject: [PATCH 03/11] fix: S3Input - add support for instance attributes
 (#2754)

---
 src/sagemaker/dataset_definition/inputs.py | 236 +++++++++++++--------
 tests/integ/test_processing.py             |  19 ++
 2 files changed, 171 insertions(+), 84 deletions(-)

diff --git a/src/sagemaker/dataset_definition/inputs.py b/src/sagemaker/dataset_definition/inputs.py
index 34289beb30..90a272c4d7 100644
--- a/src/sagemaker/dataset_definition/inputs.py
+++ b/src/sagemaker/dataset_definition/inputs.py
@@ -26,94 +26,147 @@ class RedshiftDatasetDefinition(ApiObject):
     """DatasetDefinition for Redshift.
 
     With this input, SQL queries will be executed using Redshift to generate datasets to S3.
-
-    Parameters:
-        cluster_id (str): The Redshift cluster Identifier.
-        database (str): The name of the Redshift database used in Redshift query execution.
-        db_user (str): The database user name used in Redshift query execution.
-        query_string (str): The SQL query statements to be executed.
-        cluster_role_arn (str): The IAM role attached to your Redshift cluster that
-            Amazon SageMaker uses to generate datasets.
-        output_s3_uri (str): The location in Amazon S3 where the Redshift query
-            results are stored.
-        kms_key_id (str): The AWS Key Management Service (AWS KMS) key that Amazon
-            SageMaker uses to encrypt data from a Redshift execution.
-        output_format (str): The data storage format for Redshift query results.
-            Valid options are "PARQUET", "CSV"
-        output_compression (str): The compression used for Redshift query results.
-            Valid options are "None", "GZIP", "SNAPPY", "ZSTD", "BZIP2"
     """
 
-    cluster_id = None
-    database = None
-    db_user = None
-    query_string = None
-    cluster_role_arn = None
-    output_s3_uri = None
-    kms_key_id = None
-    output_format = None
-    output_compression = None
+    def __init__(
+        self,
+        cluster_id=None,
+        database=None,
+        db_user=None,
+        query_string=None,
+        cluster_role_arn=None,
+        output_s3_uri=None,
+        kms_key_id=None,
+        output_format=None,
+        output_compression=None,
+    ):
+        """Initialize RedshiftDatasetDefinition.
+
+        Args:
+            cluster_id (str, default=None): The Redshift cluster Identifier.
+            database (str, default=None):
+                The name of the Redshift database used in Redshift query execution.
+            db_user (str, default=None): The database user name used in Redshift query execution.
+            query_string (str, default=None): The SQL query statements to be executed.
+            cluster_role_arn (str, default=None): The IAM role attached to your Redshift cluster
+                that Amazon SageMaker uses to generate datasets.
+            output_s3_uri (str, default=None): The location in Amazon S3 where the Redshift query
+                results are stored.
+            kms_key_id (str, default=None): The AWS Key Management Service (AWS KMS) key that Amazon
+                SageMaker uses to encrypt data from a Redshift execution.
+            output_format (str, default=None): The data storage format for Redshift query results.
+                Valid options are "PARQUET", "CSV"
+            output_compression (str, default=None): The compression used for Redshift query results.
+                Valid options are "None", "GZIP", "SNAPPY", "ZSTD", "BZIP2"
+        """
+        super(RedshiftDatasetDefinition, self).__init__(
+            cluster_id=cluster_id,
+            database=database,
+            db_user=db_user,
+            query_string=query_string,
+            cluster_role_arn=cluster_role_arn,
+            output_s3_uri=output_s3_uri,
+            kms_key_id=kms_key_id,
+            output_format=output_format,
+            output_compression=output_compression,
+        )
 
 
 class AthenaDatasetDefinition(ApiObject):
     """DatasetDefinition for Athena.
 
     With this input, SQL queries will be executed using Athena to generate datasets to S3.
-
-    Parameters:
-        catalog (str): The name of the data catalog used in Athena query execution.
-        database (str): The name of the database used in the Athena query execution.
-        query_string (str): The SQL query statements, to be executed.
-        output_s3_uri (str): The location in Amazon S3 where Athena query results are stored.
-        work_group (str): The name of the workgroup in which the Athena query is being started.
-        kms_key_id (str): The AWS Key Management Service (AWS KMS) key that Amazon
-            SageMaker uses to encrypt data generated from an Athena query execution.
-        output_format (str): The data storage format for Athena query results.
-            Valid options are "PARQUET", "ORC", "AVRO", "JSON", "TEXTFILE"
-        output_compression (str): The compression used for Athena query results.
-            Valid options are "GZIP", "SNAPPY", "ZLIB"
     """
 
-    catalog = None
-    database = None
-    query_string = None
-    output_s3_uri = None
-    work_group = None
-    kms_key_id = None
-    output_format = None
-    output_compression = None
+    def __init__(
+        self,
+        catalog=None,
+        database=None,
+        query_string=None,
+        output_s3_uri=None,
+        work_group=None,
+        kms_key_id=None,
+        output_format=None,
+        output_compression=None,
+    ):
+        """Initialize AthenaDatasetDefinition.
+
+        Args:
+            catalog (str, default=None): The name of the data catalog used in Athena query
+                execution.
+            database (str, default=None): The name of the database used in the Athena query
+                execution.
+            query_string (str, default=None): The SQL query statements, to be executed.
+            output_s3_uri (str, default=None):
+                The location in Amazon S3 where Athena query results are stored.
+            work_group (str, default=None):
+                The name of the workgroup in which the Athena query is being started.
+            kms_key_id (str, default=None): The AWS Key Management Service (AWS KMS) key that Amazon
+                SageMaker uses to encrypt data generated from an Athena query execution.
+            output_format (str, default=None): The data storage format for Athena query results.
+                Valid options are "PARQUET", "ORC", "AVRO", "JSON", "TEXTFILE"
+            output_compression (str, default=None): The compression used for Athena query results.
+                Valid options are "GZIP", "SNAPPY", "ZLIB"
+        """
+        super(AthenaDatasetDefinition, self).__init__(
+            catalog=catalog,
+            database=database,
+            query_string=query_string,
+            output_s3_uri=output_s3_uri,
+            work_group=work_group,
+            kms_key_id=kms_key_id,
+            output_format=output_format,
+            output_compression=output_compression,
+        )
 
 
 class DatasetDefinition(ApiObject):
-    """DatasetDefinition input.
-
-    Parameters:
-        data_distribution_type (str): Whether the generated dataset is FullyReplicated or
-            ShardedByS3Key (default).
-        input_mode (str): Whether to use File or Pipe input mode. In File (default) mode, Amazon
-            SageMaker copies the data from the input source onto the local Amazon Elastic Block
-            Store (Amazon EBS) volumes before starting your training algorithm. This is the most
-            commonly used input mode. In Pipe mode, Amazon SageMaker streams input data from the
-            source directly to your algorithm without using the EBS volume.
-        local_path (str): The local path where you want Amazon SageMaker to download the Dataset
-            Definition inputs to run a processing job. LocalPath is an absolute path to the input
-            data. This is a required parameter when `AppManaged` is False (default).
-        redshift_dataset_definition (:class:`~sagemaker.dataset_definition.inputs.RedshiftDatasetDefinition`):
-            Configuration for Redshift Dataset Definition input.
-        athena_dataset_definition (:class:`~sagemaker.dataset_definition.inputs.AthenaDatasetDefinition`):
-            Configuration for Athena Dataset Definition input.
-    """
+    """DatasetDefinition input."""
 
     _custom_boto_types = {
         "redshift_dataset_definition": (RedshiftDatasetDefinition, True),
         "athena_dataset_definition": (AthenaDatasetDefinition, True),
     }
 
-    data_distribution_type = "ShardedByS3Key"
-    input_mode = "File"
-    local_path = None
-    redshift_dataset_definition = None
-    athena_dataset_definition = None
+    def __init__(
+        self,
+        data_distribution_type="ShardedByS3Key",
+        input_mode="File",
+        local_path=None,
+        redshift_dataset_definition=None,
+        athena_dataset_definition=None,
+    ):
+        """Initialize DatasetDefinition.
+
+        Parameters:
+            data_distribution_type (str, default="ShardedByS3Key"):
+                Whether the generated dataset is FullyReplicated or ShardedByS3Key (default).
+            input_mode (str, default="File"):
+                Whether to use File or Pipe input mode. In File (default) mode, Amazon
+                SageMaker copies the data from the input source onto the local Amazon Elastic Block
+                Store (Amazon EBS) volumes before starting your training algorithm. This is the most
+                commonly used input mode. In Pipe mode, Amazon SageMaker streams input data from the
+                source directly to your algorithm without using the EBS volume.
+            local_path (str, default=None):
+                The local path where you want Amazon SageMaker to download the Dataset
+                Definition inputs to run a processing job. LocalPath is an absolute path to the
+                input data. This is a required parameter when `AppManaged` is False (default).
+            redshift_dataset_definition
+                (:class:`~sagemaker.dataset_definition.inputs.RedshiftDatasetDefinition`,
+                default=None):
+                Configuration for Redshift Dataset Definition input.
+            athena_dataset_definition
+                (:class:`~sagemaker.dataset_definition.inputs.AthenaDatasetDefinition`,
+                default=None):
+                Configuration for Athena Dataset Definition input.
+        """
+        super(DatasetDefinition, self).__init__(
+            data_distribution_type=data_distribution_type,
+            input_mode=input_mode,
+            local_path=local_path,
+            redshift_dataset_definition=redshift_dataset_definition,
+            athena_dataset_definition=athena_dataset_definition,
+        )
 
 
 class S3Input(ApiObject):
@@ -124,20 +177,35 @@ class S3Input(ApiObject):
     Note: Strong consistency is not guaranteed if S3Prefix is provided here.
     S3 list operations are not strongly consistent.
     Use ManifestFile if strong consistency is required.
-
-    Parameters:
-        s3_uri (str): the path to a specific S3 object or a S3 prefix
-        local_path (str): the path to a local directory. If not provided, skips data download
-            by SageMaker platform.
-        s3_data_type (str): Valid options are "ManifestFile" or "S3Prefix".
-        s3_input_mode (str): Valid options are "Pipe" or "File".
-        s3_data_distribution_type (str): Valid options are "FullyReplicated" or "ShardedByS3Key".
-        s3_compression_type (str): Valid options are "None" or "Gzip".
     """
 
-    s3_uri = None
-    local_path = None
-    s3_data_type = "S3Prefix"
-    s3_input_mode = "File"
-    s3_data_distribution_type = "FullyReplicated"
-    s3_compression_type = None
+    def __init__(
+        self,
+        s3_uri=None,
+        local_path=None,
+        s3_data_type="S3Prefix",
+        s3_input_mode="File",
+        s3_data_distribution_type="FullyReplicated",
+        s3_compression_type=None,
+    ):
+        """Initialize S3Input.
+
+        Parameters:
+            s3_uri (str, default=None): the path to a specific S3 object or a S3 prefix
+            local_path (str, default=None):
+                the path to a local directory. If not provided, skips data download
+                by SageMaker platform.
+            s3_data_type (str, default="S3Prefix"): Valid options are "ManifestFile" or "S3Prefix".
+            s3_input_mode (str, default="File"): Valid options are "Pipe" or "File".
+            s3_data_distribution_type (str, default="FullyReplicated"):
+                Valid options are "FullyReplicated" or "ShardedByS3Key".
+            s3_compression_type (str, default=None): Valid options are "None" or "Gzip".
+        """
+        super(S3Input, self).__init__(
+            s3_uri=s3_uri,
+            local_path=local_path,
+            s3_data_type=s3_data_type,
+            s3_input_mode=s3_input_mode,
+            s3_data_distribution_type=s3_data_distribution_type,
+            s3_compression_type=s3_compression_type,
+        )
diff --git a/tests/integ/test_processing.py b/tests/integ/test_processing.py
index 337d88af59..8ceb3f2195 100644
--- a/tests/integ/test_processing.py
+++ b/tests/integ/test_processing.py
@@ -747,6 +747,14 @@ def _get_processing_inputs_with_all_parameters(bucket):
             destination="/opt/ml/processing/input/data/",
             input_name="my_dataset",
         ),
+        ProcessingInput(
+            input_name="s3_input_wo_defaults",
+            s3_input=S3Input(
+                s3_uri=f"s3://{bucket}",
+                local_path="/opt/ml/processing/input/s3_input_wo_defaults",
+                s3_data_type="S3Prefix",
+            ),
+        ),
         ProcessingInput(
             input_name="s3_input",
             s3_input=S3Input(
@@ -822,6 +830,17 @@ def _get_processing_job_inputs_and_outputs(bucket, output_kms_key):
                     "S3CompressionType": "None",
                 },
             },
+            {
+                "InputName": "s3_input_wo_defaults",
+                "AppManaged": False,
+                "S3Input": {
+                    "S3Uri": f"s3://{bucket}",
+                    "LocalPath": "/opt/ml/processing/input/s3_input_wo_defaults",
+                    "S3DataType": "S3Prefix",
+                    "S3InputMode": "File",
+                    "S3DataDistributionType": "FullyReplicated",
+                },
+            },
             {
                 "InputName": "s3_input",
                 "AppManaged": False,

From 17fe93e457b3053014725e93f3bf1aac986f4d21 Mon Sep 17 00:00:00 2001
From: Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
Date: Thu, 16 Dec 2021 20:13:10 +0000
Subject: [PATCH 04/11] fix: typos and broken link (#2765)

Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>
---
 doc/frameworks/pytorch/using_pytorch.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst
index f5a4a732b8..9d4a4de3de 100644
--- a/doc/frameworks/pytorch/using_pytorch.rst
+++ b/doc/frameworks/pytorch/using_pytorch.rst
@@ -80,7 +80,7 @@ with the following:
 
         # ... load from args.train and args.test, train a model, write model to args.model_dir.
 
-Because the SageMaker imports your training script, you should put your training code in a main guard
+Because SageMaker imports your training script, you should put your training code in a main guard
 (``if __name__=='__main__':``) if you are using the same script to host your model, so that SageMaker does not
 inadvertently run your training code at the wrong point in execution.
 
@@ -177,7 +177,7 @@ fit Required Arguments
    case, the S3 objects rooted at the ``my-training-data`` prefix will
    be available in the default ``train`` channel. A dict from
    string channel names to S3 URIs. In this case, the objects rooted at
-   each S3 prefix will available as files in each channel directory.
+   each S3 prefix will be available as files in each channel directory.
 
 For example:
 
@@ -391,7 +391,7 @@ If you are using PyTorch Elastic Inference 1.5.1, you should provide ``model_fn`
 The client-side Elastic Inference framework is CPU-only, even though inference still happens in a CUDA context on the server. Thus, the default ``model_fn`` for Elastic Inference loads the model to CPU. Tracing models may lead to tensor creation on a specific device, which may cause device-related errors when loading a model onto a different device. Providing an explicit ``map_location=torch.device('cpu')`` argument forces all tensors to CPU.
 
 For more information on the default inference handler functions, please refer to:
-`SageMaker PyTorch Default Inference Handler <https://github.com/aws/sagemaker-pytorch-serving-container/blob/master/src/sagemaker_pytorch_serving_container/default_inference_handler.py>`_.
+`SageMaker PyTorch Default Inference Handler <https://github.com/aws/sagemaker-pytorch-inference-toolkit/blob/master/src/sagemaker_pytorch_serving_container/default_pytorch_inference_handler.py>`_.
 
 Serve a PyTorch Model
 ---------------------

From 0a8ca6a19181459b7eaa16795dd43aec93e31afa Mon Sep 17 00:00:00 2001
From: sreedes <70613743+sreedes@users.noreply.github.com>
Date: Sat, 18 Dec 2021 04:39:09 +0530
Subject: [PATCH 05/11] fix: Model Registration with BYO scripts (#2797)

Co-authored-by: Basil Beirouti <beirb@amazon.com>
Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
Co-authored-by: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Co-authored-by: Basil Beirouti <BasilBeirouti@gmail.com>
Co-authored-by: Payton Staub <staubhpa@gmail.com>
Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>
---
 src/sagemaker/model.py    | 23 +++++++++++--------
 tests/integ/test_mxnet.py | 48 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
index 4461345fa0..5af5539a96 100644
--- a/src/sagemaker/model.py
+++ b/src/sagemaker/model.py
@@ -178,21 +178,26 @@ def register(
         """
         if self.model_data is None:
             raise ValueError("SageMaker Model Package cannot be created without model data.")
+        if image_uri is not None:
+            self.image_uri = image_uri
+        if model_package_group_name is not None:
+            container_def = self.prepare_container_def()
+        else:
+            container_def = {"Image": self.image_uri, "ModelDataUrl": self.model_data}
 
         model_pkg_args = sagemaker.get_model_package_args(
             content_types,
             response_types,
             inference_instances,
             transform_instances,
-            model_package_name,
-            model_package_group_name,
-            self.model_data,
-            image_uri or self.image_uri,
-            model_metrics,
-            metadata_properties,
-            marketplace_cert,
-            approval_status,
-            description,
+            model_package_name=model_package_name,
+            model_package_group_name=model_package_group_name,
+            model_metrics=model_metrics,
+            metadata_properties=metadata_properties,
+            marketplace_cert=marketplace_cert,
+            approval_status=approval_status,
+            description=description,
+            container_def_list=[container_def],
             drift_check_baselines=drift_check_baselines,
         )
         model_package = self.sagemaker_session.create_model_package_from_containers(
diff --git a/tests/integ/test_mxnet.py b/tests/integ/test_mxnet.py
index 65c89c5876..d13108d471 100644
--- a/tests/integ/test_mxnet.py
+++ b/tests/integ/test_mxnet.py
@@ -231,6 +231,54 @@ def test_register_model_package(
         sagemaker_session.sagemaker_client.delete_model_package(ModelPackageName=model_package_name)
 
 
+def test_register_model_package_versioned(
+    mxnet_training_job,
+    sagemaker_session,
+    mxnet_inference_latest_version,
+    mxnet_inference_latest_py_version,
+    cpu_instance_type,
+):
+    endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
+
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
+        desc = sagemaker_session.sagemaker_client.describe_training_job(
+            TrainingJobName=mxnet_training_job
+        )
+        model_package_group_name = "register-model-package-{}".format(sagemaker_timestamp())
+        sagemaker_session.sagemaker_client.create_model_package_group(
+            ModelPackageGroupName=model_package_group_name
+        )
+        model_data = desc["ModelArtifacts"]["S3ModelArtifacts"]
+        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist.py")
+        model = MXNetModel(
+            model_data,
+            "SageMakerRole",
+            entry_point=script_path,
+            py_version=mxnet_inference_latest_py_version,
+            sagemaker_session=sagemaker_session,
+            framework_version=mxnet_inference_latest_version,
+        )
+        model_pkg = model.register(
+            content_types=["application/json"],
+            response_types=["application/json"],
+            inference_instances=["ml.m5.large"],
+            transform_instances=["ml.m5.large"],
+            model_package_group_name=model_package_group_name,
+            approval_status="Approved",
+        )
+        assert isinstance(model_pkg, ModelPackage)
+        predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name)
+        data = numpy.zeros(shape=(1, 1, 28, 28))
+        result = predictor.predict(data)
+        assert result is not None
+        sagemaker_session.sagemaker_client.delete_model_package(
+            ModelPackageName=model_pkg.model_package_arn
+        )
+        sagemaker_session.sagemaker_client.delete_model_package_group(
+            ModelPackageGroupName=model_package_group_name
+        )
+
+
 def test_deploy_model_with_tags_and_kms(
     mxnet_training_job,
     sagemaker_session,

From 940b1f17ad5182f31aefcb2d0e8171e3318d3e6b Mon Sep 17 00:00:00 2001
From: Xiaoguang Chen <xgchen@amazon.com>
Date: Fri, 15 Oct 2021 02:31:03 +0000
Subject: [PATCH 06/11] change: Add label_headers option for Clarify
 ModelExplainabilityMonitor

The option has been added to SageMakerClarifyProcessor API by PR 2446,
this commit adds the same option to ModelExplainabilityMonitor.
---
 src/sagemaker/clarify.py                      | 14 ++++---
 .../model_monitor/clarify_model_monitoring.py | 31 +++++++++++----
 tests/integ/test_clarify_model_monitor.py     |  3 +-
 .../monitor/test_clarify_model_monitor.py     | 38 ++++++++++++++++---
 4 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
index 0829e25f4b..006cc4846c 100644
--- a/src/sagemaker/clarify.py
+++ b/src/sagemaker/clarify.py
@@ -290,11 +290,15 @@ def __init__(
             probability_threshold (float): An optional value for binary prediction tasks in which
                 the model returns a probability, to indicate the threshold to convert the
                 prediction to a boolean value. Default is 0.5.
-            label_headers (list): List of label values - one for each score of the ``probability``.
+            label_headers (list[str]): List of headers, each for a predicted score in model output.
+                For bias analysis, it is used to extract the label value with the highest score as
+                predicted label. For explainability job, It is used to beautify the analysis report
+                by replacing placeholders like "label0".
         """
         self.label = label
         self.probability = probability
         self.probability_threshold = probability_threshold
+        self.label_headers = label_headers
         if probability_threshold is not None:
             try:
                 float(probability_threshold)
@@ -1060,10 +1064,10 @@ def run_explainability(
             explainability_config (:class:`~sagemaker.clarify.ExplainabilityConfig` or list):
                 Config of the specific explainability method or a list of ExplainabilityConfig
                 objects. Currently, SHAP and PDP are the two methods supported.
-            model_scores(str|int|ModelPredictedLabelConfig):  Index or JSONPath location in the
-                model output for the predicted scores to be explained. This is not required if the
-                model output is a single score. Alternatively, an instance of
-                ModelPredictedLabelConfig can be provided.
+            model_scores (int or str or :class:`~sagemaker.clarify.ModelPredictedLabelConfig`):
+                Index or JSONPath to locate the predicted scores in the model output. This is not
+                required if the model output is a single score. Alternatively, it can be an instance
+                of ModelPredictedLabelConfig to provide more parameters like label_headers.
             wait (bool): Whether the call should wait until the job completes (default: True).
             logs (bool): Whether to show the logs produced by the job.
                 Only meaningful when ``wait`` is True (default: True).
diff --git a/src/sagemaker/model_monitor/clarify_model_monitoring.py b/src/sagemaker/model_monitor/clarify_model_monitoring.py
index 10da0bf6c9..09de7b5c05 100644
--- a/src/sagemaker/model_monitor/clarify_model_monitoring.py
+++ b/src/sagemaker/model_monitor/clarify_model_monitoring.py
@@ -26,7 +26,7 @@
 from sagemaker import image_uris, s3
 from sagemaker.session import Session
 from sagemaker.utils import name_from_base
-from sagemaker.clarify import SageMakerClarifyProcessor
+from sagemaker.clarify import SageMakerClarifyProcessor, ModelPredictedLabelConfig
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -833,9 +833,10 @@ def suggest_baseline(
                 specific explainability method. Currently, only SHAP is supported.
             model_config (:class:`~sagemaker.clarify.ModelConfig`): Config of the model and its
                 endpoint to be created.
-            model_scores (int or str): Index or JSONPath location in the model output for the
-                predicted scores to be explained. This is not required if the model output is
-                a single score.
+            model_scores (int or str or :class:`~sagemaker.clarify.ModelPredictedLabelConfig`):
+                Index or JSONPath to locate the predicted scores in the model output. This is not
+                required if the model output is a single score. Alternatively, it can be an instance
+                of ModelPredictedLabelConfig to provide more parameters like label_headers.
             wait (bool): Whether the call should wait until the job completes (default: False).
             logs (bool): Whether to show the logs produced by the job.
                 Only meaningful when wait is True (default: False).
@@ -865,14 +866,24 @@ def suggest_baseline(
         headers = copy.deepcopy(data_config.headers)
         if headers and data_config.label in headers:
             headers.remove(data_config.label)
+        if model_scores is None:
+            inference_attribute = None
+            label_headers = None
+        elif isinstance(model_scores, ModelPredictedLabelConfig):
+            inference_attribute = str(model_scores.label)
+            label_headers = model_scores.label_headers
+        else:
+            inference_attribute = str(model_scores)
+            label_headers = None
         self.latest_baselining_job_config = ClarifyBaseliningConfig(
             analysis_config=ExplainabilityAnalysisConfig(
                 explainability_config=explainability_config,
                 model_config=model_config,
                 headers=headers,
+                label_headers=label_headers,
             ),
             features_attribute=data_config.features,
-            inference_attribute=model_scores if model_scores is None else str(model_scores),
+            inference_attribute=inference_attribute,
         )
         self.latest_baselining_job_name = baselining_job_name
         self.latest_baselining_job = ClarifyBaseliningJob(
@@ -1166,7 +1177,7 @@ def attach(cls, monitor_schedule_name, sagemaker_session=None):
 class ExplainabilityAnalysisConfig:
     """Analysis configuration for ModelExplainabilityMonitor."""
 
-    def __init__(self, explainability_config, model_config, headers=None):
+    def __init__(self, explainability_config, model_config, headers=None, label_headers=None):
         """Creates an analysis config dictionary.
 
         Args:
@@ -1175,13 +1186,19 @@ def __init__(self, explainability_config, model_config, headers=None):
             model_config (sagemaker.clarify.ModelConfig): Config object related to bias
                 configurations.
             headers (list[str]): A list of feature names (without label) of model/endpint input.
+            label_headers (list[str]): List of headers, each for a predicted score in model output.
+                It is used to beautify the analysis report by replacing placeholders like "label0".
+
         """
+        predictor_config = model_config.get_predictor_config()
         self.analysis_config = {
             "methods": explainability_config.get_explainability_config(),
-            "predictor": model_config.get_predictor_config(),
+            "predictor": predictor_config,
         }
         if headers is not None:
             self.analysis_config["headers"] = headers
+        if label_headers is not None:
+            predictor_config["label_headers"] = label_headers
 
     def _to_dict(self):
         """Generates a request dictionary using the parameters provided to the class."""
diff --git a/tests/integ/test_clarify_model_monitor.py b/tests/integ/test_clarify_model_monitor.py
index 6891082285..3f48fa1032 100644
--- a/tests/integ/test_clarify_model_monitor.py
+++ b/tests/integ/test_clarify_model_monitor.py
@@ -53,6 +53,7 @@
 HEADER_OF_LABEL = "Label"
 HEADERS_OF_FEATURES = ["F1", "F2", "F3", "F4", "F5", "F6", "F7"]
 ALL_HEADERS = [*HEADERS_OF_FEATURES, HEADER_OF_LABEL]
+HEADER_OF_PREDICTION = "Decision"
 DATASET_TYPE = "text/csv"
 CONTENT_TYPE = DATASET_TYPE
 ACCEPT_TYPE = DATASET_TYPE
@@ -325,7 +326,7 @@ def scheduled_explainability_monitor(
 ):
     monitor_schedule_name = utils.unique_name_from_base("explainability-monitor")
     analysis_config = ExplainabilityAnalysisConfig(
-        shap_config, model_config, headers=HEADERS_OF_FEATURES
+        shap_config, model_config, headers=HEADERS_OF_FEATURES, label_headers=[HEADER_OF_PREDICTION]
     )
     s3_uri_monitoring_output = os.path.join(
         "s3://",
diff --git a/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py b/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
index e13755f208..7c1d497d64 100644
--- a/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
+++ b/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
@@ -279,6 +279,7 @@
 # for bias
 ANALYSIS_CONFIG_LABEL = "Label"
 ANALYSIS_CONFIG_HEADERS_OF_FEATURES = ["F1", "F2", "F3"]
+ANALYSIS_CONFIG_LABEL_HEADERS = ["Decision"]
 ANALYSIS_CONFIG_ALL_HEADERS = [*ANALYSIS_CONFIG_HEADERS_OF_FEATURES, ANALYSIS_CONFIG_LABEL]
 ANALYSIS_CONFIG_LABEL_VALUES = [1]
 ANALYSIS_CONFIG_FACET_NAME = "F1"
@@ -330,6 +331,11 @@
         "content_type": CONTENT_TYPE,
     },
 }
+EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS = copy.deepcopy(EXPLAINABILITY_ANALYSIS_CONFIG)
+# noinspection PyTypeChecker
+EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS["predictor"][
+    "label_headers"
+] = ANALYSIS_CONFIG_LABEL_HEADERS
 
 
 @pytest.fixture()
@@ -1048,12 +1054,31 @@ def test_explainability_analysis_config(shap_config, model_config):
         explainability_config=shap_config,
         model_config=model_config,
         headers=ANALYSIS_CONFIG_HEADERS_OF_FEATURES,
+        label_headers=ANALYSIS_CONFIG_LABEL_HEADERS,
     )
-    assert EXPLAINABILITY_ANALYSIS_CONFIG == config._to_dict()
+    assert EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS == config._to_dict()
 
 
+@pytest.mark.parametrize(
+    "model_scores,explainability_analysis_config",
+    [
+        (INFERENCE_ATTRIBUTE, EXPLAINABILITY_ANALYSIS_CONFIG),
+        (
+            ModelPredictedLabelConfig(
+                label=INFERENCE_ATTRIBUTE, label_headers=ANALYSIS_CONFIG_LABEL_HEADERS
+            ),
+            EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS,
+        ),
+    ],
+)
 def test_model_explainability_monitor_suggest_baseline(
-    model_explainability_monitor, sagemaker_session, data_config, shap_config, model_config
+    model_explainability_monitor,
+    sagemaker_session,
+    data_config,
+    shap_config,
+    model_config,
+    model_scores,
+    explainability_analysis_config,
 ):
     clarify_model_monitor = model_explainability_monitor
     # suggest baseline
@@ -1061,12 +1086,12 @@ def test_model_explainability_monitor_suggest_baseline(
         data_config=data_config,
         explainability_config=shap_config,
         model_config=model_config,
-        model_scores=INFERENCE_ATTRIBUTE,
+        model_scores=model_scores,
         job_name=BASELINING_JOB_NAME,
     )
     assert isinstance(clarify_model_monitor.latest_baselining_job, ClarifyBaseliningJob)
     assert (
-        EXPLAINABILITY_ANALYSIS_CONFIG
+        explainability_analysis_config
         == clarify_model_monitor.latest_baselining_job_config.analysis_config._to_dict()
     )
     clarify_baselining_job = clarify_model_monitor.latest_baselining_job
@@ -1081,6 +1106,7 @@ def test_model_explainability_monitor_suggest_baseline(
         analysis_config=None,  # will pick up config from baselining job
         baseline_job_name=BASELINING_JOB_NAME,
         endpoint_input=ENDPOINT_NAME,
+        explainability_analysis_config=explainability_analysis_config,
         #  will pick up attributes from baselining job
     )
 
@@ -1133,6 +1159,7 @@ def test_model_explainability_monitor_created_with_config(
         sagemaker_session=sagemaker_session,
         analysis_config=analysis_config,
         constraints=CONSTRAINTS,
+        explainability_analysis_config=EXPLAINABILITY_ANALYSIS_CONFIG,
     )
 
     # update schedule
@@ -1263,6 +1290,7 @@ def _test_model_explainability_monitor_create_schedule(
         features_attribute=FEATURES_ATTRIBUTE,
         inference_attribute=str(INFERENCE_ATTRIBUTE),
     ),
+    explainability_analysis_config=None,
 ):
     # create schedule
     with patch(
@@ -1278,7 +1306,7 @@ def _test_model_explainability_monitor_create_schedule(
         )
         if not isinstance(analysis_config, str):
             upload.assert_called_once()
-            assert json.loads(upload.call_args[0][0]) == EXPLAINABILITY_ANALYSIS_CONFIG
+            assert json.loads(upload.call_args[0][0]) == explainability_analysis_config
 
     # validation
     expected_arguments = {

From 185b4b0aa21df567f3992ae642945ede06a72016 Mon Sep 17 00:00:00 2001
From: Navin Soni <navinns@amazon.com>
Date: Mon, 27 Dec 2021 23:01:24 +0000
Subject: [PATCH 07/11] fix: Add ContentType in test_auto_ml_describe

---
 tests/integ/test_auto_ml.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/integ/test_auto_ml.py b/tests/integ/test_auto_ml.py
index 1d1e656144..617ebae539 100644
--- a/tests/integ/test_auto_ml.py
+++ b/tests/integ/test_auto_ml.py
@@ -15,12 +15,12 @@
 import os
 
 import pytest
-import tests.integ
-from sagemaker import AutoML, CandidateEstimator, AutoMLInput
-
 from botocore.exceptions import ClientError
+
+import tests.integ
+from sagemaker import AutoML, AutoMLInput, CandidateEstimator
 from sagemaker.utils import unique_name_from_base
-from tests.integ import DATA_DIR, AUTO_ML_DEFAULT_TIMEMOUT_MINUTES, auto_ml_utils
+from tests.integ import AUTO_ML_DEFAULT_TIMEMOUT_MINUTES, DATA_DIR, auto_ml_utils
 from tests.integ.timeout import timeout
 
 ROLE = "SageMakerRole"
@@ -169,6 +169,7 @@ def test_auto_ml_describe_auto_ml_job(sagemaker_session):
                 }
             },
             "TargetAttributeName": TARGET_ATTRIBUTE_NAME,
+            "ContentType": "text/csv;header=present",
         }
     ]
     expected_default_output_config = {
@@ -205,6 +206,7 @@ def test_auto_ml_attach(sagemaker_session):
                 }
             },
             "TargetAttributeName": TARGET_ATTRIBUTE_NAME,
+            "ContentType": "text/csv;header=present",
         }
     ]
     expected_default_output_config = {

From 5ec2ff40d8444185072432e7b2355068a8094576 Mon Sep 17 00:00:00 2001
From: Payton Staub <pstaub@amazon.com>
Date: Mon, 27 Dec 2021 15:33:34 -0600
Subject: [PATCH 08/11] fix: Re-deploy static integ test endpoint if it is not
 found

---
 tests/integ/sagemaker/lineage/conftest.py | 27 +++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/tests/integ/sagemaker/lineage/conftest.py b/tests/integ/sagemaker/lineage/conftest.py
index dfc1ce585a..5b814bab5b 100644
--- a/tests/integ/sagemaker/lineage/conftest.py
+++ b/tests/integ/sagemaker/lineage/conftest.py
@@ -36,8 +36,8 @@
 from tests.integ.sagemaker.lineage.helpers import name, names
 
 SLEEP_TIME_SECONDS = 1
-STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline14"
-STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint14"
+STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline15"
+STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint15"
 
 
 @pytest.fixture
@@ -518,6 +518,13 @@ def _get_static_pipeline_execution_arn(sagemaker_session):
 def static_endpoint_context(sagemaker_session, static_pipeline_execution_arn):
     endpoint_arn = get_endpoint_arn_from_static_pipeline(sagemaker_session)
 
+    if endpoint_arn is None:
+        _deploy_static_endpoint(
+            execution_arn=static_pipeline_execution_arn,
+            sagemaker_session=sagemaker_session,
+        )
+        endpoint_arn = get_endpoint_arn_from_static_pipeline(sagemaker_session)
+
     contexts = sagemaker_session.sagemaker_client.list_contexts(SourceUri=endpoint_arn)[
         "ContextSummaries"
     ]
@@ -584,11 +591,17 @@ def static_dataset_artifact(static_model_artifact, sagemaker_session):
 
 
 def get_endpoint_arn_from_static_pipeline(sagemaker_session):
-    endpoint_arn = sagemaker_session.sagemaker_client.describe_endpoint(
-        EndpointName=STATIC_ENDPOINT_NAME
-    )["EndpointArn"]
+    try:
+        endpoint_arn = sagemaker_session.sagemaker_client.describe_endpoint(
+            EndpointName=STATIC_ENDPOINT_NAME
+        )["EndpointArn"]
 
-    return endpoint_arn
+        return endpoint_arn
+    except ClientError as e:
+        error = e.response["Error"]
+        if error["Code"] == "ValidationException":
+            return None
+        raise e
 
 
 def get_model_package_arn_from_static_pipeline(pipeline_execution_arn, sagemaker_session):
@@ -654,7 +667,7 @@ def _deploy_static_endpoint(execution_arn, sagemaker_session):
             sagemaker_session=sagemaker_session,
         )
         model_package.deploy(1, "ml.t2.medium", endpoint_name=STATIC_ENDPOINT_NAME)
-        time.sleep(60)
+        time.sleep(120)
     except ClientError as e:
         if e.response["Error"]["Code"] == "ValidationException":
             print(f"Endpoint {STATIC_ENDPOINT_NAME} already exists. Continuing.")

From 1312575f6a0de46c8e2cb99b9a6938f972e633a0 Mon Sep 17 00:00:00 2001
From: Miyoung <myoung8739@gmail.com>
Date: Thu, 30 Dec 2021 07:30:29 -0800
Subject: [PATCH 09/11] documentation :SageMaker model parallel library 1.6.0
 API doc (#2814)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update smdmp change log, archive api doc for 1.4.0 and 1.5.0

* add no-index flags

* finish api doc archive

* fix: Set ProcessingStep upload locations deterministically to avoid c… (#2790)

* fix: Prevent repack_model script from referencing nonexistent directories (#2755)

Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>

* fix: S3Input - add support for instance attributes (#2754)

* fix: typos and broken link (#2765)

Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>

* add all api docs

* add appendix, fix links

* structural changes, fix links

* incorporate feedback

* prepare release v2.72.1

* update development version to v2.72.2.dev0

Co-authored-by: Payton Staub <staubhpa@gmail.com>
Co-authored-by: Payton Staub <pstaub@amazon.com>
Co-authored-by: Ahsan Khan <ahsan.al.zaki@gmail.com>
Co-authored-by: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Co-authored-by: Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
Co-authored-by: Shreya Pandit <shreya.pandit25@gmail.com>
Co-authored-by: ci <ci>
Co-authored-by: Jeniya Tabassum <jeniya.tabassum@gmail.com>
---
 CHANGELOG.md                                  |   9 +
 VERSION                                       |   2 +-
 doc/api/training/smd_data_parallel.rst        |   6 +-
 doc/api/training/smd_model_parallel.rst       |  64 +-
 .../training/smd_model_parallel_general.rst   | 683 +++++++-------
 .../smd_model_parallel_change_log.rst         |  75 +-
 doc/api/training/smp_versions/archives.rst    |  10 +
 doc/api/training/smp_versions/latest.rst      |  26 +-
 .../latest/smd_model_parallel_common_api.rst  | 100 ++-
 .../latest/smd_model_parallel_pytorch.rst     | 125 ++-
 ...model_parallel_pytorch_tensor_parallel.rst | 835 ++++++++++++++++++
 .../latest/smd_model_parallel_tensorflow.rst  |   9 +-
 .../v1.4.0/smd_model_parallel_common_api.rst  | 488 ++++++++++
 .../v1.4.0/smd_model_parallel_pytorch.rst     | 572 ++++++++++++
 .../v1.4.0/smd_model_parallel_tensorflow.rst  | 172 ++++
 .../v1.5.0/smd_model_parallel_common_api.rst  | 488 ++++++++++
 .../v1.5.0/smd_model_parallel_pytorch.rst     | 572 ++++++++++++
 .../v1.5.0/smd_model_parallel_tensorflow.rst  | 172 ++++
 doc/api/training/smp_versions/v1_4_0.rst      |  12 +
 doc/api/training/smp_versions/v1_5_0.rst      |  12 +
 20 files changed, 3989 insertions(+), 443 deletions(-)
 create mode 100644 doc/api/training/smp_versions/archives.rst
 create mode 100644 doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst
 create mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst
 create mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst
 create mode 100644 doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst
 create mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst
 create mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst
 create mode 100644 doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst
 create mode 100644 doc/api/training/smp_versions/v1_4_0.rst
 create mode 100644 doc/api/training/smp_versions/v1_5_0.rst

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 15694498f5..3d4ef865a0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## v2.72.1 (2021-12-20)
+
+### Bug Fixes and Other Changes
+
+ * typos and broken link
+ * S3Input - add support for instance attributes
+ * Prevent repack_model script from referencing nonexistent directories
+ * Set ProcessingStep upload locations deterministically to avoid c…
+
 ## v2.72.0 (2021-12-13)
 
 ### Features
diff --git a/VERSION b/VERSION
index ce195e8ed0..cf7d16b307 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.72.1.dev0
+2.72.2.dev0
diff --git a/doc/api/training/smd_data_parallel.rst b/doc/api/training/smd_data_parallel.rst
index 27c0e5dea7..14f70a777f 100644
--- a/doc/api/training/smd_data_parallel.rst
+++ b/doc/api/training/smd_data_parallel.rst
@@ -1,6 +1,6 @@
-##########################
-Distributed data parallel
-##########################
+###############################################
+The SageMaker Distributed Data Parallel Library
+###############################################
 
 SageMaker's distributed data parallel library extends SageMaker’s training
 capabilities on deep learning models with near-linear scaling efficiency,
diff --git a/doc/api/training/smd_model_parallel.rst b/doc/api/training/smd_model_parallel.rst
index 47a0af6775..c40bc258fb 100644
--- a/doc/api/training/smd_model_parallel.rst
+++ b/doc/api/training/smd_model_parallel.rst
@@ -1,5 +1,5 @@
-Distributed model parallel
---------------------------
+The SageMaker Distributed Model Parallel Library
+------------------------------------------------
 
 The Amazon SageMaker distributed model parallel library is a model parallelism library for training
 large deep learning models that were previously difficult to train due to GPU memory limitations.
@@ -9,49 +9,35 @@ allowing you to increase prediction accuracy by creating larger models with more
 You can use the library to automatically partition your existing TensorFlow and PyTorch workloads
 across multiple GPUs with minimal code changes. The library's API can be accessed through the Amazon SageMaker SDK.
 
-Use the following sections to learn more about the model parallelism and the library.
-
-Use with the SageMaker Python SDK
-=================================
-
-Use the following page to learn how to configure and enable distributed model parallel
-when you configure an Amazon SageMaker Python SDK `Estimator`.
+See the following sections to learn more about the SageMaker model parallel library APIs.
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 3
 
+   smp_versions/latest
    smd_model_parallel_general
 
-API Documentation
-=================
-
-The library contains a Common API that is shared across frameworks, as well as APIs
-that are specific to supported frameworks, TensorFlow and PyTorch.
-
-Select a version to see the API documentation for version. To use the library, reference the
-**Common API** documentation alongside the framework specific API documentation.
-
-.. toctree::
-   :maxdepth: 1
-
-   smp_versions/latest.rst
-   smp_versions/v1_3_0.rst
-   smp_versions/v1_2_0.rst
-   smp_versions/v1_1_0.rst
-
-It is recommended to use this documentation alongside `SageMaker Distributed Model Parallel
-<http://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`__ in the Amazon SageMaker
-developer guide. This developer guide documentation includes:
 
-   -  An overview of model parallelism and the library
-      `core features <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html>`__
-   -  Instructions on how to modify `TensorFlow
-      <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`__
-      and `PyTorch
-      <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`__
-      training scripts
-   -  `Configuration tips and pitfalls
-      <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-tips-pitfalls.html>`__
+.. tip::
+
+  We recommended using this API documentation with the conceptual guide at
+  `SageMaker's Distributed Model Parallel
+  <http://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`_
+  in the *Amazon SageMaker developer guide*. This developer guide documentation includes:
+
+  - An overview of model parallelism, and the library's
+    `core features <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html>`_,
+    and `extended features for PyTorch <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch.html>`_.
+  - Instructions on how to modify `TensorFlow
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-tf.html>`_
+    and `PyTorch
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-pt.html>`_
+    training scripts.
+  - Instructions on how to `run a distributed training job using the SageMaker Python SDK
+    and the SageMaker model parallel library
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-sm-sdk.html>`_.
+  - `Configuration tips and pitfalls
+    <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-tips-pitfalls.html>`_.
 
 
 .. important::
diff --git a/doc/api/training/smd_model_parallel_general.rst b/doc/api/training/smd_model_parallel_general.rst
index 03c9c0078a..71f9115580 100644
--- a/doc/api/training/smd_model_parallel_general.rst
+++ b/doc/api/training/smd_model_parallel_general.rst
@@ -1,338 +1,212 @@
-.. admonition:: Contents
+#################################
+Use with the SageMaker Python SDK
+#################################
 
-   - :ref:`sm-sdk-modelparallel-params`
-   - :ref:`ranking-basics`
+Walk through the following pages to learn about the SageMaker model parallel library's APIs
+to configure and enable distributed model parallelism
+through an Amazon SageMaker estimator.
 
 .. _sm-sdk-modelparallel-params:
 
-Required SageMaker Python SDK parameters
-========================================
-
-The TensorFlow and PyTorch ``Estimator`` objects contains a ``distribution`` parameter,
-which is used to enable and specify parameters for the
-initialization of the SageMaker distributed model parallel library. The library internally uses MPI,
-so in order to use model parallelism, MPI must also be enabled using the ``distribution`` parameter.
-
-The following is an example of how you can launch a new PyTorch training job with the library.
-
-.. code-block:: python3
-
-   sagemaker_session = sagemaker.session.Session(boto_session=session)
-
-   mpi_options = {
-                  "enabled" : True,
-                  "processes_per_host" : 8,
-                  "custom_mpi_options" : "--mca btl_vader_single_copy_mechanism none "
-                  }
-
-   smp_options = {
-                  "enabled":True,
-                  "parameters": {
-                     "microbatches": 4,
-                     "placement_strategy": "spread",
-                     "pipeline": "interleaved",
-                     "optimize": "speed",
-                     "partitions": 2,
-                     "ddp": True,
-                  }
-               }
-
-   smd_mp_estimator = PyTorch(
-            entry_point="training-script.py", # Pick your train script
-            source_dir='utils',
-            role=role,
-            instance_type='ml.p3.16xlarge',
-            sagemaker_session=sagemaker_session,
-            framework_version='1.6.0',
-            py_version='py3',
-            instance_count=1,
-            distribution={
-               "smdistributed": {"modelparallel": smp_options},
-               "mpi": mpi_options
-            },
-            base_job_name="SMD-MP-demo",
-         )
-
-   smd_mp_estimator.fit('s3://my_bucket/my_training_data/')
-
-``smdistributed`` Parameters
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can use the following parameters to initialize the library using the ``parameters``
-in the ``smdistributed`` of ``distribution``.
-
-Note: ``partitions`` is required in ``parameters`` of ``smp_options``. All other parameters in the following
-table are optional.
-
-.. table::
-   :widths: 10 20 10 60
+Configuration Parameters for ``distribution``
+=============================================
 
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | **Parameter**             | **Type / Valid values** | **Default**       | **Description**       |
-   |                           |                         |                   |                       |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``partitions`` (required) | int                     | -                 | The number of         |
-   |                           |                         |                   | partitions to         |
-   |                           |                         |                   | split the model       |
-   |                           |                         |                   | into.                 |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``microbatches``          | int                     | 1                 | The number of         |
-   |                           |                         |                   | microbatches to       |
-   |                           |                         |                   | perform               |
-   |                           |                         |                   | pipelining            |
-   |                           |                         |                   | over. 1 means         |
-   |                           |                         |                   | no pipelining.        |
-   |                           |                         |                   | Batch size must       |
-   |                           |                         |                   | be divisible by       |
-   |                           |                         |                   | the number of         |
-   |                           |                         |                   | microbatches.         |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``pipeline``              | ``"interleaved"``       | ``"interleaved"`` | The pipeline          |
-   |                           | or ``"simple"``         |                   | schedule.             |
-   |                           |                         |                   |                       |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``optimize``              | ``"memory"`` or         | ``"memory"``      | Whether the library   |
-   |                           | ``"speed"``             |                   | should optimize       |
-   |                           |                         |                   | for speed or          |
-   |                           |                         |                   | memory during         |
-   |                           |                         |                   | partitioning          |
-   |                           |                         |                   | decision and          |
-   |                           |                         |                   | pipeline              |
-   |                           |                         |                   | execution.            |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **speed**             |
-   |                           |                         |                   | When the library is   |
-   |                           |                         |                   | configured to         |
-   |                           |                         |                   | optimize speed,       |
-   |                           |                         |                   | it attempts to        |
-   |                           |                         |                   | balance the           |
-   |                           |                         |                   | number of             |
-   |                           |                         |                   | operations            |
-   |                           |                         |                   | executed in           |
-   |                           |                         |                   | each device,          |
-   |                           |                         |                   | and executes a        |
-   |                           |                         |                   | less strict           |
-   |                           |                         |                   | pipeline              |
-   |                           |                         |                   | schedule in           |
-   |                           |                         |                   | which a               |
-   |                           |                         |                   | microbatch can        |
-   |                           |                         |                   | start executing       |
-   |                           |                         |                   | before the            |
-   |                           |                         |                   | previous              |
-   |                           |                         |                   | microbatch is         |
-   |                           |                         |                   | completely            |
-   |                           |                         |                   | finished on           |
-   |                           |                         |                   | that device.          |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **memory**            |
-   |                           |                         |                   | When the library      |
-   |                           |                         |                   | optimizes             |
-   |                           |                         |                   | memory, it            |
-   |                           |                         |                   | attempts to           |
-   |                           |                         |                   | balance the           |
-   |                           |                         |                   | total number of       |
-   |                           |                         |                   | stored                |
-   |                           |                         |                   | trainable             |
-   |                           |                         |                   | parameters and        |
-   |                           |                         |                   | activations on        |
-   |                           |                         |                   | each device and       |
-   |                           |                         |                   | imposes a             |
-   |                           |                         |                   | strict pipeline       |
-   |                           |                         |                   | schedule on the       |
-   |                           |                         |                   | backend.              |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``placement_strategy``    | ``"cluster"`` or        | ``"cluster"``     | When hybrid           |
-   |                           | ``"spread"``            |                   | model/data            |
-   |                           |                         |                   | parallelism is        |
-   |                           |                         |                   | used,                 |
-   |                           |                         |                   | cluster               |
-   |                           |                         |                   | places a single       |
-   |                           |                         |                   | model replica         |
-   |                           |                         |                   | in neighboring        |
-   |                           |                         |                   | device IDs,           |
-   |                           |                         |                   | whereas               |
-   |                           |                         |                   | spread                |
-   |                           |                         |                   | places them as        |
-   |                           |                         |                   | far as                |
-   |                           |                         |                   | possible.             |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | Example:              |
-   |                           |                         |                   | - 8 GPUs: [0,         |
-   |                           |                         |                   | 1, 2, 3, 4, 5,        |
-   |                           |                         |                   | 6, 7], 4-way          |
-   |                           |                         |                   | model                 |
-   |                           |                         |                   | parallelism,          |
-   |                           |                         |                   | 2-way data            |
-   |                           |                         |                   | parallelism.          |
-   |                           |                         |                   | Two model             |
-   |                           |                         |                   | replicas, each        |
-   |                           |                         |                   | partitioned           |
-   |                           |                         |                   | across 4 GPUs.        |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **spread**            |
-   |                           |                         |                   | places                |
-   |                           |                         |                   | the two model         |
-   |                           |                         |                   | replicas in [0,       |
-   |                           |                         |                   | 2, 4, 6] and          |
-   |                           |                         |                   | [1, 3, 5, 7].         |
-   |                           |                         |                   |                       |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | **cluster**           |
-   |                           |                         |                   | places the two        |
-   |                           |                         |                   | model replicas        |
-   |                           |                         |                   | in [0, 1, 2, 3]       |
-   |                           |                         |                   | and [4, 5, 6, 7].     |
-   |                           |                         |                   |                       |
-   |                           |                         |                   | This can be           |
-   |                           |                         |                   | useful, for           |
-   |                           |                         |                   | instance, for         |
-   |                           |                         |                   | performing            |
-   |                           |                         |                   | model                 |
-   |                           |                         |                   | parallelism           |
-   |                           |                         |                   | across                |
-   |                           |                         |                   | instances, and        |
-   |                           |                         |                   | leaving the           |
-   |                           |                         |                   | intra-node            |
-   |                           |                         |                   | high-bandwidth        |
-   |                           |                         |                   | NVLinks for           |
-   |                           |                         |                   | data                  |
-   |                           |                         |                   | parallelism.          |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``auto_partition``        | bool                    | ``True``          | Enable                |
-   |                           |                         |                   | auto-partitioning.    |
-   |                           |                         |                   | If disabled,          |
-   |                           |                         |                   | ``default_partition`` |
-   |                           |                         |                   | parameter             |
-   |                           |                         |                   | must be               |
-   |                           |                         |                   | provided.             |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-   | ``default_partition``     | int                     | ``0``             | The partition         |
-   |                           |                         |                   | ID to place           |
-   | (required if              |                         |                   | operations/modules    |
-   | auto_partition if false)  |                         |                   | that are not          |
-   |                           |                         |                   | placed in any         |
-   |                           |                         |                   | ``smp.partition``     |
-   |                           |                         |                   | contexts.             |
-   +---------------------------+-------------------------+-------------------+-----------------------+
-
-
-.. rubric:: TensorFlow-specific parameters
-
-.. table::
-   :widths: 10 20 10 60
+Amazon SageMaker's TensorFlow and PyTorch estimator objects contain a ``distribution`` parameter,
+which you can use to enable and specify parameters for SageMaker distributed training.
+The SageMaker model parallel library internally uses MPI.
+To use model parallelism, both ``smdistributed`` and MPI must be enabled
+through the ``distribution`` parameter.
+
+.. tip::
+
+  This page provides you a complete list of parameters you can use
+  when you construct a SageMaker estimator and configure for distributed training.
+
+  To find examples of how to construct a SageMaker estimator with the distributed training parameters, see
+  `Launch a SageMaker Distributed Model Parallel Training Job <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-sm-sdk.html>`_
+  in the `SageMaker's Distributed Model Parallel developer guide <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`_.
+
+.. contents:: Table of Contents
+  :depth: 3
+  :local:
+
+Parameters for ``smdistributed``
+----------------------------------
 
-   +----------------+-------------------------+-------------+-----------------+
-   | **Parameter**  | **Type / Valid values** | **Default** | **Description** |
-   |                |                         |             |                 |
-   +----------------+-------------------------+-------------+-----------------+
-   | ``contiguous`` | bool                    | ``True``    | Whether the     |
-   |                |                         |             | model           |
-   |                |                         |             | partitions      |
-   |                |                         |             | should be       |
-   |                |                         |             | contiguous. If  |
-   |                |                         |             | true, each      |
-   |                |                         |             | partition forms |
-   |                |                         |             | a connected     |
-   |                |                         |             | component in    |
-   |                |                         |             | the             |
-   |                |                         |             | computational   |
-   |                |                         |             | graph, unless   |
-   |                |                         |             | the graph       |
-   |                |                         |             | itself is not   |
-   |                |                         |             | connected.      |
-   +----------------+-------------------------+-------------+-----------------+
-   | ``horovod``    | bool                    | ``False``   | Must be set to  |
-   |                |                         |             | ``True`` if     |
-   |                |                         |             | hybrid          |
-   |                |                         |             | model/data      |
-   |                |                         |             | parallelism is  |
-   |                |                         |             | used and the    |
-   |                |                         |             | data            |
-   |                |                         |             | parallelism     |
-   |                |                         |             | (DP) framework  |
-   |                |                         |             | is Horovod.     |
-   +----------------+-------------------------+-------------+-----------------+
-
-.. rubric:: PyTorch-specific parameters
-
-.. table::
+You can use the following parameters to initialize the library
+configuring a dictionary for ``modelparallel``, which goes
+into the ``smdistributed`` option for the ``distribution`` parameter.
+
+.. note::
+
+    ``partitions`` for TensorFlow and ``pipeline_parallel_degree`` for PyTorch are required parameters.
+    All other parameters in the following
+    table are optional.
+
+Common Parameters
+~~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :widths: 10 20 10 60
+   :header-rows: 1
+
+   * - Parameter
+     - Type / Valid values
+     - Default
+     - Description
+   * - ``partitions`` for TensorFlow and PyTorch with smdistributed-modelparallel<v1.6,
+       ``pipeline_parallel_degree`` for PyTorch v1.8.1 with smdistributed-modelparallel>=v1.6)
+     - int
+     -
+     - **Required.** The number of partitions to split the model into.
+       In case of ``pipeline_parallel_degree`` for PyTorch, this is the number of devices
+       over which pipeline parallelism will be performed.
+   * - ``microbatches``
+     - int
+     - 1
+     - The number of microbatches to perform pipelining over. 1 means no pipelining.
+       Batch size must be divisible by the number of microbatches.
+   * - ``pipeline``
+     - ``"interleaved"`` or ``"simple"``
+     - ``"interleaved"``
+     - The pipeline schedule.
+   * - ``optimize``
+     - ``"memory"`` or ``"speed"``
+     - ``"memory"``
+     - Determines the distribution mechanism of transformer layers.
+       If optimizing ``speed``, there will be less communication across tensor-parallel ranks
+       and layer normalization will not be distributed. However, there will be duplicate activations
+       stored across tensor-parallel ranks.
+       If optimizing ``memory``, there will be no redundant activations stored,
+       but this will result in more communication overhead across tensor parallel ranks.
+   * - ``placement_strategy``
+     - ``"cluster"``, ``"spread"``, or a permutation of the string ``D``, ``P``, and ``T``.
+     - ``"cluster"``
+     - Determines the mapping of model partitions onto physical devices.
+       When hybrid model/data parallelism is used, ``cluster`` places a single model replica in
+       neighboring device IDs. Contrarily, ``spread`` places a model replica as far as possible.
+       For more information, see :ref:`ranking-basics`.
+
+       In case of the permutation letters, ``D`` stands for reduced-data parallelism,
+       ``P`` stands for pipeline parallelism,
+       and ``T`` stands for tensor parallelism.
+       ``spread`` is equivalent to ``"TPD"``, and ``cluster`` is equivalent to ``"DPT"``.
+       For more information, see :ref:`ranking-basics-tensor-parallelism`.
+
+       Note: For TensorFlow, tensor parallelism is not implemented and
+       available parameter values are only ``"spread"`` and ``"cluster"``.
+   * - ``auto_partition``
+     - bool
+     - ``True``
+     - Enable auto-partitioning. If disabled, ``default_partition`` parameter must be provided.
+   * - ``default_partition``
+     - int
+     - ``0``
+     - **Required** if ``auto_partition`` is false. The partition ID to place operations/modules
+       that are not placed in any ``smp.partition`` contexts.
+
+TensorFlow-specific Parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
    :widths: 10 20 10 60
+   :header-rows: 1
+
+   * - Parameter
+     - Type / Valid values
+     - Default
+     - Description
+   * - ``contiguous``
+     - bool
+     - ``True``
+     - Whether the model partitions should be contiguous. If true, each partition forms a connected component in the computational graph, unless the graph itself is not connected.
+   * - ``horovod``
+     - bool
+     - ``False``
+     - Must be set to ``True`` if hybrid model/data parallelism is used and the data parallelism (DP) framework is Horovod.
+
+
+PyTorch-specific Parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
+  :widths: 10 20 10 60
+  :header-rows: 1
+
+  * - Parameter
+    - Type / Valid values
+    - Default
+    - Description
+  * - ``memory_weight``
+    - float [0.0, 1.0]
+    - ``0.2`` if ``optimize`` is ``"speed"``, else ``0.8``
+    - The weight of memory balancing in the auto-partitioni ng objective, as opposed to balancing computational load. If 0.0, the library only tries to balance computation; if 1.0 the library only tries to balance the memory use. Any value in between interpolates between these extremes.
+  * - ``ddp``
+    - bool
+    - ``False``
+    - Must be set to True if hybrid model/data parallelism is used with DistributedDataParallel. DistributedDataParallel is used with NCCL backend, and uses the MASTER_PORT provided by SageMaker.
+  * - ``active_microbatches`` (**smdistributed-modelparallel**>=v1.3)
+    - int
+    - ``partitions`` + 2
+    - This is the maximum number of microbatches that are simultaneously in execution during pipelining. Jointly scaling batch size and number of microbatches can often mitigate the pipeline bubble overhead, but that can lead to increased memory usage if too many microbatches are simultaneously in execution. In such cases setting the number of active microbatches to a lower number can help control memory usage. By default this is set to two plus the number of partitions of the model.
+  * - ``deterministic_server`` (**smdistributed-modelparallel**>=v1.3)
+    - bool
+    - ``False``
+    - Setting this to true ensures that the execution server for pipelining executes requests in the same order across all data parallel ranks.
+  * -  ``offload_activations`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - False
+    - Enables activation
+      offloading. To improve GPU memory usage, use activation offloading
+      only when (1) the ``microbatches`` and ``active_microbatches`` are
+      greater than 1, and (2) activation checkpointing is enabled for at
+      least one module in the model.
+  * - ``activation_loading_horizon`` (**smdistributed-modelparallel**>=v1.6)
+    - int
+    - 4
+    - Specify the number
+      of pipeline tasks. This determines how early the activations should
+      be loaded back to the GPU, expressed in number of pipeline tasks.
+      Smaller value indicates that activations are loaded closer in time to
+      when they are needed for backward pass. Setting this value too small
+      might improve memory usage, but might potentially cause throughput
+      loss and GPU bottlenecks during the CPU-to-GPU data transfer.
+  * - ``tensor_parallel_degree`` (**smdistributed-modelparallel**>=v1.6)
+    - int
+    - 1
+    - The number of devices over which the tensor parallel modules will be distributed.
+      If ``tensor_parallel_degree`` is greater than 1, then ``ddp`` must be set to ``True``.
+  * - ``fp16_params`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - ``False``
+    - If ``True``, the parameters of the distributed modules will be initialized in FP16.
+  * - ``shard_optimizer_state`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - ``False``
+    - If ``True``, the library shards the optimizer state of all parameters across
+      the data parallel processes which hold the same parameter.
+      This optimizer state sharding happens in a balanced manner.
+      Note that when sharding optimizer state, full optimizer saving is not currently supported.
+      Please save partial optimizer state. For more information about saving and loading checkpoints with
+      optimizer state sharding, see `Instructions for Checkpointing with Tensor Parallelism <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-saving-loading-checkpoints.html>`_.
+  * - ``prescaled_batch`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - ``False``
+    - If ``True`` and when ``smp.nn.DistributedTransformerLMHead`` is used
+      (this is typically used for GPT-2 or GPT-3 models),
+      the library assumes that the devices in the same tensor parallelism group
+      receive the same input data. Otherwise, it is assumed that they receive
+      different examples. To learn more, see :ref:`prescaled-batch`.
+  * - ``skip_tracing`` (**smdistributed-modelparallel**>=v1.6)
+    - bool
+    - False
+    - Skips the initial tracing step. This can be useful in very large models
+      where even model tracing at the CPU is not possible due to memory constraints.
+
+
+Parameters for ``mpi``
+----------------------
 
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | **Parameter**            | **Type / Valid values** | **Default**        | **Description**                      |
-   |                          |                         |                    |                                      |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``memory_weight``        | float (between          | 0.2 if             | The weight of                        |
-   |                          | 0.0 and 1.0)            | ``optimize`` is    | memory                               |
-   |                          |                         | ``"speed"``,       | balancing in                         |
-   |                          |                         | else 0.8           | the                                  |
-   |                          |                         |                    | auto-partitioni                      |
-   |                          |                         |                    | ng                                   |
-   |                          |                         |                    | objective, as                        |
-   |                          |                         |                    | opposed to                           |
-   |                          |                         |                    | balancing                            |
-   |                          |                         |                    | computational                        |
-   |                          |                         |                    | load. If 0.0,                        |
-   |                          |                         |                    | the library only tries               |
-   |                          |                         |                    | to balance                           |
-   |                          |                         |                    | computation; if                      |
-   |                          |                         |                    | 1.0 the library only                 |
-   |                          |                         |                    | tries to                             |
-   |                          |                         |                    | balance the                          |
-   |                          |                         |                    | memory use. Any                      |
-   |                          |                         |                    | value in                             |
-   |                          |                         |                    | between                              |
-   |                          |                         |                    | interpolates                         |
-   |                          |                         |                    | between these                        |
-   |                          |                         |                    | extremes.                            |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``ddp``                  | bool                    | ``False``          | Must be set to                       |
-   |                          |                         |                    | ``True`` if                          |
-   |                          |                         |                    | hybrid                               |
-   |                          |                         |                    | model/data                           |
-   |                          |                         |                    | parallelism is                       |
-   |                          |                         |                    | used                                 |
-   |                          |                         |                    | with ``DistributedDataParallel``.    |
-   |                          |                         |                    | ``DistributedDataParallel``          |
-   |                          |                         |                    | is used with                         |
-   |                          |                         |                    | NCCL backend,                        |
-   |                          |                         |                    | and uses the                         |
-   |                          |                         |                    | ``MASTER_PORT``                      |
-   |                          |                         |                    | provided by                          |
-   |                          |                         |                    | SageMaker.                           |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``active_microbatches``  | int                     | ``partitions`` + 2 | This is the maximum number of        |
-   | (Only >= v1.3)           |                         |                    | microbatches that are simultaneously |
-   |                          |                         |                    | in execution during pipelining.      |
-   |                          |                         |                    | Jointly scaling batch                |
-   |                          |                         |                    | size and number of microbatches      |
-   |                          |                         |                    | can often mitigate the pipeline      |
-   |                          |                         |                    | bubble overhead, but that can        |
-   |                          |                         |                    | lead to increased memory usage       |
-   |                          |                         |                    | if too many microbatches are         |
-   |                          |                         |                    | simultaneously in execution.         |
-   |                          |                         |                    | In such cases setting the            |
-   |                          |                         |                    | number of active                     |
-   |                          |                         |                    | microbatches to a lower number       |
-   |                          |                         |                    | can help control memory usage.       |
-   |                          |                         |                    | By default this is set to two        |
-   |                          |                         |                    | plus the number of                   |
-   |                          |                         |                    | partitions of the model.             |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-   | ``deterministic_server`` | bool                    | ``False``          | Setting this to true                 |
-   | (Only >= v1.3)           |                         |                    | ensures that the execution           |
-   |                          |                         |                    | server for pipelining                |
-   |                          |                         |                    | executes requests in the             |
-   |                          |                         |                    | same order across all                |
-   |                          |                         |                    | data parallel ranks.                 |
-   +--------------------------+-------------------------+--------------------+--------------------------------------+
-
-
-``mpi`` Parameters
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 For the ``"mpi"`` key, a dict must be passed which contains:
 
 * ``"enabled"``: Set to ``True`` to launch the training job with MPI.
@@ -369,8 +243,8 @@ For the ``"mpi"`` key, a dict must be passed which contains:
 
 .. _ranking-basics:
 
-Ranking Basics
-==============
+Ranking Basics without Tensor Parallelism
+=========================================
 
 The library maintains a one-to-one mapping between processes and available GPUs:
 for each GPU, there is a corresponding CPU process. Each CPU process
@@ -387,27 +261,136 @@ launched in the instance. For instance, in the preceding
 example, ``local_rank``\ s of the processes will range from 0 to 7,
 since there are 8 GPUs in a ``p3dn.24xlarge`` instance.
 
-When the library is used together with data parallelism (Horovod for TensorFlow
+When model parallelism is used together with data parallelism (Horovod for TensorFlow
 and DDP for PyTorch), the library partitions the set of processes into
 disjoint \ ``mp_group``\ s. An ``mp_group`` is a subset of all processes
-that together hold a single, partitioned model replica. For instance, if
-a single node job is launched with 8 local processes, and
-``partitions`` is 2 (meaning the model will be split into 2), there are
+that together hold a single, partitioned model replica.
+
+For instance, if
+a single node job is launched with 8 local processes with
+``partitions=2`` (meaning the model will be split into 2), there are
 four \ ``mp_group``\ s. The specific sets of processes that form the
-``mp_group``\ s can be adjusted by the ``placement_strategy`` option. In
-this example, if ``placement_strategy`` is ``spread``, then the four
-``mp_group``\ s are ``[0, 4], [1, 5], [2, 6], [3, 7]``. An
-``mp_rank`` is the rank of a process within its own ``mp_group``. In the
-previous example, the ``mp_rank`` of process 1 is 0, and ``mp_rank`` of
-process 6 is 1.
-
-Analogously, the library defines ``dp_group``\ s as the sets of processes that
-all hold the same model partition, and perform data parallelism among
-each other. In the example above, there are two ``dp_group``\ s,
-``[0, 1, 2, 3]`` and ``[4, 5, 6, 7]``,
-
-since each process within the ``dp_group`` holds the same partition of
-the model, and makes allreduce calls among themselves. Allreduce for
-data parallelism does not take place *across* ``dp_group``\ s.
-``dp_rank`` is defined as the rank of a process within its ``dp_group``.
-In the preceding example, the \ ``dp_rank`` of process 6 is 2.
+``mp_group``\ s can be adjusted by the ``placement_strategy`` option.
+
+- If ``placement_strategy`` is ``spread``, then the four
+  ``mp_group``\ s are ``[0, 4], [1, 5], [2, 6], [3, 7]``. The
+  ``mp_rank`` is the rank of a process within each ``mp_group``. For example,
+  the ``mp_rank`` is 0 for the processes 0, 1, 2, and 3, and the ``mp_rank`` is 1 for
+  the processes 4, 5, 6, and 7.
+
+  Analogously, the library defines ``dp_group``\ s as sets of processes that
+  all hold the same model partition, and perform data parallelism among
+  each other. If ``placement_strategy`` is ``spread``, there are two ``dp_group``\ s:
+  ``[0, 1, 2, 3]`` and ``[4, 5, 6, 7]``.
+
+  Since each process within the ``dp_group`` holds the same partition of
+  the model, and makes allreduce calls among themselves. Allreduce for
+  data parallelism does not take place *across* ``dp_group``\ s.
+  ``dp_rank`` is defined as the rank of a process within its ``dp_group``.
+  In the preceding example, the \ ``dp_rank`` of process 6 is 2.
+
+- If ``placement_strategy`` is ``cluster``, the four ``mp_group``\ s
+  become ``[0, 1], [2, 3], [4, 5], [6, 7]``, and the the two ``dp_group``\ s become
+  ``[0, 2, 4, 6]`` and ``[1, 3, 5, 7]``.
+
+.. _ranking-basics-tensor-parallelism:
+
+Placement Strategy with Tensor Parallelism
+==========================================
+
+In addition to the two placement strategies introduced in the previous section,
+the library provides additional placement strategies for extended tensor parallelism features
+for PyTorch. The additional placement strategies (parallelism types) are denoted as follows:
+
+- ``D`` stands for (reduced) data parallelism.
+- ``P`` stands for pipeline parallelism.
+- ``T`` stands for tensor parallelism.
+
+With given permutation of the tree letters, the library takes the right-most letter
+as the first strategy performs over the global ranks in ascending order.
+Contrarily, the parallelism type represented by the left-most letter is performed
+over the ranks that are as distant as possible.
+
+- **Example:** Given 8 devices with ``tp_size() == 2``,
+  ``pp_size() == 2``, ``rdp_size() == 2``
+
+  - ``placement_strategy: "DPT"`` gives
+
+    ==== ======== ======= =======
+    rank rdp_rank pp_rank tp_rank
+    ==== ======== ======= =======
+    0    0        0       0
+    1    0        0       1
+    2    0        1       0
+    3    0        1       1
+    4    1        0       0
+    5    1        0       1
+    6    1        1       0
+    7    1        1       1
+    ==== ======== ======= =======
+
+  - ``placement_strategy: "PTD"`` gives
+
+    ==== ======== ======= =======
+    rank rdp_rank pp_rank tp_rank
+    ==== ======== ======= =======
+    0    0        0       0
+    1    1        0       0
+    2    0        0       1
+    3    1        0       1
+    4    0        1       0
+    5    1        1       0
+    6    0        1       1
+    7    1        1       1
+    ==== ======== ======= =======
+
+Because the neighboring ranks are placed on the same instance with
+high-bandwidth NVLinks, it is recommended to place the
+parallelism type that has higher bandwidth requirements for your model
+on the right-most position in the ``placement_strategy`` string. Because
+tensor parallelism often requires frequent communication, placing
+``T`` in the right-most position is recommended (as in the default
+``"cluster"`` strategy). In many large models, keeping the default of
+``"cluster"`` would result in the best performance.
+
+
+.. _prescaled-batch:
+
+Prescaled Batch
+===============
+
+``prescaled_batch`` is a configuration parameter that can be useful for
+``DistributedTransformerLMHead``, which is used for GPT-2 and GPT-3.
+
+The way tensor parallelism works is that when a module is distributed,
+the inputs to the distributed module in different ``tp_rank``\ s gets
+shuffled around in a way that is sliced by the hidden dimension and
+scaled by the batch dimension. For example, if tensor parallel degree is
+8, the inputs to ``DistributedTransformer`` (a tensor with shape
+``[B, S, H]`` where ``B``\ =batch size, ``S``\ =sequence length,
+``H``\ =hidden width) in different ``tp_rank``\ s will be communicated
+around, and the shapes will become ``[8B, S, H/8]``. Each ``tp_rank``
+has the batch from all the peer ``tp_rank``\ s, but only the slice that
+interacts with their local partition of the module.
+
+By default, the library assumes that each ``tp_rank`` gets assigned a
+different batch, and performs the communication described above. If
+``prescaled_batch`` is true, then the library assumes that the input
+batch is already scaled (and is the same across the ``tp_rank``\ s), and
+only does the slicing. In the example above, the library assumes that
+input tensor has shape ``[8B, S, H]``, and only converts it into
+``[8B, S, H/8]``. So if ``prescaled_batch`` is true, it is the user’s
+responsibility to feed the same batch to the ``tp_rank``\ s in the same
+``TP_GROUP``. This can be done by doing the data sharding based on
+``smp.rdp_size()`` and ``smp.rdp_rank()``, instead of ``smp.dp_size()``
+and ``smp.dp_rank()``. When ``prescaled_batch`` is true, the global
+batch size is ``smp.rdp_size()`` multiplied by the per-``MP_GROUP``
+batch size. When ``prescaled_batch`` is false, global batch size is
+``smp.dp_size()`` multiplied by the per-``PP_GROUP`` batch size.
+
+If you use pipeline parallelism degree 1, then you can keep
+``prescaled_batch`` false (the default option). If you use a pipeline
+parallellism degree more than 1, it is recommended to use
+``prescaled_batch`` true, so that you can increase per-``MP_GROUP``
+batch size for efficient pipelining, without running into out-of-memory
+issues.
diff --git a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst
index 249a38573e..feed17a101 100644
--- a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst
+++ b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst
@@ -1,6 +1,67 @@
-Sagemaker Distributed Model Parallel 1.4.0 Release Notes
+Sagemaker Distributed Model Parallel 1.6.0 Release Notes
 ========================================================
 
+*Date: December. 20. 2021*
+
+**New Features**
+
+- **PyTorch**
+
+  - Added extended memory-saving features for PyTorch 1.8.1:
+
+    - Tensor parallelism
+    - Optimizer state sharding
+    - Activation checkpointing
+    - Activation offloading
+
+    For more information, see the following documentation:
+
+    - `SageMaker distributed model parallel developer guide <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch.html>`_
+    - `SageMaker distributed model parallel API documentation for v1.6.0 <https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest.html>`_
+
+**Migration to AWS Deep Learning Containers**
+
+This version passed benchmark testing and is migrated to the following
+AWS Deep Learning Container(s):
+
+- Deep Learning Container for PyTorch 1.8.1:
+
+  .. code::
+
+    763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04
+
+----
+
+Release History
+===============
+
+Sagemaker Distributed Model Parallel 1.5.0 Release Notes
+--------------------------------------------------------
+
+*Date: November. 03. 2021*
+
+**New Features**
+
+- **PyTorch**
+
+  - Currency update for PyTorch 1.10.0
+
+**Migration to AWS Deep Learning Containers**
+
+This version passed benchmark testing and is migrated to the following
+AWS Deep Learning Containers:
+
+- Deep Learning Container for PyTorch 1.10.0:
+
+  .. code::
+
+    763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.10.0-gpu-py38-cu113-ubuntu20.04-sagemaker
+
+----
+
+Sagemaker Distributed Model Parallel 1.4.0 Release Notes
+--------------------------------------------------------
+
 *Date: June. 29. 2021*
 
 **New Features**
@@ -15,17 +76,19 @@ Sagemaker Distributed Model Parallel 1.4.0 Release Notes
 This version passed benchmark testing and is migrated to the following
 AWS Deep Learning Containers:
 
-- TensorFlow 2.5.0 DLC release: `v1.0-tf-2.5.0-tr-py37
-  <https://github.com/aws/deep-learning-containers/releases/tag/v1.0-tf-2.5.0-tr-py37>`__
+- Deep Learning Container for TensorFlow 2.5.0:
 
   .. code::
 
     763104351884.dkr.ecr.<region>.amazonaws.com/tensorflow-training:2.5.0-gpu-py37-cu112-ubuntu18.04-v1.0
 
-----
+- Deep Learning Container for PyTorch 1.9.1:
 
-Release History
-===============
+  .. code::
+
+    763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.9.1-gpu-py38-cu111-ubuntu20.04
+
+----
 
 Sagemaker Distributed Model Parallel 1.3.1 Release Notes
 --------------------------------------------------------
diff --git a/doc/api/training/smp_versions/archives.rst b/doc/api/training/smp_versions/archives.rst
new file mode 100644
index 0000000000..c1b3d55491
--- /dev/null
+++ b/doc/api/training/smp_versions/archives.rst
@@ -0,0 +1,10 @@
+.. _smdmp-pt-version-archive:
+
+.. toctree::
+    :maxdepth: 1
+
+    v1_5_0.rst
+    v1_4_0.rst
+    v1_3_0.rst
+    v1_2_0.rst
+    v1_1_0.rst
diff --git a/doc/api/training/smp_versions/latest.rst b/doc/api/training/smp_versions/latest.rst
index c99975cd27..336fe7df87 100644
--- a/doc/api/training/smp_versions/latest.rst
+++ b/doc/api/training/smp_versions/latest.rst
@@ -1,5 +1,16 @@
+###############################################
+Use the Library's API to Adapt Training Scripts
+###############################################
 
-Version 1.4.0 (Latest)
+The library provides Common APIs that you can use across frameworks,
+as well as framework-specific APIs for TensorFlow and PyTorch.
+
+Select the latest or one of the previous versions of the API documentation
+depending on which version of the library you need to use.
+To use the library, reference the
+**Common API** documentation alongside the framework specific API documentation.
+
+Version 1.6.0 (Latest)
 ======================
 
 To use the library, reference the Common API documentation alongside the framework specific API documentation.
@@ -9,4 +20,17 @@ To use the library, reference the Common API documentation alongside the framewo
 
    latest/smd_model_parallel_common_api
    latest/smd_model_parallel_pytorch
+   latest/smd_model_parallel_pytorch_tensor_parallel
    latest/smd_model_parallel_tensorflow
+
+To find archived API documentation for the previous versions of the library,
+see the following link:
+
+
+Documentation Archive
+=====================
+
+.. toctree::
+   :maxdepth: 1
+
+   archives
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst
index 82ef6c6df0..d1f6b4d45b 100644
--- a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst
@@ -1,14 +1,16 @@
-.. admonition:: Contents
-
-   - :ref:`communication_api`
-   - :ref:`mpi_basics`
-
 Common API
 ==========
 
 The following SageMaker distribute model parallel APIs are common across all frameworks.
 
-**Important**: This API document assumes you use the following import statement in your training scripts.
+.. contents:: Table of Contents
+  :depth: 3
+  :local:
+
+The Library's Core APIs
+-----------------------
+
+This API document assumes you use the following import statement in your training scripts.
 
 **TensorFlow**
 
@@ -254,30 +256,78 @@ The following SageMaker distribute model parallel APIs are common across all fra
 .. _mpi_basics:
 
 MPI Basics
-^^^^^^^^^^
+----------
 
 The library exposes the following basic MPI primitives to its Python API:
 
--  ``smp.rank()``: The rank of the current process.
--  ``smp.size()``: The total number of processes.
--  ``smp.mp_rank()``: The rank of the process among the processes that
-   hold the current model replica.
--  ``smp.dp_rank()``: The rank of the process among the processes that
-   hold different replicas of the same model partition.
--  ``smp.dp_size()``: The total number of model replicas.
--  ``smp.local_rank()``: The rank among the processes on the current
-   instance.
--  ``smp.local_size()``: The total number of processes on the current
-   instance.
--  ``smp.get_mp_group()``: The list of ranks over which the current
-   model replica is partitioned.
--  ``smp.get_dp_group()``: The list of ranks that hold different
-   replicas of the same model partition.
-
-   .. _communication_api:
+**Global**
+
+-  ``smp.rank()`` : The global rank of the current process.
+-  ``smp.size()`` : The total number of processes.
+-  ``smp.get_world_process_group()`` :
+   ``torch.distributed.ProcessGroup`` that contains all processes.
+-  ``smp.CommGroup.WORLD``: The communication group corresponding to all processes.
+-  ``smp.local_rank()``: The rank among the processes on the current instance.
+-  ``smp.local_size()``: The total number of processes on the current instance.
+-  ``smp.get_mp_group()``: The list of ranks over which the current model replica is partitioned.
+-  ``smp.get_dp_group()``: The list of ranks that hold different replicas of the same model partition.
+
+**Tensor Parallelism**
+
+-  ``smp.tp_rank()`` : The rank of the process within its
+   tensor-parallelism group.
+-  ``smp.tp_size()`` : The size of the tensor-parallelism group.
+-  ``smp.get_tp_process_group()`` : Equivalent to
+   ``torch.distributed.ProcessGroup`` that contains the processes in the
+   current tensor-parallelism group.
+-  ``smp.CommGroup.TP_GROUP`` : The communication group corresponding to
+   the current tensor parallelism group.
+
+**Pipeline Parallelism**
+
+-  ``smp.pp_rank()`` : The rank of the process within its
+   pipeline-parallelism group.
+-  ``smp.pp_size()`` : The size of the pipeline-parallelism group.
+-  ``smp.get_pp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current pipeline-parallelism group.
+-  ``smp.CommGroup.PP_GROUP`` : The communication group corresponding to
+   the current pipeline parallelism group.
+
+**Reduced-Data Parallelism**
+
+-  ``smp.rdp_rank()`` : The rank of the process within its
+   reduced-data-parallelism group.
+-  ``smp.rdp_size()`` : The size of the reduced-data-parallelism group.
+-  ``smp.get_rdp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current reduced data parallelism
+   group.
+-  ``smp.CommGroup.RDP_GROUP`` : The communication group corresponding
+   to the current reduced data parallelism group.
+
+**Model Parallelism**
+
+-  ``smp.mp_rank()`` : The rank of the process within its model-parallelism
+   group.
+-  ``smp.mp_size()`` : The size of the model-parallelism group.
+-  ``smp.get_mp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current model-parallelism group.
+-  ``smp.CommGroup.MP_GROUP`` : The communication group corresponding to
+   the current model parallelism group.
+
+**Data Parallelism**
+
+-  ``smp.dp_rank()`` : The rank of the process within its data-parallelism
+   group.
+-  ``smp.dp_size()`` : The size of the data-parallelism group.
+-  ``smp.get_dp_process_group()`` : ``torch.distributed.ProcessGroup``
+   that contains the processes in the current data-parallelism group.
+-  ``smp.CommGroup.DP_GROUP`` : The communication group corresponding to
+   the current data-parallelism group.
+
+.. _communication_api:
 
 Communication API
-^^^^^^^^^^^^^^^^^
+-----------------
 
 The library provides a few communication primitives which can be helpful while
 developing the training script. These primitives use the following
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
index 6e98e7fc66..3ca65c17cb 100644
--- a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
@@ -1,14 +1,8 @@
-.. admonition:: Contents
-
-   - :ref:`pytorch_saving_loading`
-   - :ref:`pytorch_saving_loading_instructions`
-
 PyTorch API
 ===========
 
-**Supported versions: 1.7.1, 1.8.1**
-
-This API document assumes you use the following import statements in your training scripts.
+To use the PyTorch-specific APIs for SageMaker distributed model parallism,
+you need to add the following import statement at the top of your training script.
 
 .. code:: python
 
@@ -19,10 +13,10 @@ This API document assumes you use the following import statements in your traini
 
    Refer to
    `Modify a PyTorch Training Script
-   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`_
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-pt.html>`_
    to learn how to use the following API in your PyTorch training script.
 
-.. class:: smp.DistributedModel
+.. py:class:: smp.DistributedModel()
 
    A sub-class of ``torch.nn.Module`` which specifies the model to be
    partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is
@@ -42,7 +36,6 @@ This API document assumes you use the following import statements in your traini
    is \ ``model``) can only be made inside a ``smp.step``-decorated
    function.
 
-
    Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can
    be performed by calling the \ ``DistributedModel`` object on the input
    tensors.
@@ -56,7 +49,6 @@ This API document assumes you use the following import statements in your traini
    arguments, replacing the PyTorch operations \ ``torch.Tensor.backward``
    or ``torch.autograd.backward``.
 
-
    The API for ``model.backward`` is very similar to
    ``torch.autograd.backward``. For example, the following
    ``backward`` calls:
@@ -90,7 +82,7 @@ This API document assumes you use the following import statements in your traini
 
    **Using DDP**
 
-   If DDP is enabled, do not not place a PyTorch
+   If DDP is enabled with the SageMaker model parallel library, do not not place a PyTorch
    ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
    the ``DistributedModel`` wrapper will also handle data parallelism.
 
@@ -284,6 +276,113 @@ This API document assumes you use the following import statements in your traini
       `register_comm_hook <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.register_comm_hook>`__
       in the PyTorch documentation.
 
+  **Behavior of** ``smp.DistributedModel`` **with Tensor Parallelism**
+
+  When a model is wrapped by ``smp.DistributedModel``, the library
+  immediately traverses the modules of the model object, and replaces the
+  modules that are supported for tensor parallelism with their distributed
+  counterparts. This replacement happens in place. If there are no other
+  references to the original modules in the script, they are
+  garbage-collected. The module attributes that previously referred to the
+  original submodules now refer to the distributed versions of those
+  submodules.
+
+  **Example:**
+
+  .. code:: python
+
+     # register DistributedSubmodule as the distributed version of Submodule
+     # (note this is a hypothetical example, smp.nn.DistributedSubmodule does not exist)
+     smp.tp_register_with_module(Submodule, smp.nn.DistributedSubmodule)
+
+     class MyModule(nn.Module):
+         def __init__(self):
+             ...
+
+             self.submodule = Submodule()
+         ...
+
+     # enabling tensor parallelism for the entire model
+     with smp.tensor_parallelism():
+         model = MyModule()
+
+     # here model.submodule is still a Submodule object
+     assert isinstance(model.submodule, Submodule)
+
+     model = smp.DistributedModel(model)
+
+     # now model.submodule is replaced with an equivalent instance
+     # of smp.nn.DistributedSubmodule
+     assert isinstance(model.module.submodule, smp.nn.DistributedSubmodule)
+
+  If ``pipeline_parallel_degree`` (equivalently, ``partitions``) is 1, the
+  placement of model partitions into GPUs and the initial broadcast of
+  model parameters and buffers across data-parallel ranks take place
+  immediately. This is because it does not need to wait for the model
+  partition when ``smp.DistributedModel`` wrapper is called. For other
+  cases with ``pipeline_parallel_degree`` greater than 1, the broadcast
+  and device placement will be deferred until the first call of an
+  ``smp.step``-decorated function happens. This is because the first
+  ``smp.step``-decorated function call is when the model partitioning
+  happens if pipeline parallelism is enabled.
+
+  Because of the module replacement during the ``smp.DistributedModel``
+  call, any ``load_state_dict`` calls on the model, as well as any direct
+  access to model parameters, such as during the optimizer creation,
+  should be done **after** the ``smp.DistributedModel`` call.
+
+  Since the broadcast of the model parameters and buffers happens
+  immediately during ``smp.DistributedModel`` call when the degree of
+  pipeline parallelism is 1, using ``@smp.step`` decorators is not
+  required when tensor parallelism is used by itself (without pipeline
+  parallelism).
+
+  For more information about the library's tensor parallelism APIs for PyTorch,
+  see :ref:`smdmp-pytorch-tensor-parallel`.
+
+  **Additional Methods of** ``smp.DistributedModel`` **for Tensor Parallelism**
+
+  The following are the new methods of ``smp.DistributedModel``, in
+  addition to the ones listed in the
+  `documentation <https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.html#smp.DistributedModel>`__.
+
+  .. function:: distributed_modules()
+
+     -  An iterator that runs over the set of distributed
+        (tensor-parallelized) modules in the model
+
+  .. function:: is_distributed_parameter(param)
+
+     -  Returns ``True`` if the given ``nn.Parameter`` is distributed over
+        tensor-parallel ranks.
+
+  .. function::  is_distributed_buffer(buf)
+
+     -  Returns ``True`` if the given buffer is distributed over
+        tensor-parallel ranks.
+
+  .. function::  is_scaled_batch_parameter(param)
+
+     -  Returns ``True`` if the given ``nn.Parameter`` is operates on the
+        scaled batch (batch over the entire ``TP_GROUP``, and not only the
+        local batch).
+
+  .. function::  is_scaled_batch_buffer(buf)
+
+     -  Returns ``True`` if the parameter corresponding to the given
+        buffer operates on the scaled batch (batch over the entire
+        ``TP_GROUP``, and not only the local batch).
+
+  .. function::  default_reducer_named_parameters()
+
+     -  Returns an iterator that runs over ``(name, param)`` tuples, for
+        ``param`` that is allreduced over the ``DP_GROUP``.
+
+  .. function::  scaled_batch_reducer_named_parameters()
+
+     -  Returns an iterator that runs over ``(name, param)`` tuples, for
+        ``param`` that is allreduced over the ``RDP_GROUP``.
+
 
 
 .. class:: smp.DistributedOptimizer
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst
new file mode 100644
index 0000000000..413fc7cc46
--- /dev/null
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst
@@ -0,0 +1,835 @@
+.. _smdmp-pytorch-tensor-parallel:
+
+PyTorch API for Tensor Parallelism
+==================================
+
+SageMaker distributed tensor parallelism works by replacing specific submodules
+in the model with their distributed implementations. The distributed modules
+have their parameters and optimizer states partitioned across tensor-parallel
+ranks. This is to compute the same output as it would have been computed by
+the original modules. Since tensor parallelism occurs across data-parallel
+ranks, a rank might collect slices of the activations corresponding to the
+data shards on other devices that are part of the same tensor parallelism group.
+
+You can enable or disable tensor parallelism for specific parts of the model.
+Within the enabled parts, the replacements with distributed modules will take
+place on a best-effort basis for those module supported for tensor parallelism.
+Alternatively, you can directly import and use the library’s distributed
+modules in the model definition.
+
+Some of the supported modules (such as ``smp.nn.Transformer``) are high-level
+blocks that contain many operations. Because custom implementations
+(as opposed to the built-in PyTorch modules) are typically used for these
+high-level blocks, the library offers an API that you can use to register
+specific distributed versions with such custom modules (provided that they
+are functionally equivalent). This allows the library to automatically replace
+the occurrences of such PyTorch modules with their distributed counterparts
+provided by the library.
+For more information, see the following topics.
+
+.. contents:: Topics
+  :depth: 3
+  :local:
+
+.. _registering-tp-modules:
+
+Registering Tensor Parallelism Distributed Modules
+--------------------------------------------------
+
+Although PyTorch natively provides some of the commonly used (and
+tensor-parallelizable) building blocks such as Transformer, users often
+use custom implementations for such higher-level modules. To distribute
+such modules with tensor parallelism, you need to register the
+distributed modules to the custom module implementation in your class,
+so that the library knows how to distribute the custom module. When you
+register the distributed modules, make sure the custom module that you
+use is functionally equivalent to the distributed module. You can verify
+this by taking a look at the equivalent reference implementations in the
+:ref:`smdmp-tp-appendix`.
+These implementations are functionally equivalent to their distributed
+versions in ``smp.nn`` module.
+
+.. decorator:: @smp.tp_register(dist_module, init_hook=None, forward_hook=None, return_hook=None)
+
+   -  A class decorator that registers the ``dist_module`` class with
+      the module class that it is attached to. The hooks can be used to
+      adapt to different interfaces used with ``__init__`` and
+      ``forward`` methods.
+   -  **Arguments:**
+
+      -  ``dist_module``: A subclass of ``smp.nn.DistributedModule``
+         that implements the distributed version of the module class the
+         decorator is attached to. Any distributed module class defined
+         in ``smp.nn`` module can be used.
+      -  ``init_hook``: A callable that translates the arguments of the
+         original module ``__init__`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``__init__`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``__init__`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``forward_hook``: A callable that translates the arguments of
+         the original module ``forward`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``forward`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``forward`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``return_hook``: A callable that translates the object returned
+         from the distributed module to the return object expected of
+         the original module.
+
+   -  **Example:**
+
+      .. code:: python
+
+         init_hook = lambda config: ((), config.to_dict())
+
+         # register smp.nn.DistributedTransformer
+         # as the distributed version of MyTransformer
+         @smp.tp_register(smp.nn.DistributedTransformer, init_hook=init_hook)
+         class MyTransformer(nn.Module):
+             def __init__(self, config):
+                 ...
+
+             def forward(self, hidden_states, attention_mask):
+                 ...
+
+.. function:: smp.tp_register_with_module(module_cls, dist_module, init_hook=None, forward_hook=None, return_hook=None)
+
+   -  When you do not have direct access to model definition code, you
+      can use this API to similarly register a distributed module with
+      an existing module class.
+
+   -  **Arguments:**
+
+      -  ``module_cls``: The existing module class that will be
+         distributed.
+      -  ``dist_module``: A subclass of ``smp.nn.DistributedModule``
+         that implements the distributed version of the module class the
+         decorator is attached to. Any distributed module class defined
+         in ``smp.nn`` module can be used.
+      -  ``init_hook``: A callable that translates the arguments of the
+         original module ``__init__`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``__init__`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``__init__`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``forward_hook``: A callable that translates the arguments of
+         the original module ``forward`` method to an ``(args, kwargs)``
+         tuple compatible with the arguments of the corresponding
+         distributed module ``forward`` method. Must return a tuple,
+         whose first element is an iterable representing the positional
+         arguments, and second element is a ``dict`` representing the
+         keyword arguments. The input signature of the ``init_hook``
+         must **exactly** match the signature of the original
+         ``forward`` method (including argument order and default
+         values), except it must exclude ``self``.
+      -  ``return_hook``: A callable that translates the object returned
+         from the distributed module to the return object expected of
+         the original module.
+
+   -  **Example:**
+
+      .. code:: python
+
+         from somelibrary import MyTransformer
+
+         init_hook = lambda config: ((), config.to_dict())
+
+         # register smp.nn.DistributedTransformer as the distributed version of MyTransformer
+         smp.tp_register_with_module(MyTransformer,
+                                     smp.nn.DistributedTransformer,
+                                     init_hook=init_hook)
+
+.. _smdmp-supported-modules-for-tp:
+
+Supported Modules for Tensor Parallelism
+----------------------------------------
+
+The following modules are supported for tensor
+parallelism.
+
+-  ``smp.nn.DistributedLinear`` (implements ``nn.Linear``)
+-  ``smp.nn.DistributedTransformerLMHead``
+-  ``smp.nn.DistributedTransformer``
+-  ``smp.nn.DistributedTransformerLayer``
+-  ``smp.nn.DistributedAttentionLayer``
+-  ``smp.nn.DistributedTransformerOutputLayer``
+-  ``smp.nn.DistributedEmbedding``
+
+.. contents:: Topics
+  :depth: 3
+  :local:
+
+.. _tp-module-api:
+
+Tensor Parallelism Module APIs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. class:: smp.nn.DistributedLinear(in_features, out_features)
+
+   -  Tensor-parallel implementation of the ``nn.Linear`` class.
+      Functionally equivalent to an ``nn.Linear`` module with the same
+      ``in_features`` and ``out_features``. In other words,
+      ``in_features`` and ``out_features`` are the number of *global*
+      channels across tensor-parallel ranks.
+   -  **Arguments:**
+
+      -  ``in_features``: The total number of input channels for the
+         linear layer across all tensor-parallel ranks.
+      -  ``out_features``: The total number of output channels for the
+         linear layer across all tensor-parallel ranks.
+
+.. class:: smp.nn.DistributedTransformerLMHead(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, vocab_size=30522, num_positions=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, num_token_types=0, causal_mask_size=None, add_cross_attention=False, add_lm_head=True,  initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True)
+
+   -  Constructs a distributed transformer model, including embeddings
+      and a single LM head. A word embedding of size
+      ``(vocab_size, hidden_size)`` is created, as well as a positional
+      embedding of size ``(num_positions, hidden_size)``, and the
+      embeddings are added together. If ``num_token_types`` is larger
+      than 0, a separate embedding of size
+      ``(num_token_types, hidden_size)`` is created, and further added
+      on top.
+   -  The embeddings are fed through a ``DistributedTransformer``, and
+      if ``add_lm_head`` is ``True``, the output passes through a single
+      LM head, which is a linear module without bias whose weight is
+      tied to the word embeddings.
+   -  See ``DistributedTransformerLayer`` for a description of the rest
+      of the arguments.
+   -  **Methods:**
+
+      -  ``forward(self, inputs)``
+
+         -  If ``add_cross_attention`` is ``True``, ``inputs`` must be a
+            tuple
+            ``(input_ids, attention_mask, token_type_ids, position_ids, cross_states, cross_states, cross_mask, labels)``.
+         -  Otherwise, ``inputs`` must be a tuple
+            ``(input_ids, attention_mask, token_type_ids, position_ids, labels)``.
+         -  If ``token_type_ids`` is ``None``, token type embedding will
+            not be used.
+         -  ``input_ids`` is assumed to be of shape ``[N, S]``, where
+            ``N`` is the batch size and ``S`` is sequence length.
+         -  ``attention_mask`` is assumed to be a 0-1 tensor of shape
+            ``[N, S]``, where 1 represents a masked position.
+
+.. class:: smp.nn.DistributedTransformer(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True)
+
+   -  A sequence of ``smp.nn.DistributedTransformerLayer``\ s, whose
+      number is given by ``num_layers`` argument. For the other
+      arguments and methods, refer to
+      ``smp.nn.DistributedTransformerLayer``.
+   -  If both ``pre_layernorm`` and ``post_layernorm`` are ``True``,
+      layer normalization is applied to both the input and the output of
+      the ``DistributedTransformer``, in addition to the intermediate
+      attention and transformer-output layers.
+
+.. class:: smp.nn.DistributedTransformerLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True)
+
+   -  Tensor-parallel implementation of a single transformer layer.
+      Number of attention heads, hidden size, and intermediate size
+      refer to the global quantities across all tensor-parallel ranks.
+   -  **Arguments:**
+
+      -  ``num_attention_heads``: The total number of attention heads
+         across tensor-parallel ranks
+      -  ``attention_head_size``: The number of channels of a single
+         attention head.
+      -  ``hidden_size``: The hidden dimension of the transformer. The
+         input tensor ``hidden_states`` is assumed to have its last
+         dimension size equal to ``hidden_size``.
+      -  ``intermediate_size``: The number of output channels in the
+         first linear transformation of the transformer output layer.
+         ``DistributedTransformerOutputLayer`` first maps
+         ``hidden_size`` dimensions of its input tensor into
+         ``intermediate_size`` dimensions, and then maps it back into
+         ``hidden_size`` dimensions.
+      -  ``attention_dropout_prob``: The dropout probability applied to
+         the attention probabilities.
+      -  ``hidden_dropout_prob``: The dropout probability used in
+         dropout layers other than the one applied to the attention
+         probabilities.
+      -  ``activation``: Choice of activation function to use at the
+         output layer. Must be ``"gelu"`` or ``"relu"``.
+      -  ``layernorm_epsilon``: The epsilon added to the denominator of
+         layer normalization for numerical stability.
+      -  ``initializer_range``: If ``use_normal_initialization`` is
+         ``True``, the standard deviation of the normal random variable
+         to initialize the weights with.
+      -  ``use_normal_initialization``: If ``True``, the weights are
+         initialized with normal distribution with standard deviation
+         given by ``initializer_range``. Otherwise, default PyTorch
+         initialization is used.
+      -  ``causal_mask_size``: If ``None``, no causal mask is used on
+         attentions. Otherwise, should be set to maximum sequence length
+         to apply a causal mask to the attention scores. This is used,
+         for instance, in GPT-2.
+      -  ``add_cross_attention``: If ``True``, a cross-attention layer
+         will be added after the self-attention block. The
+         cross-attention layer computes the attention keys and values
+         based on the ``cross_states`` input (instead of
+         ``hidden_states`` input, as in self-attention. This is used in
+         the decoder block of encoder-decoder architectures. For
+         encoder-only architectures that only use self-attention, this
+         should be kept ``False``.
+      -  ``pre_layernorm``: If ``True``, inserts layer normalization at
+         the input. At least one of ``pre_layernorm`` and
+         ``post_layernorm`` must be ``True``.
+      -  ``post_layernorm``: If ``True``, inserts layer normalization at
+         the output. At least one of ``pre_layernorm`` and
+         ``post_layernorm`` must be ``True``.
+
+   -  **Methods:**
+
+      -  ``forward(self, inputs)``: Forward pass for the transformer
+         layer.
+
+         -  **Arguments:**
+
+            -  If ``add_cross_attention=False``, ``inputs`` must be a
+               tuple ``(hidden_states, attention_mask)``, where
+               ``hidden_states`` is assumed to be a tensor of dimensions
+               ``[N, S, H]``, where ``N`` is batch size, ``S`` is
+               sequence length, and ``H`` is ``hidden_size``.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S]``, where ``N`` is the batch
+               size, and ``S`` is the sequence length.
+            -  If ``add_cross_attention=True``, ``inputs`` must be a
+               tuple
+               ``(hidden_states, cross_states, attention_mask, cross_mask)``,
+               where ``hidden_states`` is assumed to be a tensor of
+               dimensions ``[N, S_1, H]``, where ``N`` is batch size,
+               ``S_1`` is sequence length, and ``H`` is ``hidden_size``.
+               ``cross_states`` is assumed to be a tensor of size
+               ``[N, S_2, H]``, similarly interpreted.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S_1]``, where ``N`` is the batch
+               size, and ``S_1`` is the sequence length, and
+               ``cross_mask`` is assumed to be a tensor of size
+               ``[N, 1, 1, S_2]``. Keys and values for the attention
+               heads in the cross-attention layer (but not the
+               self-attention layer) are computed using
+               ``cross_states``, and ``cross_mask`` is applied as the
+               attention mask in the cross-attention layer (but not the
+               self-attention layer).
+
+         -  **Returns:**
+
+            -  If ``add_cross_attention=False``, a tuple
+               ``(hidden_states, attention_mask)``, where
+               ``hidden_states`` is the output of the transformer, and
+               ``attention_mask`` is the same the ``attention_mask``
+               argument.
+            -  If ``add_cross_attention=True``, a tuple
+               ``(hidden_states, cross_states, attention_mask, cross_mask)``,
+               where ``hidden_states`` is the output of the transformer,
+               and the next three tensors are the same as the input
+               arguments.
+
+.. class:: smp.nn.DistributedAttentionLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, cross_attention=False, causal_mask_size=None, pre_layernorm=False, post_layernorm=True)
+
+   -  A distributed implementation for the attention block. Includes the
+      computation of the self- or cross-attention (context layer),
+      followed by a linear mapping and dropout, which is optionally
+      followed by the residual-connection and layer normalization.
+   -  **Arguments:**
+
+      -  See ``DistributedTransformerLayer`` for a description of the
+         arguments.
+      -  If ``cross_attention`` is ``True``, computes the attentions
+         with respect to the ``cross_states`` tensor of the ``forward``
+         method input tuple.
+
+   -  **Methods:**
+
+      -  ``forward(self, inputs)``: Forward pass for the attention
+         layer.
+
+         -  **Arguments:**
+
+            -  If ``cross_attention=False``, ``inputs`` must be a tuple
+               ``(hidden_states, attention_mask)``, where
+               ``hidden_states`` is assumed to be a tensor of dimensions
+               ``[N, S, H]``, where ``N`` is batch size, ``S`` is
+               sequence length, and ``H`` is ``hidden_size``.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S]``, \***\* where ``N`` is the
+               batch size, and ``S`` is the sequence length.
+            -  If ``cross_attention=True``, ``inputs`` must be a tuple
+               ``(hidden_states, cross_states, attention_mask)``, where
+               ``hidden_states`` is assumed to be a tensor of dimensions
+               ``[N, S_1, H]``, where ``N`` is batch size, ``S_1`` is
+               sequence length, and ``H`` is ``hidden_size``.
+               ``cross_states`` is assumed to be a tensor of size
+               ``[N, S_2, H]``, similarly interpreted.
+               ``attention_mask`` is assumed to be a tensor of
+               dimensions ``[N, 1, 1, S_2]``, where ``N`` is the batch
+               size, and ``S_2`` is the sequence length. Keys and values
+               for the attention heads are computed using
+               ``cross_states``.
+
+         -  **Returns:**
+
+            -  A single tensor that is the output of the attention
+               layer.
+
+.. class:: smp.nn.DistributedTransformerOutputLayer(hidden_size=1024, intermediate_size=4096,  hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True)
+
+   -  Distributed implementation of a single transformer output layer. A
+      single ``DistributedTransformerLayer`` with
+      ``add_cross_attention=False`` consists of a single
+      ``DistributedAttentionLayer`` immediately followed by a single
+      ``DistributedTransformerOutputLayer``. The latter linearly maps
+      the last channel of the input tensor from ``hidden_size`` to
+      ``intermediate_size``, and then maps it back to ``hidden_size``.
+   -  **Arguments:**
+
+      -  See ``DistributedTransformerLayer`` for a description of the
+         arguments.
+
+.. class:: smp.nn.DistributedEmbedding(num_embeddings,embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, initializer_range=0.02, _skip_allgather=False,_skip_scatter_and_merge=False,)
+
+   -  Distributed implementation of a single Embedding Layer. Currently
+      only supports splitting across the embedding_dim.
+   -  **Arguments:**
+
+      -  See ``DistributedEmbedding`` for a description of the
+         arguments.
+
+.. _enabling-tp:
+
+Enabling Tensor Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are two ways tensor parallelism can be enabled.
+
+First, you can use
+the distributed module implementations in ``smp.nn`` module directly in
+your model definition. See :ref:`smdmp-supported-modules-for-tp`
+for a complete list of built-in distributed modules. Here is an example
+of how this can be done:
+
+.. code:: python
+
+   import torch.nn as nn
+   import smdistributed.modelparallel.torch as smp
+
+   class TransformerModel:
+       def __init__(self):
+           self.embedding = nn.Embedding(vocab_size, hidden_size)
+
+           # directly instantiate smp.nn.DistributedTransformer and use it
+           self.encoder = smp.nn.DistributedTransformer(num_layers, hidden_size, **kwargs)
+
+           self.pooler = nn.Linear(hidden_size, hidden_size)
+
+       def forward(self, hidden_states):
+           emb_out = self.embedding(hidden_states)
+           enc_out = self.encoder(emb_out)
+           return self.pooler(enc_out)
+
+Second, you can enable tensor parallelism for specific modules or blocks
+of code, which will automatically enable tensor parallelism for the
+supported modules within that scope. To do this, you can use the
+following API:
+
+.. decorator:: smp.tensor_parallelism(enabled=True, **kwargs)
+
+   -  A context manager that enables or disables tensor parallelism for
+      any supported module that is created inside. If there are nested
+      contexts, the innermost will override the rest. If there are
+      multiple supported modules created within the context, where one
+      is the submodule of the other, only the outermost module will be
+      distributed. If a supported module shares weights with another
+      (supported or unsupported) module, or if its hyperparameters do
+      not support distribution (e.g., not divisible by the tensor
+      parallelism degree), tensor parallelism will **not** be enabled
+      for this module even if this API is used.
+
+      **Example:**
+
+      .. code:: python
+
+         with smp.tensor_parallelism():
+             self.m0 = nn.Linear(20, 20)                   # will be distributed
+             with smp.tensor_parallelism(enabled=False):
+                 self.m1 = nn.Linear(20, 20)               # will not be distributed
+
+   - Keyword arguments `kwargs` can be used to modify the configurations of the distributed modules created inside the context. If a keyword argument provided here matches any `__init__` method arguments of a `DistributedModule` that substitutes a module created inside the `smp.tensor_parallelism` context, this keyword will override the value defined in the `init_hook`.
+
+.. function:: smp.set_tensor_parallelism(module, enabled=True, **kwargs)
+
+   -  Enables or disables tensor parallelism for the supported
+      submodules of ``module``. If enabling, the outermost supported
+      modules will be distributed. If disabling, tensor parallelism will
+      be disabled for the entire module subtree of ``module``. Unlike
+      the context manager, this API can be used after the model creation
+      (but before wrapping with :class:`smp.DistributedModel`), so direct
+      access to model definition code is not required. If a supported
+      module shares weights with another (supported or unsupported)
+      module, or if its hyperparameters do not support distribution
+      (e.g., not divisible by the tensor parallelism degree), tensor
+      parallelism will **not** be enabled for this module.
+   -  Keyword arguments ``kwargs`` can be used to modify the
+      configurations of the distributed modules created inside the
+      context. If a keyword argument provided here matches any
+      ``__init__`` method arguments of a :class:`smp.DistributedModel` that
+      substitutes a module created inside the ``smp.tensor_parallelism``
+      context, this keyword will override the value defined in the
+      ``init_hook``.
+   -  **Example:**
+
+      .. code:: python
+
+         model = MyModel()
+         smp.set_tensor_parallelism(model.encoder, True)
+         smp.set_tensor_parallelism(model.encoder.embedding, True)
+
+         # outermost supported submodules in model.encoder will be distributed, except for
+         # model.encoder.embedding
+         model = smp.DistributedModel(model)
+         optimizer = smp.DistributedOptimizer(optimizer)
+
+.. _activation-checkpointing-api:
+
+Activation Checkpointing APIs
+-----------------------------
+
+``smdistributed.modelparallel`` provides three APIs to enable
+activation checkpointing: one for checkpointing modules,
+one for checkpointing sequential modules, and
+one for checkpointing pretrained models.
+
+For a conceptual guide and examples, see
+`Activation Checkpointing <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-activation-checkpointing.html>`_
+in the *SageMaker's Distributed Model Parallel developer guide*.
+
+.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint(module, *args, preserve_rng_state=True)
+
+   -  Checkpoints the module passed. Throws error if, during manual
+      partitioning, all children of module are not on same rank as the
+      module itself, i.e. the module tree is split across multiple
+      partitions. During auto-partitioning, if the module is split
+      across multiple partitions, then this call is ignored(with a
+      warning). Note that this call applies to the module instance only,
+      not to the module class.
+
+   -  **Arguments:**
+
+      -  ``module (Instance of nn.Module)``: The module to be
+         checkpointed. Note that unlike native checkpointing in
+         PyTorch’s, activation checkpointing in
+         ``smdistributed.modelparallel`` is at the granularity of a
+         module. A generic function cannot be passed here.
+      -  ``args``: Tuple containing inputs to the module.
+      -  ``preserve_rng_state (bool, default=True)``: Omit stashing and
+         restoring the RNG state during each checkpoint.
+
+.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint_sequential(sequential_module, input, strategy="each", preserve_rng_state=True, pack_args_as_tuple=False)
+
+   -  Checkpoints the modules inside
+      `nn.Sequential <https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html>`__.
+      This can be used even if different layers that are part of the
+      sequential container lie on different partitions. Each layer part
+      of the sequential module that is checkpointed must lie completely
+      within one partition. If this is not the case during manual
+      partitioning, then an error will be thrown. If this is not the
+      case during auto partitioning, a warning will be raised and this
+      module will be run without checkpointing.
+
+   -  **Arguments**
+
+      -  ``sequential_module (nn.Sequential)``: the sequential module to
+         be checkpointed.
+      -  ``input (torch.Tensor or a tuple of torch.Tensors)``: input to
+         the module, which can be a tensor or a tuple of tensors. If a
+         tuple is passed, then pack_args_as_tuple should be set to True.
+      -  ``strategy (string, default=“each”)`` : Strategy determines how
+         many layers part of the sequential module need to be grouped
+         together for one checkpointing call. This determines how much
+         memory can be reduced. It can take the following values
+
+         -  ``each`` : The default is to checkpoint each module inside
+            the sequential separately.
+         -  ``contiguous``: Groups consecutive layers on the same
+            partition together. For example, if a sequential consists of
+            [a, b, c, d] where a,b are on pp_rank0 and c,d are on
+            pp_rank 1, then this strategy would checkpoint a,b together
+            and then c,d together. This means effectively, inputs of a,
+            outputs of b, inputs of c, and outputs of d are in memory;
+            the reamining activations are recomputed.
+         -  ``group_2, group_3, group_4, etc:`` More generally,
+            ``group_x`` where x is an integer. This strategy provides
+            more flexibility in how many layers to group together.
+            ``group_x`` groups x layers together on a best effort basis.
+            It can group x layers together if there are x layers
+            consecutively on the same partition. For example:
+            [a,b,c,d,e] where a,b are on pp_rank0 and c,d,e are on
+            pp_rank 1. If the strategy is ``group_3,`` then a,b are
+            checkpointed together on pp_rank0 and c,d,e are checkpointed
+            together on pp_rank1.
+
+      -  ``preserve_rng_state (bool, default=True)``: Set to ``False``
+         to omit stashing and restoring the RNG state during each
+         checkpoint.
+      -  ``pack_args_as_tuple (bool, default=False)``: To ensure that
+         backward works correctly, the autograd function has to unpack
+         any tuples received. If the checkpointed layer takes a tuple as
+         input, then this needs to be set to True.
+
+.. class:: smp.set_activation_checkpointing(module, preserve_rng_state=True, pack_args_as_tuple=False, strategy="each")
+
+   -  This API is recommended when importing pretrained models from
+      libraries, such as PyTorch and Hugging Face Transformers. This is
+      particularly useful when you don’t have access to the model
+      definition code and not be able to replace a module call with
+      checkpoint.
+
+   -  **Arguments**:
+
+      -  ``module (Instance of nn.Module or nn.Sequential)``: The module
+         to checkpoint.
+      -  ``preserve_rng_state (bool, default=True)``: Set to ``False``
+         to omit stashing and restoring the RNG state during each
+         checkpoint.
+      -  ``pack_args_as_tuple (bool, default=False)``: *Can only be
+         passed when module is a sequential module.* To ensure that
+         backward works correctly, the autograd function has to unpack
+         any tuples received. If the layer checkpointed takes a tuple as
+         input, then this needs to be set to True.
+      -  ``strategy: (string, default=“each”)``: *Can only be passed
+         when module is a sequential module.* Strategy determines how
+         many layers part of the sequential module need to be grouped
+         together for one checkpointing call.
+      -  This determines how much memory can be reduced. It can take the
+         following values
+
+         -  ``each`` : The default is to checkpoint each module inside
+            the sequential separately.
+         -  ``contiguous``: Groups consecutive layers on the same
+            partition together. For example if a sequential consists of
+            ``[a, b, c, d]`` where ``a, b`` are on ``pp_rank0`` and ``c, d`` are on
+            ``pp_rank 1``, then this strategy would checkpoint a,b together
+            and then ``c, d`` together. This means effectively, the inputs of
+            ``a``, outputs of ``b``, inputs of ``c``, and outputs of ``d`` are in
+            memory, and the rest of the activations are recomputed.
+         -  ``group_2, group_3, group_4, etc:`` More generally,
+            ``group_x`` where x is an integer. This strategy provides
+            more flexibility in how many layers to group together.
+            ``group_x`` groups x number of layers together on a best
+            effort basis if there are x layers consecutively in the same
+            partition. **Example**: Assume a module with layers ``[a, b,
+            c, d, e]``. The layers a and b are on pp_rank0, and ``c``, ``d``, and
+            ``e`` are on ``pp_rank 1``. If the strategy is ``group_3,`` then ``a``,
+            ``b`` are checkpointed together on ``pp_rank0``, and ``c``, ``d``, ``e`` are
+            checkpointed together on ``pp_rank1``.
+
+.. _smdmp-tp-appendix:
+
+Appendix: Reference Implementations for Modules
+-----------------------------------------------
+
+The following are reference implementations for transformer-related
+modules. Note that this is not the actual ``smdistributed`` source code,
+but the distributed implementations provided in the library are the
+distributed versions of these reference implementations, and can be used
+to determine whether the distributed modules perform the same operations
+as the custom modules in your script.
+
+To keep the implementations simple, we only assume keyword arguments,
+and assume the existence of a method ``parse_args(kwargs)``, which
+parses the arguments to ``__init__`` methods and sets the relevant
+attributes of the module, such as ``hidden_size`` and
+``num_attention_heads``.
+
+``smp.nn.DistributedTransformer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class Transformer(nn.Module):
+       def __init__(self, **kwargs):
+           super(Transformer, self).__init__()
+           self.parse_args(kwargs)
+
+           self.layers = []
+           for l in range(self.num_layers):
+               self.layers.append(TransformerLayer(**kwargs))
+
+           self.seq_layers = nn.Sequential(*self.layers)
+
+       def forward(self, inp):
+           return self.seq_layers(inp)
+
+``smp.nn.DistributedTransformerLayer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class TransformerLayer(nn.Module):
+       def __init__(self, **kwargs):
+           super(TransformerLayer, self).__init__()
+           self.parse_args(kwargs)
+
+           self.attention = AttentionLayer(**kwargs)
+           self.output = TransformerOutputLayer(**kwargs)
+
+           if self.add_cross_attention:
+               self.cross_attention = AttentionLayer(cross_attention=True, **kwargs)
+
+       def forward(self, inp):
+           if self.add_cross_attention:
+               hidden_states, cross_states, attention_mask, cross_mask = inp
+           else:
+               hidden_states, attention_mask = inp
+
+           attention_output = self.attention((hidden_states, attention_mask))
+           if self.add_cross_attention:
+               attention_output = self.cross_attention((attention_output,
+                                                        cross_states,
+                                                        cross_mask))
+
+           output = self.output(attention_output)
+
+           if self.add_cross_attention:
+               return output, cross_states, attention_mask, cross_mask
+           else:
+               return output, attention_mask
+
+``smp.nn.DistributedAttentionLayer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class AttentionLayer(nn.Module):
+       def __init__(self, **kwargs):
+           super(AttentionLayer, self).__init__()
+           self.parse_args(kwargs)
+           self.attention_head_size = self.hidden_size // self.num_attention_heads
+
+           self.query = nn.Linear(self.hidden_size, self.hidden_size)
+           self.key = nn.Linear(self.hidden_size, self.hidden_size)
+           self.value = nn.Linear(self.hidden_size, self.hidden_size)
+           self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+
+           self.dropout1 = nn.Dropout(self.attention_dropout_prob)
+           self.dropout2 = nn.Dropout(self.hidden_dropout_prob)
+
+           if self.pre_layernorm:
+               self.pre_layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+           if self.post_layernorm:
+               self.layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+       def transpose(self, tensor, key=False):
+           shape = tensor.size()[:-1] +
+                           (self.num_attention_heads, self.attention_head_size)
+           tensor = torch.reshape(tensor, shape)
+           if key:
+               return tensor.permute(0, 2, 3, 1)
+           else:
+               return tensor.permute(0, 2, 1, 3)
+
+       def forward(self, inp):
+           if self.cross_attention:
+               hidden_states, cross_states, attention_mask = inp
+           else:
+               hidden_states, attention_mask = inp
+
+           if self.pre_layernorm:
+               norm_states = self.pre_layernorm(hidden_states)
+           else:
+               norm_states = hidden_states
+
+           query_layer = self.query(norm_states)
+
+           if self.cross_attention:
+               key_layer = self.key(cross_states)
+               value_layer = self.value(cross_states)
+           else:
+               key_layer = self.key(norm_states)
+               value_layer = self.value(norm_states)
+
+           query_layer = self.transpose(query_layer)
+           key_layer = self.transpose(key_layer, key=True)
+           value_layer = self.transpose(value_layer)
+
+           attention_scores = torch.matmul(query_layer, key_layer)
+           attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+           if not self.cross_attention and self.causal_mask is not None:
+               attention_scores = self.apply_causal_mask(attention_scores)
+
+           attention_scores = attention_scores + attention_mask
+
+           attention_probs = F.softmax(attention_scores, dim=-1)
+           attention_probs = self.dropout1(attention_probs)
+
+           context_layer = torch.matmul(attention_probs, value_layer)
+           context_layer = context_layer.permute(0, 2, 1, 3)
+           new_context_layer_shape = context_layer.size()[:-2] + \
+                                       (self.local_attention_size,)
+           context_layer = torch.reshape(context_layer, new_context_layer_shape)
+
+           self_attention = self.dense(context_layer)
+           self_attention = self.dropout2(self_attention)
+
+           if self.post_layernorm:
+               return self.layernorm(self_attention + hidden_states)
+           else:
+               return self_attention
+
+``smp.nn.DistributedTransformerOutputLayer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+   class TransformerOutputLayer(nn.Module):
+       def __init__(self, **kwargs):
+           super(TransformerOutputLayer, self).__init__()
+           self.parse_args(kwargs)
+
+           self.dense1 = nn.Linear(self.hidden_size, self.intermediate_size)
+           self.dense2 = nn.Linear(self.intermediate_size, self.hidden_size)
+
+           self.dropout = nn.Dropout(self.attention_dropout_prob)
+
+           if self.pre_layernorm:
+               self.pre_layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+           if self.post_layernorm:
+               self.layernorm = nn.LayerNorm(self.hidden_size,
+                                       eps=self.layernorm_epsilon)
+
+       def forward(self, inp):
+           if self.pre_layernorm:
+               norm_inp = self.pre_layernorm(inp)
+           else:
+               norm_inp = inp
+
+           dense1_output = self.dense1(norm_inp)
+           if self.activation == "gelu":
+               act_output = F.gelu(dense1_output)
+           else:
+               act_output = F.relu(dense1_output)
+
+           dense2_output = self.dense2(act_output)
+           output = self.dropout(dense2_output)
+
+           if self.post_layernorm:
+               return self.layernorm(inp + output)
+           else:
+               return output
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
index 6eefe5cad8..7f21f7a557 100644
--- a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
+++ b/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
@@ -1,9 +1,8 @@
 TensorFlow API
 ==============
 
-**Supported version: 2.3.1, 2.4.1, 2.5.0**
-
-**Important**: This API document assumes you use the following import statement in your training scripts.
+To use the TensorFlow-specific APIs for SageMaker distributed model parallism,
+you need to add the following import statement at the top of your training script.
 
 .. code:: python
 
@@ -13,8 +12,8 @@ TensorFlow API
 
    Refer to
    `Modify a TensorFlow Training Script
-   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`_
-   to learn how to use the following API in your TensorFlow training script.
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-tf.html>`_
+   to learn how to use the following APIs in your TensorFlow training script.
 
 .. class:: smp.DistributedModel
    :noindex:
diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst
new file mode 100644
index 0000000000..625a7fcbf1
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst
@@ -0,0 +1,488 @@
+.. admonition:: Contents
+
+   - :ref:`communication_api`
+   - :ref:`mpi_basics`
+
+Common API
+==========
+
+The following SageMaker distribute model parallel APIs are common across all frameworks.
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+**TensorFlow**
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+**PyTorch**
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. function:: smp.init( )
+   :noindex:
+
+   Initialize the library. Must be called at the beginning of training script.
+
+.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs])
+   :noindex:
+
+   A decorator that must be placed over a function that represents a single
+   forward and backward pass (for training use cases), or a single forward
+   pass (for evaluation use cases). Any computation that is defined inside
+   the ``smp.step``-decorated function is executed in a pipelined manner.
+
+   By default, every tensor input to the function is split across its batch
+   dimension into a number of microbatches specified while launching the
+   training job. This behavior can be customized through the arguments to
+   ``smp.step``, described below. The library then orchestrates the execution of
+   each microbatch across all partitions, based on the chosen pipeline
+   type.
+
+   In a typical use case, forward pass and back-propagation are executed
+   inside an \ ``smp.step``-decorated function and gradients, loss, and
+   other relevant metrics (such as accuracy, etc.) are returned from
+   ``smp.step``-decorated function.
+
+   Any gradient post-processing operation, such as gradient clipping and
+   allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or
+   ``optimizer.step`` (for PT) should be applied on the gradients returned
+   from the ``smp.step`` function, and not inside the ``smp.step``
+   function. This is because every operation inside ``smp.step`` is
+   executed once per microbatch, so having these operations inside
+   ``smp.step`` can either be inefficient (in the case of allreduce), or
+   lead to wrong results (in the case of ``apply_gradients`` /
+   ``optimizer.step``).
+
+   If the objects returned from the ``smp.step``-decorated function contain
+   ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to
+   ``StepOutput`` objects. A ``StepOutput`` object encapsulates all
+   versions of the tensor across different microbatches
+   (see ``StepOutput`` entry for more information).
+
+   The argument to ``smp.step`` decorated function should either be a tensor
+   or an instance of list, tuple, dict or set for it to be split across
+   microbatches. If your object doesn't fall into this category, you can make
+   the library split your object, by implementing ``smp_slice`` method.
+
+   Below is an example of how to use it with PyTorch.
+
+   .. code:: python
+
+      class CustomType:
+          def __init__(self, tensor):
+              self.data = tensor
+
+          # The library will call this to invoke slicing on the object passing in total microbatches (num_mb)
+          # and the current microbatch index (mb).
+          def smp_slice(self, num_mb, mb, axis):
+              dim_size = list(self.data.size())[axis]
+
+              split_size = dim_size // num_mb
+              sliced_tensor = self.data.narrow(axis, mb * split_size, split_size)
+              return CustomType(sliced_tensor, self.other)
+
+      custom_obj = CustomType(torch.ones(4,))
+
+      @smp.step()
+      def step(custom_obj):
+          loss = model(custom_obj)
+          model.backward(loss)
+          return loss
+
+
+   **Important:** ``smp.step`` splits the batch into microbatches, and
+   executes everything inside the decorated function once per microbatch.
+   This might affect the behavior of batch normalization, any operation
+   that explicitly uses the batch size information, or any other Python
+   code that is expected to run once.
+
+   **TensorFlow-specific behavior**
+
+   ``smp.step`` is a wrapper that
+   inherits from and extends the behavior of ``tf.function``, and as such,
+   all the caveats that apply to the use of ``tf.function``\ s also apply
+   to ``smp.step``. In particular, any operation that is inside
+   ``smp.step`` executes in graph mode, and not eager mode.
+
+   In the first call, ``smp.step`` performs tracing of the wrapped function every time
+   one of the tensor arguments changes their shape or dtype, or for every
+   new value of a Python argument, if there is one. Tracing is expensive,
+   so such scenarios should be avoided as much as possible or,
+   alternatively, an ``input_signature`` argument must be provided. For
+   more information on the usage of ``tf.function``, refer to the
+   TensorFlow documentation:
+
+   -  https://www.tensorflow.org/api_docs/python/tf/function\
+   -  https://www.tensorflow.org/guide/function\
+
+   Each ``smp.step`` decorated function must have a return value that depends on the
+   output of ``smp.DistributedModel``.
+
+   **Common parameters**
+
+   -  ``non_split_inputs`` (``list``): The list of arguments to the decorated function
+      that should not be split along the batch dimension. Should be used
+      for all input tensors that do not have a batch dimension. Should be a
+      list of argument names as ``str``, as they appear in the signature of
+      the ``smp.step``-decorated function. By default it is considered an
+      empty list.
+
+   -  ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch
+      axis. The keys should be the argument names as ``str``, as they
+      appear in the signature of the ``smp.step``-decorated function.  By
+      default all batch axes are assumed to be the 0-axis.
+
+   **TensorFlow-only parameters**
+
+   -  All arguments of ``tf.function``. Note:
+      The \ ``experimental_compile`` argument of ``tf.function`` may not
+      work as expected with ``smp.step``, since it interferes with
+      pipelining and model partitioning. To enable XLA with the library, you can
+      instead use \ ``tf.config.optimizer.set_jit(True)``.
+
+   **PyTorch-only parameters**
+
+   -  ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on
+      all returned ``torch.Tensor`` outputs. Setting it to ``False``
+      increases memory consumption, unless ``detach()`` is manually called
+      on the returned tensors, because the model graph is not cleared from
+      memory after the training step. Set to \ ``True`` by default.
+
+   **Returns**
+
+   -  The same object(s) returned from the decorated function. All
+      returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or
+      ``torch.Tensor`` objects (for PT) are wrapped inside
+      a \ ``StepOutput`` object, even when they are inside a Python
+      ``list``, ``tuple``, or ``dict``.
+
+
+
+.. class:: StepOutput
+   :noindex:
+
+
+   A class that encapsulates all versions of a ``tf.Tensor``
+   or \ ``torch.Tensor`` across all microbatches.
+
+   When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside
+   ``smp.step``, different versions of the tensor are computed for each
+   microbatch.
+
+   When this tensor is returned from ``smp.step`` and is accessed outside
+   of the decorated function, it appears as a ``StepOutput`` object, which
+   contains all such versions. For example,
+
+   -  In the case of Tensorflow, the gradient for a particular
+      ``tf.Variable`` is computed on each microbatch individually, and if
+      this gradient is returned from ``smp.step``, all gradients for this
+      ``tf.Variable`` become part of the same ``StepOutput`` object. The
+      ``StepOutput`` class offers the following API for commonly-used
+      post-processing operations on such tensors.
+   -  In the case of PyTorch, the loss for each microbatch is computed
+      individually and all the ``torch.Tensor``\ s that represent the loss
+      for different microbatches become part of same ``StepOutput`` object,
+      if loss is returned from the ``smp.step`` function.
+
+
+   The ``StepOutput`` class offers the following API for commonly-used
+   post-processing operations on tensors.
+
+   .. data:: StepOutput.outputs
+      :noindex:
+
+      Returns a list of the underlying tensors, indexed by microbatch.
+
+   .. function:: StepOutput.reduce_mean( )
+      :noindex:
+
+      Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s
+      ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches.
+
+   .. function:: StepOutput.reduce_sum( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` /
+      ``torch.Tensor`` that sums the constituent
+      ``tf.Tensor``\ s/\ ``torch.Tensor``\ s.
+
+   .. function:: StepOutput.concat( )
+      :noindex:
+
+      Returns a
+      ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the
+      batch dimension using ``tf.concat`` / ``torch.cat``.
+
+   .. function:: StepOutput.stack( )
+      :noindex:
+
+      Applies ``tf.stack`` / ``torch.stack``
+      operation to the list of constituent ``tf.Tensor``\ s /
+      ``torch.Tensor``\ s.
+
+   **TensorFlow-only methods**
+
+   .. function:: StepOutput.merge( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` that
+      concatenates the constituent ``tf.Tensor``\ s along the batch
+      dimension. This is commonly used for merging the model predictions
+      across microbatches.
+
+   .. function:: StepOutput.accumulate(method="variable", var=None)
+      :noindex:
+
+      Functionally the same as ``StepOutput.reduce_mean()``. However, it is
+      more memory-efficient, especially for large numbers of microbatches,
+      since it does not wait for all constituent \ ``tf.Tensor``\ s to be
+      ready to start averaging them, thereby saving memory.
+
+      In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end
+      up being more memory-efficient than ``StepOutput.accumulate()``.
+
+      **Parameters**
+
+      -  ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``):
+         If ``"add_n"`` or ``"accumulate_n"``, the library uses
+         ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement
+         accumulation. If ``"variable"``, the library uses an internal ``tf.Variable``
+         into which to accumulate the tensors. Default is \ ``"variable"``.
+         Note: Memory usage behavior of these choices can depend on the model
+         and implementation.
+
+      -  ``var``: A ``tf.Variable`` into which, if provided, the library uses to
+         accumulate the tensors. If \ ``None``, the library internally creates a
+         variable. If ``method`` is not ``"variable"``, this argument is
+         ignored.
+
+.. _mpi_basics:
+   :noindex:
+
+MPI Basics
+^^^^^^^^^^
+
+The library exposes the following basic MPI primitives to its Python API:
+
+-  ``smp.rank()``: The rank of the current process.
+-  ``smp.size()``: The total number of processes.
+-  ``smp.mp_rank()``: The rank of the process among the processes that
+   hold the current model replica.
+-  ``smp.dp_rank()``: The rank of the process among the processes that
+   hold different replicas of the same model partition.
+-  ``smp.dp_size()``: The total number of model replicas.
+-  ``smp.local_rank()``: The rank among the processes on the current
+   instance.
+-  ``smp.local_size()``: The total number of processes on the current
+   instance.
+-  ``smp.get_mp_group()``: The list of ranks over which the current
+   model replica is partitioned.
+-  ``smp.get_dp_group()``: The list of ranks that hold different
+   replicas of the same model partition.
+
+   .. _communication_api:
+      :noindex:
+
+Communication API
+^^^^^^^^^^^^^^^^^
+
+The library provides a few communication primitives which can be helpful while
+developing the training script. These primitives use the following
+``enum`` s as arguments to specify which processes the communication
+should involve.
+​
+
+**Helper structures**
+
+.. data:: smp.CommGroup
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``.
+   These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``,
+   and ``smp.DP_GROUP`` respectively.
+
+   -  ``CommGroup.WORLD``: Represents the entire group of processes used in
+      training
+   -  ``CommGroup.MP_GROUP``: Represents the group of processes that hold
+      the same model replica as the current process. The processes in a
+      single ``MP_GROUP`` collectively store an entire replica of the
+      model.
+   -  ``CommGroup.DP_GROUP``: Represents the group of processes that hold
+      the same model partition as the current process. The processes in a
+      single ``DP_GROUP`` perform data parallelism/allreduce among
+      themselves.
+
+.. data:: smp.RankType
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``.
+
+   -  ``RankType.WORLD_RANK``: The associated rank is to be interpreted as
+      the rank of the process across all processes used in training.
+   -  ``RankType.MP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``MP_GROUP``.
+   -  ``RankType.DP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``DP_GROUP``.
+
+
+**Communication primitives:**
+
+.. function:: smp.broadcast(obj, group)
+   :noindex:
+
+   Sends the object to all processes in the
+   group. The receiving process must call ``smp.recv_from`` to receive the
+   sent object.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be broadcast.
+
+   -  ``group``: A ``CommGroup`` argument that represents to which group of
+      processes the object will be sent.
+
+   **Notes**
+
+   -  When you use ``broadcast`` on the sender process, there needs
+      to be an accompanying ``smp.recv_from()`` call on the receiver
+      processes.
+
+   -  This is a synchronous call; the ``broadcast`` statement
+      returns only after all ranks participating in the call have made a
+      matching ``recv_from`` call.
+
+   **Example**
+
+   .. code:: python
+
+      if smp.rank() == 0:
+          smp.broadcast(something, group=smp.CommGroup.WORLD)
+      else:
+          smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK)
+
+.. function:: smp.send(obj, dest_rank, rank_type)
+   :noindex:
+
+   Sends the object ``obj`` to
+   ``dest_rank``, which is of a type specified by ``rank_type``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be sent.
+
+   -  ``dest_rank`` (``int``): An integer denoting the rank of the receiving process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process
+      with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current
+      process.
+
+   **Notes**
+
+   -  Note: \ This is a synchronous call; the ``send`` statement returns
+      only after the destination rank has made a matching
+      ``recv_from`` call.
+
+.. function:: smp.recv_from(src_rank, rank_type)
+   :noindex:
+
+   Receive an object from a peer process. Can be used with a matching
+   ``smp.send`` or a ``smp.broadcast`` call.
+
+   **Inputs**
+
+   -  ``src_rank`` (``int``): An integer denoting rank of the sending process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then the object is received from
+      the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the
+      current process.
+
+   **Returns**
+
+   Returns the python object that is sent by the peer process.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``recv_from`` statement returns
+      only after the source rank has made a matching ``send`` or
+      ``broadcast`` call, and the object is received.
+
+.. function:: smp.allgather(obj, group)
+   :noindex:
+
+   A collective call that gathers all the
+   submitted objects across all ranks in the specified ``group``. Returns a
+   list whose ``i``\ th index contains the object submitted by the
+   ``i``\ th rank in ``group``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be
+      allgathered.
+
+   -  ``group`` : A ``CommGroup`` argument that represents which group of
+      processes participate in ``allgather``.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``allgather`` statement returns
+      only after all ranks participating in the call have made a matching
+      ``allgather`` call, and all the objects are received at the current
+      rank.
+
+   **Examples**
+
+   .. code:: python
+
+      # assuming mp_size() == 2
+
+      if smp.mp_rank() == 0:
+          out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+      else:
+          out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+
+.. function:: smp.barrier(group=smp.WORLD)
+   :noindex:
+
+   A statement that hangs until all
+   processes in the specified group reach the barrier statement, similar to
+   ``MPI_Barrier()``.
+
+   **Inputs**
+
+   -  ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of
+      processes participating in the barrier call. Defaults to
+      ``smp.WORLD``.
+
+   **Examples**
+
+   -  Assume there are 8 processes and 2 model partitions, and
+      therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If
+      the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its
+      group argument, then each process only waits until the other process
+      of its own ``mp_group`` reaches that point. It does not wait for
+      processes outside that ``mp_group``.
+
+.. function:: smp.dp_barrier()
+   :noindex:
+
+   Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``.
+   Waits for the processes in the same \ ``dp_group`` as
+   the current process to reach the same point in execution.
+
+.. function:: smp.mp_barrier()
+   :noindex:
+
+   Same as passing ``smp.MP_GROUP`` to
+   ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as
+   the current process to reach the same point in execution.
diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst
new file mode 100644
index 0000000000..d2fcb95954
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst
@@ -0,0 +1,572 @@
+.. admonition:: Contents
+
+   - :ref:`pytorch_saving_loading`
+   - :ref:`pytorch_saving_loading_instructions`
+
+PyTorch API
+===========
+
+**Supported versions: 1.7.1, 1.8.1**
+
+This API document assumes you use the following import statements in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. tip::
+
+   Refer to
+   `Modify a PyTorch Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`_
+   to learn how to use the following API in your PyTorch training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of ``torch.nn.Module`` which specifies the model to be
+   partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is
+   the model to be partitioned. The returned ``DistributedModel`` object
+   internally manages model parallelism and data parallelism. Only one
+   model in the training script can be wrapped with
+   ``smp.DistributedModel``.
+
+   **Example:**
+
+   .. code:: python
+
+      model = smp.DistributedModel(model)
+
+   **Important**: The ``__call__`` and  ``backward`` method calls on the
+   ``smp.DistributedModel`` object (in the following example, the object
+   is \ ``model``) can only be made inside a ``smp.step``-decorated
+   function.
+
+
+   Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can
+   be performed by calling the \ ``DistributedModel`` object on the input
+   tensors.
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   For a backward pass, one needs to call the backward function on
+   the \ ``DistributedModel`` object, with tensors and gradients as
+   arguments, replacing the PyTorch operations \ ``torch.Tensor.backward``
+   or ``torch.autograd.backward``.
+
+
+   The API for ``model.backward`` is very similar to
+   ``torch.autograd.backward``. For example, the following
+   ``backward`` calls:
+
+   .. code:: python
+
+      torch.autograd.backward(loss) or loss.backward()
+
+   should be replaced with:
+
+   .. code:: python
+
+      model.backward(loss) # loss is a tensor with only one element as its data
+
+   Similarly, for non-scalar tensors, replace the following
+   ``backward`` call containing incoming gradient arguments:
+
+   .. code:: python
+
+      torch.autograd.backward(outputs, out_grads)
+
+   with the following line:
+
+   .. code:: python
+
+      model.backward(outputs, out_grads)
+
+   In these examples, all ``__call__``  and ``backward`` method calls on
+   the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside
+   a ``smp.step``-decorated function.
+
+   **Using DDP**
+
+   If DDP is enabled, do not not place a PyTorch
+   ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
+   the ``DistributedModel`` wrapper will also handle data parallelism.
+
+   Unlike the original DDP wrapper, when you use ``DistributedModel``,
+   model parameters and buffers are not immediately broadcast across
+   processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the
+   ``smp.step``-decorated function when the partition is done.
+
+   **Parameters**
+
+   -  ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism).
+
+   -  ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``)
+      Whether to perform the tracing step on the GPU or CPU. The tracing step gathers
+      information on the order of execution of modules, the shapes of
+      intermediate outputs, and execution times, to be used by the
+      partitioning algorithm. If ``trace_device`` is set to GPU, accurate
+      module execution times can be gathered during tracing for potentially
+      improved partitioning decision. However, if the model is too large to
+      fit in a single GPU, then ``trace_device`` should be set to CPU.
+
+   -  ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``,
+      the library profiles the execution time of each module during tracing, and uses
+      it in the partitioning decision. This improves the partitioning
+      decision, but it might make the tracing slower. It may also introduce
+      some degree of non-determinism in partitioning results, because of the
+      inherent randomness in module execution times. Must be ``False`` if
+      ``trace_device`` is ``"cpu"``.
+
+   -  ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` while launching training). The library uses this flag
+      to decide whether to do overlapping allreduce whenever a parameter
+      gradients are ready. This leads to overlapping of communication and
+      computation and can improve performance. If this is set to ``False`` ,
+      allreduce is performed at the end of the step.
+
+   -  ``backward_passes_per_step`` (``int``) (default: 1): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` in config). This parameter indicates the
+      number of backward passes to perform before calling allreduce on DDP.
+      This allows accumulating updates over multiple mini-batches before
+      reducing and applying them.
+
+   -  ``average_grads_across_microbatches`` (``bool``) (default: ``True``):
+      Whether or not the computed gradients should be averaged across
+      microbatches. If ``False``, the computed gradients will be summed across
+      microbatches, but not divided by the number of microbatches. In typical
+      use case where the computed loss is averaged over the mini-batch, this
+      should be left as ``True``. If you use a loss function that only sums
+      the per-sample loss across the batch (and not divide by the batch size),
+      then this must be set to ``False`` for correctness.
+
+   -  ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets
+      parameters into multiple buckets so that gradient reduction of each
+      bucket can potentially overlap with backward
+      computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes
+      (MB).
+
+   -  ``trace_memory_usage`` (default: False): When set to True, the library attempts
+      to measure memory usage per module during tracing. If this is disabled,
+      memory usage will be estimated through the sizes of tensors returned from
+      the module.
+
+   -  ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
+      This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper.
+      Please see: `broadcast_buffer <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   -  ``gradient_as_bucket_view`` (default: False): To be
+      used with ``ddp=True``. This parameter is forwarded to the underlying
+      ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   **Properties**
+
+   -  ``partitioned``: Is ``True`` if the model is partitioned, ``False``
+      otherwise. Initialized to ``False`` when ``DistributedModel`` is first
+      created. It becomes be ``True`` during the first call
+      to ``smp.step``-decorated function. Once the model is partitioned, the
+      local parameters or local ``state_dict`` can be fetched using the
+      following methods.
+
+   **Methods**
+
+   .. function:: backward(tensors, grad_tensors)
+      :noindex:
+
+      Triggers a distributed backward
+      pass across model partitions. Example usage provided in the previous
+      section. The API is very similar
+      to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward.
+      ``retain_grad`` and ``create_graph``  flags are not supported.
+
+   .. function:: local_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the modules in
+      the partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the
+      modules in the partitioned model that have been assigned to the current
+      process. This yields both the name of the buffer as well as the buffer
+      itself.
+
+   .. function:: local_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for the
+      modules in the partitioned model that have been assigned to the current
+      process.
+
+   .. function:: local_named_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for
+      the modules in the partitioned model that have been assigned to the
+      current process. This yields both the name of the parameter as well as
+      the parameter itself.
+
+   .. function:: local_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process. This
+      yields both the name of the module as well as the module itself.
+
+   .. function:: local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains local
+      parameters that belong to the current \ ``mp_rank``. This ``state_dict``
+      contains a key \ ``_smp_is_partial`` to indicate this is a
+      partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains parameters
+      for the entire model. It first collects the \ ``local_state_dict``  and
+      gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to
+      create a full ``state_dict``. Please note that this needs to be called on all ranks with
+      ``dp_rank()==0`` to ensure the gather happens properly.
+      If it is only called on all such ranks, it can hang.
+
+   .. function:: load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.module.load_state_dict()`` ,
+      except: It first gathers and merges the ``state_dict``\ s across
+      ``mp_rank``\ s, if they are partial. The actual loading happens after the
+      model partition so that each rank knows its local parameters.
+
+   .. function:: register_post_partition_hook(hook)
+      :noindex:
+
+      Registers a callable ``hook`` to
+      be executed after the model is partitioned. This is useful in situations
+      where an operation needs to be executed after the model partition during
+      the first call to ``smp.step``, but before the actual execution of the
+      first forward pass. Returns a ``RemovableHandle`` object ``handle``,
+      which can be used to remove the hook by calling ``handle.remove()``.
+
+   .. function:: cpu( )
+      :noindex:
+
+      Allgathers parameters and buffers across all ``mp_rank``\ s and moves them
+      to the CPU.
+
+   .. function:: join( )
+      :noindex:
+
+      A context manager to be used in conjunction with an instance of
+      ``smp.DistributedModel`` to be able to train with uneven inputs across
+      participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped
+      ``DistributedDataParallel`` instance. For more information, see:
+      `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__
+      in the PyTorch documentation.
+
+   .. function:: register_comm_hook( state, callable )
+      :noindex:
+
+      **Available for PyTorch 1.8.1 only**
+      Registers a communication hook which is an enhancement that provides
+      a flexible hook ``callable`` to users where they can specify how
+      gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance.
+
+      Please note that when you register a comm hook you have full control of how the gradients are processed.
+      When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook.
+      Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook.
+      In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function.
+      See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches.
+
+      This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default).
+      For more information, see:
+      `register_comm_hook <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.register_comm_hook>`__
+      in the PyTorch documentation.
+
+
+
+.. class:: smp.DistributedOptimizer
+   :noindex:
+
+   **Parameters**
+   - ``optimizer``
+
+   An optimizer wrapper for saving/loading optimizer states. This wrapper
+   returns ``optimizer`` with the following methods overridden:
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains optimizer state for the entire model.
+      It first collects the ``local_state_dict`` and gathers and merges
+      the ``local_state_dict`` from all ``mp_rank``s to create a full
+      ``state_dict``.
+
+   .. function::  load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.optimizer.load_state_dict()`` , except:
+
+         -  It first gathers and merges the local ``state_dict``\ s if they are
+            partial.
+         -  The actual loading happens after the model partition so that each
+            rank knows its local parameters.
+
+   .. function::  local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains the
+      local optimizer state that belongs to the current \ ``mp_rank``. This
+      ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is
+      a partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   ​
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (int) - The index of the partition.
+
+   A context manager which places all modules defined inside into the
+   partition with ID ``index``.  The ``index`` argument must be less than
+   the number of partitions.
+
+   Use ``smp.partition`` to implement manual partitioning.
+   If ``"auto_partition"`` is ``True``, then the
+   ``smp.partition`` contexts are ignored. Any module that is not placed in
+   any ``smp.partition`` context is placed in the
+   ``default_partition`` defined through the SageMaker Python SDK.
+
+   When ``smp.partition`` contexts are nested, the innermost context
+   overrides the rest (see the following example). In PyTorch, manual
+   partitioning should be done inside the module \ ``__init__``, and the
+   partition assignment applies to the modules that are *created* inside
+   the ``smp.partition`` context.
+
+   Example:
+
+   .. code:: python
+
+      class Model(torch.nn.Module):
+          def __init__(self):
+              with smp.partition(1):
+                  self.child0 = Child0()            # child0 on partition 1
+                  with smp.partition(2):
+                      self.child1 = Child1()        # child1 on partition 2
+                  self.child2 = Child2()            # child2 on partition 1
+              self.child3 = Child3()                # child3 on default_partition
+
+.. function:: smp.get_world_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all
+   processes, which can be used with the ``torch.distributed`` API.
+   Requires ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_mp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``MP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_dp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``DP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.is_initialized( )
+   :noindex:
+
+   Returns ``True`` if ``smp.init`` has already been called for the
+   process, and ``False`` otherwise.
+
+.. function::smp.is_tracing( )
+   :noindex:
+
+   Returns ``True`` if the current process is running the tracing step, and
+   ``False`` otherwise.
+
+.. data:: smp.nn.FusedLayerNorm
+   :noindex:
+
+   `Apex Fused Layer Norm <https://nvidia.github.io/apex/layernorm.html>`__ is currently not
+   supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex``
+   ``FusedLayerNorm`` and provides the same functionality. This requires
+   ``apex`` to be installed on the system.
+
+.. data:: smp.optimizers.FusedNovoGrad
+   :noindex:
+
+
+   `Fused Novo Grad optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedNovoGrad>`__ is
+   currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad``
+   optimizer and provides the same functionality. This requires ``apex`` to
+   be installed on the system.
+
+.. data:: smp.optimizers.FusedLamb
+   :noindex:
+
+
+   `FusedLamb optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedLAMB>`__
+   currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces
+   ``apex`` ``FusedLamb`` optimizer and provides the same functionality.
+   This requires ``apex`` to be installed on the system.
+
+.. data:: smp.amp.GradScaler
+   :noindex:
+
+   `Torch AMP Gradscaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`__
+   currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces
+   ``torch.amp.GradScaler`` and provides the same functionality.
+
+.. _pytorch_saving_loading:
+   :noindex:
+
+APIs for Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. function:: smp.save( )
+   :noindex:
+
+   Saves an object. This operation is similar to ``torch.save()``, except
+   it has an additional keyword argument, ``partial``, and accepts only
+   string type for the argument ``f`` (file). If ``partial=True``, each
+   ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank``
+   index to your saved file.
+
+   **Parameters**
+
+   -  ``obj`` (dict): A saved object.
+   -  ``f`` (str): A string containing a file name.
+   -  ``partial`` (bool, default= ``True``):  When set to ``True``, each
+      ``mp_rank`` saves a separate checkpoint file and the library adds an
+      ``mp_rank`` index to the saved file. If you want to be able to load
+      and further train a model that you save with ``smp.save()``, you must
+      set ``partial=True``.
+   -  ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``):
+      A module used for pickling metadata and objects.
+   -  ``pickle_protocol``  (int, default=2): Can be specified to
+      override the defaultprotocol.
+
+.. function:: smp.load( )
+   :noindex:
+
+   Loads an object saved with ``smp.save()`` from a file.
+
+   Similar to, `torch.load() <https://pytorch.org/docs/stable/generated/torch.load.html>`__,
+   except it has an additional keyword argument, ``partial``, and accepts
+   only string type for the argument ``f`` (file). If \ ``partial=True``,
+   then each ``mp_rank`` loads a separate checkpoint file.
+
+   **Parameters**
+
+   -  ``f`` (string): A string containing a file name.
+   -  ``map_location`` (function): A function
+      `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device>`__,
+      a string, or a dict specifying how to remap storage locations.
+   -  ``pickle_module`` (pickle module): A module used for unpickling
+      metadata and objects (has to match the \ ``pickle_module``\ used to
+      serialize file).
+   -  ``pickle_load_args`` (Python 3 only): Optional keyword arguments
+      passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``.
+   -  ``partial`` (bool, default= ``True``): When set to ``True``, each
+      ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``.
+      Should be used when loading a model trained with the library.
+
+.. _pytorch_saving_loading_instructions:
+   :noindex:
+
+General Instruction For Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The library can save partial or full checkpoints.
+
+-  For partial checkpoints, each ``mp_rank`` saves its own checkpoint
+   file with only the parameters that belong to that rank.
+-  For full checkpoints, the library saves a single checkpoint that contains
+   entire model parameters.
+
+When **saving** using ``smp.save()``, each rank only holds its own
+parameters. If you want to save the full model, there will be some
+communication between the ranks to create the full model. If you save
+checkpoints often, you should save partial checkpoints for best
+performance.
+
+When **loading** using ``smp.load()``, the library can load either partial or |
+full checkpoints or full checkpoints saved by a non-model-parallel model. If you
+want to resume training with a non-model-parallel model or do inference, you need
+a full checkpoint.
+
+The following is an example of how you can save and load a checkpoint:
+
+.. code:: python
+
+   # Original model and optimizer
+   model = MyModel(...)
+   optimizer = MyOpt(...)
+
+   # model parallel wrapper
+   model = smp.DistributedModel(model)
+   optimizer = smp.DistributedOptimizer(optimizer)
+
+   # To save, always save on dp_rank 0 to avoid data racing
+   if partial:
+       # To save the partial model on each mp rank
+       # the library will create `checkpoint.pt_{mprank}` for each mp rank
+       if save_partial_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.local_state_dict() # save the partial model
+               opt_dict = optimizer.local_state_dict() # save the partial optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   f"/checkpoint.pt",
+                   partial=True,
+               )
+
+       # To save the full model
+       if save_full_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.state_dict() # save the full model
+               opt_dict = optimizer.state_dict() # save the full optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   "/checkpoint.pt",
+                   partial=False,
+               )
+
+   # To load, load on all ranks.
+   # The only difference for partial/full loading is the partial flag in smp.load
+   # Load partial checkpoint
+   if partial_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=True)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+   # Load full checkpoint
+   if full_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=False)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst
new file mode 100644
index 0000000000..131fc327ac
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst
@@ -0,0 +1,172 @@
+TensorFlow API
+==============
+
+**Supported version: 2.3.1, 2.4.1, 2.5.0**
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+.. tip::
+
+   Refer to
+   `Modify a TensorFlow Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`_
+   to learn how to use the following API in your TensorFlow training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of the Keras \ ``Model`` class, which defines the model to
+   be partitioned. Model definition is done by sub-classing
+   ``smp.DistributedModel`` class, and implementing the ``call()`` method,
+   in the same way as the Keras model sub-classing API. Any operation that
+   is part of the \ ``smp.DistributedModel.call()`` method is subject to
+   partitioning, meaning that every operation placed inside executes in
+   exactly one of the devices (the operations outside run on all devices).
+
+
+   Similar to the regular Keras API, the forward pass is done by directly
+   calling the model object on the input tensors. For example:
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   However, ``model()`` calls can only be made inside a
+   ``smp.step``-decorated function.
+
+   The outputs from a ``smp.DistributedModel`` are available in all ranks,
+   regardless of which rank computed the last operation.
+
+   **Methods:**
+
+   .. function:: save_model(save_path="/opt/ml/model")
+      :noindex:
+
+      **Inputs**
+      - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights.
+
+      Saves the entire,
+      unpartitioned model with the latest trained weights to ``save_path`` in
+      TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which
+      SageMaker monitors to upload the model artifacts to Amazon S3.
+
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (``int``): The index of the partition.
+
+   A context manager which places all operations defined inside into the
+   partition whose ID is equal to ``index``. When
+   ``smp.partition`` contexts are nested, the innermost context overrides
+   the rest. The ``index`` argument must be smaller than the number of
+   partitions.
+
+   ``smp.partition`` is used in the manual partitioning API;
+   if \ ``"auto_partition"`` parameter is set to ``True`` while launching
+   training, then ``smp.partition`` contexts are ignored. Any operation
+   that is not placed in any ``smp.partition`` context is placed in the
+   ``default_partition``, as shown in the following example:
+
+   .. code:: python
+
+      # auto_partition: False
+      # default_partition: 0
+      smp.init()
+      [...]
+      x = tf.constant(1.2)                     # placed in partition 0
+      with smp.partition(1):
+          y = tf.add(x, tf.constant(2.3))      # placed in partition 1
+          with smp.partition(3):
+              z = tf.reduce_sum(y)             # placed in partition 3
+
+
+.. function:: register_post_partition_hook(hook)
+   :noindex:
+
+    Registers a callable ``hook`` to
+    be executed after the model is partitioned. This is useful in situations
+    where an operation needs to be executed after the model partition during
+    the first call to ``smp.step``, but before the actual execution of the
+    first forward pass.
+
+    .. code:: python
+
+        @smp.register_post_partition_hook
+        def test_eager():
+            # All statements here will be executed right after partition but before the first forward pass
+            tf.print("Entered hook through eager context")
+
+.. class:: smp.CheckpointManager
+   :noindex:
+
+
+   A subclass of TensorFlow
+   `CheckpointManager <https://www.tensorflow.org/api_docs/python/tf/train/CheckpointManager>`__,
+   which is used to manage checkpoints. The usage is similar to TensorFlow
+   ``CheckpointManager``.
+
+   The following returns a ``CheckpointManager`` object.
+
+   .. code:: python
+
+      smp.CheckpointManager(checkpoint,
+                            directory="/opt/ml/checkpoints",
+                            max_to_keep=None,
+                            checkpoint_name="ckpt")
+
+   **Parameters**
+
+   -  ``checkpoint``: A `tf.train.Checkpoint
+      <https://www.tensorflow.org/api_docs/python/tf/train/Checkpoint>`__ instance
+      that represents a model checkpoint.
+
+   -  ``directory``: (``str``) The path to a directory in which to write
+      checkpoints. A file named "checkpoint" is also written to this
+      directory (in a human-readable text format) which contains the state
+      of the ``CheckpointManager``. Defaults to
+      ``"/opt/ml/checkpoints"``, which is the directory that SageMaker
+      monitors for uploading the checkpoints to Amazon S3.
+   -  ``max_to_keep`` (``int``): The number of checkpoints to keep. If
+      ``None``, all checkpoints are kept.
+   -  ``checkpoint_name`` (``str``): Custom name for the checkpoint file.
+      Defaults to ``"ckpt"``.
+
+
+   **Methods:**
+
+   .. function:: save( )
+      :noindex:
+
+      Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``.
+
+   .. function:: restore( )
+      :noindex:
+
+      Restores the latest checkpoint in the specified directory.
+      Internally uses ``tf.train.CheckpointManager.restore()``.
+
+
+   **Examples:**
+
+   .. code:: python
+
+      checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+      ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints
+
+      for inputs in train_ds:
+          loss = train_step(inputs)
+          # [...]
+          ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints
+
+   .. code:: python
+
+      for step, inputs in enumerate(train_ds):
+          if step == 0:
+              ckpt_manager.restore()
+          loss = train_step(inputs)
diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst
new file mode 100644
index 0000000000..625a7fcbf1
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst
@@ -0,0 +1,488 @@
+.. admonition:: Contents
+
+   - :ref:`communication_api`
+   - :ref:`mpi_basics`
+
+Common API
+==========
+
+The following SageMaker distribute model parallel APIs are common across all frameworks.
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+**TensorFlow**
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+**PyTorch**
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. function:: smp.init( )
+   :noindex:
+
+   Initialize the library. Must be called at the beginning of training script.
+
+.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs])
+   :noindex:
+
+   A decorator that must be placed over a function that represents a single
+   forward and backward pass (for training use cases), or a single forward
+   pass (for evaluation use cases). Any computation that is defined inside
+   the ``smp.step``-decorated function is executed in a pipelined manner.
+
+   By default, every tensor input to the function is split across its batch
+   dimension into a number of microbatches specified while launching the
+   training job. This behavior can be customized through the arguments to
+   ``smp.step``, described below. The library then orchestrates the execution of
+   each microbatch across all partitions, based on the chosen pipeline
+   type.
+
+   In a typical use case, forward pass and back-propagation are executed
+   inside an \ ``smp.step``-decorated function and gradients, loss, and
+   other relevant metrics (such as accuracy, etc.) are returned from
+   ``smp.step``-decorated function.
+
+   Any gradient post-processing operation, such as gradient clipping and
+   allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or
+   ``optimizer.step`` (for PT) should be applied on the gradients returned
+   from the ``smp.step`` function, and not inside the ``smp.step``
+   function. This is because every operation inside ``smp.step`` is
+   executed once per microbatch, so having these operations inside
+   ``smp.step`` can either be inefficient (in the case of allreduce), or
+   lead to wrong results (in the case of ``apply_gradients`` /
+   ``optimizer.step``).
+
+   If the objects returned from the ``smp.step``-decorated function contain
+   ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to
+   ``StepOutput`` objects. A ``StepOutput`` object encapsulates all
+   versions of the tensor across different microbatches
+   (see ``StepOutput`` entry for more information).
+
+   The argument to ``smp.step`` decorated function should either be a tensor
+   or an instance of list, tuple, dict or set for it to be split across
+   microbatches. If your object doesn't fall into this category, you can make
+   the library split your object, by implementing ``smp_slice`` method.
+
+   Below is an example of how to use it with PyTorch.
+
+   .. code:: python
+
+      class CustomType:
+          def __init__(self, tensor):
+              self.data = tensor
+
+          # The library will call this to invoke slicing on the object passing in total microbatches (num_mb)
+          # and the current microbatch index (mb).
+          def smp_slice(self, num_mb, mb, axis):
+              dim_size = list(self.data.size())[axis]
+
+              split_size = dim_size // num_mb
+              sliced_tensor = self.data.narrow(axis, mb * split_size, split_size)
+              return CustomType(sliced_tensor, self.other)
+
+      custom_obj = CustomType(torch.ones(4,))
+
+      @smp.step()
+      def step(custom_obj):
+          loss = model(custom_obj)
+          model.backward(loss)
+          return loss
+
+
+   **Important:** ``smp.step`` splits the batch into microbatches, and
+   executes everything inside the decorated function once per microbatch.
+   This might affect the behavior of batch normalization, any operation
+   that explicitly uses the batch size information, or any other Python
+   code that is expected to run once.
+
+   **TensorFlow-specific behavior**
+
+   ``smp.step`` is a wrapper that
+   inherits from and extends the behavior of ``tf.function``, and as such,
+   all the caveats that apply to the use of ``tf.function``\ s also apply
+   to ``smp.step``. In particular, any operation that is inside
+   ``smp.step`` executes in graph mode, and not eager mode.
+
+   In the first call, ``smp.step`` performs tracing of the wrapped function every time
+   one of the tensor arguments changes their shape or dtype, or for every
+   new value of a Python argument, if there is one. Tracing is expensive,
+   so such scenarios should be avoided as much as possible or,
+   alternatively, an ``input_signature`` argument must be provided. For
+   more information on the usage of ``tf.function``, refer to the
+   TensorFlow documentation:
+
+   -  https://www.tensorflow.org/api_docs/python/tf/function\
+   -  https://www.tensorflow.org/guide/function\
+
+   Each ``smp.step`` decorated function must have a return value that depends on the
+   output of ``smp.DistributedModel``.
+
+   **Common parameters**
+
+   -  ``non_split_inputs`` (``list``): The list of arguments to the decorated function
+      that should not be split along the batch dimension. Should be used
+      for all input tensors that do not have a batch dimension. Should be a
+      list of argument names as ``str``, as they appear in the signature of
+      the ``smp.step``-decorated function. By default it is considered an
+      empty list.
+
+   -  ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch
+      axis. The keys should be the argument names as ``str``, as they
+      appear in the signature of the ``smp.step``-decorated function.  By
+      default all batch axes are assumed to be the 0-axis.
+
+   **TensorFlow-only parameters**
+
+   -  All arguments of ``tf.function``. Note:
+      The \ ``experimental_compile`` argument of ``tf.function`` may not
+      work as expected with ``smp.step``, since it interferes with
+      pipelining and model partitioning. To enable XLA with the library, you can
+      instead use \ ``tf.config.optimizer.set_jit(True)``.
+
+   **PyTorch-only parameters**
+
+   -  ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on
+      all returned ``torch.Tensor`` outputs. Setting it to ``False``
+      increases memory consumption, unless ``detach()`` is manually called
+      on the returned tensors, because the model graph is not cleared from
+      memory after the training step. Set to \ ``True`` by default.
+
+   **Returns**
+
+   -  The same object(s) returned from the decorated function. All
+      returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or
+      ``torch.Tensor`` objects (for PT) are wrapped inside
+      a \ ``StepOutput`` object, even when they are inside a Python
+      ``list``, ``tuple``, or ``dict``.
+
+
+
+.. class:: StepOutput
+   :noindex:
+
+
+   A class that encapsulates all versions of a ``tf.Tensor``
+   or \ ``torch.Tensor`` across all microbatches.
+
+   When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside
+   ``smp.step``, different versions of the tensor are computed for each
+   microbatch.
+
+   When this tensor is returned from ``smp.step`` and is accessed outside
+   of the decorated function, it appears as a ``StepOutput`` object, which
+   contains all such versions. For example,
+
+   -  In the case of Tensorflow, the gradient for a particular
+      ``tf.Variable`` is computed on each microbatch individually, and if
+      this gradient is returned from ``smp.step``, all gradients for this
+      ``tf.Variable`` become part of the same ``StepOutput`` object. The
+      ``StepOutput`` class offers the following API for commonly-used
+      post-processing operations on such tensors.
+   -  In the case of PyTorch, the loss for each microbatch is computed
+      individually and all the ``torch.Tensor``\ s that represent the loss
+      for different microbatches become part of same ``StepOutput`` object,
+      if loss is returned from the ``smp.step`` function.
+
+
+   The ``StepOutput`` class offers the following API for commonly-used
+   post-processing operations on tensors.
+
+   .. data:: StepOutput.outputs
+      :noindex:
+
+      Returns a list of the underlying tensors, indexed by microbatch.
+
+   .. function:: StepOutput.reduce_mean( )
+      :noindex:
+
+      Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s
+      ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches.
+
+   .. function:: StepOutput.reduce_sum( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` /
+      ``torch.Tensor`` that sums the constituent
+      ``tf.Tensor``\ s/\ ``torch.Tensor``\ s.
+
+   .. function:: StepOutput.concat( )
+      :noindex:
+
+      Returns a
+      ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the
+      batch dimension using ``tf.concat`` / ``torch.cat``.
+
+   .. function:: StepOutput.stack( )
+      :noindex:
+
+      Applies ``tf.stack`` / ``torch.stack``
+      operation to the list of constituent ``tf.Tensor``\ s /
+      ``torch.Tensor``\ s.
+
+   **TensorFlow-only methods**
+
+   .. function:: StepOutput.merge( )
+      :noindex:
+
+      Returns a ``tf.Tensor`` that
+      concatenates the constituent ``tf.Tensor``\ s along the batch
+      dimension. This is commonly used for merging the model predictions
+      across microbatches.
+
+   .. function:: StepOutput.accumulate(method="variable", var=None)
+      :noindex:
+
+      Functionally the same as ``StepOutput.reduce_mean()``. However, it is
+      more memory-efficient, especially for large numbers of microbatches,
+      since it does not wait for all constituent \ ``tf.Tensor``\ s to be
+      ready to start averaging them, thereby saving memory.
+
+      In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end
+      up being more memory-efficient than ``StepOutput.accumulate()``.
+
+      **Parameters**
+
+      -  ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``):
+         If ``"add_n"`` or ``"accumulate_n"``, the library uses
+         ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement
+         accumulation. If ``"variable"``, the library uses an internal ``tf.Variable``
+         into which to accumulate the tensors. Default is \ ``"variable"``.
+         Note: Memory usage behavior of these choices can depend on the model
+         and implementation.
+
+      -  ``var``: A ``tf.Variable`` into which, if provided, the library uses to
+         accumulate the tensors. If \ ``None``, the library internally creates a
+         variable. If ``method`` is not ``"variable"``, this argument is
+         ignored.
+
+.. _mpi_basics:
+   :noindex:
+
+MPI Basics
+^^^^^^^^^^
+
+The library exposes the following basic MPI primitives to its Python API:
+
+-  ``smp.rank()``: The rank of the current process.
+-  ``smp.size()``: The total number of processes.
+-  ``smp.mp_rank()``: The rank of the process among the processes that
+   hold the current model replica.
+-  ``smp.dp_rank()``: The rank of the process among the processes that
+   hold different replicas of the same model partition.
+-  ``smp.dp_size()``: The total number of model replicas.
+-  ``smp.local_rank()``: The rank among the processes on the current
+   instance.
+-  ``smp.local_size()``: The total number of processes on the current
+   instance.
+-  ``smp.get_mp_group()``: The list of ranks over which the current
+   model replica is partitioned.
+-  ``smp.get_dp_group()``: The list of ranks that hold different
+   replicas of the same model partition.
+
+   .. _communication_api:
+      :noindex:
+
+Communication API
+^^^^^^^^^^^^^^^^^
+
+The library provides a few communication primitives which can be helpful while
+developing the training script. These primitives use the following
+``enum`` s as arguments to specify which processes the communication
+should involve.
+​
+
+**Helper structures**
+
+.. data:: smp.CommGroup
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``.
+   These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``,
+   and ``smp.DP_GROUP`` respectively.
+
+   -  ``CommGroup.WORLD``: Represents the entire group of processes used in
+      training
+   -  ``CommGroup.MP_GROUP``: Represents the group of processes that hold
+      the same model replica as the current process. The processes in a
+      single ``MP_GROUP`` collectively store an entire replica of the
+      model.
+   -  ``CommGroup.DP_GROUP``: Represents the group of processes that hold
+      the same model partition as the current process. The processes in a
+      single ``DP_GROUP`` perform data parallelism/allreduce among
+      themselves.
+
+.. data:: smp.RankType
+   :noindex:
+
+   An ``enum`` that takes the values
+   ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``.
+
+   -  ``RankType.WORLD_RANK``: The associated rank is to be interpreted as
+      the rank of the process across all processes used in training.
+   -  ``RankType.MP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``MP_GROUP``.
+   -  ``RankType.DP_RANK``: The associated rank is to be interpreted as the
+      rank of the process within the ``DP_GROUP``.
+
+
+**Communication primitives:**
+
+.. function:: smp.broadcast(obj, group)
+   :noindex:
+
+   Sends the object to all processes in the
+   group. The receiving process must call ``smp.recv_from`` to receive the
+   sent object.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be broadcast.
+
+   -  ``group``: A ``CommGroup`` argument that represents to which group of
+      processes the object will be sent.
+
+   **Notes**
+
+   -  When you use ``broadcast`` on the sender process, there needs
+      to be an accompanying ``smp.recv_from()`` call on the receiver
+      processes.
+
+   -  This is a synchronous call; the ``broadcast`` statement
+      returns only after all ranks participating in the call have made a
+      matching ``recv_from`` call.
+
+   **Example**
+
+   .. code:: python
+
+      if smp.rank() == 0:
+          smp.broadcast(something, group=smp.CommGroup.WORLD)
+      else:
+          smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK)
+
+.. function:: smp.send(obj, dest_rank, rank_type)
+   :noindex:
+
+   Sends the object ``obj`` to
+   ``dest_rank``, which is of a type specified by ``rank_type``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be sent.
+
+   -  ``dest_rank`` (``int``): An integer denoting the rank of the receiving process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process
+      with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current
+      process.
+
+   **Notes**
+
+   -  Note: \ This is a synchronous call; the ``send`` statement returns
+      only after the destination rank has made a matching
+      ``recv_from`` call.
+
+.. function:: smp.recv_from(src_rank, rank_type)
+   :noindex:
+
+   Receive an object from a peer process. Can be used with a matching
+   ``smp.send`` or a ``smp.broadcast`` call.
+
+   **Inputs**
+
+   -  ``src_rank`` (``int``): An integer denoting rank of the sending process.
+
+   -  ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how
+      ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1
+      and ``rank_type`` is ``MP_RANK``, then the object is received from
+      the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the
+      current process.
+
+   **Returns**
+
+   Returns the python object that is sent by the peer process.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``recv_from`` statement returns
+      only after the source rank has made a matching ``send`` or
+      ``broadcast`` call, and the object is received.
+
+.. function:: smp.allgather(obj, group)
+   :noindex:
+
+   A collective call that gathers all the
+   submitted objects across all ranks in the specified ``group``. Returns a
+   list whose ``i``\ th index contains the object submitted by the
+   ``i``\ th rank in ``group``.
+
+   **Inputs**
+
+   -  ``obj``: An arbitrary picklable Python object that will be
+      allgathered.
+
+   -  ``group`` : A ``CommGroup`` argument that represents which group of
+      processes participate in ``allgather``.
+
+   **Notes**
+
+   -  Note: This is a synchronous call; the ``allgather`` statement returns
+      only after all ranks participating in the call have made a matching
+      ``allgather`` call, and all the objects are received at the current
+      rank.
+
+   **Examples**
+
+   .. code:: python
+
+      # assuming mp_size() == 2
+
+      if smp.mp_rank() == 0:
+          out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+      else:
+          out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2]
+
+.. function:: smp.barrier(group=smp.WORLD)
+   :noindex:
+
+   A statement that hangs until all
+   processes in the specified group reach the barrier statement, similar to
+   ``MPI_Barrier()``.
+
+   **Inputs**
+
+   -  ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of
+      processes participating in the barrier call. Defaults to
+      ``smp.WORLD``.
+
+   **Examples**
+
+   -  Assume there are 8 processes and 2 model partitions, and
+      therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If
+      the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its
+      group argument, then each process only waits until the other process
+      of its own ``mp_group`` reaches that point. It does not wait for
+      processes outside that ``mp_group``.
+
+.. function:: smp.dp_barrier()
+   :noindex:
+
+   Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``.
+   Waits for the processes in the same \ ``dp_group`` as
+   the current process to reach the same point in execution.
+
+.. function:: smp.mp_barrier()
+   :noindex:
+
+   Same as passing ``smp.MP_GROUP`` to
+   ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as
+   the current process to reach the same point in execution.
diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst
new file mode 100644
index 0000000000..d2fcb95954
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst
@@ -0,0 +1,572 @@
+.. admonition:: Contents
+
+   - :ref:`pytorch_saving_loading`
+   - :ref:`pytorch_saving_loading_instructions`
+
+PyTorch API
+===========
+
+**Supported versions: 1.7.1, 1.8.1**
+
+This API document assumes you use the following import statements in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.torch as smp
+
+
+.. tip::
+
+   Refer to
+   `Modify a PyTorch Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`_
+   to learn how to use the following API in your PyTorch training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of ``torch.nn.Module`` which specifies the model to be
+   partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is
+   the model to be partitioned. The returned ``DistributedModel`` object
+   internally manages model parallelism and data parallelism. Only one
+   model in the training script can be wrapped with
+   ``smp.DistributedModel``.
+
+   **Example:**
+
+   .. code:: python
+
+      model = smp.DistributedModel(model)
+
+   **Important**: The ``__call__`` and  ``backward`` method calls on the
+   ``smp.DistributedModel`` object (in the following example, the object
+   is \ ``model``) can only be made inside a ``smp.step``-decorated
+   function.
+
+
+   Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can
+   be performed by calling the \ ``DistributedModel`` object on the input
+   tensors.
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   For a backward pass, one needs to call the backward function on
+   the \ ``DistributedModel`` object, with tensors and gradients as
+   arguments, replacing the PyTorch operations \ ``torch.Tensor.backward``
+   or ``torch.autograd.backward``.
+
+
+   The API for ``model.backward`` is very similar to
+   ``torch.autograd.backward``. For example, the following
+   ``backward`` calls:
+
+   .. code:: python
+
+      torch.autograd.backward(loss) or loss.backward()
+
+   should be replaced with:
+
+   .. code:: python
+
+      model.backward(loss) # loss is a tensor with only one element as its data
+
+   Similarly, for non-scalar tensors, replace the following
+   ``backward`` call containing incoming gradient arguments:
+
+   .. code:: python
+
+      torch.autograd.backward(outputs, out_grads)
+
+   with the following line:
+
+   .. code:: python
+
+      model.backward(outputs, out_grads)
+
+   In these examples, all ``__call__``  and ``backward`` method calls on
+   the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside
+   a ``smp.step``-decorated function.
+
+   **Using DDP**
+
+   If DDP is enabled, do not not place a PyTorch
+   ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
+   the ``DistributedModel`` wrapper will also handle data parallelism.
+
+   Unlike the original DDP wrapper, when you use ``DistributedModel``,
+   model parameters and buffers are not immediately broadcast across
+   processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the
+   ``smp.step``-decorated function when the partition is done.
+
+   **Parameters**
+
+   -  ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism).
+
+   -  ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``)
+      Whether to perform the tracing step on the GPU or CPU. The tracing step gathers
+      information on the order of execution of modules, the shapes of
+      intermediate outputs, and execution times, to be used by the
+      partitioning algorithm. If ``trace_device`` is set to GPU, accurate
+      module execution times can be gathered during tracing for potentially
+      improved partitioning decision. However, if the model is too large to
+      fit in a single GPU, then ``trace_device`` should be set to CPU.
+
+   -  ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``,
+      the library profiles the execution time of each module during tracing, and uses
+      it in the partitioning decision. This improves the partitioning
+      decision, but it might make the tracing slower. It may also introduce
+      some degree of non-determinism in partitioning results, because of the
+      inherent randomness in module execution times. Must be ``False`` if
+      ``trace_device`` is ``"cpu"``.
+
+   -  ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` while launching training). The library uses this flag
+      to decide whether to do overlapping allreduce whenever a parameter
+      gradients are ready. This leads to overlapping of communication and
+      computation and can improve performance. If this is set to ``False`` ,
+      allreduce is performed at the end of the step.
+
+   -  ``backward_passes_per_step`` (``int``) (default: 1): This is only
+      applicable for hybrid data parallelism/model parallelism use cases (when
+      ``ddp`` is set to ``True`` in config). This parameter indicates the
+      number of backward passes to perform before calling allreduce on DDP.
+      This allows accumulating updates over multiple mini-batches before
+      reducing and applying them.
+
+   -  ``average_grads_across_microbatches`` (``bool``) (default: ``True``):
+      Whether or not the computed gradients should be averaged across
+      microbatches. If ``False``, the computed gradients will be summed across
+      microbatches, but not divided by the number of microbatches. In typical
+      use case where the computed loss is averaged over the mini-batch, this
+      should be left as ``True``. If you use a loss function that only sums
+      the per-sample loss across the batch (and not divide by the batch size),
+      then this must be set to ``False`` for correctness.
+
+   -  ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets
+      parameters into multiple buckets so that gradient reduction of each
+      bucket can potentially overlap with backward
+      computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes
+      (MB).
+
+   -  ``trace_memory_usage`` (default: False): When set to True, the library attempts
+      to measure memory usage per module during tracing. If this is disabled,
+      memory usage will be estimated through the sizes of tensors returned from
+      the module.
+
+   -  ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
+      This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper.
+      Please see: `broadcast_buffer <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   -  ``gradient_as_bucket_view`` (default: False): To be
+      used with ``ddp=True``. This parameter is forwarded to the underlying
+      ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
+
+   **Properties**
+
+   -  ``partitioned``: Is ``True`` if the model is partitioned, ``False``
+      otherwise. Initialized to ``False`` when ``DistributedModel`` is first
+      created. It becomes be ``True`` during the first call
+      to ``smp.step``-decorated function. Once the model is partitioned, the
+      local parameters or local ``state_dict`` can be fetched using the
+      following methods.
+
+   **Methods**
+
+   .. function:: backward(tensors, grad_tensors)
+      :noindex:
+
+      Triggers a distributed backward
+      pass across model partitions. Example usage provided in the previous
+      section. The API is very similar
+      to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward.
+      ``retain_grad`` and ``create_graph``  flags are not supported.
+
+   .. function:: local_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the modules in
+      the partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_buffers( )
+      :noindex:
+
+      Returns an iterator over buffers for the
+      modules in the partitioned model that have been assigned to the current
+      process. This yields both the name of the buffer as well as the buffer
+      itself.
+
+   .. function:: local_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for the
+      modules in the partitioned model that have been assigned to the current
+      process.
+
+   .. function:: local_named_parameters( )
+      :noindex:
+
+      Returns an iterator over parameters for
+      the modules in the partitioned model that have been assigned to the
+      current process. This yields both the name of the parameter as well as
+      the parameter itself.
+
+   .. function:: local_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process.
+
+   .. function:: local_named_modules( )
+      :noindex:
+
+      Returns an iterator over the modules in the
+      partitioned model that have been assigned to the current process. This
+      yields both the name of the module as well as the module itself.
+
+   .. function:: local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains local
+      parameters that belong to the current \ ``mp_rank``. This ``state_dict``
+      contains a key \ ``_smp_is_partial`` to indicate this is a
+      partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains parameters
+      for the entire model. It first collects the \ ``local_state_dict``  and
+      gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to
+      create a full ``state_dict``. Please note that this needs to be called on all ranks with
+      ``dp_rank()==0`` to ensure the gather happens properly.
+      If it is only called on all such ranks, it can hang.
+
+   .. function:: load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.module.load_state_dict()`` ,
+      except: It first gathers and merges the ``state_dict``\ s across
+      ``mp_rank``\ s, if they are partial. The actual loading happens after the
+      model partition so that each rank knows its local parameters.
+
+   .. function:: register_post_partition_hook(hook)
+      :noindex:
+
+      Registers a callable ``hook`` to
+      be executed after the model is partitioned. This is useful in situations
+      where an operation needs to be executed after the model partition during
+      the first call to ``smp.step``, but before the actual execution of the
+      first forward pass. Returns a ``RemovableHandle`` object ``handle``,
+      which can be used to remove the hook by calling ``handle.remove()``.
+
+   .. function:: cpu( )
+      :noindex:
+
+      Allgathers parameters and buffers across all ``mp_rank``\ s and moves them
+      to the CPU.
+
+   .. function:: join( )
+      :noindex:
+
+      A context manager to be used in conjunction with an instance of
+      ``smp.DistributedModel`` to be able to train with uneven inputs across
+      participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped
+      ``DistributedDataParallel`` instance. For more information, see:
+      `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__
+      in the PyTorch documentation.
+
+   .. function:: register_comm_hook( state, callable )
+      :noindex:
+
+      **Available for PyTorch 1.8.1 only**
+      Registers a communication hook which is an enhancement that provides
+      a flexible hook ``callable`` to users where they can specify how
+      gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance.
+
+      Please note that when you register a comm hook you have full control of how the gradients are processed.
+      When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook.
+      Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook.
+      In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function.
+      See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches.
+
+      This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default).
+      For more information, see:
+      `register_comm_hook <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.register_comm_hook>`__
+      in the PyTorch documentation.
+
+
+
+.. class:: smp.DistributedOptimizer
+   :noindex:
+
+   **Parameters**
+   - ``optimizer``
+
+   An optimizer wrapper for saving/loading optimizer states. This wrapper
+   returns ``optimizer`` with the following methods overridden:
+
+   .. function:: state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains optimizer state for the entire model.
+      It first collects the ``local_state_dict`` and gathers and merges
+      the ``local_state_dict`` from all ``mp_rank``s to create a full
+      ``state_dict``.
+
+   .. function::  load_state_dict( )
+      :noindex:
+
+      Same as the ``torch.optimizer.load_state_dict()`` , except:
+
+         -  It first gathers and merges the local ``state_dict``\ s if they are
+            partial.
+         -  The actual loading happens after the model partition so that each
+            rank knows its local parameters.
+
+   .. function::  local_state_dict( )
+      :noindex:
+
+      Returns the ``state_dict`` that contains the
+      local optimizer state that belongs to the current \ ``mp_rank``. This
+      ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is
+      a partial \ ``state_dict``, which indicates whether the
+      ``state_dict`` contains elements corresponding to only the current
+      partition, or to the entire model.
+
+   ​
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (int) - The index of the partition.
+
+   A context manager which places all modules defined inside into the
+   partition with ID ``index``.  The ``index`` argument must be less than
+   the number of partitions.
+
+   Use ``smp.partition`` to implement manual partitioning.
+   If ``"auto_partition"`` is ``True``, then the
+   ``smp.partition`` contexts are ignored. Any module that is not placed in
+   any ``smp.partition`` context is placed in the
+   ``default_partition`` defined through the SageMaker Python SDK.
+
+   When ``smp.partition`` contexts are nested, the innermost context
+   overrides the rest (see the following example). In PyTorch, manual
+   partitioning should be done inside the module \ ``__init__``, and the
+   partition assignment applies to the modules that are *created* inside
+   the ``smp.partition`` context.
+
+   Example:
+
+   .. code:: python
+
+      class Model(torch.nn.Module):
+          def __init__(self):
+              with smp.partition(1):
+                  self.child0 = Child0()            # child0 on partition 1
+                  with smp.partition(2):
+                      self.child1 = Child1()        # child1 on partition 2
+                  self.child2 = Child2()            # child2 on partition 1
+              self.child3 = Child3()                # child3 on default_partition
+
+.. function:: smp.get_world_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all
+   processes, which can be used with the ``torch.distributed`` API.
+   Requires ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_mp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``MP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.get_dp_process_group( )
+   :noindex:
+
+   Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the
+   processes in the ``DP_GROUP`` which contains the current process, which
+   can be used with the \ ``torch.distributed`` API. Requires
+   ``"ddp": True`` in SageMaker Python SDK parameters.
+
+.. function:: smp.is_initialized( )
+   :noindex:
+
+   Returns ``True`` if ``smp.init`` has already been called for the
+   process, and ``False`` otherwise.
+
+.. function::smp.is_tracing( )
+   :noindex:
+
+   Returns ``True`` if the current process is running the tracing step, and
+   ``False`` otherwise.
+
+.. data:: smp.nn.FusedLayerNorm
+   :noindex:
+
+   `Apex Fused Layer Norm <https://nvidia.github.io/apex/layernorm.html>`__ is currently not
+   supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex``
+   ``FusedLayerNorm`` and provides the same functionality. This requires
+   ``apex`` to be installed on the system.
+
+.. data:: smp.optimizers.FusedNovoGrad
+   :noindex:
+
+
+   `Fused Novo Grad optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedNovoGrad>`__ is
+   currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad``
+   optimizer and provides the same functionality. This requires ``apex`` to
+   be installed on the system.
+
+.. data:: smp.optimizers.FusedLamb
+   :noindex:
+
+
+   `FusedLamb optimizer <https://nvidia.github.io/apex/optimizers.html#apex.optimizers.FusedLAMB>`__
+   currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces
+   ``apex`` ``FusedLamb`` optimizer and provides the same functionality.
+   This requires ``apex`` to be installed on the system.
+
+.. data:: smp.amp.GradScaler
+   :noindex:
+
+   `Torch AMP Gradscaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`__
+   currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces
+   ``torch.amp.GradScaler`` and provides the same functionality.
+
+.. _pytorch_saving_loading:
+   :noindex:
+
+APIs for Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. function:: smp.save( )
+   :noindex:
+
+   Saves an object. This operation is similar to ``torch.save()``, except
+   it has an additional keyword argument, ``partial``, and accepts only
+   string type for the argument ``f`` (file). If ``partial=True``, each
+   ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank``
+   index to your saved file.
+
+   **Parameters**
+
+   -  ``obj`` (dict): A saved object.
+   -  ``f`` (str): A string containing a file name.
+   -  ``partial`` (bool, default= ``True``):  When set to ``True``, each
+      ``mp_rank`` saves a separate checkpoint file and the library adds an
+      ``mp_rank`` index to the saved file. If you want to be able to load
+      and further train a model that you save with ``smp.save()``, you must
+      set ``partial=True``.
+   -  ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``):
+      A module used for pickling metadata and objects.
+   -  ``pickle_protocol``  (int, default=2): Can be specified to
+      override the defaultprotocol.
+
+.. function:: smp.load( )
+   :noindex:
+
+   Loads an object saved with ``smp.save()`` from a file.
+
+   Similar to, `torch.load() <https://pytorch.org/docs/stable/generated/torch.load.html>`__,
+   except it has an additional keyword argument, ``partial``, and accepts
+   only string type for the argument ``f`` (file). If \ ``partial=True``,
+   then each ``mp_rank`` loads a separate checkpoint file.
+
+   **Parameters**
+
+   -  ``f`` (string): A string containing a file name.
+   -  ``map_location`` (function): A function
+      `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device>`__,
+      a string, or a dict specifying how to remap storage locations.
+   -  ``pickle_module`` (pickle module): A module used for unpickling
+      metadata and objects (has to match the \ ``pickle_module``\ used to
+      serialize file).
+   -  ``pickle_load_args`` (Python 3 only): Optional keyword arguments
+      passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``.
+   -  ``partial`` (bool, default= ``True``): When set to ``True``, each
+      ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``.
+      Should be used when loading a model trained with the library.
+
+.. _pytorch_saving_loading_instructions:
+   :noindex:
+
+General Instruction For Saving and Loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The library can save partial or full checkpoints.
+
+-  For partial checkpoints, each ``mp_rank`` saves its own checkpoint
+   file with only the parameters that belong to that rank.
+-  For full checkpoints, the library saves a single checkpoint that contains
+   entire model parameters.
+
+When **saving** using ``smp.save()``, each rank only holds its own
+parameters. If you want to save the full model, there will be some
+communication between the ranks to create the full model. If you save
+checkpoints often, you should save partial checkpoints for best
+performance.
+
+When **loading** using ``smp.load()``, the library can load either partial or |
+full checkpoints or full checkpoints saved by a non-model-parallel model. If you
+want to resume training with a non-model-parallel model or do inference, you need
+a full checkpoint.
+
+The following is an example of how you can save and load a checkpoint:
+
+.. code:: python
+
+   # Original model and optimizer
+   model = MyModel(...)
+   optimizer = MyOpt(...)
+
+   # model parallel wrapper
+   model = smp.DistributedModel(model)
+   optimizer = smp.DistributedOptimizer(optimizer)
+
+   # To save, always save on dp_rank 0 to avoid data racing
+   if partial:
+       # To save the partial model on each mp rank
+       # the library will create `checkpoint.pt_{mprank}` for each mp rank
+       if save_partial_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.local_state_dict() # save the partial model
+               opt_dict = optimizer.local_state_dict() # save the partial optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   f"/checkpoint.pt",
+                   partial=True,
+               )
+
+       # To save the full model
+       if save_full_model:
+           if smp.dp_rank() == 0:
+               model_dict = model.state_dict() # save the full model
+               opt_dict = optimizer.state_dict() # save the full optimizer state
+               smp.save(
+                   {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict},
+                   "/checkpoint.pt",
+                   partial=False,
+               )
+
+   # To load, load on all ranks.
+   # The only difference for partial/full loading is the partial flag in smp.load
+   # Load partial checkpoint
+   if partial_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=True)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+   # Load full checkpoint
+   if full_checkpoint:
+       checkpoint = smp.load("/checkpoint.pt", partial=False)
+       model.load_state_dict(checkpoint["model_state_dict"])
+       optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst
new file mode 100644
index 0000000000..131fc327ac
--- /dev/null
+++ b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst
@@ -0,0 +1,172 @@
+TensorFlow API
+==============
+
+**Supported version: 2.3.1, 2.4.1, 2.5.0**
+
+**Important**: This API document assumes you use the following import statement in your training scripts.
+
+.. code:: python
+
+   import smdistributed.modelparallel.tensorflow as smp
+
+.. tip::
+
+   Refer to
+   `Modify a TensorFlow Training Script
+   <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`_
+   to learn how to use the following API in your TensorFlow training script.
+
+.. class:: smp.DistributedModel
+   :noindex:
+
+   A sub-class of the Keras \ ``Model`` class, which defines the model to
+   be partitioned. Model definition is done by sub-classing
+   ``smp.DistributedModel`` class, and implementing the ``call()`` method,
+   in the same way as the Keras model sub-classing API. Any operation that
+   is part of the \ ``smp.DistributedModel.call()`` method is subject to
+   partitioning, meaning that every operation placed inside executes in
+   exactly one of the devices (the operations outside run on all devices).
+
+
+   Similar to the regular Keras API, the forward pass is done by directly
+   calling the model object on the input tensors. For example:
+
+   .. code:: python
+
+      predictions = model(inputs)   # model is a smp.DistributedModel object
+
+   However, ``model()`` calls can only be made inside a
+   ``smp.step``-decorated function.
+
+   The outputs from a ``smp.DistributedModel`` are available in all ranks,
+   regardless of which rank computed the last operation.
+
+   **Methods:**
+
+   .. function:: save_model(save_path="/opt/ml/model")
+      :noindex:
+
+      **Inputs**
+      - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights.
+
+      Saves the entire,
+      unpartitioned model with the latest trained weights to ``save_path`` in
+      TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which
+      SageMaker monitors to upload the model artifacts to Amazon S3.
+
+.. function:: smp.partition(index)
+   :noindex:
+
+   **Inputs**
+
+   -  ``index`` (``int``): The index of the partition.
+
+   A context manager which places all operations defined inside into the
+   partition whose ID is equal to ``index``. When
+   ``smp.partition`` contexts are nested, the innermost context overrides
+   the rest. The ``index`` argument must be smaller than the number of
+   partitions.
+
+   ``smp.partition`` is used in the manual partitioning API;
+   if \ ``"auto_partition"`` parameter is set to ``True`` while launching
+   training, then ``smp.partition`` contexts are ignored. Any operation
+   that is not placed in any ``smp.partition`` context is placed in the
+   ``default_partition``, as shown in the following example:
+
+   .. code:: python
+
+      # auto_partition: False
+      # default_partition: 0
+      smp.init()
+      [...]
+      x = tf.constant(1.2)                     # placed in partition 0
+      with smp.partition(1):
+          y = tf.add(x, tf.constant(2.3))      # placed in partition 1
+          with smp.partition(3):
+              z = tf.reduce_sum(y)             # placed in partition 3
+
+
+.. function:: register_post_partition_hook(hook)
+   :noindex:
+
+    Registers a callable ``hook`` to
+    be executed after the model is partitioned. This is useful in situations
+    where an operation needs to be executed after the model partition during
+    the first call to ``smp.step``, but before the actual execution of the
+    first forward pass.
+
+    .. code:: python
+
+        @smp.register_post_partition_hook
+        def test_eager():
+            # All statements here will be executed right after partition but before the first forward pass
+            tf.print("Entered hook through eager context")
+
+.. class:: smp.CheckpointManager
+   :noindex:
+
+
+   A subclass of TensorFlow
+   `CheckpointManager <https://www.tensorflow.org/api_docs/python/tf/train/CheckpointManager>`__,
+   which is used to manage checkpoints. The usage is similar to TensorFlow
+   ``CheckpointManager``.
+
+   The following returns a ``CheckpointManager`` object.
+
+   .. code:: python
+
+      smp.CheckpointManager(checkpoint,
+                            directory="/opt/ml/checkpoints",
+                            max_to_keep=None,
+                            checkpoint_name="ckpt")
+
+   **Parameters**
+
+   -  ``checkpoint``: A `tf.train.Checkpoint
+      <https://www.tensorflow.org/api_docs/python/tf/train/Checkpoint>`__ instance
+      that represents a model checkpoint.
+
+   -  ``directory``: (``str``) The path to a directory in which to write
+      checkpoints. A file named "checkpoint" is also written to this
+      directory (in a human-readable text format) which contains the state
+      of the ``CheckpointManager``. Defaults to
+      ``"/opt/ml/checkpoints"``, which is the directory that SageMaker
+      monitors for uploading the checkpoints to Amazon S3.
+   -  ``max_to_keep`` (``int``): The number of checkpoints to keep. If
+      ``None``, all checkpoints are kept.
+   -  ``checkpoint_name`` (``str``): Custom name for the checkpoint file.
+      Defaults to ``"ckpt"``.
+
+
+   **Methods:**
+
+   .. function:: save( )
+      :noindex:
+
+      Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``.
+
+   .. function:: restore( )
+      :noindex:
+
+      Restores the latest checkpoint in the specified directory.
+      Internally uses ``tf.train.CheckpointManager.restore()``.
+
+
+   **Examples:**
+
+   .. code:: python
+
+      checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+      ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints
+
+      for inputs in train_ds:
+          loss = train_step(inputs)
+          # [...]
+          ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints
+
+   .. code:: python
+
+      for step, inputs in enumerate(train_ds):
+          if step == 0:
+              ckpt_manager.restore()
+          loss = train_step(inputs)
diff --git a/doc/api/training/smp_versions/v1_4_0.rst b/doc/api/training/smp_versions/v1_4_0.rst
new file mode 100644
index 0000000000..4485ae6a40
--- /dev/null
+++ b/doc/api/training/smp_versions/v1_4_0.rst
@@ -0,0 +1,12 @@
+
+Version 1.4.x
+=============
+
+To use the library, reference the Common API documentation alongside the framework specific API documentation.
+
+.. toctree::
+   :maxdepth: 1
+
+   v1.4.0/smd_model_parallel_common_api
+   v1.4.0/smd_model_parallel_pytorch
+   v1.4.0/smd_model_parallel_tensorflow
diff --git a/doc/api/training/smp_versions/v1_5_0.rst b/doc/api/training/smp_versions/v1_5_0.rst
new file mode 100644
index 0000000000..c93761efa4
--- /dev/null
+++ b/doc/api/training/smp_versions/v1_5_0.rst
@@ -0,0 +1,12 @@
+
+Version 1.5.x
+=============
+
+To use the library, reference the Common API documentation alongside the framework specific API documentation.
+
+.. toctree::
+   :maxdepth: 1
+
+   v1.5.0/smd_model_parallel_common_api
+   v1.5.0/smd_model_parallel_pytorch
+   v1.5.0/smd_model_parallel_tensorflow

From 6319bced27046f309a4ff0b018e20d72984bb9ff Mon Sep 17 00:00:00 2001
From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Date: Fri, 31 Dec 2021 13:00:05 -0800
Subject: [PATCH 10/11] fix: fix kmeans test deletion sequence, increment
 lineage statics (#2815)

---
 src/sagemaker/session.py                  | 1 +
 tests/integ/sagemaker/lineage/conftest.py | 4 ++--
 tests/integ/test_kmeans.py                | 9 ++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
index 828371c6dc..189c9cb308 100644
--- a/src/sagemaker/session.py
+++ b/src/sagemaker/session.py
@@ -3565,6 +3565,7 @@ def endpoint_from_production_variants(
         if data_capture_config_dict is not None:
             config_options["DataCaptureConfig"] = data_capture_config_dict
 
+        LOGGER.info("Creating endpoint-config with name %s", name)
         self.sagemaker_client.create_endpoint_config(**config_options)
 
         return self.create_endpoint(endpoint_name=name, config_name=name, tags=tags, wait=wait)
diff --git a/tests/integ/sagemaker/lineage/conftest.py b/tests/integ/sagemaker/lineage/conftest.py
index 5b814bab5b..bb051b9634 100644
--- a/tests/integ/sagemaker/lineage/conftest.py
+++ b/tests/integ/sagemaker/lineage/conftest.py
@@ -36,8 +36,8 @@
 from tests.integ.sagemaker.lineage.helpers import name, names
 
 SLEEP_TIME_SECONDS = 1
-STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline15"
-STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint15"
+STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline16"
+STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint16"
 
 
 @pytest.fixture
diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py
index c4def3b439..056b068f3b 100644
--- a/tests/integ/test_kmeans.py
+++ b/tests/integ/test_kmeans.py
@@ -76,11 +76,10 @@ def test_kmeans(sagemaker_session, cpu_instance_type, training_set):
         for record in result:
             assert record.label["closest_cluster"] is not None
             assert record.label["distance_to_cluster"] is not None
-
-    predictor.delete_model()
-    with pytest.raises(Exception) as exception:
-        sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
-        assert "Could not find model" in str(exception.value)
+        predictor.delete_model()
+        with pytest.raises(Exception) as exception:
+            sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
+            assert "Could not find model" in str(exception.value)
 
 
 def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set):

From beeabbda1b970b81d79a9f3f11410603b1f33f93 Mon Sep 17 00:00:00 2001
From: Mufaddal Rohawala <89424143+mufaddal-rohawala@users.noreply.github.com>
Date: Mon, 3 Jan 2022 15:42:03 -0800
Subject: [PATCH 11/11] fix: Increment static lineage pipeline (#2817)

---
 tests/integ/sagemaker/lineage/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integ/sagemaker/lineage/conftest.py b/tests/integ/sagemaker/lineage/conftest.py
index bb051b9634..b6cebdcb61 100644
--- a/tests/integ/sagemaker/lineage/conftest.py
+++ b/tests/integ/sagemaker/lineage/conftest.py
@@ -36,8 +36,8 @@
 from tests.integ.sagemaker.lineage.helpers import name, names
 
 SLEEP_TIME_SECONDS = 1
-STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline16"
-STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint16"
+STATIC_PIPELINE_NAME = "SdkIntegTestStaticPipeline17"
+STATIC_ENDPOINT_NAME = "SdkIntegTestStaticEndpoint17"
 
 
 @pytest.fixture