aws
diff --git a/‎doc/frameworks/pytorch/using_pytorch.rst
+97 b/‎doc/frameworks/pytorch/using_pytorch.rst
+97
diff --git a/‎src/sagemaker/fw_utils.py
+147-1 b/‎src/sagemaker/fw_utils.py
+147-1
diff --git a/‎src/sagemaker/pytorch/estimator.py
+21 b/‎src/sagemaker/pytorch/estimator.py
+21
diff --git a/‎tests/conftest.py
+11 b/‎tests/conftest.py
+11
@@ -292,7 +292,104 @@ using two ``ml.p4d.24xlarge`` instances:
 
     pt_estimator.fit("s3://bucket/path/to/training/data")
 
+.. _distributed-pytorch-training-on-trainium:
+=============================================
 
+SageMaker Training on Trainium instances now supports the `xla`
+package through `torchrun`. With this, you do not need to manually pass RANK, 
+WORLD_SIZE, MASTER_ADDR, and MASTER_PORT. You can launch the training job using the
+:class:`sagemaker.pytorch.estimator.PyTorch` estimator class
+with the ``torch_distributed`` option as the distribution strategy.
+
+.. note::
+
+  This ``torch_distributed`` support is available
+  in the SageMaker Trainium (trn1) PyTorch Deep Learning Containers starting v1.11.0.
+  To find a complete list of supported versions of PyTorch Neuron, see `Neuron Containers <https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers>`_ in the *AWS Deep Learning Containers GitHub repository*.
+
+  SageMaker Debugger and Profiler are currently not supported with Trainium instances.
+
+Adapt Your Training Script
+--------------------------
+
+To initialize distributed training in your script, call
+`torch.distributed.init_process_group
+<https://pytorch.org/docs/master/distributed.html#torch.distributed.init_process_group>`_
+with the ``xla`` backend as shown below.
+
+.. code:: python
+
+    import torch.distributed as dist
+
+    dist.init_process_group('xla')
+
+SageMaker takes care of ``'MASTER_ADDR'`` and ``'MASTER_PORT'`` for you via ``torchrun``
+
+For detailed documentation about modifying your training script for Trainium, see `Multi-worker data-parallel MLP training using torchrun <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/mlp.html?highlight=torchrun#multi-worker-data-parallel-mlp-training-using-torchrun>`_ in the *AWS Neuron Documentation*.
+
+**Currently Supported backends:**
+
+-  ``xla`` for Trainium (Trn1) instances
+
+For up-to-date information on supported backends for Trainium instances, see `AWS Neuron Documentation <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html>`_.
+
+Launching a Distributed Training Job
+------------------------------------
+
+You can run multi-node distributed PyTorch training jobs on Trainium instances using the
+:class:`sagemaker.pytorch.estimator.PyTorch` estimator class.
+With ``instance_count=1``, the estimator submits a
+single-node training job to SageMaker; with ``instance_count`` greater
+than one, a multi-node training job is launched.
+
+With the ``torch_distributed`` option, the SageMaker PyTorch estimator runs a SageMaker
+training container for PyTorch Neuron, sets up the environment, and launches
+the training job using the ``torchrun`` command on each worker with the given information.
+
+.. note::
+
+The following example shows how to run a PyTorch training using ``torch_distributed`` in SageMaker
+using one ``ml.trn1.2xlarge`` and two ``ml.trn1.32xlarge`` instances:
+
+.. code:: python
+
+    from sagemaker.pytorch import PyTorch
+
+    pt_estimator = PyTorch(
+        entry_point="train_ptddp.py",
+        role="SageMakerRole",
+        framework_version="1.11.0",
+        py_version="py38",
+        instance_count=1,
+        instance_type="ml.trn1.2xlarge",
+        distribution={
+            "torch_distributed": {
+                "enabled": True
+            }
+        }
+    )
+
+    pt_estimator.fit("s3://bucket/path/to/training/data")
+
+.. code:: python
+
+    from sagemaker.pytorch import PyTorch
+
+    pt_estimator = PyTorch(
+        entry_point="train_ptddp.py",
+        role="SageMakerRole",
+        framework_version="1.11.0",
+        py_version="py38",
+        instance_count=2,
+        instance_type="ml.trn1.32xlarge",
+        distribution={
+            "torch_distributed": {
+                "enabled": True
+            }
+        }
+    )
+
+    pt_estimator.fit("s3://bucket/path/to/training/data")
 
 *********************
 Deploy PyTorch Models
 
@@ -134,6 +134,15 @@
     "1.12.0",
 ]
 
+TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS = [
+    "1.11",
+    "1.11.0"
+]
+
+TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = [
+    "torch_distributed"
+]
+
 SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
 
 
@@ -701,7 +710,7 @@ def _validate_smdataparallel_args(
 
 
 def validate_distribution(
-    distribution, instance_groups, framework_name, framework_version, py_version, image_uri, kwargs
+    distribution, instance_groups, framework_name, framework_version, py_version, image_uri, entry_point, kwargs
 ):
     """Check if distribution strategy is correctly invoked by the user.
 
@@ -726,6 +735,8 @@ def validate_distribution(
         framework_version (str): A string representing the framework version selected.
         py_version (str): A string representing the python version selected.
         image_uri (str): A string representing a Docker image URI.
+        entry_point (str or PipelineVariable): Path (absolute or relative) to the
+            Python source file which should be executed as the entry point to training.
         kwargs(dict): Additional kwargs passed to this function
 
     Returns:
@@ -767,6 +778,10 @@ def validate_distribution(
                     f"Invalid training instance group {train_instance_group.instance_group_name} !"
                 )
             instance_type = train_instance_group.instance_type
+            validate_distribution_for_instance_type(
+                instance_type=instance_type,
+                distribution=distribution,
+            )
             validate_smdistributed(
                 instance_type=instance_type,
                 framework_name=framework_name,
@@ -782,6 +797,15 @@ def validate_distribution(
                 py_version=py_version,
                 image_uri=image_uri,
             )
+            validate_torch_distributed_distribution(
+                instance_type=instance_type,
+                distribution=distribution,
+                framework_name=framework_name,
+                framework_version=framework_version,
+                py_version=py_version,
+                image_uri=image_uri,
+                entry_point=entry_point,
+            )
             warn_if_parameter_server_with_multi_gpu(
                 training_instance_type=instance_type, distribution=distribution
             )
@@ -793,6 +817,10 @@ def validate_distribution(
         instance_type = renamed_kwargs(
             "train_instance_type", "instance_type", kwargs.get("instance_type"), kwargs
         )
+        validate_distribution_for_instance_type(
+            instance_type=instance_type,
+            distribution=distribution,
+        )
         validate_smdistributed(
             instance_type=instance_type,
             framework_name=framework_name,
@@ -808,11 +836,53 @@ def validate_distribution(
             py_version=py_version,
             image_uri=image_uri,
         )
+        validate_torch_distributed_distribution(
+            instance_type=instance_type,
+            distribution=distribution,
+            framework_name=framework_name,
+            framework_version=framework_version,
+            py_version=py_version,
+            image_uri=image_uri,
+            entry_point=entry_point,
+        )
         warn_if_parameter_server_with_multi_gpu(
             training_instance_type=instance_type, distribution=distribution
         )
     return distribution
 
+def validate_distribution_for_instance_type(
+    instance_type, distribution
+):
+    """Check if the provided distribution strategy is supported for the instance_type
+
+    Args:
+        instance_type (str): A string representing the type of training instance selected.
+        distribution (dict): A dictionary with information to enable distributed training.
+    """
+    match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
+    err_msg = ""
+    if match and match[1].startswith("trn"):
+        keys = distribution.keys()
+        if len(keys) == 0:
+            return
+        elif len(keys) == 1:
+            distribution_strategy = keys[0]
+            if distribution_strategy != "torch_distributed":
+                err_msg += (
+                f"Provided distribution strategy {distribution_strategy} is not supported for"
+                " Trainium instances.\n"
+                "Please specify one of the following supported distribution strategies:"
+                f" {TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES} \n"
+            )
+        elif len(keys) > 1:
+            err_msg += (
+                f"Multiple distribution strategies are not supported for Trainium instances.\n"
+                "Please specify one of the following supported distribution strategies:"
+                f" {TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES} "
+            )
+
+    if err_msg:
+        raise ValueError(err_msg)
 
 def validate_pytorch_distribution(
     distribution, framework_name, framework_version, py_version, image_uri
@@ -870,6 +940,82 @@ def validate_pytorch_distribution(
     if err_msg:
         raise ValueError(err_msg)
 
+def validate_torch_distributed_distribution(
+    instance_type, distribution, framework_name, framework_version, py_version, image_uri, entry_point,
+):
+    """Check if torch_distributed distribution strategy is correctly invoked by the user.
+
+    Args:
+        instance_type (str): A string representing the type of training instance selected.
+        distribution (dict): A dictionary with information to enable distributed training.
+            (Defaults to None if distributed training is not enabled.) For example:
+
+            .. code:: python
+
+                {
+                    "torch_distributed": {
+                        "enabled": True
+                    }
+                }
+        framework_name (str): A string representing the name of framework selected.
+        framework_version (str): A string representing the framework version selected.
+        py_version (str): A string representing the python version selected.
+        image_uri (str): A string representing a Docker image URI.
+        entry_point (str): Path (absolute or relative) to the
+                Python source file which should be executed as the entry point to training.
+
+    Raises:
+        ValueError: if
+            `py_version` is not python3 or
+            `framework_version` is not in TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS
+    """
+    if framework_name and framework_name != "pytorch":
+        # We need to validate only for PyTorch framework
+        return
+
+    torch_distributed_enabled = False
+    if "torch_distributed" in distribution:
+        torch_distributed_enabled = distribution.get("torch_distributed").get("enabled", False)
+    if not torch_distributed_enabled:
+        # Distribution strategy other than torch_distributed is selected
+        return
+
+    err_msg = ""
+    if not image_uri:
+        # ignore framework_version and py_version if image_uri is set
+        # in case image_uri is not set, then both are mandatory
+        if framework_version not in TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS:
+            err_msg += (
+                f"Provided framework_version {framework_version} is not supported by"
+                " torch_distributed.\n"
+                "Please specify one of the supported framework versions:"
+                f" {TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS} \n"
+            )
+        if "py3" not in py_version:
+            err_msg += (
+                f"Provided py_version {py_version} is not supported by torch_distributed.\n"
+                "Please specify py_version>=py3"
+            )
+
+    # Check instance compatibility
+    match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
+    if match and match[1].startswith("trn"):
+        return
+    else:
+        err_msg += (
+                f"torch_distributed is currently supported only for trainium instances."
+                " Please refer https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training \
+                for information regarding distributed training on non-trainium instances"
+        )
+
+    # Check entry point type
+    if not entry_point.endswith(".py"):
+        err_msg += ("Unsupported entry point type for torch_distributed.\n"
+                    "Only python programs (*.py) are supported."
+        )
+    
+    if err_msg:
+        raise ValueError(err_msg)
 
 def python_deprecation_warning(framework, latest_supported_version):
     """Placeholder docstring"""
 
@@ -39,6 +39,7 @@ class PyTorch(Framework):
 
     _framework_name = "pytorch"
     LAUNCH_PYTORCH_DDP_ENV_NAME = "sagemaker_pytorch_ddp_enabled"
+    LAUNCH_TORCH_DISTRIBUTED_ENV_NAME = "sagemaker_torch_distributed_enabled"
     INSTANCE_TYPE_ENV_NAME = "sagemaker_instance_type"
 
     def __init__(
@@ -167,6 +168,17 @@ def __init__(
 
                     To learn more, see `Distributed PyTorch Training
                     <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training>`_.
+                
+                **To enable Torch Distributed (Trainium Instances):**
+
+                    .. code:: python
+                    {
+                        "torch_distributed": {
+                            "enabled": True
+                        }
+                    }
+                    To learn more, see `Distributed PyTorch Training on Trainium 
+                    <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training-on-trainium>`_.
 
                 **To enable MPI:**
 
@@ -227,6 +239,7 @@ def __init__(
                 framework_version,
                 py_version,
                 image_uri,
+                entry_point,
                 kwargs,
             )
 
@@ -242,13 +255,21 @@ def _pytorch_distribution_configuration(self, distribution):
         """
         distribution_config = {}
         pytorch_ddp_enabled = False
+        torch_distributed_enabled = False
+
         if "pytorchddp" in distribution:
             pytorch_ddp_enabled = distribution.get("pytorchddp").get("enabled", False)
+        elif "torch_distributed" in distribution:
+            torch_distributed_enabled = distribution.get("torch_distributed").get("enabled", False)
 
         if pytorch_ddp_enabled:
             distribution_config[self.LAUNCH_PYTORCH_DDP_ENV_NAME] = pytorch_ddp_enabled
             if self.instance_type is not None:
                 distribution_config[self.INSTANCE_TYPE_ENV_NAME] = self.instance_type
+        elif torch_distributed_enabled:
+            distribution_config[self.LAUNCH_TORCH_DISTRIBUTED_ENV_NAME] = torch_distributed_enabled
+            if self.instance_type is not None:
+                distribution_config[self.INSTANCE_TYPE_ENV_NAME] = self.instance_type
         else:
             distribution_config = self._distribution_configuration(distribution=distribution)
 
 
@@ -447,6 +447,17 @@ def pytorch_ddp_framework_version(request):
     return request.param
 
 
+@pytest.fixture(scope="module")
+def torch_distributed_py_version():
+    return "py3"
+
+
+@pytest.fixture(
+    scope="module", params=["1.11.0"]
+)
+def torch_distributed_framework_version(request):
+    return request.param
+
 @pytest.fixture(scope="session")
 def cpu_instance_type(sagemaker_session, request):
     region = sagemaker_session.boto_session.region_name