aws · Dec 4, 2024
diff --git a/‎.gitignore
Lines changed: 2 additions & 2 deletions b/‎.gitignore
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/modules/__init__.py
Lines changed: 4 additions & 0 deletions b/‎src/sagemaker/modules/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/sagemaker/modules/configs.py
Lines changed: 15 additions & 111 deletions b/‎src/sagemaker/modules/configs.py
Lines changed: 15 additions & 111 deletions
diff --git a/‎src/sagemaker/modules/constants.py
Lines changed: 2 additions & 2 deletions b/‎src/sagemaker/modules/constants.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/modules/distributed.py
Lines changed: 126 additions & 0 deletions b/‎src/sagemaker/modules/distributed.py
Lines changed: 126 additions & 0 deletions
diff --git a/‎src/sagemaker/modules/templates.py
Lines changed: 8 additions & 8 deletions b/‎src/sagemaker/modules/templates.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
Lines changed: 41 additions & 22 deletions b/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
Lines changed: 41 additions & 22 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/mpi_driver.py
Lines changed: 9 additions & 10 deletions b/‎src/sagemaker/modules/train/container_drivers/mpi_driver.py
Lines changed: 9 additions & 10 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/mpi_utils.py
Lines changed: 1 addition & 35 deletions b/‎src/sagemaker/modules/train/container_drivers/mpi_utils.py
Lines changed: 1 addition & 35 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/pytorch_driver.py renamed to ‎src/sagemaker/modules/train/container_drivers/torchrun_driver.py
Lines changed: 8 additions & 6 deletions b/‎src/sagemaker/modules/train/container_drivers/pytorch_driver.py renamed to ‎src/sagemaker/modules/train/container_drivers/torchrun_driver.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/sagemaker/modules/train/container_drivers/utils.py
Lines changed: 20 additions & 13 deletions b/‎src/sagemaker/modules/train/container_drivers/utils.py
Lines changed: 20 additions & 13 deletions
diff --git a/‎src/sagemaker/modules/train/model_trainer.py
Lines changed: 99 additions & 130 deletions b/‎src/sagemaker/modules/train/model_trainer.py
Lines changed: 99 additions & 130 deletions
diff --git a/‎tests/data/modules/script_mode/custom_script.py
Lines changed: 146 additions & 0 deletions b/‎tests/data/modules/script_mode/custom_script.py
Lines changed: 146 additions & 0 deletions
diff --git a/‎tests/data/modules/script_mode/data/test/x_test.npy
19 KB b/‎tests/data/modules/script_mode/data/test/x_test.npy
19 KB
diff --git a/‎tests/data/modules/script_mode/data/test/y_test.npy
2.48 KB b/‎tests/data/modules/script_mode/data/test/y_test.npy
2.48 KB
diff --git a/‎tests/data/modules/script_mode/data/train/x_train.npy
75.4 KB b/‎tests/data/modules/script_mode/data/train/x_train.npy
75.4 KB
diff --git a/‎tests/data/modules/script_mode/data/train/y_train.npy
9.53 KB b/‎tests/data/modules/script_mode/data/train/y_train.npy
9.53 KB
diff --git a/‎tests/data/modules/script_mode/pytorch_model_def.py
Lines changed: 23 additions & 0 deletions b/‎tests/data/modules/script_mode/pytorch_model_def.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎tests/data/modules/script_mode/requirements.txt
Lines changed: 3 additions & 0 deletions b/‎tests/data/modules/script_mode/requirements.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/unit/sagemaker/modules/train/container_drivers/test_mpi_driver.py
Lines changed: 25 additions & 16 deletions b/‎tests/unit/sagemaker/modules/train/container_drivers/test_mpi_driver.py
Lines changed: 25 additions & 16 deletions
diff --git a/‎tests/unit/sagemaker/modules/train/container_drivers/test_pytorch_driver.py renamed to ‎tests/unit/sagemaker/modules/train/container_drivers/test_torchrun_driver.py
Lines changed: 42 additions & 35 deletions b/‎tests/unit/sagemaker/modules/train/container_drivers/test_pytorch_driver.py renamed to ‎tests/unit/sagemaker/modules/train/container_drivers/test_torchrun_driver.py
Lines changed: 42 additions & 35 deletions
diff --git a/‎tests/unit/sagemaker/modules/train/test_model_trainer.py
Lines changed: 119 additions & 23 deletions b/‎tests/unit/sagemaker/modules/train/test_model_trainer.py
Lines changed: 119 additions & 23 deletions
@@ -33,8 +33,8 @@ env/
 *.html
 **/_repack_script_launcher.sh
 src/sagemaker/modules/train/container_drivers/sm_train.sh
-src/sagemaker/modules/train/container_drivers/sourcecodeconfig.json
-src/sagemaker/modules/train/container_drivers/distribution.json
+src/sagemaker/modules/train/container_drivers/sourcecode.json
+src/sagemaker/modules/train/container_drivers/distributed_runner.json
 tests/data/**/_repack_model.py
 tests/data/experiment/sagemaker-dev-1.0.tar.gz
 src/sagemaker/serve/tmp_workspace
@@ -16,3 +16,7 @@
 from sagemaker_core.main.utils import logger as sagemaker_core_logger
 
 logger = sagemaker_core_logger
+
+from sagemaker.modules.train.model_trainer import (  # noqa: F401 E402 # pylint: disable=C0413
+    ModelTrainer,
+)
@@ -21,7 +21,7 @@
 
 from __future__ import absolute_import
 
-from typing import Optional, Union, Dict, Any, List
+from typing import Optional, Union
 from pydantic import BaseModel, model_validator
 
 import sagemaker_core.shapes as shapes
@@ -54,15 +54,10 @@
     CheckpointConfig,
 )
 
-from sagemaker.modules import logger
 from sagemaker.modules.utils import convert_unassigned_to_none
 
 __all__ = [
-    "SourceCodeConfig",
-    "TorchDistributionConfig",
-    "MPIDistributionConfig",
-    "SMDistributedSettings",
-    "DistributionConfig",
+    "SourceCode",
     "StoppingCondition",
     "RetryStrategy",
     "OutputDataConfig",
@@ -87,107 +82,16 @@
     "InstanceGroup",
     "TensorBoardOutputConfig",
     "CheckpointConfig",
-    "ComputeConfig",
-    "NetworkingConfig",
+    "Compute",
+    "Networking",
     "InputData",
 ]
 
 
-class SMDistributedSettings(BaseModel):
-    """SMDistributedSettings.
+class SourceCode(BaseModel):
+    """SourceCode.
 
-    The SMDistributedSettings is used to configure distributed training when
-        using the smdistributed library.
-
-    Attributes:
-        enable_dataparallel (Optional[bool]):
-            Whether to enable data parallelism.
-        enable_modelparallel (Optional[bool]):
-            Whether to enable model parallelism.
-        modelparallel_parameters (Optional[Dict[str, Any]]):
-            The parameters for model parallelism.
-    """
-
-    enable_dataparallel: Optional[bool] = False
-    enable_modelparallel: Optional[bool] = False
-    modelparallel_parameters: Optional[Dict[str, Any]] = None
-
-
-class DistributionConfig(BaseModel):
-    """Base class for distribution configurations."""
-
-    _distribution_type: str
-
-
-class TorchDistributionConfig(DistributionConfig):
-    """TorchDistributionConfig.
-
-    The TorchDistributionConfig uses `torchrun` or `torch.distributed.launch` in the backend to
-    launch distributed training.
-
-    SMDistributed Library Information:
-        - `TorchDistributionConfig` can be used for SMModelParallel V2.
-        - For SMDataParallel or SMModelParallel V1, it is recommended to use the
-            `MPIDistributionConfig.`
-
-
-    Attributes:
-        smdistributed_settings (Optional[SMDistributedSettings]):
-            The settings for smdistributed library.
-        process_count_per_node (int):
-            The number of processes to run on each node in the training job.
-            Will default to the number of CPUs or GPUs available in the container.
-    """
-
-    _distribution_type: str = "torch_distributed"
-
-    smdistributed_settings: Optional[SMDistributedSettings] = None
-    process_count_per_node: Optional[int] = None
-
-    @model_validator(mode="after")
-    def _validate_model(cls, model):  # pylint: disable=E0213
-        """Validate the model."""
-        if (
-            getattr(model, "smddistributed_settings", None)
-            and model.smddistributed_settings.enable_dataparallel
-        ):
-            logger.warning(
-                "For smdistributed data parallelism, it is recommended to use "
-                + "MPIDistributionConfig."
-            )
-        return model
-
-
-class MPIDistributionConfig(DistributionConfig):
-    """MPIDistributionConfig.
-
-    The MPIDistributionConfig uses `mpirun` in the backend to launch distributed training.
-
-    SMDistributed Library Information:
-        - `MPIDistributionConfig` can be used for SMDataParallel and SMModelParallel V1.
-        - For SMModelParallel V2, it is recommended to use the `TorchDistributionConfig`.
-
-    Attributes:
-        smdistributed_settings (Optional[SMDistributedSettings]):
-            The settings for smdistributed library.
-        process_count_per_node (int):
-            The number of processes to run on each node in the training job.
-            Will default to the number of CPUs or GPUs available in the container.
-        mpi_additional_options (Optional[str]):
-            The custom MPI options to use for the training job.
-    """
-
-    _distribution_type: str = "mpi"
-
-    smdistributed_settings: Optional[SMDistributedSettings] = None
-    process_count_per_node: Optional[int] = None
-    mpi_additional_options: Optional[List[str]] = None
-
-
-class SourceCodeConfig(BaseModel):
-    """SourceCodeConfig.
-
-    This config allows the user to specify the source code location, dependencies,
+    The SourceCode class allows the user to specify the source code location, dependencies,
     entry script, or commands to be executed in the training job container.
 
     Attributes:
@@ -210,10 +114,10 @@ class SourceCodeConfig(BaseModel):
     command: Optional[str] = None
 
 
-class ComputeConfig(shapes.ResourceConfig):
-    """ComputeConfig.
+class Compute(shapes.ResourceConfig):
+    """Compute.
 
-    The ComputeConfig is a subclass of `sagemaker_core.shapes.ResourceConfig`
+    The Compute class is a subclass of `sagemaker_core.shapes.ResourceConfig`
     and allows the user to specify the compute resources for the training job.
 
     Attributes:
@@ -245,7 +149,7 @@ class ComputeConfig(shapes.ResourceConfig):
     enable_managed_spot_training: Optional[bool] = None
 
     @model_validator(mode="after")
-    def _model_validator(self) -> "ComputeConfig":
+    def _model_validator(self) -> "Compute":
         """Convert Unassigned values to None."""
         return convert_unassigned_to_none(self)
 
@@ -259,10 +163,10 @@ def _to_resource_config(self) -> shapes.ResourceConfig:
         return shapes.ResourceConfig(**filtered_dict)
 
 
-class NetworkingConfig(shapes.VpcConfig):
-    """NetworkingConfig.
+class Networking(shapes.VpcConfig):
+    """Networking.
 
-    The NetworkingConifg is a subclass of `sagemaker_core.shapes.VpcConfig ` and
+    The Networking class is a subclass of `sagemaker_core.shapes.VpcConfig ` and
     allows the user to specify the networking configuration for the training job.
 
     Attributes:
@@ -290,7 +194,7 @@ class NetworkingConfig(shapes.VpcConfig):
     enable_inter_container_traffic_encryption: Optional[bool] = None
 
     @model_validator(mode="after")
-    def _model_validator(self) -> "NetworkingConfig":
+    def _model_validator(self) -> "Networking":
         """Convert Unassigned values to None."""
         return convert_unassigned_to_none(self)
 
 
@@ -25,8 +25,8 @@
     os.path.dirname(os.path.abspath(__file__)), "train/container_drivers"
 )
 
-SOURCE_CODE_CONFIG_JSON = "sourcecodeconfig.json"
-DISTRIBUTION_JSON = "distribution.json"
+SOURCE_CODE_JSON = "sourcecode.json"
+DISTRIBUTED_RUNNER_JSON = "distributed_runner.json"
 TRAIN_SCRIPT = "sm_train.sh"
 
 DEFAULT_CONTAINER_ENTRYPOINT = ["/bin/bash"]
 
@@ -0,0 +1,126 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Distributed module."""
+from __future__ import absolute_import
+
+from typing import Optional, Dict, Any, List
+from pydantic import BaseModel, PrivateAttr
+
+
+class DistributedRunner(BaseModel):
+    """Base class for DistributedRunner Class"""
+
+    _type: str = PrivateAttr()
+
+    def model_dump(self, *args, **kwargs):
+        """Dump the model to a dictionary."""
+        result = super().model_dump(*args, **kwargs)
+        result["_type"] = self._type
+        return result
+
+
+class Torchrun(DistributedRunner):
+    """TorchDistribution.
+
+    The TorchDistribution runner uses `torchrun` or `torch.distributed.launch` in the backend to
+    launch distributed training.
+
+    Attributes:
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of GPUs available in the container.
+    """
+
+    _type: str = PrivateAttr(default="torchrun")
+
+    process_count_per_node: Optional[int] = None
+
+
+class TorchrunSMP(DistributedRunner):
+    """TorchrunSMP.
+
+    The TorchrunSMP runner uses `torchrun` or `torch.distributed.launch` in the backend
+    to launch distributed training. This strategy is used for a PyTorch job using the SageMaker
+    Model Parallelism library v2. For more information on the model parallelism parameters, see:
+    https://docs.aws.amazon.com/sagemaker/latest/dg/distributed-model-parallel-v2-reference.html#distributed-model-parallel-v2-reference-init-config
+
+    Attributes:
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of GPUs available in the container.
+        hybrid_shard_degree (Optional[int]):
+            Specifies a sharded parallelism degree for the model.
+        sm_activation_offloading (Optional[bool]):
+            Specifies whether to enable the SMP activation offloading implementation.
+        activation_loading_horizon (Optional[int]):
+            An integer specifying the activation offloading horizon type for FSDP. This is the
+            maximum number of checkpointed or offloaded layers whose inputs can be in the GPU
+            memory simultaneously.
+        fsdp_cache_flush_warnings (Optional[bool]):
+            Detects and warns if cache flushes happen in the PyTorch memory manager, because they
+            can degrade computational performance.
+        allow_empty_shards (Optional[bool]):
+            Whether to allow empty shards when sharding tensors if tensor is not divisible. This is
+            an experimental fix for crash during checkpointing in certain scenarios. Disabling this
+            falls back to the original PyTorch behavior.
+        tensor_parallel_degree (Optional[int]):
+            Specifies a tensor parallelism degree. The value must be between 1 and world_size.
+        context_parallel_degree (Optional[int]):
+            Specifies the context parallelism degree. The value must be between 1 and world_size ,
+            and must be <= hybrid_shard_degree.
+        expert_parallel_degree (Optional[int]):
+            Specifies a expert parallelism degree. The value must be between 1 and world_size.
+        random_seed (Optional[int]):
+            A seed number for the random operations in distributed modules by SMP tensor
+            parallelism or expert parallelism.
+    """
+
+    _type: str = PrivateAttr(default="torchrun")
+
+    process_count_per_node: Optional[int] = None
+    hybrid_shard_degree: Optional[int] = None
+    sm_activation_offloading: Optional[bool] = None
+    activation_loading_horizon: Optional[int] = None
+    fsdp_cache_flush_warnings: Optional[bool] = None
+    allow_empty_shards: Optional[bool] = None
+    tensor_parallel_degree: Optional[int] = None
+    context_parallel_degree: Optional[int] = None
+    expert_parallel_degree: Optional[int] = None
+    random_seed: Optional[int] = None
+
+    def _to_mp_parameters_dict(self) -> Dict[str, Any]:
+        """Convert to a dictionary of MP parameters."""
+        mp_parameters = self.model_dump(exclude_none=True)
+        mp_parameters.pop("_type")
+        if mp_parameters.get("process_count_per_node") is not None:
+            mp_parameters.pop("process_count_per_node")
+        return mp_parameters
+
+
+class MPI(DistributedRunner):
+    """MPI.
+
+    The MPI runner uses `mpirun` in the backend to launch distributed training.
+
+    Attributes:
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of GPUs available in the container.
+        mpi_additional_options (Optional[str]):
+            The custom MPI options to use for the training job.
+    """
+
+    _type: str = PrivateAttr(default="mpi")
+
+    process_count_per_node: Optional[int] = None
+    mpi_additional_options: Optional[List[str]] = None
@@ -19,13 +19,13 @@
 eval $CMD
 """
 
-EXECUTE_PYTORCH_DRIVER = """
-echo "Running PyTorch training driver"
-$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/pytorch_driver.py
+EXEUCTE_TORCHRUN_DRIVER = """
+echo "Running Torchrun driver"
+$SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/torchrun_driver.py
 """
 
 EXECUTE_MPI_DRIVER = """
-echo "Running MPI training driver"
+echo "Running MPI driver"
 $SM_PYTHON_CMD /opt/ml/input/data/sm_drivers/mpi_driver.py
 """
 
@@ -73,12 +73,12 @@
 cat /opt/ml/input/config/inputdataconfig.json
 echo
 
-echo "/opt/ml/input/data/sm_drivers/sourcecodeconfig.json"
-cat /opt/ml/input/data/sm_drivers/sourcecodeconfig.json
+echo "/opt/ml/input/data/sm_drivers/sourcecode.json"
+cat /opt/ml/input/data/sm_drivers/sourcecode.json
 echo
 
-echo "/opt/ml/input/data/sm_drivers/distribution.json"
-cat /opt/ml/input/data/sm_drivers/distribution.json
+echo "/opt/ml/input/data/sm_drivers/distributed_runner.json"
+cat /opt/ml/input/data/sm_drivers/distributed_runner.json
 echo
 
 echo "Setting up environment variables"
 
@@ -27,16 +27,16 @@
    "outputs": [],
    "source": [
     "from sagemaker.modules.train import ModelTrainer\n",
-    "from sagemaker.modules.configs import SourceCodeConfig\n",
+    "from sagemaker.modules.configs import SourceCode\n",
     "\n",
     "pytorch_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310\"\n",
     "\n",
-    "source_code_config = SourceCodeConfig(\n",
+    "source_code = SourceCode(\n",
     "    command=\"echo 'Hello World' && env\",\n",
     ")\n",
     "model_trainer = ModelTrainer(\n",
     "    training_image=pytorch_image,\n",
-    "    source_code_config=source_code_config,\n",
+    "    source_code=source_code,\n",
     ")"
    ]
   },
@@ -70,11 +70,11 @@
    "outputs": [],
    "source": [
     "from sagemaker.modules.train import ModelTrainer\n",
-    "from sagemaker.modules.configs import SourceCodeConfig\n",
+    "from sagemaker.modules.configs import SourceCode\n",
     "\n",
     "pytorch_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310\"\n",
     "\n",
-    "source_code_config = SourceCodeConfig(\n",
+    "source_code = SourceCode(\n",
     "    source_dir=\"basic-script-mode\",\n",
     "    command=\"python custom_script.py\",\n",
     ")\n",
@@ -89,7 +89,7 @@
     "\n",
     "model_trainer = ModelTrainer(\n",
     "    training_image=pytorch_image,\n",
-    "    source_code_config=source_code_config,\n",
+    "    source_code=source_code,\n",
     "    hyperparameters=hyperparameters,\n",
     "    environment=env_vars,\n",
     ")\n",
@@ -117,17 +117,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sagemaker.modules.configs import SourceCodeConfig\n",
+    "from sagemaker.modules.configs import SourceCode\n",
     "\n",
-    "source_code_config = SourceCodeConfig(\n",
+    "source_code = SourceCode(\n",
     "    source_dir=\"basic-script-mode\",\n",
     "    requirements=\"requirements.txt\",\n",
     "    entry_script=\"custom_script.py\",\n",
     ")\n",
     "\n",
     "model_trainer = ModelTrainer(\n",
     "    training_image=pytorch_image,\n",
-    "    source_code_config=source_code_config,\n",
+    "    source_code=source_code,\n",
     ")"
    ]
   },
@@ -296,7 +296,7 @@
    "outputs": [],
    "source": [
     "from sagemaker.modules.train import ModelTrainer\n",
-    "from sagemaker.modules.configs import ComputeConfig, SourceCodeConfig, InputData\n",
+    "from sagemaker.modules.configs import Compute, SourceCode, InputData\n",
     "\n",
     "env = {}\n",
     "env[\"FI_PROVIDER\"] = \"efa\"\n",
@@ -307,10 +307,11 @@
     "env[\"FI_EFA_USE_DEVICE_RDMA\"] = \"1\"\n",
     "env[\"RDMAV_FORK_SAFE\"] = \"1\"\n",
     "\n",
-    "compute_config = ComputeConfig(\n",
+    "compute = Compute(\n",
     "    instance_count=2,\n",
     "    instance_type=\"ml.p4d.24xlarge\",\n",
     "    volume_size_in_gb=96,\n",
+    "    keep_alive_period_in_seconds=3600\n",
     ")\n",
     "\n",
     "hugging_face_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"\n",
@@ -335,7 +336,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "source_code_config = SourceCodeConfig(\n",
+    "source_code = SourceCode(\n",
     "    source_dir=\"distributed-training/scripts\",\n",
     "    requirements=\"requirements.txt\",\n",
     "    command=\"torchrun --nnodes 2 \\\n",
@@ -348,10 +349,10 @@
     "\n",
     "model_trainer = ModelTrainer(\n",
     "    training_image=hugging_face_image,\n",
-    "    compute_config=compute_config,\n",
+    "    compute=compute,\n",
     "    environment=env,\n",
     "    hyperparameters=hyperparameters,\n",
-    "    source_code_config=source_code_config,\n",
+    "    source_code=source_code,\n",
     ")"
    ]
   },
@@ -365,7 +366,7 @@
     "    channel_name=\"dataset\",\n",
     "    data_source=training_input_path,\n",
     ")\n",
-    "model_trainer.train(input_data_config=[test_data])"
+    "model_trainer.train(input_data_config=[test_data], wait=False)"
    ]
   },
   {
@@ -383,13 +384,18 @@
    "source": [
     "from sagemaker.modules.train import ModelTrainer\n",
     "from sagemaker.modules.configs import (\n",
-    "    ComputeConfig, SourceCodeConfig, TorchDistributionConfig, InputData\n",
+    "    Compute, SourceCode, InputData\n",
+    ")\n",
+    "from sagemaker.modules.distributed import (\n",
+    "    Torchrun,\n",
+    "    MPI\n",
     ")\n",
     "\n",
-    "compute_config = ComputeConfig(\n",
+    "compute = Compute(\n",
     "    instance_count=2,\n",
     "    instance_type=\"ml.p4d.24xlarge\",\n",
     "    volume_size_in_gb=96,\n",
+    "    keep_alive_period_in_seconds=3600\n",
     ")\n",
     "\n",
     "hugging_face_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"\n",
@@ -414,18 +420,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "source_code_config = SourceCodeConfig(\n",
+    "source_code = SourceCode(\n",
     "    source_dir=\"distributed-training/scripts\",\n",
     "    requirements=\"requirements.txt\",\n",
     "    entry_script=\"run_clm_no_trainer.py\",\n",
     ")\n",
     "\n",
+    "# Run using Torchrun\n",
+    "torchrun = Torchrun()\n",
+    "\n",
+    "# Run using MPI\n",
+    "mpi = MPI(\n",
+    "    mpi_additional_options=[\n",
+    "        \"-x\",\n",
+    "        \"MASTER_ADDR=algo-1\",\n",
+    "        \"-x\",\n",
+    "        \"MASTER_PORT=7777\",\n",
+    "    ]\n",
+    ")\n",
+    "\n",
     "model_trainer = ModelTrainer(\n",
     "    training_image=hugging_face_image,\n",
-    "    compute_config=compute_config,\n",
+    "    compute=compute,\n",
     "    hyperparameters=hyperparameters,\n",
-    "    source_code_config=source_code_config,\n",
-    "    distribution_config=TorchDistributionConfig(),\n",
+    "    source_code=source_code,\n",
+    "    distributed_runner=mpi,\n",
     ")"
    ]
   },
@@ -439,7 +458,7 @@
     "    channel_name=\"dataset\",\n",
     "    data_source=training_input_path,\n",
     ")\n",
-    "model_trainer.train(input_data_config=[test_data])"
+    "model_trainer.train(input_data_config=[test_data], wait=False)"
    ]
   },
   {
 
@@ -14,12 +14,13 @@
 from __future__ import absolute_import
 
 import os
+import sys
 import json
 
 from utils import (
     logger,
-    read_source_code_config_json,
-    read_distribution_json,
+    read_source_code_json,
+    read_distributed_runner_json,
     get_process_count,
     execute_commands,
     write_failure_file,
@@ -55,9 +56,8 @@ def main():
     5. Exit
 
     """
-    source_code_config = read_source_code_config_json()
-    distribution = read_distribution_json()
-    sm_distributed_settings = distribution.get("smdistributed_settings", {})
+    source_code = read_source_code_json()
+    distribution = read_distributed_runner_json()
 
     sm_current_host = os.environ["SM_CURRENT_HOST"]
     sm_hosts = json.loads(os.environ["SM_HOSTS"])
@@ -83,18 +83,17 @@ def main():
             host_count=host_count,
             host_list=host_list,
             num_processes=process_count,
-            smdataparallel_enabled=sm_distributed_settings.get("enable_dataparallel", False),
-            smmodelparallel_enabled=sm_distributed_settings.get("enable_modelparallel", False),
             additional_options=distribution.get("mpi_additional_options", []),
-            entry_script_path=os.path.join(USER_CODE_PATH, source_code_config["entry_script"]),
+            entry_script_path=os.path.join(USER_CODE_PATH, source_code["entry_script"]),
         )
 
         logger.info(f"Executing command: {mpi_command}")
         exit_code, error_traceback = execute_commands(mpi_command)
+        write_status_file_to_workers(worker_hosts)
+
         if exit_code != 0:
             write_failure_file(error_traceback)
-
-        write_status_file_to_workers(worker_hosts)
+            sys.exit(exit_code)
 
 
 if __name__ == "__main__":
 
@@ -16,7 +16,6 @@
 import os
 import time
 import subprocess
-import json
 
 from typing import List
 
@@ -29,7 +28,7 @@
 def _write_status_file(host: str, status_file: str) -> bool:
     """Write the status file to the provided host."""
     try:
-        logger.info(f"Start writing mpirun finished status to {host}")
+        logger.info("Writing finished status file (%s) to %s", status_file, host)
         subprocess.run(
             ["ssh", host, "touch", f"{status_file}"],
             capture_output=True,
@@ -188,8 +187,6 @@ def get_mpirun_command(
     host_count: int,
     host_list: List[str],
     num_processes: int,
-    smdataparallel_enabled: bool,
-    smmodelparallel_enabled: bool,
     additional_options: List[str],
     entry_script_path: str,
 ):
@@ -258,37 +255,6 @@ def get_mpirun_command(
         if credential in os.environ:
             mpirun_command.extend(["-x", credential])
 
-    if smdataparallel_enabled:
-        if host_count == 1:
-            smdataparallel_flag = "SMDATAPARALLEL_USE_HOMOGENEOUS=1"
-            mpirun_command.extend(["-x", smdataparallel_flag])
-        else:
-            smdataparallel_flag = "SMDATAPARALLEL_USE_SINGLENODE=1"
-            smdataparallel_server_port = 7592
-            smdataparallel_server_addr = "algo-1"
-
-            mpirun_command.extend(["-x", smdataparallel_flag])
-            mpirun_command.extend(
-                [
-                    "-x",
-                    f"SMDATAPARALLEL_SERVER_ADDR={smdataparallel_server_addr}",
-                    "-x",
-                    f"SMDATAPARALLEL_SERVER_PORT={smdataparallel_server_port}",
-                    "-x",
-                    f"SAGEMAKER_INSTANCE_TYPE={instance_type}",
-                ]
-            )
-
-        if validate_smddprun():
-            mpirun_command.extend(["smddprun"])
-
-    if smmodelparallel_enabled:
-        mp_parameters = json.loads(os.environ.get("SM_HP_MP_PARAMETERS", "{}"))
-        ddp_dist_backend = mp_parameters.get("ddp_dist_backend", "auto")
-        if ddp_dist_backend == "auto":
-            if validate_smddpmprun():
-                mpirun_command.extend(["smddpmprun", "-i", instance_type, "--allow-bypass"])
-
     mpirun_command.extend([get_python_executable()])
     mpirun_command.extend(["-m", "mpi4py", entry_script_path])
     return mpirun_command
@@ -10,17 +10,18 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-"""This module is the entry point for the PyTorch driver script."""
+"""This module is the entry point for the Torchrun driver script."""
 from __future__ import absolute_import
 
 import os
+import sys
 
 from typing import List, Tuple
 
 from utils import (
     logger,
-    read_source_code_config_json,
-    read_distribution_json,
+    read_source_code_json,
+    read_distributed_runner_json,
     get_process_count,
     get_python_executable,
     SM_EFA_NCCL_INSTANCES,
@@ -62,8 +63,8 @@ def setup_env():
 
 def create_commands():
     """Create the Torch Distributed command to execute"""
-    source_code_config = read_source_code_config_json()
-    distribution = read_distribution_json()
+    source_code = read_source_code_json()
+    distribution = read_distributed_runner_json()
 
     process_count = get_process_count(distribution)
     host_count = int(os.environ["SM_HOST_COUNT"])
@@ -90,7 +91,7 @@ def create_commands():
             ]
         )
 
-    torch_cmd.extend([os.path.join(USER_CODE_PATH, source_code_config["entry_script"])])
+    torch_cmd.extend([os.path.join(USER_CODE_PATH, source_code["entry_script"])])
     return torch_cmd
 
 
@@ -113,6 +114,7 @@ def main():
     exit_code, traceback = execute_commands(torch_cmd)
     if exit_code != 0:
         write_failure_file(traceback)
+        sys.exit(exit_code)
 
 
 if __name__ == "__main__":
 
@@ -37,8 +37,9 @@
 """
 
 USER_CODE_PATH = "/opt/ml/input/data/sm_code"
-SOURCE_CODE_CONFIG_JSON = "/opt/ml/input/data/sm_drivers/sourcecodeconfig.json"
-DISTRIBUTION_JSON = "/opt/ml/input/data/sm_drivers/distribution.json"
+SOURCE_CODE_JSON = "/opt/ml/input/data/sm_drivers/sourcecode.json"
+DISTRIBUTED_RUNNER_JSON = "/opt/ml/input/data/sm_drivers/distributed_runner.json"
+
 
 SM_EFA_NCCL_INSTANCES = [
     "ml.g4dn.8xlarge",
@@ -65,24 +66,30 @@ def write_failure_file(message: str = DEFAULT_FAILURE_MESSAGE):
             f.write(message)
 
 
-def read_source_code_config_json(source_code_config_file: Dict[str, Any] = SOURCE_CODE_CONFIG_JSON):
+def read_source_code_json(source_code_json: Dict[str, Any] = SOURCE_CODE_JSON):
     """Read the source code config json file."""
-    with open(source_code_config_file, "r") as f:
-        source_code_config_json = json.load(f)
-    return source_code_config_json
+    try:
+        with open(source_code_json, "r") as f:
+            source_code_dict = json.load(f) or {}
+    except FileNotFoundError:
+        source_code_dict = {}
+    return source_code_dict
 
 
-def read_distribution_json(distribution_file: Dict[str, Any] = DISTRIBUTION_JSON):
-    """Read the distribution json file."""
-    with open(distribution_file, "r") as f:
-        distribution_json = json.load(f)
-    return distribution_json
+def read_distributed_runner_json(distributed_json: Dict[str, Any] = DISTRIBUTED_RUNNER_JSON):
+    """Read the distribution config json file."""
+    try:
+        with open(distributed_json, "r") as f:
+            distributed_runner_dict = json.load(f) or {}
+    except FileNotFoundError:
+        distributed_runner_dict = {}
+    return distributed_runner_dict
 
 
-def get_process_count(distribution: Dict[str, Any]) -> int:
+def get_process_count(distributed_runner_dict: Dict[str, Any]) -> int:
     """Get the number of processes to run on each node in the training job."""
     return (
-        int(distribution.get("process_count_per_node", 0))
+        int(distributed_runner_dict.get("process_count_per_node", 0))
         or int(os.environ.get("SM_NUM_GPUS", 0))
         or int(os.environ.get("SM_NUM_NEURONS", 0))
         or 1
 
@@ -0,0 +1,146 @@
+# flake8: noqa
+import argparse
+import numpy as np
+import os
+import sys
+import logging
+import json
+import shutil
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pytorch_model_def import get_model
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+current_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_train_data(train_dir):
+    """
+    Get the training data and convert to tensors
+    """
+
+    x_train = np.load(os.path.join(train_dir, "x_train.npy"))
+    y_train = np.load(os.path.join(train_dir, "y_train.npy"))
+    logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
+
+    return torch.from_numpy(x_train), torch.from_numpy(y_train)
+
+
+def get_test_data(test_dir):
+    """
+    Get the testing data and convert to tensors
+    """
+
+    x_test = np.load(os.path.join(test_dir, "x_test.npy"))
+    y_test = np.load(os.path.join(test_dir, "y_test.npy"))
+    logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
+
+    return torch.from_numpy(x_test), torch.from_numpy(y_test)
+
+
+def model_fn(model_dir):
+    """
+    Load the model for inference
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = get_model()
+    model.load_state_dict(torch.load(model_dir + "/model.pth"))
+    model.eval()
+    return model.to(device)
+
+
+def input_fn(request_body, request_content_type):
+    """
+    Deserialize and prepare the prediction input
+    """
+
+    if request_content_type == "application/json":
+        request = json.loads(request_body)
+        train_inputs = torch.tensor(request)
+        return train_inputs
+
+
+def predict_fn(input_data, model):
+    """
+    Apply model to the incoming request
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        return model(input_data.float()).numpy()[0]
+
+
+def train():
+    """
+    Train the PyTorch model
+    """
+    # Directories: train, test and model
+    train_dir = os.path.join(current_dir, "data/train")
+    test_dir = os.path.join(current_dir, "data/test")
+    model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
+
+    # Load the training and testing data
+    x_train, y_train = get_train_data(train_dir)
+    x_test, y_test = get_test_data(test_dir)
+    train_ds = TensorDataset(x_train, y_train)
+
+    # Training parameters - used to configure the training loop
+    batch_size = 64
+    epochs = 1
+    learning_rate = 0.1
+    logger.info(
+        "batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
+    )
+
+    train_dl = DataLoader(train_ds, batch_size, shuffle=True)
+
+    # Define the model, loss function and optimizer
+    model = get_model()
+    model = model.to(device)
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(epochs):
+        for x_train_batch, y_train_batch in train_dl:
+            y = model(x_train_batch.float())
+            loss = criterion(y.flatten(), y_train_batch.float())
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        epoch += 1
+        logger.info(f"epoch: {epoch} -> loss: {loss}")
+
+    # Test the model
+    with torch.no_grad():
+        y = model(x_test.float()).flatten()
+        mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
+    print("\nTest MSE:", mse.numpy())
+
+    # Save the model
+    os.makedirs(model_dir, exist_ok=True)
+    torch.save(model.state_dict(), model_dir + "/model.pth")
+    inference_code_path = model_dir + "/code/"
+
+    if not os.path.exists(inference_code_path):
+        os.mkdir(inference_code_path)
+        logger.info("Created a folder at {}!".format(inference_code_path))
+
+    code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
+    shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
+    shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
+    logger.info("Saving models files to {}".format(inference_code_path))
+
+
+if __name__ == "__main__":
+    print("Running the training job ...\n")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train()
@@ -0,0 +1,23 @@
+# flake8: noqa
+import torch
+import torch.nn as nn
+
+
+class NeuralNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(8, 8)
+        self.fc2 = nn.Linear(8, 6)
+        self.fc3 = nn.Linear(6, 1)
+
+    def forward(self, x):
+        x = torch.tanh(self.fc1(x))
+        x = torch.sigmoid(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def get_model():
+
+    model = NeuralNet()
+    return model
@@ -0,0 +1,3 @@
+numpy
+-f https://download.pytorch.org/whl/torch_stable.html
+torch==2.0.1+cpu
@@ -30,25 +30,28 @@
     "algo-1,algo-2",
     "-np",
     "2",
+    "--verbose",
+    "-x",
+    "ENV_VAR1",
     "python",
     "-m",
     "mpi4py",
     "-m",
     "script.py",
 ]
 
-DUMMY_SOURCE_CODE_CONFIG = {
+DUMMY_SOURCE_CODE = {
+    "source_code": "source_code",
     "entry_script": "script.py",
-    "distribution": {
-        "process_count_per_node": 2,
-        "sm_distributed_settings": {
-            "enable_dataparallel": True,
-        },
-        "mpi_additional_options": [
-            "-x",
-            "AWS_REGION",
-        ],
-    },
+}
+DUMMY_DISTRIBUTED_RUNNER = {
+    "_type": "mpi",
+    "process_count_per_node": 2,
+    "mpi_additional_options": [
+        "--verbose",
+        "-x",
+        "ENV_VAR1",
+    ],
 }
 
 
@@ -61,7 +64,8 @@
         "SM_HOST_COUNT": "2",
     },
 )
-@patch("sagemaker.modules.train.container_drivers.mpi_driver.read_source_code_config_json")
+@patch("sagemaker.modules.train.container_drivers.mpi_driver.read_distributed_runner_json")
+@patch("sagemaker.modules.train.container_drivers.mpi_driver.read_source_code_json")
 @patch("sagemaker.modules.train.container_drivers.mpi_driver.write_env_vars_to_file")
 @patch("sagemaker.modules.train.container_drivers.mpi_driver.start_sshd_daemon")
 @patch("sagemaker.modules.train.container_drivers.mpi_driver.bootstrap_master_node")
@@ -75,9 +79,11 @@ def test_mpi_driver_worker(
     mock_bootstrap_master_node,
     mock_start_sshd_daemon,
     mock_write_env_vars_to_file,
-    mock_read_source_code_config_json,
+    mock_read_source_code_json,
+    mock_read_distributed_runner_json,
 ):
-    mock_read_source_code_config_json.return_value = DUMMY_SOURCE_CODE_CONFIG
+    mock_read_source_code_json.return_value = DUMMY_SOURCE_CODE
+    mock_read_distributed_runner_json.return_value = DUMMY_DISTRIBUTED_RUNNER
 
     mpi_driver.main()
 
@@ -99,7 +105,8 @@ def test_mpi_driver_worker(
         "SM_HOST_COUNT": "2",
     },
 )
-@patch("sagemaker.modules.train.container_drivers.mpi_driver.read_source_code_config_json")
+@patch("sagemaker.modules.train.container_drivers.mpi_driver.read_distributed_runner_json")
+@patch("sagemaker.modules.train.container_drivers.mpi_driver.read_source_code_json")
 @patch("sagemaker.modules.train.container_drivers.mpi_driver.write_env_vars_to_file")
 @patch("sagemaker.modules.train.container_drivers.mpi_driver.start_sshd_daemon")
 @patch("sagemaker.modules.train.container_drivers.mpi_driver.bootstrap_master_node")
@@ -118,8 +125,10 @@ def test_mpi_driver_master(
     mock_start_sshd_daemon,
     mock_write_env_vars_to_file,
     mock_read_source_code_config_json,
+    mock_read_distributed_runner_json,
 ):
-    mock_read_source_code_config_json.return_value = DUMMY_SOURCE_CODE_CONFIG
+    mock_read_source_code_config_json.return_value = DUMMY_SOURCE_CODE
+    mock_read_distributed_runner_json.return_value = DUMMY_DISTRIBUTED_RUNNER
     mock_get_mpirun_command.return_value = DUMMY_MPI_COMMAND
     mock_get_process_count.return_value = 2
     mock_execute_commands.return_value = (0, "")
 
@@ -10,7 +10,7 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-"""Pytorch Driver Unit Tests."""
+"""Torchrun Driver Unit Tests."""
 from __future__ import absolute_import
 
 import os
@@ -20,45 +20,38 @@
 
 sys.modules["utils"] = MagicMock()
 
-from sagemaker.modules.train.container_drivers import pytorch_driver  # noqa: E402
+from sagemaker.modules.train.container_drivers import torchrun_driver  # noqa: E402
 
-DUMMY_SOURCE_CODE_CONFIG = {
+DUMMY_SOURCE_CODE = {
+    "source_code": "source_code",
     "entry_script": "script.py",
-    "distribution": {
-        "process_count_per_node": 2,
-        "sm_distributed_settings": {
-            "enable_dataparallel": True,
-        },
-        "mpi_additional_options": [
-            "-x",
-            "AWS_REGION",
-        ],
-    },
 }
 
+DUMMY_DISTRIBUTED_RUNNER = {"_type": "torchrun", "process_count_per_node": 2}
+
 
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.get_python_executable",
+    "sagemaker.modules.train.container_drivers.torchrun_driver.get_python_executable",
     return_value="python3",
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.pytorch_version", return_value=(2, 0)
+    "sagemaker.modules.train.container_drivers.torchrun_driver.pytorch_version", return_value=(2, 0)
 )
 def test_get_base_pytorch_command_torchrun(mock_pytorch_version, mock_get_python_executable):
-    assert pytorch_driver.get_base_pytorch_command() == ["torchrun"]
+    assert torchrun_driver.get_base_pytorch_command() == ["torchrun"]
 
 
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.get_python_executable",
+    "sagemaker.modules.train.container_drivers.torchrun_driver.get_python_executable",
     return_value="python3",
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.pytorch_version", return_value=(1, 8)
+    "sagemaker.modules.train.container_drivers.torchrun_driver.pytorch_version", return_value=(1, 8)
 )
 def test_get_base_pytorch_command_torch_distributed_launch(
     mock_pytorch_version, mock_get_python_executable
 ):
-    assert pytorch_driver.get_base_pytorch_command() == (
+    assert torchrun_driver.get_base_pytorch_command() == (
         ["python3", "-m", "torch.distributed.launch"]
     )
 
@@ -72,23 +65,30 @@ def test_get_base_pytorch_command_torch_distributed_launch(
     },
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.USER_CODE_PATH",
+    "sagemaker.modules.train.container_drivers.torchrun_driver.USER_CODE_PATH",
     "/opt/ml/input/data/code",
 )
-@patch("sagemaker.modules.train.container_drivers.pytorch_driver.get_process_count", return_value=2)
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.pytorch_version", return_value=(2, 0)
+    "sagemaker.modules.train.container_drivers.torchrun_driver.get_process_count", return_value=2
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.get_base_pytorch_command",
+    "sagemaker.modules.train.container_drivers.torchrun_driver.pytorch_version", return_value=(2, 0)
+)
+@patch(
+    "sagemaker.modules.train.container_drivers.torchrun_driver.get_base_pytorch_command",
     return_value=["torchrun"],
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.read_source_code_config_json",
-    return_value=DUMMY_SOURCE_CODE_CONFIG,
+    "sagemaker.modules.train.container_drivers.torchrun_driver.read_source_code_json",
+    return_value=DUMMY_SOURCE_CODE,
+)
+@patch(
+    "sagemaker.modules.train.container_drivers.torchrun_driver.read_distributed_runner_json",
+    return_value=DUMMY_DISTRIBUTED_RUNNER,
 )
 def test_create_commands_single_node(
-    mock_read_source_code_config_json,
+    mock_read_distributed_runner_json,
+    mock_read_source_code_json,
     mock_get_base_pytorch_command,
     mock_pytorch_version,
     mock_get_process_count,
@@ -100,7 +100,7 @@ def test_create_commands_single_node(
         "/opt/ml/input/data/code/script.py",
     ]
 
-    command = pytorch_driver.create_commands()
+    command = torchrun_driver.create_commands()
     assert command == expected_command
 
 
@@ -116,23 +116,30 @@ def test_create_commands_single_node(
     },
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.USER_CODE_PATH",
+    "sagemaker.modules.train.container_drivers.torchrun_driver.USER_CODE_PATH",
     "/opt/ml/input/data/code",
 )
-@patch("sagemaker.modules.train.container_drivers.pytorch_driver.get_process_count", return_value=2)
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.pytorch_version", return_value=(2, 0)
+    "sagemaker.modules.train.container_drivers.torchrun_driver.get_process_count", return_value=2
+)
+@patch(
+    "sagemaker.modules.train.container_drivers.torchrun_driver.pytorch_version", return_value=(2, 0)
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.get_base_pytorch_command",
+    "sagemaker.modules.train.container_drivers.torchrun_driver.get_base_pytorch_command",
     return_value=["torchrun"],
 )
 @patch(
-    "sagemaker.modules.train.container_drivers.pytorch_driver.read_source_code_config_json",
-    return_value=DUMMY_SOURCE_CODE_CONFIG,
+    "sagemaker.modules.train.container_drivers.torchrun_driver.read_source_code_json",
+    return_value=DUMMY_SOURCE_CODE,
+)
+@patch(
+    "sagemaker.modules.train.container_drivers.torchrun_driver.read_distributed_runner_json",
+    return_value=DUMMY_DISTRIBUTED_RUNNER,
 )
 def test_create_commands_multi_node(
-    mock_read_source_code_config_json,
+    mock_read_distributed_runner_json,
+    mock_read_source_code_json,
     mock_get_base_pytorch_command,
     mock_pytorch_version,
     mock_get_process_count,
@@ -147,5 +154,5 @@ def test_create_commands_multi_node(
         "/opt/ml/input/data/code/script.py",
     ]
 
-    command = pytorch_driver.create_commands()
+    command = torchrun_driver.create_commands()
     assert command == expected_command
@@ -13,18 +13,26 @@
 """ModelTrainer Tests."""
 from __future__ import absolute_import
 
+import json
+import os
 import pytest
 from unittest.mock import patch, MagicMock
 
 from sagemaker.session import Session
 from sagemaker.modules.train.model_trainer import ModelTrainer
-from sagemaker.modules.constants import DEFAULT_INSTANCE_TYPE
+from sagemaker.modules.constants import (
+    DEFAULT_INSTANCE_TYPE,
+    SM_DRIVERS_LOCAL_PATH,
+    DISTRIBUTED_RUNNER_JSON,
+    SOURCE_CODE_JSON,
+    TRAIN_SCRIPT,
+)
 from sagemaker.modules.configs import (
-    ComputeConfig,
+    Compute,
     StoppingCondition,
     RetryStrategy,
     OutputDataConfig,
-    SourceCodeConfig,
+    SourceCode,
     S3DataSource,
     FileSystemDataSource,
     MetricDefinition,
@@ -39,13 +47,15 @@
     SessionChainingConfig,
     InputData,
 )
+from sagemaker.modules.distributed import Torchrun, TorchrunSMP, MPI
+from sagemaker.modules.templates import EXEUCTE_TORCHRUN_DRIVER, EXECUTE_MPI_DRIVER
 from tests.unit import DATA_DIR
 
 DEFAULT_BASE_NAME = "dummy-image-job"
 DEFAULT_IMAGE = "000000000000.dkr.ecr.us-west-2.amazonaws.com/dummy-image:latest"
 DEFAULT_BUCKET = "sagemaker-us-west-2-000000000000"
 DEFAULT_ROLE = "arn:aws:iam::000000000000:role/test-role"
-DEFAULT_COMPUTE_CONFIG = ComputeConfig(instance_type=DEFAULT_INSTANCE_TYPE, instance_count=1)
+DEFAULT_COMPUTE_CONFIG = Compute(instance_type=DEFAULT_INSTANCE_TYPE, instance_count=1)
 DEFAULT_OUTPUT_DATA_CONFIG = OutputDataConfig(
     s3_output_path=f"s3://{DEFAULT_BUCKET}/{DEFAULT_BASE_NAME}",
     compression_type="GZIP",
@@ -56,11 +66,11 @@
     max_pending_time_in_seconds=None,
     max_wait_time_in_seconds=None,
 )
-DEFAULT_SOURCE_CODE_CONFIG = SourceCodeConfig(
-    source_dir="test-data",
-    entry_point="train.py",
+DEFAULT_SOURCE_CODE = SourceCode(
+    source_dir=f"{DATA_DIR}/modules/script_mode",
+    entry_script="custom_script.py",
 )
-UNSUPPORTED_SOURCE_CODE_CONFIG = SourceCodeConfig(
+UNSUPPORTED_SOURCE_CODE = SourceCode(
     entry_script="train.py",
 )
 
@@ -80,7 +90,7 @@ def model_trainer():
     trainer = ModelTrainer(
         training_image=DEFAULT_IMAGE,
         role=DEFAULT_ROLE,
-        compute_config=DEFAULT_COMPUTE_CONFIG,
+        compute=DEFAULT_COMPUTE_CONFIG,
         stopping_condition=DEFAULT_STOPPING_CONDITION,
         output_data_config=DEFAULT_OUTPUT_DATA_CONFIG,
     )
@@ -110,14 +120,14 @@ def model_trainer():
         {
             "init_params": {
                 "training_image": DEFAULT_IMAGE,
-                "source_code_config": UNSUPPORTED_SOURCE_CODE_CONFIG,
+                "source_code": UNSUPPORTED_SOURCE_CODE,
             },
             "should_throw": True,
         },
         {
             "init_params": {
                 "training_image": DEFAULT_IMAGE,
-                "source_code_config": DEFAULT_SOURCE_CODE_CONFIG,
+                "source_code": DEFAULT_SOURCE_CODE,
             },
             "should_throw": False,
         },
@@ -126,8 +136,8 @@ def model_trainer():
         "no_params",
         "training_image_and_algorithm_name",
         "only_training_image",
-        "unsupported_source_code_config",
-        "supported_source_code_config",
+        "unsupported_source_code",
+        "supported_source_code",
     ],
 )
 def test_model_trainer_param_validation(test_case, modules_session):
@@ -138,7 +148,7 @@ def test_model_trainer_param_validation(test_case, modules_session):
         trainer = ModelTrainer(**test_case["init_params"], session=modules_session)
         assert trainer is not None
         assert trainer.training_image == DEFAULT_IMAGE
-        assert trainer.compute_config == DEFAULT_COMPUTE_CONFIG
+        assert trainer.compute == DEFAULT_COMPUTE_CONFIG
         assert trainer.output_data_config == DEFAULT_OUTPUT_DATA_CONFIG
         assert trainer.stopping_condition == DEFAULT_STOPPING_CONDITION
         assert trainer.base_job_name == DEFAULT_BASE_NAME
@@ -282,9 +292,6 @@ def test_debugger_settings(mock_training_job, modules_session):
         rule_evaluator_image=image_uri,
         rule_parameters={"parameter": "value"},
     )
-    remote_debug_config = RemoteDebugConfig(
-        enable_remote_debug=True,
-    )
     profiler_config = ProfilerConfig(s3_output_path="s3://dummy-bucket/dummy-prefix")
     profiler_rule_config = ProfilerRuleConfiguration(
         rule_configuration_name="rule-name",
@@ -301,15 +308,13 @@ def test_debugger_settings(mock_training_job, modules_session):
     ).with_debugger_settings(
         debug_hook_config=debug_hook_config,
         debug_rule_configurations=debug_rule_config,
-        remote_debug_config=remote_debug_config,
         profiler_config=profiler_config,
         profiler_rule_configurations=profiler_rule_config,
         tensor_board_output_config=tensor_board_output_config,
     )
 
     assert model_trainer._debug_hook_config == debug_hook_config
     assert model_trainer._debug_rule_configurations == debug_rule_config
-    assert model_trainer._remote_debug_config == remote_debug_config
     assert model_trainer._profiler_config == profiler_config
     assert model_trainer._profiler_rule_configurations == profiler_rule_config
     assert model_trainer._tensor_board_output_config == tensor_board_output_config
@@ -324,9 +329,6 @@ def test_debugger_settings(mock_training_job, modules_session):
             mock_training_job.create.call_args.kwargs["debug_rule_configurations"]
             == debug_rule_config
         )
-        assert (
-            mock_training_job.create.call_args.kwargs["remote_debug_config"] == remote_debug_config
-        )
         assert mock_training_job.create.call_args.kwargs["profiler_config"] == profiler_config
         assert (
             mock_training_job.create.call_args.kwargs["profiler_rule_configurations"]
@@ -346,7 +348,9 @@ def test_additional_settings(mock_training_job, modules_session):
     retry_strategy = RetryStrategy(
         maximum_retry_attempts=3,
     )
-
+    remote_debug_config = RemoteDebugConfig(
+        enable_remote_debug=True,
+    )
     experiment_config = ExperimentConfig(
         experiment_name="experiment-name",
         trial_name="trial-name",
@@ -364,6 +368,7 @@ def test_additional_settings(mock_training_job, modules_session):
     ).with_additional_settings(
         retry_strategy=retry_strategy,
         experiment_config=experiment_config,
+        remote_debug_config=remote_debug_config,
         infra_check_config=infra_check_config,
         session_chaining_config=session_chaining_config,
     )
@@ -372,6 +377,7 @@ def test_additional_settings(mock_training_job, modules_session):
     assert model_trainer._experiment_config == experiment_config
     assert model_trainer._infra_check_config == infra_check_config
     assert model_trainer._session_chaining_config == session_chaining_config
+    assert model_trainer._remote_debug_config == remote_debug_config
 
     with patch("sagemaker.modules.train.model_trainer.Session.upload_data") as mock_upload_data:
         mock_upload_data.return_value = "s3://dummy-bucket/dummy-prefix"
@@ -386,3 +392,93 @@ def test_additional_settings(mock_training_job, modules_session):
             mock_training_job.create.call_args.kwargs["session_chaining_config"]
             == session_chaining_config
         )
+        assert (
+            mock_training_job.create.call_args.kwargs["remote_debug_config"] == remote_debug_config
+        )
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        {
+            "source_code": DEFAULT_SOURCE_CODE,
+            "distributed_runner": Torchrun(),
+            "expected_template": EXEUCTE_TORCHRUN_DRIVER,
+            "expected_hyperparameters": {},
+        },
+        {
+            "source_code": DEFAULT_SOURCE_CODE,
+            "distributed_runner": TorchrunSMP(
+                hybrid_shard_degree=3,
+                sm_activation_offloading=True,
+                allow_empty_shards=True,
+                tensor_parallel_degree=5,
+            ),
+            "expected_template": EXEUCTE_TORCHRUN_DRIVER,
+            "expected_hyperparameters": {
+                "mp_parameters": json.dumps(
+                    {
+                        "hybrid_shard_degree": 3,
+                        "sm_activation_offloading": True,
+                        "allow_empty_shards": True,
+                        "tensor_parallel_degree": 5,
+                    }
+                ),
+            },
+        },
+        {
+            "source_code": DEFAULT_SOURCE_CODE,
+            "distributed_runner": MPI(
+                custom_mpi_options=["-x", "VAR1", "-x", "VAR2"],
+            ),
+            "expected_template": EXECUTE_MPI_DRIVER,
+            "expected_hyperparameters": {},
+        },
+    ],
+    ids=[
+        "torchrun",
+        "torchrun_smp",
+        "mpi",
+    ],
+)
+@patch("sagemaker.modules.train.model_trainer.TrainingJob")
+def test_train_with_distributed_runner(mock_training_job, test_case, modules_session):
+    modules_session.upload_data.return_value = (
+        f"s3://{DEFAULT_BUCKET}/{DEFAULT_BASE_NAME}-job/input/test"
+    )
+
+    expected_train_script_path = f"{SM_DRIVERS_LOCAL_PATH}/{TRAIN_SCRIPT}"
+    expected_runner_json_path = f"{SM_DRIVERS_LOCAL_PATH}/{DISTRIBUTED_RUNNER_JSON}"
+    expected_source_code_json_path = f"{SM_DRIVERS_LOCAL_PATH}/{SOURCE_CODE_JSON}"
+
+    model_trainer = ModelTrainer(
+        session=modules_session,
+        training_image=DEFAULT_IMAGE,
+        source_code=test_case["source_code"],
+        distributed_runner=test_case["distributed_runner"],
+    )
+
+    model_trainer.train()
+    mock_training_job.create.assert_called_once()
+    assert mock_training_job.create.call_args.kwargs["hyper_parameters"] == (
+        test_case["expected_hyperparameters"]
+    )
+
+    assert os.path.exists(expected_train_script_path)
+    with open(expected_train_script_path, "r") as f:
+        train_script_content = f.read()
+        assert test_case["expected_template"] in train_script_content
+
+    assert os.path.exists(expected_runner_json_path)
+    with open(expected_runner_json_path, "r") as f:
+        runner_json_content = f.read()
+        assert test_case["distributed_runner"].model_dump(exclude_none=True) == (
+            json.loads(runner_json_content)
+        )
+
+    assert os.path.exists(expected_source_code_json_path)
+    with open(expected_source_code_json_path, "r") as f:
+        source_code_json_content = f.read()
+        assert test_case["source_code"].model_dump(exclude_none=True) == (
+            json.loads(source_code_json_content)
+        )
Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,8 @@`
`25`	`25`	`os.path.dirname(os.path.abspath(__file__)), "train/container_drivers"`
`26`	`26`	`)`
`27`	`27`
`28`		`-SOURCE_CODE_CONFIG_JSON = "sourcecodeconfig.json"`
`29`		`-DISTRIBUTION_JSON = "distribution.json"`
	`28`	`+SOURCE_CODE_JSON = "sourcecode.json"`
	`29`	`+DISTRIBUTED_RUNNER_JSON = "distributed_runner.json"`
`30`	`30`	`TRAIN_SCRIPT = "sm_train.sh"`
`31`	`31`
`32`	`32`	`DEFAULT_CONTAINER_ENTRYPOINT = ["/bin/bash"]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+numpy`
	`2`	`+-f https://download.pytorch.org/whl/torch_stable.html`
	`3`	`+torch==2.0.1+cpu`