aws
diff --git a/‎MANIFEST.in
+1 b/‎MANIFEST.in
+1
diff --git a/‎pyproject.toml
+1 b/‎pyproject.toml
+1
diff --git a/‎src/sagemaker/modules/distributed.py
+43-46 b/‎src/sagemaker/modules/distributed.py
+43-46
diff --git a/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
+77-4 b/‎src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
+77-4
diff --git a/‎src/sagemaker/modules/train/container_drivers/scripts/environment.py
+2-1 b/‎src/sagemaker/modules/train/container_drivers/scripts/environment.py
+2-1
diff --git a/‎src/sagemaker/modules/train/model_trainer.py
+92-4 b/‎src/sagemaker/modules/train/model_trainer.py
+92-4
diff --git a/‎src/sagemaker/modules/train/sm_recipes/__init__.py b/‎src/sagemaker/modules/train/sm_recipes/__init__.py
diff --git a/‎src/sagemaker/modules/train/sm_recipes/training_recipes.json
+11 b/‎src/sagemaker/modules/train/sm_recipes/training_recipes.json
+11
@@ -3,6 +3,7 @@ recursive-include src/sagemaker *.py
 include src/sagemaker/image_uri_config/*.json
 include src/sagemaker/serve/schema/*.json
 include src/sagemaker/serve/requirements.txt
+include src/sagemaker/modules/train/sm_recipes/training_recipes.json
 recursive-include requirements *
 
 include VERSION
 
@@ -39,6 +39,7 @@ dependencies = [
   "importlib-metadata>=1.4.0,<7.0",
   "jsonschema",
   "numpy>=1.9.0,<2.0",
+  "omegaconf>=2.2,<2.3",
   "packaging>=20.0",
   "pandas",
   "pathos",
 
@@ -15,49 +15,17 @@
 
 from typing import Optional, Dict, Any, List
 from pydantic import BaseModel, PrivateAttr
+from sagemaker.modules.utils import safe_serialize
 
 
-class DistributedRunner(BaseModel):
-    """Base class for DistributedRunner Class"""
-
-    _type: str = PrivateAttr()
-
-    def model_dump(self, *args, **kwargs):
-        """Dump the model to a dictionary."""
-        result = super().model_dump(*args, **kwargs)
-        result["_type"] = self._type
-        return result
-
-
-class Torchrun(DistributedRunner):
-    """TorchDistributed.
-
-    The Torchrun distributed runner uses `torchrun` or `torch.distributed.launch` in the backend to
-    launch distributed training.
+class SMP(BaseModel):
+    """SMP.
 
-    Attributes:
-        process_count_per_node (int):
-            The number of processes to run on each node in the training job.
-            Will default to the number of GPUs available in the container.
-    """
-
-    _type: str = PrivateAttr(default="torchrun")
-
-    process_count_per_node: Optional[int] = None
-
-
-class TorchrunSMP(DistributedRunner):
-    """TorchrunSMP.
-
-    The TorchrunSMP runner uses `torchrun` or `torch.distributed.launch` in the backend
-    to launch distributed training. This strategy is used for a PyTorch job using the SageMaker
-    Model Parallelism library v2. For more information on the model parallelism parameters, see:
+    This class is used for configuring the SageMaker Model Parallelism v2 parameters.
+    For more information on the model parallelism parameters, see:
     https://docs.aws.amazon.com/sagemaker/latest/dg/distributed-model-parallel-v2-reference.html#distributed-model-parallel-v2-reference-init-config
 
     Attributes:
-        process_count_per_node (int):
-            The number of processes to run on each node in the training job.
-            Will default to the number of GPUs available in the container.
         hybrid_shard_degree (Optional[int]):
             Specifies a sharded parallelism degree for the model.
         sm_activation_offloading (Optional[bool]):
@@ -85,9 +53,6 @@ class TorchrunSMP(DistributedRunner):
             parallelism or expert parallelism.
     """
 
-    _type: str = PrivateAttr(default="torchrun")
-
-    process_count_per_node: Optional[int] = None
     hybrid_shard_degree: Optional[int] = None
     sm_activation_offloading: Optional[bool] = None
     activation_loading_horizon: Optional[int] = None
@@ -98,13 +63,45 @@ class TorchrunSMP(DistributedRunner):
     expert_parallel_degree: Optional[int] = None
     random_seed: Optional[int] = None
 
-    def _to_mp_parameters_dict(self) -> Dict[str, Any]:
-        """Convert to a dictionary of MP parameters."""
+    def _to_mp_hyperparameters(self) -> Dict[str, Any]:
+        """Converts to the hyperparameters format for the SageMaker Model Parallelism v2."""
         mp_parameters = self.model_dump(exclude_none=True)
-        mp_parameters.pop("_type")
-        if mp_parameters.get("process_count_per_node") is not None:
-            mp_parameters.pop("process_count_per_node")
-        return mp_parameters
+        hyperparameters = {
+            "mp_parameters": safe_serialize(mp_parameters),
+        }
+        return hyperparameters
+
+
+class DistributedRunner(BaseModel):
+    """Base class for DistributedRunner Class"""
+
+    _type: str = PrivateAttr()
+
+    def model_dump(self, *args, **kwargs):
+        """Dump the model to a dictionary."""
+        result = super().model_dump(*args, **kwargs)
+        result["_type"] = self._type
+        return result
+
+
+class Torchrun(DistributedRunner):
+    """TorchDistributed.
+
+    The Torchrun runner uses `torchrun` or `torch.distributed.launch` in the backend to
+    launch distributed training.
+
+    Attributes:
+        process_count_per_node (int):
+            The number of processes to run on each node in the training job.
+            Will default to the number of GPUs available in the container.
+        smp (Optional[SMP]):
+            The SageMaker Model Parallelism v2 parameters.
+    """
+
+    _type: str = PrivateAttr(default="torchrun")
+
+    process_count_per_node: Optional[int] = None
+    smp: Optional["SMP"] = None
 
 
 class MPI(DistributedRunner):
 
@@ -117,11 +117,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sagemaker.modules.train import ModelTrainer\n",
     "from sagemaker.modules.configs import SourceCode\n",
     "\n",
-    "pytorch_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310\"\n",
-    "\n",
     "source_code = SourceCode(\n",
     "    source_dir=\"basic-script-mode\",\n",
     "    requirements=\"requirements.txt\",\n",
@@ -163,7 +160,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install \"datasets[s3]\""
+    "!pip install \"datasets[s3]\" \"requests<2.32.0\""
    ]
   },
   {
@@ -463,6 +460,82 @@
     ")\n",
     "model_trainer.train(input_data_config=[test_data], wait=False)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ModelTrainer Recipes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SageMaker GPU Recipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.modules.train import ModelTrainer\n",
+    "from sagemaker.modules.configs import Compute, InputData\n",
+    "\n",
+    "recipe_overrides = {\n",
+    "    \"run\": {\n",
+    "        \"results_dir\": \"/opt/ml/model\",\n",
+    "    },\n",
+    "    \"exp_manager\": {\n",
+    "        \"exp_dir\": \"\",\n",
+    "        \"explicit_log_dir\": \"/opt/ml/output/tensorboard\",\n",
+    "        \"checkpoint_dir\": \"/opt/ml/checkpoints\",\n",
+    "        \"export_full_model\": {\n",
+    "            \"save_last\": False\n",
+    "        }\n",
+    "    },   \n",
+    "    \"model\": {\n",
+    "        \"data\": {\n",
+    "            \"train_dir\": \"/opt/ml/input/data/train\",\n",
+    "            \"val_dir\": \"/opt/ml/input/data/val\",\n",
+    "            \"use_synthetic_data\": True,\n",
+    "        },\n",
+    "        \"train_batch_size\": 1,\n",
+    "        \"num_hidden_layers\": 4,\n",
+    "        \"fp8\": False,\n",
+    "    },\n",
+    "    \"trainer\": {\n",
+    "        \"num_nodes\": 1\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n",
+    "\n",
+    "model_trainer = ModelTrainer.from_recipe(\n",
+    "    training_recipe=\"training/llama/hf_llama3_8b_seq8192_gpu\",\n",
+    "    training_image=training_image,\n",
+    "    recipe_overrides=recipe_overrides,\n",
+    "    compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Successful Run - https://tiny.amazon.com/14jxjrndx/IsenLink"
+   ]
   }
  ],
  "metadata": {
 
@@ -152,7 +152,8 @@ def set_env(
     # Hyperparameters
     env_vars["SM_HPS"] = hyperparameters_config
     for key, value in hyperparameters_config.items():
-        env_vars[f"SM_HP_{key.upper()}"] = safe_serialize(value)
+        key_upper = key.replace("-", "_").upper()
+        env_vars[f"SM_HP_{key_upper}"] = safe_serialize(value)
 
     # Host Variables
     current_host = resource_config["current_host"]
 
@@ -55,7 +55,7 @@
 
 from sagemaker.modules.distributed import (
     DistributedRunner,
-    TorchrunSMP,
+    Torchrun,
 )
 from sagemaker.modules.utils import (
     _get_repo_name_from_image,
@@ -85,6 +85,7 @@
     EXECUTE_BASIC_SCRIPT_DRIVER,
 )
 from sagemaker.modules import logger
+from sagemaker.modules.train.sm_recipes.utils import get_args_from_recipe, _determine_device_type
 
 
 class ModelTrainer(BaseModel):
@@ -213,6 +214,14 @@ class ModelTrainer(BaseModel):
     _session_chaining_config: Optional[SessionChainingConfig] = PrivateAttr(default=None)
     _remote_debug_config: Optional[RemoteDebugConfig] = PrivateAttr(default=None)
 
+    _temp_recipe_train_dir: Optional[TemporaryDirectory] = PrivateAttr(default=None)
+
+    def __del__(self):
+        """Destructor method to clean up the temporary directory."""
+        # Clean up the temporary directory if it exists
+        if self._temp_recipe_train_dir is not None:
+            self._temp_recipe_train_dir.cleanup()
+
     def _validate_training_image_and_algorithm_name(
         self, training_image: Optional[str], algorithm_name: Optional[str]
     ):
@@ -383,9 +392,9 @@ def train(
                 distributed_runner=self.distributed_runner,
             )
 
-            if isinstance(self.distributed_runner, TorchrunSMP):
-                mp_parameters = self.distributed_runner._to_mp_parameters_dict()
-                string_hyper_parameters["mp_parameters"] = safe_serialize(mp_parameters)
+            if isinstance(self.distributed_runner, Torchrun) and self.distributed_runner.smp:
+                mp_parameters = self.distributed_runner.smp._to_mp_hyperparameters()
+                string_hyper_parameters.update(mp_parameters)
 
             self._write_source_code_json(tmp_dir=drivers_dir, source_code=self.source_code)
             self._write_distributed_runner_json(
@@ -455,6 +464,11 @@ def train(
             session_chaining_config=self._session_chaining_config,
         )
         self._latest_training_job = training_job
+
+        # Clean up the temporary directory if it exists
+        if self._temp_recipe_train_dir is not None:
+            self._temp_recipe_train_dir.cleanup()
+
         if wait:
             training_job.wait(logs=logs)
 
@@ -748,3 +762,77 @@ def with_additional_settings(
         self._session_chaining_config = session_chaining_config
         self._remote_debug_config = remote_debug_config
         return self
+
+    @classmethod
+    def from_recipe(
+        cls,
+        training_recipe: str,
+        compute: Compute,
+        recipe_overrides: Optional[Dict[str, Any]] = None,
+        training_image: Optional[str] = None,
+        session: Optional[Session] = None,
+        role: Optional[str] = None,
+        base_job_name: Optional[str] = None,
+        **kwargs,
+    ) -> "ModelTrainer":
+        """Create a ModelTrainer from a training recipe.
+
+        Args:
+            training_recipe (str):
+                The training recipe to use for training the model. This must be the name of
+                a sagemaker training recipe or a path to a local training recipe .yaml file.
+            compute (Compute):
+                The compute configuration. This is used to specify the compute resources for
+                the training job. If not specified, will default to 1 instance of ml.m5.xlarge.
+            recipe_overrides (Optional[Dict[str, Any]]):
+                The recipe overrides. This is used to override the default recipe parameters.
+            training_image (Optional[str]):
+                The training image URI to use for the training job container. If not specified,
+                the training image will be determined from the recipe.
+            session (Optional[Session]):
+                The SageMaker session.
+                If not specified, a new session will be created.
+            role (Optional[str]):
+                The IAM role ARN for the training job.
+                If not specified, the default SageMaker execution role will be used.
+            base_job_name (Optional[str]):
+                The base name for the training job.
+                If not specified, a default name will be generated using the algorithm name
+                or training image.
+            kwargs:
+                Additional keyword arguments to pass to the ModelTrainer constructor.
+
+        """
+        if compute.instance_type is None:
+            raise ValueError(
+                "Must set `instance_type` in compute_config when using training recipes."
+            )
+        device_type = _determine_device_type(compute.instance_type)
+        if device_type == "cpu":
+            raise ValueError(
+                "Training recipes are not supported for CPU instances. "
+                + "Please provide a GPU or Tranium instance type."
+            )
+
+        if session is None:
+            session = Session()
+            logger.warning("Session not provided. Using default Session.")
+        if role is None:
+            role = get_execution_role()
+            logger.warning(f"Role not provided. Using default role:\n{role}")
+
+        model_trainer_args, recipe_train_dir = get_args_from_recipe(
+            training_recipe=training_recipe,
+            recipe_overrides=recipe_overrides,
+            compute=compute,
+            session=session,
+        )
+        if training_image is not None:
+            model_trainer_args["training_image"] = training_image
+
+        model_trainer = cls(
+            session=session, role=role, base_job_name=base_job_name, **model_trainer_args, **kwargs
+        )
+
+        model_trainer._temp_recipe_train_dir = recipe_train_dir
+        return model_trainer
@@ -0,0 +1,11 @@
+{
+    "adapter_repo": "[email protected]:aws/private-sagemaker-training-adapter-for-nemo-staging.git",
+    "launcher_repo": "[email protected]:aws/private-sagemaker-training-launcher-staging.git",
+    "neuron_dist_repo": "https://github.com/aws-neuron/neuronx-distributed-training.git",
+    "gpu_image" : {
+        "framework": "pytorch-smp",
+        "version":  "2.3.1",
+        "additional_args": {}
+    },
+    "neuron_image": "855988369404.dkr.ecr.us-west-2.amazonaws.com/chinmayee-dev:neuron_sept26_v1"
+}