Skip to content

Commit f28f814

Browse files
beniericnargokul
authored andcommitted
Add Support for Training Recipes (#1565)
Co-authored-by: Gokul Anantha Narayanan <[email protected]>
1 parent a99ae84 commit f28f814

File tree

13 files changed

+645
-65
lines changed

13 files changed

+645
-65
lines changed

MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ recursive-include src/sagemaker *.py
33
include src/sagemaker/image_uri_config/*.json
44
include src/sagemaker/serve/schema/*.json
55
include src/sagemaker/serve/requirements.txt
6+
include src/sagemaker/modules/train/sm_recipes/training_recipes.json
67
recursive-include requirements *
78

89
include VERSION

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ dependencies = [
3939
"importlib-metadata>=1.4.0,<7.0",
4040
"jsonschema",
4141
"numpy>=1.9.0,<2.0",
42+
"omegaconf>=2.2,<2.3",
4243
"packaging>=20.0",
4344
"pandas",
4445
"pathos",

src/sagemaker/modules/distributed.py

+43-46
Original file line numberDiff line numberDiff line change
@@ -15,49 +15,17 @@
1515

1616
from typing import Optional, Dict, Any, List
1717
from pydantic import BaseModel, PrivateAttr
18+
from sagemaker.modules.utils import safe_serialize
1819

1920

20-
class DistributedRunner(BaseModel):
21-
"""Base class for DistributedRunner Class"""
22-
23-
_type: str = PrivateAttr()
24-
25-
def model_dump(self, *args, **kwargs):
26-
"""Dump the model to a dictionary."""
27-
result = super().model_dump(*args, **kwargs)
28-
result["_type"] = self._type
29-
return result
30-
31-
32-
class Torchrun(DistributedRunner):
33-
"""TorchDistributed.
34-
35-
The Torchrun distributed runner uses `torchrun` or `torch.distributed.launch` in the backend to
36-
launch distributed training.
21+
class SMP(BaseModel):
22+
"""SMP.
3723
38-
Attributes:
39-
process_count_per_node (int):
40-
The number of processes to run on each node in the training job.
41-
Will default to the number of GPUs available in the container.
42-
"""
43-
44-
_type: str = PrivateAttr(default="torchrun")
45-
46-
process_count_per_node: Optional[int] = None
47-
48-
49-
class TorchrunSMP(DistributedRunner):
50-
"""TorchrunSMP.
51-
52-
The TorchrunSMP runner uses `torchrun` or `torch.distributed.launch` in the backend
53-
to launch distributed training. This strategy is used for a PyTorch job using the SageMaker
54-
Model Parallelism library v2. For more information on the model parallelism parameters, see:
24+
This class is used for configuring the SageMaker Model Parallelism v2 parameters.
25+
For more information on the model parallelism parameters, see:
5526
https://docs.aws.amazon.com/sagemaker/latest/dg/distributed-model-parallel-v2-reference.html#distributed-model-parallel-v2-reference-init-config
5627
5728
Attributes:
58-
process_count_per_node (int):
59-
The number of processes to run on each node in the training job.
60-
Will default to the number of GPUs available in the container.
6129
hybrid_shard_degree (Optional[int]):
6230
Specifies a sharded parallelism degree for the model.
6331
sm_activation_offloading (Optional[bool]):
@@ -85,9 +53,6 @@ class TorchrunSMP(DistributedRunner):
8553
parallelism or expert parallelism.
8654
"""
8755

88-
_type: str = PrivateAttr(default="torchrun")
89-
90-
process_count_per_node: Optional[int] = None
9156
hybrid_shard_degree: Optional[int] = None
9257
sm_activation_offloading: Optional[bool] = None
9358
activation_loading_horizon: Optional[int] = None
@@ -98,13 +63,45 @@ class TorchrunSMP(DistributedRunner):
9863
expert_parallel_degree: Optional[int] = None
9964
random_seed: Optional[int] = None
10065

101-
def _to_mp_parameters_dict(self) -> Dict[str, Any]:
102-
"""Convert to a dictionary of MP parameters."""
66+
def _to_mp_hyperparameters(self) -> Dict[str, Any]:
67+
"""Converts to the hyperparameters format for the SageMaker Model Parallelism v2."""
10368
mp_parameters = self.model_dump(exclude_none=True)
104-
mp_parameters.pop("_type")
105-
if mp_parameters.get("process_count_per_node") is not None:
106-
mp_parameters.pop("process_count_per_node")
107-
return mp_parameters
69+
hyperparameters = {
70+
"mp_parameters": safe_serialize(mp_parameters),
71+
}
72+
return hyperparameters
73+
74+
75+
class DistributedRunner(BaseModel):
76+
"""Base class for DistributedRunner Class"""
77+
78+
_type: str = PrivateAttr()
79+
80+
def model_dump(self, *args, **kwargs):
81+
"""Dump the model to a dictionary."""
82+
result = super().model_dump(*args, **kwargs)
83+
result["_type"] = self._type
84+
return result
85+
86+
87+
class Torchrun(DistributedRunner):
88+
"""TorchDistributed.
89+
90+
The Torchrun runner uses `torchrun` or `torch.distributed.launch` in the backend to
91+
launch distributed training.
92+
93+
Attributes:
94+
process_count_per_node (int):
95+
The number of processes to run on each node in the training job.
96+
Will default to the number of GPUs available in the container.
97+
smp (Optional[SMP]):
98+
The SageMaker Model Parallelism v2 parameters.
99+
"""
100+
101+
_type: str = PrivateAttr(default="torchrun")
102+
103+
process_count_per_node: Optional[int] = None
104+
smp: Optional["SMP"] = None
108105

109106

110107
class MPI(DistributedRunner):

src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb

+77-4
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,8 @@
117117
"metadata": {},
118118
"outputs": [],
119119
"source": [
120-
"from sagemaker.modules.train import ModelTrainer\n",
121120
"from sagemaker.modules.configs import SourceCode\n",
122121
"\n",
123-
"pytorch_image = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310\"\n",
124-
"\n",
125122
"source_code = SourceCode(\n",
126123
" source_dir=\"basic-script-mode\",\n",
127124
" requirements=\"requirements.txt\",\n",
@@ -163,7 +160,7 @@
163160
"metadata": {},
164161
"outputs": [],
165162
"source": [
166-
"!pip install \"datasets[s3]\""
163+
"!pip install \"datasets[s3]\" \"requests<2.32.0\""
167164
]
168165
},
169166
{
@@ -463,6 +460,82 @@
463460
")\n",
464461
"model_trainer.train(input_data_config=[test_data], wait=False)"
465462
]
463+
},
464+
{
465+
"cell_type": "markdown",
466+
"metadata": {},
467+
"source": [
468+
"## ModelTrainer Recipes"
469+
]
470+
},
471+
{
472+
"cell_type": "markdown",
473+
"metadata": {},
474+
"source": [
475+
"### SageMaker GPU Recipe"
476+
]
477+
},
478+
{
479+
"cell_type": "code",
480+
"execution_count": null,
481+
"metadata": {},
482+
"outputs": [],
483+
"source": [
484+
"from sagemaker.modules.train import ModelTrainer\n",
485+
"from sagemaker.modules.configs import Compute, InputData\n",
486+
"\n",
487+
"recipe_overrides = {\n",
488+
" \"run\": {\n",
489+
" \"results_dir\": \"/opt/ml/model\",\n",
490+
" },\n",
491+
" \"exp_manager\": {\n",
492+
" \"exp_dir\": \"\",\n",
493+
" \"explicit_log_dir\": \"/opt/ml/output/tensorboard\",\n",
494+
" \"checkpoint_dir\": \"/opt/ml/checkpoints\",\n",
495+
" \"export_full_model\": {\n",
496+
" \"save_last\": False\n",
497+
" }\n",
498+
" }, \n",
499+
" \"model\": {\n",
500+
" \"data\": {\n",
501+
" \"train_dir\": \"/opt/ml/input/data/train\",\n",
502+
" \"val_dir\": \"/opt/ml/input/data/val\",\n",
503+
" \"use_synthetic_data\": True,\n",
504+
" },\n",
505+
" \"train_batch_size\": 1,\n",
506+
" \"num_hidden_layers\": 4,\n",
507+
" \"fp8\": False,\n",
508+
" },\n",
509+
" \"trainer\": {\n",
510+
" \"num_nodes\": 1\n",
511+
" }\n",
512+
"}\n",
513+
"\n",
514+
"training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n",
515+
"\n",
516+
"model_trainer = ModelTrainer.from_recipe(\n",
517+
" training_recipe=\"training/llama/hf_llama3_8b_seq8192_gpu\",\n",
518+
" training_image=training_image,\n",
519+
" recipe_overrides=recipe_overrides,\n",
520+
" compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n",
521+
")"
522+
]
523+
},
524+
{
525+
"cell_type": "code",
526+
"execution_count": null,
527+
"metadata": {},
528+
"outputs": [],
529+
"source": [
530+
"model_trainer.train()"
531+
]
532+
},
533+
{
534+
"cell_type": "markdown",
535+
"metadata": {},
536+
"source": [
537+
"Successful Run - https://tiny.amazon.com/14jxjrndx/IsenLink"
538+
]
466539
}
467540
],
468541
"metadata": {

src/sagemaker/modules/train/container_drivers/scripts/environment.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ def set_env(
152152
# Hyperparameters
153153
env_vars["SM_HPS"] = hyperparameters_config
154154
for key, value in hyperparameters_config.items():
155-
env_vars[f"SM_HP_{key.upper()}"] = safe_serialize(value)
155+
key_upper = key.replace("-", "_").upper()
156+
env_vars[f"SM_HP_{key_upper}"] = safe_serialize(value)
156157

157158
# Host Variables
158159
current_host = resource_config["current_host"]

src/sagemaker/modules/train/model_trainer.py

+92-4
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555

5656
from sagemaker.modules.distributed import (
5757
DistributedRunner,
58-
TorchrunSMP,
58+
Torchrun,
5959
)
6060
from sagemaker.modules.utils import (
6161
_get_repo_name_from_image,
@@ -85,6 +85,7 @@
8585
EXECUTE_BASIC_SCRIPT_DRIVER,
8686
)
8787
from sagemaker.modules import logger
88+
from sagemaker.modules.train.sm_recipes.utils import get_args_from_recipe, _determine_device_type
8889

8990

9091
class ModelTrainer(BaseModel):
@@ -213,6 +214,14 @@ class ModelTrainer(BaseModel):
213214
_session_chaining_config: Optional[SessionChainingConfig] = PrivateAttr(default=None)
214215
_remote_debug_config: Optional[RemoteDebugConfig] = PrivateAttr(default=None)
215216

217+
_temp_recipe_train_dir: Optional[TemporaryDirectory] = PrivateAttr(default=None)
218+
219+
def __del__(self):
220+
"""Destructor method to clean up the temporary directory."""
221+
# Clean up the temporary directory if it exists
222+
if self._temp_recipe_train_dir is not None:
223+
self._temp_recipe_train_dir.cleanup()
224+
216225
def _validate_training_image_and_algorithm_name(
217226
self, training_image: Optional[str], algorithm_name: Optional[str]
218227
):
@@ -383,9 +392,9 @@ def train(
383392
distributed_runner=self.distributed_runner,
384393
)
385394

386-
if isinstance(self.distributed_runner, TorchrunSMP):
387-
mp_parameters = self.distributed_runner._to_mp_parameters_dict()
388-
string_hyper_parameters["mp_parameters"] = safe_serialize(mp_parameters)
395+
if isinstance(self.distributed_runner, Torchrun) and self.distributed_runner.smp:
396+
mp_parameters = self.distributed_runner.smp._to_mp_hyperparameters()
397+
string_hyper_parameters.update(mp_parameters)
389398

390399
self._write_source_code_json(tmp_dir=drivers_dir, source_code=self.source_code)
391400
self._write_distributed_runner_json(
@@ -455,6 +464,11 @@ def train(
455464
session_chaining_config=self._session_chaining_config,
456465
)
457466
self._latest_training_job = training_job
467+
468+
# Clean up the temporary directory if it exists
469+
if self._temp_recipe_train_dir is not None:
470+
self._temp_recipe_train_dir.cleanup()
471+
458472
if wait:
459473
training_job.wait(logs=logs)
460474

@@ -748,3 +762,77 @@ def with_additional_settings(
748762
self._session_chaining_config = session_chaining_config
749763
self._remote_debug_config = remote_debug_config
750764
return self
765+
766+
@classmethod
767+
def from_recipe(
768+
cls,
769+
training_recipe: str,
770+
compute: Compute,
771+
recipe_overrides: Optional[Dict[str, Any]] = None,
772+
training_image: Optional[str] = None,
773+
session: Optional[Session] = None,
774+
role: Optional[str] = None,
775+
base_job_name: Optional[str] = None,
776+
**kwargs,
777+
) -> "ModelTrainer":
778+
"""Create a ModelTrainer from a training recipe.
779+
780+
Args:
781+
training_recipe (str):
782+
The training recipe to use for training the model. This must be the name of
783+
a sagemaker training recipe or a path to a local training recipe .yaml file.
784+
compute (Compute):
785+
The compute configuration. This is used to specify the compute resources for
786+
the training job. If not specified, will default to 1 instance of ml.m5.xlarge.
787+
recipe_overrides (Optional[Dict[str, Any]]):
788+
The recipe overrides. This is used to override the default recipe parameters.
789+
training_image (Optional[str]):
790+
The training image URI to use for the training job container. If not specified,
791+
the training image will be determined from the recipe.
792+
session (Optional[Session]):
793+
The SageMaker session.
794+
If not specified, a new session will be created.
795+
role (Optional[str]):
796+
The IAM role ARN for the training job.
797+
If not specified, the default SageMaker execution role will be used.
798+
base_job_name (Optional[str]):
799+
The base name for the training job.
800+
If not specified, a default name will be generated using the algorithm name
801+
or training image.
802+
kwargs:
803+
Additional keyword arguments to pass to the ModelTrainer constructor.
804+
805+
"""
806+
if compute.instance_type is None:
807+
raise ValueError(
808+
"Must set `instance_type` in compute_config when using training recipes."
809+
)
810+
device_type = _determine_device_type(compute.instance_type)
811+
if device_type == "cpu":
812+
raise ValueError(
813+
"Training recipes are not supported for CPU instances. "
814+
+ "Please provide a GPU or Tranium instance type."
815+
)
816+
817+
if session is None:
818+
session = Session()
819+
logger.warning("Session not provided. Using default Session.")
820+
if role is None:
821+
role = get_execution_role()
822+
logger.warning(f"Role not provided. Using default role:\n{role}")
823+
824+
model_trainer_args, recipe_train_dir = get_args_from_recipe(
825+
training_recipe=training_recipe,
826+
recipe_overrides=recipe_overrides,
827+
compute=compute,
828+
session=session,
829+
)
830+
if training_image is not None:
831+
model_trainer_args["training_image"] = training_image
832+
833+
model_trainer = cls(
834+
session=session, role=role, base_job_name=base_job_name, **model_trainer_args, **kwargs
835+
)
836+
837+
model_trainer._temp_recipe_train_dir = recipe_train_dir
838+
return model_trainer

src/sagemaker/modules/train/sm_recipes/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"adapter_repo": "[email protected]:aws/private-sagemaker-training-adapter-for-nemo-staging.git",
3+
"launcher_repo": "[email protected]:aws/private-sagemaker-training-launcher-staging.git",
4+
"neuron_dist_repo": "https://github.com/aws-neuron/neuronx-distributed-training.git",
5+
"gpu_image" : {
6+
"framework": "pytorch-smp",
7+
"version": "2.3.1",
8+
"additional_args": {}
9+
},
10+
"neuron_image": "855988369404.dkr.ecr.us-west-2.amazonaws.com/chinmayee-dev:neuron_sept26_v1"
11+
}

0 commit comments

Comments
 (0)