Add recipes examples (#1582)

benieric · pintaoz-aws · commit 6b90f89b689c · 2024-12-04T01:42:50.000-08:00
diff --git a/src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb b/src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb
@@ -79,19 +79,9 @@
     "    command=\"python custom_script.py\",\n",
     ")\n",
     "\n",
-    "hyperparameters = {\n",
-    "    \"secret_token\": \"123456\",\n",
-    "}\n",
-    "\n",
-    "env_vars = {\n",
-    "    \"PASSWORD\": \"123456\"\n",
-    "}\n",
-    "\n",
     "model_trainer = ModelTrainer(\n",
     "    training_image=pytorch_image,\n",
     "    source_code=source_code,\n",
-    "    hyperparameters=hyperparameters,\n",
-    "    environment=env_vars,\n",
     ")\n",
     "\n",
     "model_trainer.train(wait=False)"
@@ -386,10 +376,6 @@
     "from sagemaker.modules.configs import (\n",
     "    Compute, SourceCode, InputData\n",
     ")\n",
-    "from sagemaker.modules.distributed import (\n",
-    "    Torchrun,\n",
-    "    MPI\n",
-    ")\n",
     "\n",
     "compute = Compute(\n",
     "    instance_count=2,\n",
@@ -420,6 +406,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from sagemaker.modules.distributed import (\n",
+    "    Torchrun,\n",
+    "    MPI,\n",
+    "    SMP\n",
+    ")\n",
+    "\n",
     "source_code = SourceCode(\n",
     "    source_dir=\"distributed-training/scripts\",\n",
     "    requirements=\"requirements.txt\",\n",
@@ -429,6 +421,14 @@
     "# Run using Torchrun\n",
     "torchrun = Torchrun()\n",
     "\n",
+    "# Run using Torchrun with SMP\n",
+    "torchrun_smp = Torchrun(\n",
+    "    smp=SMP(\n",
+    "        sm_activation_offloading=True,\n",
+    "        activation_loading_horizon=2,\n",
+    "    )\n",
+    ")\n",
+    "\n",
     "# Run using MPI\n",
     "mpi = MPI(\n",
     "    mpi_additional_options=[\n",
@@ -482,7 +482,7 @@
    "outputs": [],
    "source": [
     "from sagemaker.modules.train import ModelTrainer\n",
-    "from sagemaker.modules.configs import Compute, InputData\n",
+    "from sagemaker.modules.configs import Compute\n",
     "\n",
     "recipe_overrides = {\n",
     "    \"run\": {\n",
@@ -536,6 +536,131 @@
    "source": [
     "Successful Run - https://tiny.amazon.com/14jxjrndx/IsenLink"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Custom Recipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.modules.train import ModelTrainer\n",
+    "from sagemaker.modules.configs import Compute\n",
+    "\n",
+    "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n",
+    "\n",
+    "model_trainer = ModelTrainer.from_recipe(\n",
+    "    training_recipe=\"recipes/custom-recipe.yaml\",\n",
+    "    training_image=training_image,\n",
+    "    compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Successful Run - https://tiny.amazon.com/dimbimx1/IsenLink"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Trainium Recipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker import session\n",
+    "\n",
+    "session = session.Session()\n",
+    "base_job_name = \"trn-llama\"\n",
+    "compiler_cache_bucket = f\"s3://{session.default_bucket()}/{base_job_name}/compiler-cache\"\n",
+    "print(f\"Compiler cache: {compiler_cache_bucket}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.modules.train import ModelTrainer\n",
+    "from sagemaker.modules.configs import Compute, InputData, StoppingCondition\n",
+    "\n",
+    "recipe_overrides = {\n",
+    "    \"data\": {\n",
+    "        \"train_dir\": \"/opt/ml/input/data/train\",\n",
+    "    },\n",
+    "    \"model\": {\n",
+    "        \"model_config\": \"/opt/ml/input/data/train/config.json\",\n",
+    "    },\n",
+    "    \"trainer\": {\n",
+    "        \"max_epochs\": 1,\n",
+    "    },\n",
+    "    \"compiler_cache_url\": compiler_cache_bucket,\n",
+    "}\n",
+    "env = {\n",
+    "    \"FI_EFA_FORK_SAFE\": \"1\"\n",
+    "}\n",
+    "\n",
+    "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n",
+    "\n",
+    "model_trainer = ModelTrainer.from_recipe(\n",
+    "    training_recipe=\"https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/main/examples/conf/hf_llama3_8B_config.yaml\",\n",
+    "    recipe_overrides=recipe_overrides,\n",
+    "    training_image=training_image,\n",
+    "    compute=Compute(\n",
+    "        instance_type=\"ml.trn1.32xlarge\",\n",
+    "        instance_count=2,\n",
+    "    ),\n",
+    "    stopping_condition=StoppingCondition(\n",
+    "        max_runtime_in_seconds=86400\n",
+    "    ),\n",
+    "    environment=env\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = InputData(\n",
+    "    channel_name=\"train\",\n",
+    "    data_source=\"s3://sagemaker-recipes-059094755717-data/data_llama3/\",\n",
+    ")\n",
+    "\n",
+    "model_trainer.train(input_data_config=[train], wait=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Successful Run - https://tiny.amazon.com/125zldym8/IsenLink"
+   ]
   }
  ],
  "metadata": {
diff --git a/src/sagemaker/modules/testing_notebooks/recipes/custom-recipe.yaml b/src/sagemaker/modules/testing_notebooks/recipes/custom-recipe.yaml
@@ -0,0 +1,110 @@
+run:
+  name: llama-8b
+  results_dir: /opt/ml/model
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0
+exp_manager:
+  exp_dir: ''
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+    save_last: true
+  checkpoint_dir: /opt/ml/checkpoints
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: false
+  explicit_log_dir: /opt/ml/output/tensorboard
+use_smp_model: false
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: true
+  context_parallel_degree: 1
+  moe: false
+  activation_checkpointing: true
+  activation_loading_horizon: 2
+  delayed_param: false
+  offload_activations: false
+  fsdp: true
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: false
+  use_orig_param: false
+  fp8: false
+  max_context_width: 8192
+  max_position_embeddings: 8192
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1.0e-05
+  vocab_size: 128256
+  num_key_value_heads: null
+  use_flash_attention: true
+  rope_theta: 500000.0
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+  do_finetune: true
+  hf_model_name_or_path: meta-llama/Llama-3.1-8B
+  hf_access_token: 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
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+  precision: bf16
+  lr_decay_iters: 50
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 1.0e-06
+  data:
+    train_dir: /opt/ml/input/data/train
+    val_dir: /opt/ml/input/data/val
+    dataset_type: hf
+    use_synthetic_data: true
+  nsys_profile:
+    enabled: false
+    start_step: 10
+    end_step: 10
+    ranks:
+    - 0
+    gen_shape: false
+  viztracer:
+    enabled: false
diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py
@@ -191,7 +191,7 @@ class ModelTrainer(BaseModel):
     tags: Optional[List[Tag]] = None
 
     # Created Artifacts
-    _latest_training_job: Optional[resources.TrainingJob] = None
+    _latest_training_job: Optional[resources.TrainingJob] = PrivateAttr(default=None)
 
     # Metrics settings
     _enable_sage_maker_metrics_time_series: Optional[bool] = PrivateAttr(default=False)
@@ -200,7 +200,6 @@ class ModelTrainer(BaseModel):
     # Debugger settings
     _debug_hook_config: Optional[DebugHookConfig] = PrivateAttr(default=None)
     _debug_rule_configurations: Optional[List[DebugRuleConfiguration]] = PrivateAttr(default=None)
-    _remote_debug_config: Optional[RemoteDebugConfig] = PrivateAttr(default=None)
     _profiler_config: Optional[ProfilerConfig] = PrivateAttr(default=None)
     _profiler_rule_configurations: Optional[List[ProfilerRuleConfiguration]] = PrivateAttr(
         default=None