|
79 | 79 | " command=\"python custom_script.py\",\n",
|
80 | 80 | ")\n",
|
81 | 81 | "\n",
|
82 |
| - "hyperparameters = {\n", |
83 |
| - " \"secret_token\": \"123456\",\n", |
84 |
| - "}\n", |
85 |
| - "\n", |
86 |
| - "env_vars = {\n", |
87 |
| - " \"PASSWORD\": \"123456\"\n", |
88 |
| - "}\n", |
89 |
| - "\n", |
90 | 82 | "model_trainer = ModelTrainer(\n",
|
91 | 83 | " training_image=pytorch_image,\n",
|
92 | 84 | " source_code=source_code,\n",
|
93 |
| - " hyperparameters=hyperparameters,\n", |
94 |
| - " environment=env_vars,\n", |
95 | 85 | ")\n",
|
96 | 86 | "\n",
|
97 | 87 | "model_trainer.train(wait=False)"
|
|
386 | 376 | "from sagemaker.modules.configs import (\n",
|
387 | 377 | " Compute, SourceCode, InputData\n",
|
388 | 378 | ")\n",
|
389 |
| - "from sagemaker.modules.distributed import (\n", |
390 |
| - " Torchrun,\n", |
391 |
| - " MPI\n", |
392 |
| - ")\n", |
393 | 379 | "\n",
|
394 | 380 | "compute = Compute(\n",
|
395 | 381 | " instance_count=2,\n",
|
|
420 | 406 | "metadata": {},
|
421 | 407 | "outputs": [],
|
422 | 408 | "source": [
|
| 409 | + "from sagemaker.modules.distributed import (\n", |
| 410 | + " Torchrun,\n", |
| 411 | + " MPI,\n", |
| 412 | + " SMP\n", |
| 413 | + ")\n", |
| 414 | + "\n", |
423 | 415 | "source_code = SourceCode(\n",
|
424 | 416 | " source_dir=\"distributed-training/scripts\",\n",
|
425 | 417 | " requirements=\"requirements.txt\",\n",
|
|
429 | 421 | "# Run using Torchrun\n",
|
430 | 422 | "torchrun = Torchrun()\n",
|
431 | 423 | "\n",
|
| 424 | + "# Run using Torchrun with SMP\n", |
| 425 | + "torchrun_smp = Torchrun(\n", |
| 426 | + " smp=SMP(\n", |
| 427 | + " sm_activation_offloading=True,\n", |
| 428 | + " activation_loading_horizon=2,\n", |
| 429 | + " )\n", |
| 430 | + ")\n", |
| 431 | + "\n", |
432 | 432 | "# Run using MPI\n",
|
433 | 433 | "mpi = MPI(\n",
|
434 | 434 | " mpi_additional_options=[\n",
|
|
482 | 482 | "outputs": [],
|
483 | 483 | "source": [
|
484 | 484 | "from sagemaker.modules.train import ModelTrainer\n",
|
485 |
| - "from sagemaker.modules.configs import Compute, InputData\n", |
| 485 | + "from sagemaker.modules.configs import Compute\n", |
486 | 486 | "\n",
|
487 | 487 | "recipe_overrides = {\n",
|
488 | 488 | " \"run\": {\n",
|
|
536 | 536 | "source": [
|
537 | 537 | "Successful Run - https://tiny.amazon.com/14jxjrndx/IsenLink"
|
538 | 538 | ]
|
| 539 | + }, |
| 540 | + { |
| 541 | + "cell_type": "markdown", |
| 542 | + "metadata": {}, |
| 543 | + "source": [ |
| 544 | + "### Custom Recipe" |
| 545 | + ] |
| 546 | + }, |
| 547 | + { |
| 548 | + "cell_type": "code", |
| 549 | + "execution_count": null, |
| 550 | + "metadata": {}, |
| 551 | + "outputs": [], |
| 552 | + "source": [ |
| 553 | + "from sagemaker.modules.train import ModelTrainer\n", |
| 554 | + "from sagemaker.modules.configs import Compute\n", |
| 555 | + "\n", |
| 556 | + "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-gpu\"\n", |
| 557 | + "\n", |
| 558 | + "model_trainer = ModelTrainer.from_recipe(\n", |
| 559 | + " training_recipe=\"recipes/custom-recipe.yaml\",\n", |
| 560 | + " training_image=training_image,\n", |
| 561 | + " compute=Compute(instance_type=\"ml.p4d.24xlarge\")\n", |
| 562 | + ")" |
| 563 | + ] |
| 564 | + }, |
| 565 | + { |
| 566 | + "cell_type": "code", |
| 567 | + "execution_count": null, |
| 568 | + "metadata": {}, |
| 569 | + "outputs": [], |
| 570 | + "source": [ |
| 571 | + "model_trainer.train()" |
| 572 | + ] |
| 573 | + }, |
| 574 | + { |
| 575 | + "cell_type": "markdown", |
| 576 | + "metadata": {}, |
| 577 | + "source": [ |
| 578 | + "Successful Run - https://tiny.amazon.com/dimbimx1/IsenLink" |
| 579 | + ] |
| 580 | + }, |
| 581 | + { |
| 582 | + "cell_type": "markdown", |
| 583 | + "metadata": {}, |
| 584 | + "source": [ |
| 585 | + "### Trainium Recipe" |
| 586 | + ] |
| 587 | + }, |
| 588 | + { |
| 589 | + "cell_type": "code", |
| 590 | + "execution_count": null, |
| 591 | + "metadata": {}, |
| 592 | + "outputs": [], |
| 593 | + "source": [ |
| 594 | + "from sagemaker import session\n", |
| 595 | + "\n", |
| 596 | + "session = session.Session()\n", |
| 597 | + "base_job_name = \"trn-llama\"\n", |
| 598 | + "compiler_cache_bucket = f\"s3://{session.default_bucket()}/{base_job_name}/compiler-cache\"\n", |
| 599 | + "print(f\"Compiler cache: {compiler_cache_bucket}\")" |
| 600 | + ] |
| 601 | + }, |
| 602 | + { |
| 603 | + "cell_type": "code", |
| 604 | + "execution_count": null, |
| 605 | + "metadata": {}, |
| 606 | + "outputs": [], |
| 607 | + "source": [ |
| 608 | + "from sagemaker.modules.train import ModelTrainer\n", |
| 609 | + "from sagemaker.modules.configs import Compute, InputData, StoppingCondition\n", |
| 610 | + "\n", |
| 611 | + "recipe_overrides = {\n", |
| 612 | + " \"data\": {\n", |
| 613 | + " \"train_dir\": \"/opt/ml/input/data/train\",\n", |
| 614 | + " },\n", |
| 615 | + " \"model\": {\n", |
| 616 | + " \"model_config\": \"/opt/ml/input/data/train/config.json\",\n", |
| 617 | + " },\n", |
| 618 | + " \"trainer\": {\n", |
| 619 | + " \"max_epochs\": 1,\n", |
| 620 | + " },\n", |
| 621 | + " \"compiler_cache_url\": compiler_cache_bucket,\n", |
| 622 | + "}\n", |
| 623 | + "env = {\n", |
| 624 | + " \"FI_EFA_FORK_SAFE\": \"1\"\n", |
| 625 | + "}\n", |
| 626 | + "\n", |
| 627 | + "training_image = \"059094755717.dkr.ecr.us-west-2.amazonaws.com/sagemaker-recipes-neuron\"\n", |
| 628 | + "\n", |
| 629 | + "model_trainer = ModelTrainer.from_recipe(\n", |
| 630 | + " training_recipe=\"https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/refs/heads/main/examples/conf/hf_llama3_8B_config.yaml\",\n", |
| 631 | + " recipe_overrides=recipe_overrides,\n", |
| 632 | + " training_image=training_image,\n", |
| 633 | + " compute=Compute(\n", |
| 634 | + " instance_type=\"ml.trn1.32xlarge\",\n", |
| 635 | + " instance_count=2,\n", |
| 636 | + " ),\n", |
| 637 | + " stopping_condition=StoppingCondition(\n", |
| 638 | + " max_runtime_in_seconds=86400\n", |
| 639 | + " ),\n", |
| 640 | + " environment=env\n", |
| 641 | + ")" |
| 642 | + ] |
| 643 | + }, |
| 644 | + { |
| 645 | + "cell_type": "code", |
| 646 | + "execution_count": null, |
| 647 | + "metadata": {}, |
| 648 | + "outputs": [], |
| 649 | + "source": [ |
| 650 | + "train = InputData(\n", |
| 651 | + " channel_name=\"train\",\n", |
| 652 | + " data_source=\"s3://sagemaker-recipes-059094755717-data/data_llama3/\",\n", |
| 653 | + ")\n", |
| 654 | + "\n", |
| 655 | + "model_trainer.train(input_data_config=[train], wait=False)" |
| 656 | + ] |
| 657 | + }, |
| 658 | + { |
| 659 | + "cell_type": "markdown", |
| 660 | + "metadata": {}, |
| 661 | + "source": [ |
| 662 | + "Successful Run - https://tiny.amazon.com/125zldym8/IsenLink" |
| 663 | + ] |
539 | 664 | }
|
540 | 665 | ],
|
541 | 666 | "metadata": {
|
|
0 commit comments