Skip to content

Commit d74894f

Browse files
authored
Merge branch 'master' into 1p-arm64
2 parents 72a60af + b958503 commit d74894f

File tree

4 files changed

+11
-109
lines changed

4 files changed

+11
-109
lines changed

doc/frameworks/pytorch/using_pytorch.rst

Lines changed: 0 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -293,107 +293,6 @@ using two ``ml.p4d.24xlarge`` instances:
293293
294294
pt_estimator.fit("s3://bucket/path/to/training/data")
295295
296-
.. _distributed-pytorch-training-on-trainium:
297-
298-
Distributed PyTorch Training on Trainium
299-
========================================
300-
301-
SageMaker Training on Trainium instances now supports the ``xla``
302-
package through ``torchrun``. With this, you do not need to manually pass RANK,
303-
WORLD_SIZE, MASTER_ADDR, and MASTER_PORT. You can launch the training job using the
304-
:class:`sagemaker.pytorch.estimator.PyTorch` estimator class
305-
with the ``torch_distributed`` option as the distribution strategy.
306-
307-
.. note::
308-
309-
This ``torch_distributed`` support is available
310-
in the SageMaker Trainium (trn1) PyTorch Deep Learning Containers starting v1.11.0.
311-
To find a complete list of supported versions of PyTorch Neuron, see `Neuron Containers <https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers>`_ in the *AWS Deep Learning Containers GitHub repository*.
312-
313-
SageMaker Debugger and Profiler are currently not supported with Trainium instances.
314-
315-
Adapt Your Training Script to Initialize with the XLA backend
316-
-------------------------------------------------------------
317-
318-
To initialize distributed training in your script, call
319-
`torch.distributed.init_process_group
320-
<https://pytorch.org/docs/master/distributed.html#torch.distributed.init_process_group>`_
321-
with the ``xla`` backend as shown below.
322-
323-
.. code:: python
324-
325-
import torch.distributed as dist
326-
327-
dist.init_process_group('xla')
328-
329-
SageMaker takes care of ``'MASTER_ADDR'`` and ``'MASTER_PORT'`` for you via ``torchrun``
330-
331-
For detailed documentation about modifying your training script for Trainium, see `Multi-worker data-parallel MLP training using torchrun <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/mlp.html?highlight=torchrun#multi-worker-data-parallel-mlp-training-using-torchrun>`_ in the *AWS Neuron Documentation*.
332-
333-
**Currently Supported backends:**
334-
335-
- ``xla`` for Trainium (Trn1) instances
336-
337-
For up-to-date information on supported backends for Trainium instances, see `AWS Neuron Documentation <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html>`_.
338-
339-
Launching a Distributed Training Job on Trainium
340-
------------------------------------------------
341-
342-
You can run multi-node distributed PyTorch training jobs on Trainium instances using the
343-
:class:`sagemaker.pytorch.estimator.PyTorch` estimator class.
344-
With ``instance_count=1``, the estimator submits a
345-
single-node training job to SageMaker; with ``instance_count`` greater
346-
than one, a multi-node training job is launched.
347-
348-
With the ``torch_distributed`` option, the SageMaker PyTorch estimator runs a SageMaker
349-
training container for PyTorch Neuron, sets up the environment, and launches
350-
the training job using the ``torchrun`` command on each worker with the given information.
351-
352-
**Examples**
353-
354-
The following examples show how to run a PyTorch training using ``torch_distributed`` in SageMaker
355-
on one ``ml.trn1.2xlarge`` instance and two ``ml.trn1.32xlarge`` instances:
356-
357-
.. code:: python
358-
359-
from sagemaker.pytorch import PyTorch
360-
361-
pt_estimator = PyTorch(
362-
entry_point="train_ptddp.py",
363-
role="SageMakerRole",
364-
framework_version="1.11.0",
365-
py_version="py38",
366-
instance_count=1,
367-
instance_type="ml.trn1.2xlarge",
368-
distribution={
369-
"torch_distributed": {
370-
"enabled": True
371-
}
372-
}
373-
)
374-
375-
pt_estimator.fit("s3://bucket/path/to/training/data")
376-
377-
.. code:: python
378-
379-
from sagemaker.pytorch import PyTorch
380-
381-
pt_estimator = PyTorch(
382-
entry_point="train_ptddp.py",
383-
role="SageMakerRole",
384-
framework_version="1.11.0",
385-
py_version="py38",
386-
instance_count=2,
387-
instance_type="ml.trn1.32xlarge",
388-
distribution={
389-
"torch_distributed": {
390-
"enabled": True
391-
}
392-
}
393-
)
394-
395-
pt_estimator.fit("s3://bucket/path/to/training/data")
396-
397296
*********************
398297
Deploy PyTorch Models
399298
*********************

src/sagemaker/model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -569,8 +569,8 @@ def _upload_code(self, key_prefix: str, repack: bool = False) -> None:
569569
)
570570
return
571571
self.sagemaker_session.context.need_runtime_repack.add(id(self))
572-
self.sagemaker_session.context.runtime_repack_output_prefix = "s3://{}/{}".format(
573-
bucket, key_prefix
572+
self.sagemaker_session.context.runtime_repack_output_prefix = s3.s3_path_join(
573+
"s3://", bucket, key_prefix
574574
)
575575
# Add the uploaded_code and repacked_model_data to update the container env
576576
self.repacked_model_data = self.model_data

src/sagemaker/tensorflow/model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,8 +396,8 @@ def prepare_container_def(
396396
# model is not yet there, defer repacking to later during pipeline execution
397397
if isinstance(self.sagemaker_session, PipelineSession):
398398
self.sagemaker_session.context.need_runtime_repack.add(id(self))
399-
self.sagemaker_session.context.runtime_repack_output_prefix = "s3://{}/{}".format(
400-
bucket, key_prefix
399+
self.sagemaker_session.context.runtime_repack_output_prefix = s3.s3_path_join(
400+
"s3://", bucket, key_prefix
401401
)
402402
else:
403403
logging.warning(

tests/unit/sagemaker/workflow/test_model_step.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
_TENSORFLOW_PATH = os.path.join(DATA_DIR, "tfs/tfs-test-entrypoint-and-dependencies")
7171
_REPACK_OUTPUT_KEY_PREFIX = "code-output"
7272
_MODEL_CODE_LOCATION = f"s3://{_BUCKET}/{_REPACK_OUTPUT_KEY_PREFIX}"
73+
_MODEL_CODE_LOCATION_TRAILING_SLASH = _MODEL_CODE_LOCATION + "/"
7374

7475

7576
@pytest.fixture
@@ -701,7 +702,7 @@ def test_conditional_model_create_and_regis(
701702
entry_point=f"{DATA_DIR}/{_SCRIPT_NAME}",
702703
role=_ROLE,
703704
enable_network_isolation=True,
704-
code_location=_MODEL_CODE_LOCATION,
705+
code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH,
705706
),
706707
2,
707708
),
@@ -725,7 +726,7 @@ def test_conditional_model_create_and_regis(
725726
entry_point=f"{DATA_DIR}/{_SCRIPT_NAME}",
726727
role=_ROLE,
727728
framework_version="1.5.0",
728-
code_location=_MODEL_CODE_LOCATION,
729+
code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH,
729730
),
730731
2,
731732
),
@@ -757,7 +758,7 @@ def test_conditional_model_create_and_regis(
757758
image_uri=_IMAGE_URI,
758759
entry_point=f"{DATA_DIR}/{_SCRIPT_NAME}",
759760
role=_ROLE,
760-
code_location=_MODEL_CODE_LOCATION,
761+
code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH,
761762
),
762763
2,
763764
),
@@ -780,7 +781,9 @@ def assert_test_result(steps: list):
780781
assert len(steps) == expected_step_num
781782
if expected_step_num == 2:
782783
assert steps[0]["Type"] == "Training"
783-
if model.key_prefix == _REPACK_OUTPUT_KEY_PREFIX:
784+
if model.key_prefix is not None and model.key_prefix.startswith(
785+
_REPACK_OUTPUT_KEY_PREFIX
786+
):
784787
assert steps[0]["Arguments"]["OutputDataConfig"]["S3OutputPath"] == (
785788
f"{_MODEL_CODE_LOCATION}/{model.name}"
786789
)

0 commit comments

Comments
 (0)