Merge branch 'master' into 1p-arm64

mabunday · web-flow · commit d74894fc0c4f · 2022-10-25T16:11:33.000-04:00
diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst
@@ -293,107 +293,6 @@ using two ``ml.p4d.24xlarge`` instances:
 
     pt_estimator.fit("s3://bucket/path/to/training/data")
 
-.. _distributed-pytorch-training-on-trainium:
-
-Distributed PyTorch Training on Trainium
-========================================
-
-SageMaker Training on Trainium instances now supports the ``xla``
-package through ``torchrun``. With this, you do not need to manually pass RANK,
-WORLD_SIZE, MASTER_ADDR, and MASTER_PORT. You can launch the training job using the
-:class:`sagemaker.pytorch.estimator.PyTorch` estimator class
-with the ``torch_distributed`` option as the distribution strategy.
-
-.. note::
-
-  This ``torch_distributed`` support is available
-  in the SageMaker Trainium (trn1) PyTorch Deep Learning Containers starting v1.11.0.
-  To find a complete list of supported versions of PyTorch Neuron, see `Neuron Containers <https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers>`_ in the *AWS Deep Learning Containers GitHub repository*.
-
-  SageMaker Debugger and Profiler are currently not supported with Trainium instances.
-
-Adapt Your Training Script to Initialize with the XLA backend
--------------------------------------------------------------
-
-To initialize distributed training in your script, call
-`torch.distributed.init_process_group
-<https://pytorch.org/docs/master/distributed.html#torch.distributed.init_process_group>`_
-with the ``xla`` backend as shown below.
-
-.. code:: python
-
-    import torch.distributed as dist
-
-    dist.init_process_group('xla')
-
-SageMaker takes care of ``'MASTER_ADDR'`` and ``'MASTER_PORT'`` for you via ``torchrun``
-
-For detailed documentation about modifying your training script for Trainium, see `Multi-worker data-parallel MLP training using torchrun <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/mlp.html?highlight=torchrun#multi-worker-data-parallel-mlp-training-using-torchrun>`_ in the *AWS Neuron Documentation*.
-
-**Currently Supported backends:**
-
--  ``xla`` for Trainium (Trn1) instances
-
-For up-to-date information on supported backends for Trainium instances, see `AWS Neuron Documentation <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html>`_.
-
-Launching a Distributed Training Job on Trainium
-------------------------------------------------
-
-You can run multi-node distributed PyTorch training jobs on Trainium instances using the
-:class:`sagemaker.pytorch.estimator.PyTorch` estimator class.
-With ``instance_count=1``, the estimator submits a
-single-node training job to SageMaker; with ``instance_count`` greater
-than one, a multi-node training job is launched.
-
-With the ``torch_distributed`` option, the SageMaker PyTorch estimator runs a SageMaker
-training container for PyTorch Neuron, sets up the environment, and launches
-the training job using the ``torchrun`` command on each worker with the given information.
-
-**Examples**
-
-The following examples show how to run a PyTorch training using ``torch_distributed`` in SageMaker
-on one ``ml.trn1.2xlarge`` instance and two ``ml.trn1.32xlarge`` instances:
-
-.. code:: python
-
-    from sagemaker.pytorch import PyTorch
-
-    pt_estimator = PyTorch(
-        entry_point="train_ptddp.py",
-        role="SageMakerRole",
-        framework_version="1.11.0",
-        py_version="py38",
-        instance_count=1,
-        instance_type="ml.trn1.2xlarge",
-        distribution={
-            "torch_distributed": {
-                "enabled": True
-            }
-        }
-    )
-
-    pt_estimator.fit("s3://bucket/path/to/training/data")
-
-.. code:: python
-
-    from sagemaker.pytorch import PyTorch
-
-    pt_estimator = PyTorch(
-        entry_point="train_ptddp.py",
-        role="SageMakerRole",
-        framework_version="1.11.0",
-        py_version="py38",
-        instance_count=2,
-        instance_type="ml.trn1.32xlarge",
-        distribution={
-            "torch_distributed": {
-                "enabled": True
-            }
-        }
-    )
-
-    pt_estimator.fit("s3://bucket/path/to/training/data")
-
 *********************
 Deploy PyTorch Models
 *********************
diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
@@ -569,8 +569,8 @@ def _upload_code(self, key_prefix: str, repack: bool = False) -> None:
                     )
                     return
                 self.sagemaker_session.context.need_runtime_repack.add(id(self))
-                self.sagemaker_session.context.runtime_repack_output_prefix = "s3://{}/{}".format(
-                    bucket, key_prefix
+                self.sagemaker_session.context.runtime_repack_output_prefix = s3.s3_path_join(
+                    "s3://", bucket, key_prefix
                 )
                 # Add the uploaded_code and repacked_model_data to update the container env
                 self.repacked_model_data = self.model_data
diff --git a/src/sagemaker/tensorflow/model.py b/src/sagemaker/tensorflow/model.py
@@ -396,8 +396,8 @@ def prepare_container_def(
             # model is not yet there, defer repacking to later during pipeline execution
             if isinstance(self.sagemaker_session, PipelineSession):
                 self.sagemaker_session.context.need_runtime_repack.add(id(self))
-                self.sagemaker_session.context.runtime_repack_output_prefix = "s3://{}/{}".format(
-                    bucket, key_prefix
+                self.sagemaker_session.context.runtime_repack_output_prefix = s3.s3_path_join(
+                    "s3://", bucket, key_prefix
                 )
             else:
                 logging.warning(
diff --git a/tests/unit/sagemaker/workflow/test_model_step.py b/tests/unit/sagemaker/workflow/test_model_step.py
@@ -70,6 +70,7 @@
 _TENSORFLOW_PATH = os.path.join(DATA_DIR, "tfs/tfs-test-entrypoint-and-dependencies")
 _REPACK_OUTPUT_KEY_PREFIX = "code-output"
 _MODEL_CODE_LOCATION = f"s3://{_BUCKET}/{_REPACK_OUTPUT_KEY_PREFIX}"
+_MODEL_CODE_LOCATION_TRAILING_SLASH = _MODEL_CODE_LOCATION + "/"
 
 
 @pytest.fixture
@@ -701,7 +702,7 @@ def test_conditional_model_create_and_regis(
                 entry_point=f"{DATA_DIR}/{_SCRIPT_NAME}",
                 role=_ROLE,
                 enable_network_isolation=True,
-                code_location=_MODEL_CODE_LOCATION,
+                code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH,
             ),
             2,
         ),
@@ -725,7 +726,7 @@ def test_conditional_model_create_and_regis(
                 entry_point=f"{DATA_DIR}/{_SCRIPT_NAME}",
                 role=_ROLE,
                 framework_version="1.5.0",
-                code_location=_MODEL_CODE_LOCATION,
+                code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH,
             ),
             2,
         ),
@@ -757,7 +758,7 @@ def test_conditional_model_create_and_regis(
                 image_uri=_IMAGE_URI,
                 entry_point=f"{DATA_DIR}/{_SCRIPT_NAME}",
                 role=_ROLE,
-                code_location=_MODEL_CODE_LOCATION,
+                code_location=_MODEL_CODE_LOCATION_TRAILING_SLASH,
             ),
             2,
         ),
@@ -780,7 +781,9 @@ def assert_test_result(steps: list):
         assert len(steps) == expected_step_num
         if expected_step_num == 2:
             assert steps[0]["Type"] == "Training"
-            if model.key_prefix == _REPACK_OUTPUT_KEY_PREFIX:
+            if model.key_prefix is not None and model.key_prefix.startswith(
+                _REPACK_OUTPUT_KEY_PREFIX
+            ):
                 assert steps[0]["Arguments"]["OutputDataConfig"]["S3OutputPath"] == (
                     f"{_MODEL_CODE_LOCATION}/{model.name}"
                 )

Original file line number	Diff line number	Diff line change
`@@ -569,8 +569,8 @@ def _upload_code(self, key_prefix: str, repack: bool = False) -> None:`
`569`	`569`	`)`
`570`	`570`	`return`
`571`	`571`	`self.sagemaker_session.context.need_runtime_repack.add(id(self))`
`572`		`- self.sagemaker_session.context.runtime_repack_output_prefix = "s3://{}/{}".format(`
`573`		`- bucket, key_prefix`
	`572`	`+ self.sagemaker_session.context.runtime_repack_output_prefix = s3.s3_path_join(`
	`573`	`+ "s3://", bucket, key_prefix`
`574`	`574`	`)`
`575`	`575`	`# Add the uploaded_code and repacked_model_data to update the container env`
`576`	`576`	`self.repacked_model_data = self.model_data`