feature: Trainium Neuron support for PyTorch (aws#3423)

knikure · mchoi8739 · mufiAmazon · commit c56e1e1d6638 · 2022-10-27T23:26:31.000-07:00
Co-authored-by: Miyoung Choi &lt;myoung8739@gmail.com&gt;
diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst
@@ -293,6 +293,121 @@ using two ``ml.p4d.24xlarge`` instances:
 
     pt_estimator.fit("s3://bucket/path/to/training/data")
 
+.. _distributed-pytorch-training-on-trainium:
+
+Distributed Training with PyTorch Neuron on Trn1 instances
+==========================================================
+
+SageMaker Training supports Amazon EC2 Trn1 instances powered by
+`AWS Trainium <https://aws.amazon.com/machine-learning/trainium/>`_ device,
+the second generation purpose-built machine learning accelerator from AWS.
+Each Trn1 instance consists of up to 16 Trainium devices, and each
+Trainium device consists of two `NeuronCores
+<https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#trainium-architecture>`_
+in the *AWS Neuron Documentation*.
+
+You can run distributed training job on Trn1 instances.
+SageMaker supports the ``xla`` package through ``torchrun``.
+With this, you do not need to manually pass ``RANK``,
+``WORLD_SIZE``, ``MASTER_ADDR``, and ``MASTER_PORT``.
+You can launch the training job using the
+:class:`sagemaker.pytorch.estimator.PyTorch` estimator class
+with the ``torch_distributed`` option as the distribution strategy.
+
+.. note::
+
+  This ``torch_distributed`` support is available
+  in the AWS Deep Learning Containers for PyTorch Neuron starting v1.11.0.
+  To find a complete list of supported versions of PyTorch Neuron, see
+  `Neuron Containers <https://github.com/aws/deep-learning-containers/blob/master/available_images.md#neuron-containers>`_
+  in the *AWS Deep Learning Containers GitHub repository*.
+
+.. note::
+
+  SageMaker Debugger is currently not supported with Trn1 instances.
+
+Adapt Your Training Script to Initialize with the XLA backend
+-------------------------------------------------------------
+
+To initialize distributed training in your script, call
+`torch.distributed.init_process_group
+<https://pytorch.org/docs/master/distributed.html#torch.distributed.init_process_group>`_
+with the ``xla`` backend as shown below.
+
+.. code:: python
+
+    import torch.distributed as dist
+
+    dist.init_process_group('xla')
+
+SageMaker takes care of ``'MASTER_ADDR'`` and ``'MASTER_PORT'`` for you via ``torchrun``
+
+For detailed documentation about modifying your training script for Trainium, see `Multi-worker data-parallel MLP training using torchrun <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/mlp.html?highlight=torchrun#multi-worker-data-parallel-mlp-training-using-torchrun>`_ in the *AWS Neuron Documentation*.
+
+**Currently Supported backends:**
+
+-  ``xla`` for Trainium (Trn1) instances
+
+For up-to-date information on supported backends for Trn1 instances, see `AWS Neuron Documentation <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html>`_.
+
+Launching a Distributed Training Job on Trainium
+------------------------------------------------
+
+You can run multi-node distributed PyTorch training jobs on Trn1 instances using the
+:class:`sagemaker.pytorch.estimator.PyTorch` estimator class.
+With ``instance_count=1``, the estimator submits a
+single-node training job to SageMaker; with ``instance_count`` greater
+than one, a multi-node training job is launched.
+
+With the ``torch_distributed`` option, the SageMaker PyTorch estimator runs a SageMaker
+training container for PyTorch Neuron, sets up the environment, and launches
+the training job using the ``torchrun`` command on each worker with the given information.
+
+**Examples**
+
+The following examples show how to run a PyTorch training using ``torch_distributed`` in SageMaker
+on one ``ml.trn1.2xlarge`` instance and two ``ml.trn1.32xlarge`` instances:
+
+.. code:: python
+
+    from sagemaker.pytorch import PyTorch
+
+    pt_estimator = PyTorch(
+        entry_point="train_torch_distributed.py",
+        role="SageMakerRole",
+        framework_version="1.11.0",
+        py_version="py38",
+        instance_count=1,
+        instance_type="ml.trn1.2xlarge",
+        distribution={
+            "torch_distributed": {
+                "enabled": True
+            }
+        }
+    )
+
+    pt_estimator.fit("s3://bucket/path/to/training/data")
+
+.. code:: python
+
+    from sagemaker.pytorch import PyTorch
+
+    pt_estimator = PyTorch(
+        entry_point="train_torch_distributed.py",
+        role="SageMakerRole",
+        framework_version="1.11.0",
+        py_version="py38",
+        instance_count=2,
+        instance_type="ml.trn1.32xlarge",
+        distribution={
+            "torch_distributed": {
+                "enabled": True
+            }
+        }
+    )
+
+    pt_estimator.fit("s3://bucket/path/to/training/data")
+
 *********************
 Deploy PyTorch Models
 *********************
diff --git a/src/sagemaker/image_uri_config/pytorch-neuron.json b/src/sagemaker/image_uri_config/pytorch-neuron.json
@@ -0,0 +1,41 @@
+{
+    "training": {
+        "processors": ["trn"],
+        "version_aliases": {"1.11": "1.11.0"},
+        "versions": {
+                "1.11.0": {
+                    "py_versions": ["py38"],
+                    "repository": "pytorch-training-neuron",
+                    "registries": {
+                        "af-south-1": "626614931356",
+                        "ap-east-1": "871362719292",
+                        "ap-northeast-1": "763104351884",
+                        "ap-northeast-2": "763104351884",
+                        "ap-northeast-3": "364406365360",
+                        "ap-south-1": "763104351884",
+                        "ap-southeast-1": "763104351884",
+                        "ap-southeast-2": "763104351884",
+                        "ca-central-1": "763104351884",
+                        "cn-north-1": "727897471807",
+                        "cn-northwest-1": "727897471807",
+                        "eu-central-1": "763104351884",
+                        "eu-north-1": "763104351884",
+                        "eu-west-1": "763104351884",
+                        "eu-west-2": "763104351884",
+                        "eu-west-3": "763104351884",
+                        "eu-south-1": "692866216735",
+                        "me-south-1": "217643126080",
+                        "sa-east-1": "763104351884",
+                        "us-east-1": "763104351884",
+                        "us-east-2": "763104351884",
+                        "us-gov-west-1": "442386744353",
+                        "us-iso-east-1": "886529160074",
+                        "us-west-1": "763104351884",
+                        "us-west-2": "763104351884"
+                    },
+                    "container_version": {"trn": "ubuntu20.04"},
+                    "sdk_versions": ["sdk2.4.0"]
+                }
+            }
+        }
+    }
diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py
@@ -33,6 +33,7 @@
 HUGGING_FACE_FRAMEWORK = "huggingface"
 XGBOOST_FRAMEWORK = "xgboost"
 SKLEARN_FRAMEWORK = "sklearn"
+TRAINIUM_ALLOWED_FRAMEWORKS = "pytorch"
 
 
 @override_pipeline_parameter_var
@@ -150,11 +151,12 @@ def retrieve(
         )
     else:
         _framework = framework
-        if framework == HUGGING_FACE_FRAMEWORK:
+        if framework == HUGGING_FACE_FRAMEWORK or framework in TRAINIUM_ALLOWED_FRAMEWORKS:
             inference_tool = _get_inference_tool(inference_tool, instance_type)
             if inference_tool == "neuron":
                 _framework = f"{framework}-{inference_tool}"
         final_image_scope = _get_final_image_scope(framework, instance_type, image_scope)
+        _validate_for_suppported_frameworks_and_instance_type(framework, instance_type)
         config = _config_for_framework_and_scope(_framework, final_image_scope, accelerator_type)
 
     original_version = version
@@ -186,6 +188,12 @@ def retrieve(
     if version_config.get("container_version"):
         container_version = version_config["container_version"][processor]
 
+    # Append sdk version in case of trainium instances
+    if repo in ["pytorch-training-neuron"]:
+        if not sdk_version:
+            sdk_version = _get_latest_versions(version_config["sdk_versions"])
+        container_version = sdk_version + "-" + container_version
+
     if framework == HUGGING_FACE_FRAMEWORK:
         pt_or_tf_version = (
             re.compile("^(pytorch|tensorflow)(.*)$").match(base_framework_version).group(2)
@@ -344,6 +352,16 @@ def _config_for_framework_and_scope(framework, image_scope, accelerator_type=Non
     return config if "scope" in config else config[image_scope]
 
 
+def _validate_for_suppported_frameworks_and_instance_type(framework, instace_type):
+    """Validate if framework is supported for the instance_type"""
+    if (
+        instace_type is not None
+        and "trn" in instace_type
+        and framework not in TRAINIUM_ALLOWED_FRAMEWORKS
+    ):
+        _validate_framework(framework, TRAINIUM_ALLOWED_FRAMEWORKS, "framework")
+
+
 def config_for_framework(framework):
     """Loads the JSON config for the given framework."""
     fname = os.path.join(os.path.dirname(__file__), "image_uri_config", "{}.json".format(framework))
@@ -371,7 +389,7 @@ def _get_inference_tool(inference_tool, instance_type):
     """Extract the inference tool name from instance type."""
     if not inference_tool:
         instance_type_family = _get_instance_type_family(instance_type)
-        if instance_type_family.startswith("inf"):
+        if instance_type_family.startswith("inf") or instance_type_family.startswith("trn"):
             return "neuron"
     return inference_tool
 
@@ -460,6 +478,8 @@ def _processor(instance_type, available_processors, serverless_inference_config=
                 processor = family
             elif family.startswith("inf"):
                 processor = "inf"
+            elif family.startswith("trn"):
+                processor = "trn"
             elif family[0] in ("g", "p"):
                 processor = "gpu"
             else:
@@ -523,6 +543,15 @@ def _validate_arg(arg, available_options, arg_name):
         )
 
 
+def _validate_framework(framework, allowed_frameworks, arg_name):
+    """Checks if the framework is in the allowed frameworks, and raises a ``ValueError`` if not."""
+    if framework not in allowed_frameworks:
+        raise ValueError(
+            f"Unsupported {arg_name}: {framework}. "
+            f"Supported {arg_name}(s) for trainium instances: {allowed_frameworks}."
+        )
+
+
 def _format_tag(tag_prefix, processor, py_version, container_version, inference_tool=None):
     """Creates a tag for the image URI."""
     if inference_tool:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -358,6 +358,11 @@ def huggingface_neuron_latest_inference_py_version():
     return "py37"
 
 
+@pytest.fixture(scope="module")
+def pytorch_neuron_version():
+    return "1.11"
+
+
 @pytest.fixture(scope="module")
 def pytorch_eia_py_version():
     return "py3"
diff --git a/tests/unit/sagemaker/image_uris/expected_uris.py b/tests/unit/sagemaker/image_uris/expected_uris.py
@@ -30,6 +30,24 @@ def framework_uri(repo, fw_version, account, py_version=None, processor="cpu", r
     return IMAGE_URI_FORMAT.format(account, region, domain, repo, tag)
 
 
+def neuron_framework_uri(
+    repo,
+    fw_version,
+    account,
+    py_version=None,
+    inference_tool="neuron",
+    region=REGION,
+    sdk_version="sdk2.4.0",
+    container_version="ubuntu20.04",
+):
+    domain = ALTERNATE_DOMAINS.get(region, DOMAIN)
+    tag = "-".join(
+        x for x in (fw_version, inference_tool, py_version, sdk_version, container_version) if x
+    )
+
+    return IMAGE_URI_FORMAT.format(account, region, domain, repo, tag)
+
+
 def algo_uri(algo, account, region, version=1):
     domain = ALTERNATE_DOMAINS.get(region, DOMAIN)
     return IMAGE_URI_FORMAT.format(account, region, domain, algo, version)
diff --git a/tests/unit/sagemaker/image_uris/test_trainium.py b/tests/unit/sagemaker/image_uris/test_trainium.py
@@ -0,0 +1,74 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from sagemaker import image_uris
+from tests.unit.sagemaker.image_uris import expected_uris
+
+ACCOUNTS = {
+    "af-south-1": "626614931356",
+    "ap-east-1": "871362719292",
+    "ap-northeast-1": "763104351884",
+    "ap-northeast-2": "763104351884",
+    "ap-northeast-3": "364406365360",
+    "ap-south-1": "763104351884",
+    "ap-southeast-1": "763104351884",
+    "ap-southeast-2": "763104351884",
+    "ca-central-1": "763104351884",
+    "cn-north-1": "727897471807",
+    "cn-northwest-1": "727897471807",
+    "eu-central-1": "763104351884",
+    "eu-north-1": "763104351884",
+    "eu-west-1": "763104351884",
+    "eu-west-2": "763104351884",
+    "eu-west-3": "763104351884",
+    "eu-south-1": "692866216735",
+    "me-south-1": "217643126080",
+    "sa-east-1": "763104351884",
+    "us-east-1": "763104351884",
+    "us-east-2": "763104351884",
+    "us-gov-west-1": "442386744353",
+    "us-iso-east-1": "886529160074",
+    "us-west-1": "763104351884",
+    "us-west-2": "763104351884",
+}
+
+TRAINIUM_REGIONS = ACCOUNTS.keys()
+
+
+def _expected_trainium_framework_uri(
+    framework, version, region="us-west-2", inference_tool="neuron"
+):
+    return expected_uris.neuron_framework_uri(
+        "{}-neuron".format(framework),
+        fw_version=version,
+        py_version="py38",
+        account=ACCOUNTS[region],
+        region=region,
+        inference_tool=inference_tool,
+    )
+
+
+def _test_trainium_framework_uris(framework, version):
+    for region in TRAINIUM_REGIONS:
+        uri = image_uris.retrieve(
+            framework, region, instance_type="ml.trn1.xlarge", version=version
+        )
+        expected = _expected_trainium_framework_uri(
+            "{}-training".format(framework), version, region=region, inference_tool="neuron"
+        )
+        assert expected == uri
+
+
+def test_trainium_pytorch(pytorch_neuron_version):
+    _test_trainium_framework_uris("pytorch", pytorch_neuron_version)