aws
diff --git a/‎CHANGELOG.md
Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 14 additions & 0 deletions
diff --git a/‎VERSION
Lines changed: 1 addition & 1 deletion b/‎VERSION
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/sagemaker/estimator.py
Lines changed: 82 additions & 9 deletions b/‎src/sagemaker/estimator.py
Lines changed: 82 additions & 9 deletions
diff --git a/‎src/sagemaker/huggingface/estimator.py
Lines changed: 29 additions & 0 deletions b/‎src/sagemaker/huggingface/estimator.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎src/sagemaker/huggingface/training_compiler/config.py
Lines changed: 54 additions & 2 deletions b/‎src/sagemaker/huggingface/training_compiler/config.py
Lines changed: 54 additions & 2 deletions
diff --git a/‎src/sagemaker/image_uri_config/huggingface-training-compiler.json
Lines changed: 36 additions & 1 deletion b/‎src/sagemaker/image_uri_config/huggingface-training-compiler.json
Lines changed: 36 additions & 1 deletion
@@ -1,5 +1,19 @@
 # Changelog
 
+## v2.107.0 (2022-08-29)
+
+### Features
+
+ * support python 3.10, update airflow dependency
+
+### Bug Fixes and Other Changes
+
+ * Add retry in session.py to check if training is finished
+
+### Documentation Changes
+
+ * remove Other tab in Built-in algorithms section and mi…
+
 ## v2.106.0 (2022-08-24)
 
 ### Features
 
@@ -1 +1 @@
-2.106.1.dev0
+2.107.1.dev0
@@ -100,6 +100,7 @@ class EstimatorBase(with_metaclass(ABCMeta, object)):  # pylint: disable=too-man
     instance.
     """
 
+    LAUNCH_PT_XLA_ENV_NAME = "sagemaker_pytorch_xla_multi_worker_enabled"
     LAUNCH_PS_ENV_NAME = "sagemaker_parameter_server_enabled"
     LAUNCH_MPI_ENV_NAME = "sagemaker_mpi_enabled"
     LAUNCH_SM_DDP_ENV_NAME = "sagemaker_distributed_dataparallel_enabled"
@@ -166,10 +167,44 @@ def __init__(
             instance_type (str): Type of EC2 instance to use for training,
                 for example, ``'ml.c4.xlarge'``. Required if instance_groups is
                 not set.
-            volume_size (int): Size in GB of the EBS volume to use for
-                storing input data during training (default: 30). Must be large
-                enough to store training data if File Mode is used (which is the
-                default).
+            volume_size (int): Size in GB of the storage volume to use for
+                storing input and output data during training (default: 30).
+
+                Must be large enough to store training data if File mode is
+                used, which is the default mode.
+
+                When you use an ML instance with the EBS-only storage option
+                such as ``ml.c5`` and ``ml.p2``,
+                you must define the size of the EBS
+                volume through the ``volume_size`` parameter in the estimator class.
+
+                .. note::
+
+                    When you use an ML instance with `NVMe SSD volumes
+                    <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
+                    such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
+                    do not include this parameter in the estimator configuration.
+                    If you use one of those ML instance types,
+                    SageMaker doesn't provision Amazon EBS General Purpose SSD
+                    (gp2) storage nor take this parameter to adjust the NVMe instance storage.
+                    Available storage is fixed to the NVMe instance storage
+                    capacity. SageMaker configures storage paths for training
+                    datasets, checkpoints, model artifacts, and outputs to use the
+                    entire capacity of the instance storage.
+
+                    Note that if you include this parameter and specify a number that
+                    exceeds the size of the NVMe volume attached to the instance type,
+                    SageMaker returns an ``Invalid VolumeSizeInGB`` error.
+
+                To look up instance types and their instance storage types
+                and volumes, see `Amazon EC2 Instance Types
+                <http://aws.amazon.com/ec2/instance-types/>`_.
+
+                To find the default local paths defined by the SageMaker
+                training platform, see `Amazon SageMaker Training Storage
+                Folders for Training Datasets, Checkpoints, Model Artifacts,
+                and Outputs
+                <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
             volume_kms_key (str): Optional. KMS key ID for encrypting EBS
                 volume attached to the training instance (default: None).
             max_run (int): Timeout in seconds for training (default: 24 *
@@ -2232,12 +2267,46 @@ def __init__(
             instance_count (int): Number of Amazon EC2 instances to use
                 for training. Required if instance_groups is not set.
             instance_type (str): Type of EC2 instance to use for training,
-                for example, 'ml.c4.xlarge'. Required if instance_groups is
+                for example, ``'ml.c4.xlarge'``. Required if instance_groups is
                 not set.
-            volume_size (int): Size in GB of the EBS volume to use for
-                storing input data during training (default: 30). Must be large
-                enough to store training data if File Mode is used (which is the
-                default).
+            volume_size (int): Size in GB of the storage volume to use for
+                storing input and output data during training (default: 30).
+
+                Must be large enough to store training data if File mode is
+                used, which is the default mode.
+
+                When you use an ML instance with the EBS-only storage option
+                such as ``ml.c5`` and ``ml.p2``,
+                you must define the size of the EBS
+                volume through the ``volume_size`` parameter in the estimator class.
+
+                .. note::
+
+                    When you use an ML instance with `NVMe SSD volumes
+                    <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
+                    such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
+                    do not include this parameter in the estimator configuration.
+                    If you use one of those ML instance types,
+                    SageMaker doesn't provision Amazon EBS General Purpose SSD
+                    (gp2) storage nor take this parameter to adjust the NVMe instance storage.
+                    Available storage is fixed to the NVMe instance storage
+                    capacity. SageMaker configures storage paths for training
+                    datasets, checkpoints, model artifacts, and outputs to use the
+                    entire capacity of the instance storage.
+
+                    Note that if you include this parameter and specify a number that
+                    exceeds the size of the NVMe volume attached to the instance type,
+                    SageMaker returns an ``Invalid VolumeSizeInGB`` error.
+
+                To look up instance types and their instance storage types
+                and volumes, see `Amazon EC2 Instance Types
+                <http://aws.amazon.com/ec2/instance-types/>`_.
+
+                To find the default local paths defined by the SageMaker
+                training platform, see `Amazon SageMaker Training Storage
+                Folders for Training Datasets, Checkpoints, Model Artifacts,
+                and Outputs
+                <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
             volume_kms_key (str): Optional. KMS key ID for encrypting EBS
                 volume attached to the training instance (default: None).
             max_run (int): Timeout in seconds for training (default: 24 *
@@ -3248,6 +3317,10 @@ def _distribution_configuration(self, distribution):
                 "instance_groups"
             ]
 
+        if "pytorchxla" in distribution:
+            pt_xla_enabled = distribution.get("pytorchxla").get("enabled", False)
+            distribution_config[self.LAUNCH_PT_XLA_ENV_NAME] = pt_xla_enabled
+
         if "parameter_server" in distribution:
             ps_enabled = distribution.get("parameter_server").get("enabled", False)
             distribution_config[self.LAUNCH_PS_ENV_NAME] = ps_enabled
 
@@ -141,6 +141,28 @@ def __init__(
                             }
                         }
                     }
+
+                To enable distributed training with
+                `SageMaker Training Compiler <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
+                for Hugging Face Transformers with PyTorch:
+
+                .. code:: python
+
+                    {
+                        "pytorchxla": {
+                            "enabled": True
+                        }
+                    }
+
+                To learn more, see `SageMaker Training Compiler
+                <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
+                in the *Amazon SageMaker Developer Guide*.
+
+                .. note::
+
+                    When you use this PyTorch XLA option for distributed training strategy,
+                    you must add the ``compiler_config`` parameter and activate SageMaker
+                    Training Compiler.
             compiler_config (:class:`~sagemaker.huggingface.TrainingCompilerConfig`):
                 Configures SageMaker Training Compiler to accelerate training.
 
@@ -204,6 +226,13 @@ def __init__(
                 raise ValueError(error_string)
             if compiler_config:
                 compiler_config.validate(self)
+        elif distribution is not None and "pytorchxla" in distribution:
+            raise ValueError(
+                "Distributed training through PyTorch XLA is currently only supported "
+                "when SageMaker Training Compiler is enabled. To learn more, "
+                "see Enable SageMaker Training Compiler at "
+                "https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler-enable.html."
+            )
         self.compiler_config = compiler_config
 
     def _validate_args(self, image_uri):
 
@@ -14,6 +14,8 @@
 from __future__ import absolute_import
 import logging
 from typing import Union
+from packaging.specifiers import SpecifierSet
+from packaging.version import Version
 
 from sagemaker.training_compiler.config import TrainingCompilerConfig as BaseConfig
 from sagemaker.workflow.entities import PipelineVariable
@@ -24,7 +26,14 @@
 class TrainingCompilerConfig(BaseConfig):
     """The SageMaker Training Compiler configuration class."""
 
-    SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4"]
+    SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4d", "g5"]
+    SUPPORTED_INSTANCE_TYPES_WITH_EFA = [
+        "ml.g4dn.8xlarge",
+        "ml.g4dn.12xlarge",
+        "ml.g5.48xlarge",
+        "ml.p3dn.24xlarge",
+        "ml.p4d.24xlarge",
+    ]
 
     def __init__(
         self,
@@ -85,7 +94,7 @@ def validate(
         """Checks if SageMaker Training Compiler is configured correctly.
 
         Args:
-            estimator (str): A estimator object
+            estimator (:class:`sagemaker.huggingface.HuggingFace`): An estimator object.
                 If SageMaker Training Compiler is enabled, it will validate whether
                 the estimator is configured to be compatible with Training Compiler.
 
@@ -105,3 +114,46 @@ def validate(
                 "transformer_version, tensorflow_version or pytorch_version, and compiler_config."
             )
             raise ValueError(error_helper_string)
+
+        if estimator.distribution:
+            pt_xla_present = "pytorchxla" in estimator.distribution
+            pt_xla_enabled = estimator.distribution.get("pytorchxla", {}).get("enabled", False)
+            if pt_xla_enabled:
+                if estimator.tensorflow_version:
+                    error_helper_string = (
+                        "Distribution mechanism 'pytorchxla' is currently only supported for "
+                        "PyTorch >= 1.11 when SageMaker Training Compiler is enabled. Received "
+                        "tensorflow_version={} which is unsupported."
+                    )
+                    raise ValueError(error_helper_string.format(estimator.tensorflow_version))
+                if estimator.pytorch_version:
+                    if Version(estimator.pytorch_version) in SpecifierSet("< 1.11"):
+                        error_helper_string = (
+                            "Distribution mechanism 'pytorchxla' is currently only supported for "
+                            "PyTorch >= 1.11 when SageMaker Training Compiler is enabled."
+                            " Received pytorch_version={} which is unsupported."
+                        )
+                        raise ValueError(error_helper_string.format(estimator.pytorch_version))
+                    if estimator.instance_type not in cls.SUPPORTED_INSTANCE_TYPES_WITH_EFA:
+                        logger.warning(
+                            "Consider using instances with EFA support when "
+                            "training with PyTorch >= 1.11 and SageMaker Training Compiler "
+                            "enabled. SageMaker Training Compiler leverages EFA to provide better "
+                            "performance for distributed training."
+                        )
+            if not pt_xla_present:
+                if estimator.pytorch_version:
+                    if Version(estimator.pytorch_version) in SpecifierSet(">= 1.11"):
+                        error_helper_string = (
+                            "'pytorchxla' is the only distribution mechanism currently supported "
+                            "for PyTorch >= 1.11 when SageMaker Training Compiler is enabled."
+                            " Received distribution={} which is unsupported."
+                        )
+                        raise ValueError(error_helper_string.format(estimator.distribution))
+        elif estimator.instance_count and estimator.instance_count > 1:
+            if estimator.pytorch_version:
+                if Version(estimator.pytorch_version) in SpecifierSet(">= 1.11"):
+                    logger.warning(
+                        "Consider setting 'distribution' to 'pytorchxla' for distributed "
+                        "training with PyTorch >= 1.11 and SageMaker Training Compiler enabled."
+                    )
@@ -3,7 +3,8 @@
         "processors": ["gpu"],
         "version_aliases": {
             "4.11": "4.11.0",
-            "4.17": "4.17.0"
+            "4.17": "4.17.0",
+            "4.21": "4.21.1"
         },
         "versions": {
             "4.11.0": {
@@ -97,6 +98,40 @@
                     "repository": "huggingface-tensorflow-trcomp-training",
                     "container_version": {"gpu":"cu112-ubuntu20.04"}
                 }
+            },
+            "4.21.1": {
+                "version_aliases": {
+                    "pytorch1.11": "pytorch1.11.0"
+                },
+                "pytorch1.11.0": {
+                    "py_versions": ["py38"],
+                    "registries": {
+                        "af-south-1": "626614931356",
+                        "ap-east-1": "871362719292",
+                        "ap-northeast-1": "763104351884",
+                        "ap-northeast-2": "763104351884",
+                        "ap-northeast-3": "364406365360",
+                        "ap-south-1": "763104351884",
+                        "ap-southeast-1": "763104351884",
+                        "ap-southeast-2": "763104351884",
+                        "ap-southeast-3": "907027046896",
+                        "ca-central-1": "763104351884",
+                        "eu-central-1": "763104351884",
+                        "eu-north-1": "763104351884",
+                        "eu-south-1": "692866216735",
+                        "eu-west-1": "763104351884",
+                        "eu-west-2": "763104351884",
+                        "eu-west-3": "763104351884",
+                        "me-south-1": "217643126080",
+                        "sa-east-1": "763104351884",
+                        "us-east-1": "763104351884",
+                        "us-east-2": "763104351884",
+                        "us-west-1": "763104351884",
+                        "us-west-2": "763104351884"
+                    },
+                    "repository": "huggingface-pytorch-trcomp-training",
+                    "container_version": {"gpu":"cu113-ubuntu20.04"}
+                }
             }
         }
     }