Skip to content

Commit 5f4ffa7

Browse files
Merge branch 'master' into mm-batch-support-on-demand
2 parents e3bf28f + d47d966 commit 5f4ffa7

File tree

13 files changed

+504
-79
lines changed

13 files changed

+504
-79
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# Changelog
22

3+
## v2.107.0 (2022-08-29)
4+
5+
### Features
6+
7+
* support python 3.10, update airflow dependency
8+
9+
### Bug Fixes and Other Changes
10+
11+
* Add retry in session.py to check if training is finished
12+
13+
### Documentation Changes
14+
15+
* remove Other tab in Built-in algorithms section and mi…
16+
317
## v2.106.0 (2022-08-24)
418

519
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.106.1.dev0
1+
2.107.1.dev0

src/sagemaker/estimator.py

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ class EstimatorBase(with_metaclass(ABCMeta, object)): # pylint: disable=too-man
100100
instance.
101101
"""
102102

103+
LAUNCH_PT_XLA_ENV_NAME = "sagemaker_pytorch_xla_multi_worker_enabled"
103104
LAUNCH_PS_ENV_NAME = "sagemaker_parameter_server_enabled"
104105
LAUNCH_MPI_ENV_NAME = "sagemaker_mpi_enabled"
105106
LAUNCH_SM_DDP_ENV_NAME = "sagemaker_distributed_dataparallel_enabled"
@@ -166,10 +167,44 @@ def __init__(
166167
instance_type (str): Type of EC2 instance to use for training,
167168
for example, ``'ml.c4.xlarge'``. Required if instance_groups is
168169
not set.
169-
volume_size (int): Size in GB of the EBS volume to use for
170-
storing input data during training (default: 30). Must be large
171-
enough to store training data if File Mode is used (which is the
172-
default).
170+
volume_size (int): Size in GB of the storage volume to use for
171+
storing input and output data during training (default: 30).
172+
173+
Must be large enough to store training data if File mode is
174+
used, which is the default mode.
175+
176+
When you use an ML instance with the EBS-only storage option
177+
such as ``ml.c5`` and ``ml.p2``,
178+
you must define the size of the EBS
179+
volume through the ``volume_size`` parameter in the estimator class.
180+
181+
.. note::
182+
183+
When you use an ML instance with `NVMe SSD volumes
184+
<https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
185+
such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
186+
do not include this parameter in the estimator configuration.
187+
If you use one of those ML instance types,
188+
SageMaker doesn't provision Amazon EBS General Purpose SSD
189+
(gp2) storage nor take this parameter to adjust the NVMe instance storage.
190+
Available storage is fixed to the NVMe instance storage
191+
capacity. SageMaker configures storage paths for training
192+
datasets, checkpoints, model artifacts, and outputs to use the
193+
entire capacity of the instance storage.
194+
195+
Note that if you include this parameter and specify a number that
196+
exceeds the size of the NVMe volume attached to the instance type,
197+
SageMaker returns an ``Invalid VolumeSizeInGB`` error.
198+
199+
To look up instance types and their instance storage types
200+
and volumes, see `Amazon EC2 Instance Types
201+
<http://aws.amazon.com/ec2/instance-types/>`_.
202+
203+
To find the default local paths defined by the SageMaker
204+
training platform, see `Amazon SageMaker Training Storage
205+
Folders for Training Datasets, Checkpoints, Model Artifacts,
206+
and Outputs
207+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
173208
volume_kms_key (str): Optional. KMS key ID for encrypting EBS
174209
volume attached to the training instance (default: None).
175210
max_run (int): Timeout in seconds for training (default: 24 *
@@ -2232,12 +2267,46 @@ def __init__(
22322267
instance_count (int): Number of Amazon EC2 instances to use
22332268
for training. Required if instance_groups is not set.
22342269
instance_type (str): Type of EC2 instance to use for training,
2235-
for example, 'ml.c4.xlarge'. Required if instance_groups is
2270+
for example, ``'ml.c4.xlarge'``. Required if instance_groups is
22362271
not set.
2237-
volume_size (int): Size in GB of the EBS volume to use for
2238-
storing input data during training (default: 30). Must be large
2239-
enough to store training data if File Mode is used (which is the
2240-
default).
2272+
volume_size (int): Size in GB of the storage volume to use for
2273+
storing input and output data during training (default: 30).
2274+
2275+
Must be large enough to store training data if File mode is
2276+
used, which is the default mode.
2277+
2278+
When you use an ML instance with the EBS-only storage option
2279+
such as ``ml.c5`` and ``ml.p2``,
2280+
you must define the size of the EBS
2281+
volume through the ``volume_size`` parameter in the estimator class.
2282+
2283+
.. note::
2284+
2285+
When you use an ML instance with `NVMe SSD volumes
2286+
<https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
2287+
such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
2288+
do not include this parameter in the estimator configuration.
2289+
If you use one of those ML instance types,
2290+
SageMaker doesn't provision Amazon EBS General Purpose SSD
2291+
(gp2) storage nor take this parameter to adjust the NVMe instance storage.
2292+
Available storage is fixed to the NVMe instance storage
2293+
capacity. SageMaker configures storage paths for training
2294+
datasets, checkpoints, model artifacts, and outputs to use the
2295+
entire capacity of the instance storage.
2296+
2297+
Note that if you include this parameter and specify a number that
2298+
exceeds the size of the NVMe volume attached to the instance type,
2299+
SageMaker returns an ``Invalid VolumeSizeInGB`` error.
2300+
2301+
To look up instance types and their instance storage types
2302+
and volumes, see `Amazon EC2 Instance Types
2303+
<http://aws.amazon.com/ec2/instance-types/>`_.
2304+
2305+
To find the default local paths defined by the SageMaker
2306+
training platform, see `Amazon SageMaker Training Storage
2307+
Folders for Training Datasets, Checkpoints, Model Artifacts,
2308+
and Outputs
2309+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
22412310
volume_kms_key (str): Optional. KMS key ID for encrypting EBS
22422311
volume attached to the training instance (default: None).
22432312
max_run (int): Timeout in seconds for training (default: 24 *
@@ -3248,6 +3317,10 @@ def _distribution_configuration(self, distribution):
32483317
"instance_groups"
32493318
]
32503319

3320+
if "pytorchxla" in distribution:
3321+
pt_xla_enabled = distribution.get("pytorchxla").get("enabled", False)
3322+
distribution_config[self.LAUNCH_PT_XLA_ENV_NAME] = pt_xla_enabled
3323+
32513324
if "parameter_server" in distribution:
32523325
ps_enabled = distribution.get("parameter_server").get("enabled", False)
32533326
distribution_config[self.LAUNCH_PS_ENV_NAME] = ps_enabled

src/sagemaker/huggingface/estimator.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,28 @@ def __init__(
141141
}
142142
}
143143
}
144+
145+
To enable distributed training with
146+
`SageMaker Training Compiler <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
147+
for Hugging Face Transformers with PyTorch:
148+
149+
.. code:: python
150+
151+
{
152+
"pytorchxla": {
153+
"enabled": True
154+
}
155+
}
156+
157+
To learn more, see `SageMaker Training Compiler
158+
<https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
159+
in the *Amazon SageMaker Developer Guide*.
160+
161+
.. note::
162+
163+
When you use this PyTorch XLA option for distributed training strategy,
164+
you must add the ``compiler_config`` parameter and activate SageMaker
165+
Training Compiler.
144166
compiler_config (:class:`~sagemaker.huggingface.TrainingCompilerConfig`):
145167
Configures SageMaker Training Compiler to accelerate training.
146168
@@ -204,6 +226,13 @@ def __init__(
204226
raise ValueError(error_string)
205227
if compiler_config:
206228
compiler_config.validate(self)
229+
elif distribution is not None and "pytorchxla" in distribution:
230+
raise ValueError(
231+
"Distributed training through PyTorch XLA is currently only supported "
232+
"when SageMaker Training Compiler is enabled. To learn more, "
233+
"see Enable SageMaker Training Compiler at "
234+
"https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler-enable.html."
235+
)
207236
self.compiler_config = compiler_config
208237

209238
def _validate_args(self, image_uri):

src/sagemaker/huggingface/training_compiler/config.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from __future__ import absolute_import
1515
import logging
1616
from typing import Union
17+
from packaging.specifiers import SpecifierSet
18+
from packaging.version import Version
1719

1820
from sagemaker.training_compiler.config import TrainingCompilerConfig as BaseConfig
1921
from sagemaker.workflow.entities import PipelineVariable
@@ -24,7 +26,14 @@
2426
class TrainingCompilerConfig(BaseConfig):
2527
"""The SageMaker Training Compiler configuration class."""
2628

27-
SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4"]
29+
SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4d", "g5"]
30+
SUPPORTED_INSTANCE_TYPES_WITH_EFA = [
31+
"ml.g4dn.8xlarge",
32+
"ml.g4dn.12xlarge",
33+
"ml.g5.48xlarge",
34+
"ml.p3dn.24xlarge",
35+
"ml.p4d.24xlarge",
36+
]
2837

2938
def __init__(
3039
self,
@@ -85,7 +94,7 @@ def validate(
8594
"""Checks if SageMaker Training Compiler is configured correctly.
8695
8796
Args:
88-
estimator (str): A estimator object
97+
estimator (:class:`sagemaker.huggingface.HuggingFace`): An estimator object.
8998
If SageMaker Training Compiler is enabled, it will validate whether
9099
the estimator is configured to be compatible with Training Compiler.
91100
@@ -105,3 +114,46 @@ def validate(
105114
"transformer_version, tensorflow_version or pytorch_version, and compiler_config."
106115
)
107116
raise ValueError(error_helper_string)
117+
118+
if estimator.distribution:
119+
pt_xla_present = "pytorchxla" in estimator.distribution
120+
pt_xla_enabled = estimator.distribution.get("pytorchxla", {}).get("enabled", False)
121+
if pt_xla_enabled:
122+
if estimator.tensorflow_version:
123+
error_helper_string = (
124+
"Distribution mechanism 'pytorchxla' is currently only supported for "
125+
"PyTorch >= 1.11 when SageMaker Training Compiler is enabled. Received "
126+
"tensorflow_version={} which is unsupported."
127+
)
128+
raise ValueError(error_helper_string.format(estimator.tensorflow_version))
129+
if estimator.pytorch_version:
130+
if Version(estimator.pytorch_version) in SpecifierSet("< 1.11"):
131+
error_helper_string = (
132+
"Distribution mechanism 'pytorchxla' is currently only supported for "
133+
"PyTorch >= 1.11 when SageMaker Training Compiler is enabled."
134+
" Received pytorch_version={} which is unsupported."
135+
)
136+
raise ValueError(error_helper_string.format(estimator.pytorch_version))
137+
if estimator.instance_type not in cls.SUPPORTED_INSTANCE_TYPES_WITH_EFA:
138+
logger.warning(
139+
"Consider using instances with EFA support when "
140+
"training with PyTorch >= 1.11 and SageMaker Training Compiler "
141+
"enabled. SageMaker Training Compiler leverages EFA to provide better "
142+
"performance for distributed training."
143+
)
144+
if not pt_xla_present:
145+
if estimator.pytorch_version:
146+
if Version(estimator.pytorch_version) in SpecifierSet(">= 1.11"):
147+
error_helper_string = (
148+
"'pytorchxla' is the only distribution mechanism currently supported "
149+
"for PyTorch >= 1.11 when SageMaker Training Compiler is enabled."
150+
" Received distribution={} which is unsupported."
151+
)
152+
raise ValueError(error_helper_string.format(estimator.distribution))
153+
elif estimator.instance_count and estimator.instance_count > 1:
154+
if estimator.pytorch_version:
155+
if Version(estimator.pytorch_version) in SpecifierSet(">= 1.11"):
156+
logger.warning(
157+
"Consider setting 'distribution' to 'pytorchxla' for distributed "
158+
"training with PyTorch >= 1.11 and SageMaker Training Compiler enabled."
159+
)

src/sagemaker/image_uri_config/huggingface-training-compiler.json

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
"processors": ["gpu"],
44
"version_aliases": {
55
"4.11": "4.11.0",
6-
"4.17": "4.17.0"
6+
"4.17": "4.17.0",
7+
"4.21": "4.21.1"
78
},
89
"versions": {
910
"4.11.0": {
@@ -97,6 +98,40 @@
9798
"repository": "huggingface-tensorflow-trcomp-training",
9899
"container_version": {"gpu":"cu112-ubuntu20.04"}
99100
}
101+
},
102+
"4.21.1": {
103+
"version_aliases": {
104+
"pytorch1.11": "pytorch1.11.0"
105+
},
106+
"pytorch1.11.0": {
107+
"py_versions": ["py38"],
108+
"registries": {
109+
"af-south-1": "626614931356",
110+
"ap-east-1": "871362719292",
111+
"ap-northeast-1": "763104351884",
112+
"ap-northeast-2": "763104351884",
113+
"ap-northeast-3": "364406365360",
114+
"ap-south-1": "763104351884",
115+
"ap-southeast-1": "763104351884",
116+
"ap-southeast-2": "763104351884",
117+
"ap-southeast-3": "907027046896",
118+
"ca-central-1": "763104351884",
119+
"eu-central-1": "763104351884",
120+
"eu-north-1": "763104351884",
121+
"eu-south-1": "692866216735",
122+
"eu-west-1": "763104351884",
123+
"eu-west-2": "763104351884",
124+
"eu-west-3": "763104351884",
125+
"me-south-1": "217643126080",
126+
"sa-east-1": "763104351884",
127+
"us-east-1": "763104351884",
128+
"us-east-2": "763104351884",
129+
"us-west-1": "763104351884",
130+
"us-west-2": "763104351884"
131+
},
132+
"repository": "huggingface-pytorch-trcomp-training",
133+
"container_version": {"gpu":"cu113-ubuntu20.04"}
134+
}
100135
}
101136
}
102137
}

0 commit comments

Comments
 (0)