Skip to content

Commit 585b8df

Browse files
author
Brock Wade
committed
Merge branch 'master' into runproc-bug-fix
2 parents 7732b9e + 5bffb04 commit 585b8df

File tree

18 files changed

+1518
-35
lines changed

18 files changed

+1518
-35
lines changed

CHANGELOG.md

+13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# Changelog
22

3+
## v2.120.0 (2022-12-07)
4+
5+
### Features
6+
7+
* Add Neo image uri config for Pytorch 1.12
8+
* Adding support for SageMaker Training Compiler in PyTorch estimator starting 1.12
9+
* Update registries with new region account number mappings.
10+
* Add DXB region to frameworks by DLC
11+
12+
### Bug Fixes and Other Changes
13+
14+
* support idempotency for framework and spark processors
15+
316
## v2.119.0 (2022-12-03)
417

518
### Features

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.119.1.dev0
1+
2.120.1.dev0

src/sagemaker/fw_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ def framework_name_from_image(image_uri):
493493
# We must support both the legacy and current image name format.
494494
name_pattern = re.compile(
495495
r"""^(?:sagemaker(?:-rl)?-)?
496-
(tensorflow|mxnet|chainer|pytorch|scikit-learn|xgboost
496+
(tensorflow|mxnet|chainer|pytorch|pytorch-trcomp|scikit-learn|xgboost
497497
|huggingface-tensorflow|huggingface-pytorch
498498
|huggingface-tensorflow-trcomp|huggingface-pytorch-trcomp)(?:-)?
499499
(scriptmode|training)?

src/sagemaker/image_uri_config/neo-pytorch.json

+35-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
"1.7.0": "1.7",
1212
"1.7.1": "1.7",
1313
"1.8.0": "1.8",
14-
"1.8.1": "1.8"
14+
"1.8.1": "1.8",
15+
"1.12.0": "1.12",
16+
"1.12.1": "1.12"
1517
},
1618
"versions": {
1719
"1.4": {
@@ -173,6 +175,38 @@
173175
"us-west-2": "301217895009"
174176
},
175177
"repository": "sagemaker-inference-pytorch"
178+
},
179+
"1.12": {
180+
"py_versions": ["py3"],
181+
"registries": {
182+
"af-south-1": "774647643957",
183+
"ap-east-1": "110948597952",
184+
"ap-northeast-1": "941853720454",
185+
"ap-northeast-2": "151534178276",
186+
"ap-northeast-3": "925152966179",
187+
"ap-south-1": "763008648453",
188+
"ap-southeast-1": "324986816169",
189+
"ap-southeast-2": "355873309152",
190+
"ca-central-1": "464438896020",
191+
"cn-north-1": "472730292857",
192+
"cn-northwest-1": "474822919863",
193+
"eu-central-1": "746233611703",
194+
"eu-north-1": "601324751636",
195+
"eu-south-1": "966458181534",
196+
"eu-west-1": "802834080501",
197+
"eu-west-2": "205493899709",
198+
"eu-west-3": "254080097072",
199+
"me-south-1": "836785723513",
200+
"sa-east-1": "756306329178",
201+
"us-east-1": "785573368785",
202+
"us-east-2": "007439368137",
203+
"us-gov-west-1": "263933020539",
204+
"us-iso-east-1": "167761179201",
205+
"us-isob-east-1": "406031935815",
206+
"us-west-1": "710691900526",
207+
"us-west-2": "301217895009"
208+
},
209+
"repository": "sagemaker-inference-pytorch"
176210
}
177211
}
178212
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
{
2+
"training": {
3+
"processors": [
4+
"gpu"
5+
],
6+
"version_aliases": {
7+
"1.12": "1.12.0"
8+
},
9+
"versions": {
10+
"1.12.0": {
11+
"py_versions": [
12+
"py38"
13+
],
14+
"registries": {
15+
"af-south-1": "626614931356",
16+
"ap-east-1": "871362719292",
17+
"ap-northeast-1": "763104351884",
18+
"ap-northeast-2": "763104351884",
19+
"ap-northeast-3": "364406365360",
20+
"ap-south-1": "763104351884",
21+
"ap-southeast-1": "763104351884",
22+
"ap-southeast-2": "763104351884",
23+
"ca-central-1": "763104351884",
24+
"eu-central-1": "763104351884",
25+
"eu-north-1": "763104351884",
26+
"eu-west-1": "763104351884",
27+
"eu-west-2": "763104351884",
28+
"eu-west-3": "763104351884",
29+
"eu-south-1": "692866216735",
30+
"me-south-1": "217643126080",
31+
"sa-east-1": "763104351884",
32+
"us-east-1": "763104351884",
33+
"us-east-2": "763104351884",
34+
"us-west-1": "763104351884",
35+
"us-west-2": "763104351884"
36+
},
37+
"repository": "pytorch-training"
38+
}
39+
}
40+
}
41+
}

src/sagemaker/image_uris.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def retrieve(
146146
tolerate_deprecated_model,
147147
)
148148

149-
if training_compiler_config and (framework == HUGGING_FACE_FRAMEWORK):
149+
if training_compiler_config and (framework in [HUGGING_FACE_FRAMEWORK, "pytorch"]):
150150
final_image_scope = image_scope
151151
config = _config_for_framework_and_scope(
152152
framework + "-training-compiler", final_image_scope, accelerator_type

src/sagemaker/pytorch/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@
1616
from sagemaker.pytorch.estimator import PyTorch # noqa: F401
1717
from sagemaker.pytorch.model import PyTorchModel, PyTorchPredictor # noqa: F401
1818
from sagemaker.pytorch.processing import PyTorchProcessor # noqa: F401
19+
20+
from sagemaker.pytorch.training_compiler.config import TrainingCompilerConfig # noqa: F401

src/sagemaker/pytorch/estimator.py

+57-3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
)
2929
from sagemaker.pytorch import defaults
3030
from sagemaker.pytorch.model import PyTorchModel
31+
from sagemaker.pytorch.training_compiler.config import TrainingCompilerConfig
3132
from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT
3233
from sagemaker.workflow.entities import PipelineVariable
3334

@@ -51,7 +52,8 @@ def __init__(
5152
hyperparameters: Optional[Dict[str, Union[str, PipelineVariable]]] = None,
5253
image_uri: Optional[Union[str, PipelineVariable]] = None,
5354
distribution: Optional[Dict] = None,
54-
**kwargs
55+
compiler_config: Optional[TrainingCompilerConfig] = None,
56+
**kwargs,
5557
):
5658
"""This ``Estimator`` executes a PyTorch script in a managed PyTorch execution environment.
5759
@@ -208,6 +210,31 @@ def __init__(
208210
To learn more, see `Training with parameter servers
209211
<https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#training-with-parameter-servers>`_.
210212
213+
**To enable distributed training with
214+
`SageMaker Training Compiler <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
215+
for PyTorch:**
216+
217+
.. code:: python
218+
219+
{
220+
"pytorchxla": {
221+
"enabled": True
222+
}
223+
}
224+
225+
To learn more, see `SageMaker Training Compiler
226+
<https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
227+
in the *Amazon SageMaker Developer Guide*.
228+
229+
.. note::
230+
231+
When you use this PyTorch XLA option for distributed training strategy,
232+
you must add the ``compiler_config`` parameter and activate SageMaker
233+
Training Compiler.
234+
235+
compiler_config (:class:`~sagemaker.pytorch.TrainingCompilerConfig`):
236+
Configures SageMaker Training Compiler to accelerate training.
237+
211238
**kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework`
212239
constructor.
213240
@@ -250,6 +277,25 @@ def __init__(
250277

251278
self.distribution = distribution or {}
252279

280+
if compiler_config is not None:
281+
if not isinstance(compiler_config, TrainingCompilerConfig):
282+
error_string = (
283+
f"Expected instance of type {TrainingCompilerConfig}"
284+
f"for argument compiler_config. "
285+
f"Instead got {type(compiler_config)}"
286+
)
287+
raise ValueError(error_string)
288+
if compiler_config:
289+
compiler_config.validate(self)
290+
elif distribution is not None and "pytorchxla" in distribution:
291+
raise ValueError(
292+
"Distributed training through PyTorch XLA is currently only supported "
293+
"when SageMaker Training Compiler is enabled. To learn more, "
294+
"see Enable SageMaker Training Compiler at "
295+
"https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler-enable.html."
296+
)
297+
self.compiler_config = compiler_config
298+
253299
def _pytorch_distribution_configuration(self, distribution):
254300
"""Returns a dict of distribution config for PyTorch training
255301
@@ -289,6 +335,12 @@ def hyperparameters(self):
289335
hyperparameters.update(
290336
EstimatorBase._json_encode_hyperparameters(additional_hyperparameters)
291337
)
338+
if self.compiler_config:
339+
training_compiler_hyperparameters = self.compiler_config._to_hyperparameter_dict()
340+
hyperparameters.update(
341+
EstimatorBase._json_encode_hyperparameters(training_compiler_hyperparameters)
342+
)
343+
292344
return hyperparameters
293345

294346
def create_model(
@@ -299,7 +351,7 @@ def create_model(
299351
entry_point=None,
300352
source_dir=None,
301353
dependencies=None,
302-
**kwargs
354+
**kwargs,
303355
):
304356
"""Create a SageMaker ``PyTorchModel`` object that can be deployed to an ``Endpoint``.
305357
@@ -350,7 +402,7 @@ def create_model(
350402
sagemaker_session=self.sagemaker_session,
351403
vpc_config=self.get_vpc_config(vpc_config_override),
352404
dependencies=(dependencies or self.dependencies),
353-
**kwargs
405+
**kwargs,
354406
)
355407

356408
@classmethod
@@ -371,6 +423,8 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
371423
)
372424
image_uri = init_params.pop("image_uri")
373425
framework, py_version, tag, _ = framework_name_from_image(image_uri)
426+
if framework:
427+
framework = framework.split("-")[0]
374428

375429
if tag is None:
376430
framework_version = None

src/sagemaker/pytorch/training_compiler/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)