Skip to content

Commit 81b0567

Browse files
authored
Merge branch 'master' into feat/enhance-bucket-override-support
2 parents 49342ed + cdd8cb5 commit 81b0567

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2289
-276
lines changed

doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,31 @@ Release Notes
55
New features, bug fixes, and improvements are regularly made to the SageMaker
66
distributed model parallel library.
77

8-
SageMaker Distributed Model Parallel 1.9.0 Release Notes
9-
========================================================
8+
SageMaker Distributed Model Parallel 1.10.0 Release Notes
9+
=========================================================
1010

11-
*Date: May. 3. 2022*
11+
*Date: July. 19. 2022*
1212

13-
**Currency Updates**
13+
**New Features**
1414

15-
* Added support for PyTorch 1.11.0
15+
The following new features are added for PyTorch.
16+
17+
* Added support for FP16 training by implementing smdistributed.modelparallel
18+
modification of Apex FP16_Module and FP16_Optimizer. To learn more, see
19+
`FP16 Training with Model Parallelism
20+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-fp16.html>`_.
21+
* New checkpoint APIs for CPU memory usage optimization. To learn more, see
22+
`Checkpointing Distributed Models and Optimizer States
23+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-checkpoint.html>`_.
24+
25+
**Improvements**
26+
27+
* The SageMaker distributed model parallel library manages and optimizes CPU
28+
memory by garbage-collecting non-local parameters in general and during checkpointing.
29+
* Changes in the `GPT-2 translate functions
30+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-hugging-face.html>`_
31+
(``smdistributed.modelparallel.torch.nn.huggingface.gpt2``)
32+
to save memory by not maintaining two copies of weights at the same time.
1633

1734
**Migration to AWS Deep Learning Containers**
1835

@@ -28,7 +45,7 @@ Binary file of this version of the library for custom container users:
2845

2946
.. code::
3047
31-
https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.11.0/build-artifacts/2022-04-20-17-05/smdistributed_modelparallel-1.9.0-cp38-cp38-linux_x86_64.whl
48+
https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.11.0/build-artifacts/2022-07-11-19-23/smdistributed_modelparallel-1.10.0-cp38-cp38-linux_x86_64.whl
3249
3350
3451
@@ -37,6 +54,33 @@ Binary file of this version of the library for custom container users:
3754
Release History
3855
===============
3956

57+
SageMaker Distributed Model Parallel 1.9.0 Release Notes
58+
--------------------------------------------------------
59+
60+
*Date: May. 3. 2022*
61+
62+
**Currency Updates**
63+
64+
* Added support for PyTorch 1.11.0
65+
66+
**Migration to AWS Deep Learning Containers**
67+
68+
This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
69+
70+
- PyTorch 1.11.0 DLC
71+
72+
.. code::
73+
74+
763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker
75+
76+
Binary file of this version of the library for custom container users:
77+
78+
.. code::
79+
80+
https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.11.0/build-artifacts/2022-04-20-17-05/smdistributed_modelparallel-1.9.0-cp38-cp38-linux_x86_64.whl
81+
82+
83+
4084
SageMaker Distributed Model Parallel 1.8.1 Release Notes
4185
--------------------------------------------------------
4286

src/sagemaker/amazon/hyperparameter.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
import json
1717

18+
from sagemaker.workflow import is_pipeline_variable
19+
1820

1921
class Hyperparameter(object):
2022
"""An algorithm hyperparameter with optional validation.
@@ -98,8 +100,14 @@ def serialize_all(obj):
98100
"""
99101
if "_hyperparameters" not in dir(obj):
100102
return {}
101-
return {
102-
k: json.dumps(v) if isinstance(v, list) else str(v)
103-
for k, v in obj._hyperparameters.items()
104-
if v is not None
105-
}
103+
hps = {}
104+
for k, v in obj._hyperparameters.items():
105+
if v is not None:
106+
if isinstance(v, list):
107+
v = json.dumps(v)
108+
elif is_pipeline_variable(v):
109+
v = v.to_string()
110+
else:
111+
v = str(v)
112+
hps[k] = v
113+
return hps

src/sagemaker/chainer/estimator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from __future__ import absolute_import
1515

1616
import logging
17+
from typing import Union, Optional
1718

1819
from sagemaker.estimator import Framework, EstimatorBase
1920
from sagemaker.fw_utils import (
@@ -25,6 +26,7 @@
2526
from sagemaker.chainer import defaults
2627
from sagemaker.chainer.model import ChainerModel
2728
from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT
29+
from sagemaker.workflow.entities import PipelineVariable
2830

2931
logger = logging.getLogger("sagemaker")
3032

@@ -42,12 +44,12 @@ class Chainer(Framework):
4244

4345
def __init__(
4446
self,
45-
entry_point,
47+
entry_point: Union[str, PipelineVariable],
4648
use_mpi=None,
4749
num_processes=None,
4850
process_slots_per_host=None,
4951
additional_mpi_options=None,
50-
source_dir=None,
52+
source_dir: Optional[Union[str, PipelineVariable]] = None,
5153
hyperparameters=None,
5254
framework_version=None,
5355
py_version=None,

src/sagemaker/chainer/model.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,108 @@ def __init__(
140140

141141
self.model_server_workers = model_server_workers
142142

143+
def register(
144+
self,
145+
content_types,
146+
response_types,
147+
inference_instances,
148+
transform_instances,
149+
model_package_name=None,
150+
model_package_group_name=None,
151+
image_uri=None,
152+
model_metrics=None,
153+
metadata_properties=None,
154+
marketplace_cert=False,
155+
approval_status=None,
156+
description=None,
157+
drift_check_baselines=None,
158+
customer_metadata_properties=None,
159+
domain=None,
160+
sample_payload_url=None,
161+
task=None,
162+
framework=None,
163+
framework_version=None,
164+
nearest_model_name=None,
165+
data_input_configuration=None,
166+
):
167+
"""Creates a model package for creating SageMaker models or listing on Marketplace.
168+
169+
Args:
170+
content_types (list): The supported MIME types for the input data.
171+
response_types (list): The supported MIME types for the output data.
172+
inference_instances (list): A list of the instance types that are used to
173+
generate inferences in real-time.
174+
transform_instances (list): A list of the instance types on which a transformation
175+
job can be run or on which an endpoint can be deployed.
176+
model_package_name (str): Model Package name, exclusive to `model_package_group_name`,
177+
using `model_package_name` makes the Model Package un-versioned (default: None).
178+
model_package_group_name (str): Model Package Group name, exclusive to
179+
`model_package_name`, using `model_package_group_name` makes the Model Package
180+
versioned (default: None).
181+
image_uri (str): Inference image uri for the container. Model class' self.image will
182+
be used if it is None (default: None).
183+
model_metrics (ModelMetrics): ModelMetrics object (default: None).
184+
metadata_properties (MetadataProperties): MetadataProperties (default: None).
185+
marketplace_cert (bool): A boolean value indicating if the Model Package is certified
186+
for AWS Marketplace (default: False).
187+
approval_status (str): Model Approval Status, values can be "Approved", "Rejected",
188+
or "PendingManualApproval" (default: "PendingManualApproval").
189+
description (str): Model Package description (default: None).
190+
drift_check_baselines (DriftCheckBaselines): DriftCheckBaselines object (default: None).
191+
customer_metadata_properties (dict[str, str]): A dictionary of key-value paired
192+
metadata properties (default: None).
193+
domain (str): Domain values can be "COMPUTER_VISION", "NATURAL_LANGUAGE_PROCESSING",
194+
"MACHINE_LEARNING" (default: None).
195+
sample_payload_url (str): The S3 path where the sample payload is stored
196+
(default: None).
197+
task (str): Task values which are supported by Inference Recommender are "FILL_MASK",
198+
"IMAGE_CLASSIFICATION", "OBJECT_DETECTION", "TEXT_GENERATION", "IMAGE_SEGMENTATION",
199+
"CLASSIFICATION", "REGRESSION", "OTHER" (default: None).
200+
framework (str): Machine learning framework of the model package container image
201+
(default: None).
202+
framework_version (str): Framework version of the Model Package Container Image
203+
(default: None).
204+
nearest_model_name (str): Name of a pre-trained machine learning benchmarked by
205+
Amazon SageMaker Inference Recommender (default: None).
206+
data_input_configuration (str): Input object for the model (default: None).
207+
208+
Returns:
209+
str: A string of SageMaker Model Package ARN.
210+
"""
211+
instance_type = inference_instances[0]
212+
self._init_sagemaker_session_if_does_not_exist(instance_type)
213+
214+
if image_uri:
215+
self.image_uri = image_uri
216+
if not self.image_uri:
217+
self.image_uri = self.serving_image_uri(
218+
region_name=self.sagemaker_session.boto_session.region_name,
219+
instance_type=instance_type,
220+
)
221+
return super(ChainerModel, self).register(
222+
content_types,
223+
response_types,
224+
inference_instances,
225+
transform_instances,
226+
model_package_name,
227+
model_package_group_name,
228+
image_uri,
229+
model_metrics,
230+
metadata_properties,
231+
marketplace_cert,
232+
approval_status,
233+
description,
234+
drift_check_baselines=drift_check_baselines,
235+
customer_metadata_properties=customer_metadata_properties,
236+
domain=domain,
237+
sample_payload_url=sample_payload_url,
238+
task=task,
239+
framework=(framework or self._framework_name).upper(),
240+
framework_version=framework_version or self.framework_version,
241+
nearest_model_name=nearest_model_name,
242+
data_input_configuration=data_input_configuration,
243+
)
244+
143245
def prepare_container_def(
144246
self, instance_type=None, accelerator_type=None, serverless_inference_config=None
145247
):

0 commit comments

Comments
 (0)