Skip to content

Commit a727149

Browse files
Merge branch 'master' into mm-batch-support-on-demand
2 parents 5f4ffa7 + 3b45257 commit a727149

23 files changed

+2953
-238
lines changed

CHANGELOG.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,21 @@
11
# Changelog
22

3+
## v2.108.0 (2022-09-02)
4+
5+
### Features
6+
7+
* Adding support in HuggingFace estimator for Training Compiler enhanced PyTorch 1.11
8+
9+
### Bug Fixes and Other Changes
10+
11+
* add sagemaker clarify image account for cgk region
12+
* set PYTHONHASHSEED env variable to fixed value to fix intermittent failures in release pipeline
13+
* trcomp fixtures to override default fixtures for integ tests
14+
15+
### Documentation Changes
16+
17+
* add more info about volume_size
18+
319
## v2.107.0 (2022-08-29)
420

521
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.107.1.dev0
1+
2.108.1.dev0

doc/_static/kendrasearchtools.js

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -533,11 +533,19 @@ var KendraSearch = {
533533
_pulse_status : -1,
534534

535535
init : function() {
536+
var filters = {};
536537
var params = $.getQueryParameters();
537538
if (params.q) {
538539
var query = params.q[0];
539540
$('input[name="q"]')[0].value = query;
540-
this.performSearch(query);
541+
542+
Object.keys(params).forEach(function(key) {
543+
if(key.startsWith("filter")){
544+
filters[key] = true;
545+
$('input[name="' + key + '"]')[0].checked = true;
546+
}
547+
});
548+
this.performSearch(query, filters=filters);
541549
}
542550
},
543551

@@ -577,16 +585,16 @@ var KendraSearch = {
577585
/**
578586
* execute search (requires search index to be loaded)
579587
*/
580-
query : function(query, pageNumber, pageSize=10) {
581-
var url = " https://9cs56celvj.execute-api.us-west-2.amazonaws.com/prod"
588+
query : function(query, pageNumber, pageSize=10, filters={}) {
589+
var url = "https://9cs56celvj.execute-api.us-west-2.amazonaws.com/prod"
582590

583591
$('#search-progress').empty();
584592

585593
query = KendraSearch.sanitize(query);
586594

587595
fetch(url, {
588596
method: 'post',
589-
body: JSON.stringify({ "queryText": query , "pageNumber": pageNumber, "pageSize": pageSize, "host": window.location.host}),
597+
body: JSON.stringify({ "queryText": query , "pageNumber": pageNumber, "pageSize": pageSize, "filters": filters, "host": window.location.host}),
590598
}).then(response => response.json())
591599
.then(function(data) {
592600
var docs = data["ResultItems"];
@@ -602,7 +610,7 @@ var KendraSearch = {
602610
if(doc_url.includes("sagemaker-examples.readthedocs.io")){
603611
type_badge_html = '<span class="example-badge">Example</span>'
604612
}else if(doc_url.includes("docs.aws.amazon.com")){
605-
type_badge_html = '<span class="aws-doc-badge">AWS Dev Guide</span>'
613+
type_badge_html = '<span class="aws-doc-badge">Dev Guide</span>'
606614
}else if(doc_url.includes("sagemaker.readthedocs.io") || doc_url.includes("sagemaker-debugger.readthedocs.io")){
607615
type_badge_html = '<span class="sdk-doc-badge">SDK Guide</span>'
608616
}
@@ -656,7 +664,7 @@ var KendraSearch = {
656664
$(element).on('click', function() {
657665
KendraSearch.output.empty();
658666
paginationItem.remove();
659-
KendraSearch.query(query, parseInt($(element).attr('id').split("-")[1]));
667+
KendraSearch.query(query, parseInt($(element).attr('id').split("-")[1]), pageSize, filters);
660668
});
661669
});
662670
}
@@ -670,7 +678,7 @@ var KendraSearch = {
670678
/**
671679
* perform a search for something (or wait until index is loaded)
672680
*/
673-
performSearch : function(query) {
681+
performSearch : function(query, filters) {
674682
// create the required interface elements
675683
this.out = $('#search-results');
676684
this.title = $('<h2>' + _('Searching...') + '</h2>').appendTo(this.out);
@@ -682,7 +690,7 @@ var KendraSearch = {
682690
$('#search-progress').text(_('Preparing search...'));
683691
this.startPulse();
684692

685-
this.query(query, 1)
693+
this.query(query, 1, pageSize=10, filters=filters)
686694
},
687695

688696
};

doc/_templates/searchbox.html

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<div role="search">
2+
<form id ="rtd-search-form" class="wy-form" action="{{ pathto('search') }}" method="get">
3+
<input type="text" name="q" placeholder="{% trans %}ex. train object detection model, pd.concat{% endtrans %}" title="{% trans %}Type search term here{% endtrans %}" />
4+
<br />
5+
<br />
6+
<div style="text-align: left;">
7+
<div style="font-size: 0.85rem;">Filters: </div>
8+
<div style="display: inline-block;"><label style="color: white;" for="filterExample"><input type="checkbox" id="filterExample" name="filterExample">Example</label></div>
9+
<div style="display: inline-block;"><label style="color: white;" for="filterAWSDevGuide"><input type="checkbox" id="filterAWSDevGuide" name="filterAWSDevGuide">Dev Guide</label></div>
10+
<div style="display: inline-block;"><label style="color: white;" for="filterSDKGuide"><input type="checkbox" id="filterSDKGuide" name="filterSDKGuide">SDK Guide</label></div>
11+
</div>
12+
13+
</form>
14+
</div>

doc/api/training/smd_model_parallel_general.rst

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,35 @@ The SageMaker model parallel library internally uses MPI.
1919
To use model parallelism, both ``smdistributed`` and MPI must be enabled
2020
through the ``distribution`` parameter.
2121

22+
The following code example is a template of setting up model parallelism for a PyTorch estimator.
23+
24+
.. code:: python
25+
26+
import sagemaker
27+
from sagemaker.pytorch import PyTorch
28+
29+
smp_options = {
30+
"enabled":True,
31+
"parameters": {
32+
...
33+
}
34+
}
35+
36+
mpi_options = {
37+
"enabled" : True,
38+
...
39+
}
40+
41+
smdmp_estimator = PyTorch(
42+
...
43+
distribution={
44+
"smdistributed": {"modelparallel": smp_options},
45+
"mpi": mpi_options
46+
}
47+
)
48+
49+
smdmp_estimator.fit()
50+
2251
.. tip::
2352

2453
This page provides you a complete list of parameters you can use
@@ -214,6 +243,34 @@ PyTorch-specific Parameters
214243
- False
215244
- Skips the initial tracing step. This can be useful in very large models
216245
where even model tracing at the CPU is not possible due to memory constraints.
246+
* - ``sharded_data_parallel_degree`` (**smdistributed-modelparallel**>=v1.11)
247+
- int
248+
- 1
249+
- To run a training job using sharded data parallelism, add this parameter and specify a number greater than 1.
250+
Sharded data parallelism is a memory-saving distributed training technique that splits the training state of a model (model parameters, gradients, and optimizer states) across GPUs in a data parallel group.
251+
For more information, see `Sharded Data Parallelism
252+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html>`_.
253+
* - ``sdp_reduce_bucket_size`` (**smdistributed-modelparallel**>=v1.11)
254+
- int
255+
- 5e8
256+
- Configuration parameter for sharded data parallelism (for ``sharded_data_parallel_degree > 2``).
257+
Specifies the size of PyTorch DDP gradient buckets in number of elements of the default dtype.
258+
* - ``sdp_param_persistence_threshold`` (**smdistributed-modelparallel**>=v1.11)
259+
- int
260+
- 1e6
261+
- Specifies the size of a parameter tensor in number of elements that can persist at each GPU. Sharded data parallelism splits each parameter tensor across GPUs of a data parallel group. If the number of elements in the parameter tensor is smaller than this threshold, the parameter tensor is not split; this helps reduce communication overhead because the parameter tensor is replicated across data-parallel GPUs.
262+
* - ``sdp_max_live_parameters`` (**smdistributed-modelparallel**>=v1.11)
263+
- int
264+
- 1e9
265+
- Specifies the maximum number of parameters that can simultaneously be in a recombined training state during the forward and backward pass. Parameter fetching with the AllGather operation pauses when the number of active parameters reaches the given threshold. Note that increasing this parameter increases the memory footprint.
266+
* - ``sdp_hierarchical_allgather`` (**smdistributed-modelparallel**>=v1.11)
267+
- bool
268+
- True
269+
- If set to True, the AllGather operation runs hierarchically: it runs within each node first, and then runs across nodes. For multi-node distributed training jobs, the hierarchical AllGather operation is automatically activated.
270+
* - ``sdp_gradient_clipping`` (**smdistributed-modelparallel**>=v1.11)
271+
- float
272+
- 1.0
273+
- Specifies a threshold for gradient clipping the L2 norm of the gradients before propagating them backward through the model parameters. When sharded data parallelism is activated, gradient clipping is also activated. The default threshold is 1.0. Adjust this parameter if you have the exploding gradients problem.
217274

218275

219276
Parameters for ``mpi``

doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst

Lines changed: 76 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,84 @@ Release Notes
55
New features, bug fixes, and improvements are regularly made to the SageMaker
66
distributed model parallel library.
77

8-
SageMaker Distributed Model Parallel 1.10.0 Release Notes
8+
9+
SageMaker Distributed Model Parallel 1.11.0 Release Notes
910
=========================================================
1011

12+
*Date: August. 17. 2022*
13+
14+
**New Features**
15+
16+
The following new features are added for PyTorch.
17+
18+
* The library implements sharded data parallelism, which is a memory-saving
19+
distributed training technique that splits the training state of a model
20+
(model parameters, gradients, and optimizer states) across data parallel groups.
21+
With sharded data parallelism, you can reduce the per-GPU memory footprint of
22+
a model by sharding the training state over multiple GPUs. To learn more,
23+
see `Sharded Data Parallelism
24+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch-sharded-data-parallelism.html>`_
25+
in the *Amazon SageMaker Developer Guide*.
26+
27+
**Migration to AWS Deep Learning Containers**
28+
29+
This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
30+
31+
- DLC for PyTorch 1.12.0
32+
33+
.. code::
34+
35+
763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker
36+
37+
Binary file of this version of the library for `custom container
38+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-sm-sdk.html#model-parallel-bring-your-own-container>`_ users:
39+
40+
- For PyTorch 1.12.0
41+
42+
.. code::
43+
44+
https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.12.0/build-artifacts/2022-08-12-16-58/smdistributed_modelparallel-1.11.0-cp38-cp38-linux_x86_64.whl
45+
46+
----
47+
48+
Release History
49+
===============
50+
51+
SageMaker Distributed Model Parallel 1.10.1 Release Notes
52+
---------------------------------------------------------
53+
54+
*Date: August. 8. 2022*
55+
56+
**Currency Updates**
57+
58+
* Added support for Transformers v4.21.
59+
60+
61+
**Migration to AWS Deep Learning Containers**
62+
63+
This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
64+
65+
- DLC for PyTorch 1.11.0
66+
67+
.. code::
68+
69+
763104351884.dkr.ecr.<region>.amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker
70+
71+
72+
Binary file of this version of the library for `custom container
73+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-sm-sdk.html#model-parallel-bring-your-own-container>`_ users:
74+
75+
- For PyTorch 1.11.0
76+
77+
.. code::
78+
79+
https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.11.0/build-artifacts/2022-07-28-23-07/smdistributed_modelparallel-1.10.1-cp38-cp38-linux_x86_64.whl
80+
81+
82+
83+
SageMaker Distributed Model Parallel 1.10.0 Release Notes
84+
---------------------------------------------------------
85+
1186
*Date: July. 19. 2022*
1287

1388
**New Features**
@@ -62,10 +137,6 @@ Binary file of this version of the library for `custom container
62137
63138
https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.12.0/build-artifacts/2022-07-11-19-23/smdistributed_modelparallel-1.10.0-cp38-cp38-linux_x86_64.whl
64139
65-
----
66-
67-
Release History
68-
===============
69140
70141
SageMaker Distributed Model Parallel 1.9.0 Release Notes
71142
--------------------------------------------------------

doc/api/training/smp_versions/archives.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
.. toctree::
44
:maxdepth: 1
55

6+
v1_10_0.rst
67
v1_9_0.rst
78
v1_6_0.rst
89
v1_5_0.rst

doc/api/training/smp_versions/latest.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ depending on which version of the library you need to use.
1010
To use the library, reference the
1111
**Common API** documentation alongside the framework specific API documentation.
1212

13-
Version 1.10.0 (Latest)
13+
Version 1.11.0 (Latest)
1414
===========================================
1515

1616
To use the library, reference the Common API documentation alongside the framework specific API documentation.

0 commit comments

Comments
 (0)