Merge branch 'master' into fix-cloudwatch-logs-local

ahsan-z-khan · web-flow · commit 551566b9b167 · 2021-04-12T09:38:13.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # Changelog
 
+## v2.33.0 (2021-04-05)
+
+### Features
+
+ * Add environment variable support for SageMaker training job
+
+### Bug Fixes and Other Changes
+
+ * add version length mismatch validation for HuggingFace
+ * Disable debugger when checkpointing is enabled with distributed training
+ * map user context is list associations response
+
+### Testing and Release Infrastructure
+
+ * disable_profiler on mx-horovod test
+
 ## v2.32.1 (2021-04-01)
 
 ### Bug Fixes and Other Changes
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.32.2.dev0
+2.33.1.dev0
diff --git a/doc/api/training/sdp_versions/latest.rst b/doc/api/training/sdp_versions/latest.rst
@@ -1,5 +1,5 @@
 
-Version 1.1.0 (Latest)
+Version 1.1.1 (Latest)
 ======================
 
 .. toctree::
diff --git a/doc/api/training/sdp_versions/latest/smd_data_parallel_pytorch.rst b/doc/api/training/sdp_versions/latest/smd_data_parallel_pytorch.rst
@@ -153,9 +153,9 @@ you will have for distributed training with the distributed data parallel librar
 PyTorch API
 ===========
 
-**Supported versions:**
+.. rubric:: Supported versions
 
--  PyTorch 1.6.0, 1.8.0
+**PyTorch 1.7.1, 1.8.0**
 
 
 .. function:: smdistributed.dataparallel.torch.distributed.is_available()
diff --git a/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst b/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst
@@ -16,8 +16,9 @@ The following steps show you how to convert a TensorFlow 2.x training
 script to utilize the distributed data parallel library.
 
 The distributed data parallel library APIs are designed to be close to Horovod APIs.
-See `SageMaker distributed data parallel TensorFlow examples <https://sagemaker-examples.readthedocs.io/en/latest/training/distributed_training/index.html#tensorflow-distributed>`__ for additional details on how to implement the data parallel library
-API offered for TensorFlow.
+See `SageMaker distributed data parallel TensorFlow examples
+<https://sagemaker-examples.readthedocs.io/en/latest/training/distributed_training/index.html#tensorflow-distributed>`__
+for additional details on how to implement the data parallel library.
 
 -  First import the distributed data parallel library’s TensorFlow client and initialize it:
 
@@ -156,8 +157,10 @@ TensorFlow API
 
 .. rubric:: Supported versions
 
--  TensorFlow 2.x - 2.3.1
-
+TensorFlow is supported in version 1.0.0 of ``sagemakerdistributed.dataparallel``.
+Reference version 1.0.0 `TensorFlow API documentation
+<https://sagemaker.readthedocs.io/en/stable/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.html#tensorflow-sdp-api>`_
+for supported TensorFlow versions.
 
 .. function:: smdistributed.dataparallel.tensorflow.init()
 
diff --git a/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.rst b/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.rst
@@ -4,11 +4,10 @@ PyTorch Guide to SageMaker's distributed data parallel library
 
 .. admonition:: Contents
 
-   - :ref:`pytorch-sdp-modify`
-   - :ref:`pytorch-sdp-api`
+   - :ref:`pytorch-sdp-modify-1.0.0`
+   - :ref:`pytorch-sdp-api-1.0.0`
 
-.. _pytorch-sdp-modify:
-   :noindex:
+.. _pytorch-sdp-modify-1.0.0:
 
 Modify a PyTorch training script to use SageMaker data parallel
 ======================================================================
@@ -149,15 +148,14 @@ you will have for distributed training with the distributed data parallel librar
        main()
 
 
-.. _pytorch-sdp-api:
-   :noindex:
+.. _pytorch-sdp-api-1.0.0:
 
 PyTorch API
 ===========
 
-**Supported versions:**
+.. rubric:: Supported versions
 
--  PyTorch 1.6.0
+**PyTorch 1.6.0, 1.7.1**
 
 
 .. function:: smdistributed.dataparallel.torch.distributed.is_available()
diff --git a/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst b/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst
@@ -4,11 +4,10 @@ TensorFlow Guide to SageMaker's distributed data parallel library
 
 .. admonition:: Contents
 
-   - :ref:`tensorflow-sdp-modify`
-   - :ref:`tensorflow-sdp-api`
+   - :ref:`tensorflow-sdp-modify-1.0.0`
+   - :ref:`tensorflow-sdp-api-1.0.0`
 
-.. _tensorflow-sdp-modify:
-   :noindex:
+.. _tensorflow-sdp-modify-1.0.0:
 
 Modify a TensorFlow 2.x training script to use SageMaker data parallel
 ======================================================================
@@ -150,15 +149,14 @@ script you will have for distributed training with the library.
        checkpoint.save(checkpoint_dir)
 
 
-.. _tensorflow-sdp-api:
-   :noindex:
+.. _tensorflow-sdp-api-1.0.0:
 
 TensorFlow API
 ==============
 
 .. rubric:: Supported versions
 
--  TensorFlow 2.x - 2.3.1
+**TensorFlow 2.3.x - 2.4.1**
 
 
 .. function:: smdistributed.dataparallel.tensorflow.init()
diff --git a/doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.md b/doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.md
@@ -1,23 +1,41 @@
+# Sagemaker Distributed Data Parallel 1.1.1 Release Notes
+
+* New Features
+* Bug Fixes
+* Known Issues
+
+*New Features:*
+
+* Adds support for PyTorch 1.8.1
+
+*Bug Fixes:*
+
+* Fixes a bug that was causing gradients from one of the worker nodes to be added twice resulting in incorrect `all_reduce` results under some conditions.
+
+*Known Issues:*
+
+* SageMaker distributed data parallel still is not efficient when run using a single node. For the best performance, use multi-node distributed training with `smdistributed.dataparallel`. Use a single node only for experimental runs while preparing your training pipeline.
+
 # Sagemaker Distributed Data Parallel 1.1.0 Release Notes
 
 * New Features
 * Bug Fixes
 * Improvements
 * Known Issues
 
-New Features:
+*New Features:*
 
 * Adds support for PyTorch 1.8.0 with CUDA 11.1 and CUDNN 8
 
-Bug Fixes:
+*Bug Fixes:*
 
 * Fixes crash issue when importing `smdataparallel` before PyTorch
 
-Improvements:
+*Improvements:*
 
 * Update `smdataparallel` name in python packages, descriptions, and log outputs
 
-Known Issues:
+*Known Issues:*
 
 * SageMaker DataParallel is not efficient when run using a single node. For the best performance, use multi-node distributed training with `smdataparallel`. Use a single node only for experimental runs while preparing your training pipeline.
 
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
@@ -6,7 +6,7 @@
 PyTorch API
 ===========
 
-**Supported versions: 1.7.1, 1.8.0**
+**Supported versions: 1.6.0, 1.7.1, 1.8.0**
 
 This API document assumes you use the following import statements in your training scripts.
 
diff --git a/doc/frameworks/huggingface/index.rst b/doc/frameworks/huggingface/index.rst
@@ -9,3 +9,4 @@ For general information about using the SageMaker Python SDK, see :ref:`overview
     :maxdepth: 2
 
     sagemaker.huggingface
+    Use Hugging Face with the SageMaker Python SDK <https://huggingface.co/transformers/sagemaker.html>
diff --git a/doc/requirements.txt b/doc/requirements.txt
@@ -1,2 +1,3 @@
 sphinx==3.1.1
 sphinx-rtd-theme==0.5.0
+docutils==0.15.2
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -123,6 +123,7 @@ def __init__(
         content_type=None,
         content_template=None,
         custom_attributes=None,
+        accelerator_type=None,
     ):
         """Initializes a configuration of a model and the endpoint to be created for it.
 
@@ -151,6 +152,9 @@ def __init__(
                 Section 3.3.6. Field Value Components (
                 https://tools.ietf.org/html/rfc7230#section-3.2.6) of the Hypertext Transfer
                 Protocol (HTTP/1.1).
+            accelerator_type (str): The Elastic Inference accelerator type to deploy to the model
+                endpoint instance for making inferences to the model, see
+                https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html.
         """
         self.predictor_config = {
             "model_name": model_name,
@@ -178,9 +182,8 @@ def __init__(
                     f" Please include a placeholder $features."
                 )
             self.predictor_config["content_template"] = content_template
-
-        if custom_attributes is not None:
-            self.predictor_config["custom_attributes"] = custom_attributes
+        _set(custom_attributes, "custom_attributes", self.predictor_config)
+        _set(accelerator_type, "accelerator_type", self.predictor_config)
 
     def get_predictor_config(self):
         """Returns part of the predictor dictionary of the analysis config."""
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -190,7 +190,7 @@ def pytorch_inference_py_version(pytorch_inference_version, request):
         return "py3"
 
 
-def _huggingface_pytorch_version(huggingface_vesion):
+def _huggingface_base_fm_version(huggingface_vesion, base_fw):
     config = image_uris.config_for_framework("huggingface")
     training_config = config.get("training")
     original_version = huggingface_vesion
@@ -200,21 +200,26 @@ def _huggingface_pytorch_version(huggingface_vesion):
         )
     version_config = training_config.get("versions").get(huggingface_vesion)
     for key in list(version_config.keys()):
-        if key.startswith("pytorch"):
-            pt_version = key[7:]
+        if key.startswith(base_fw):
+            base_fw_version = key[len(base_fw) :]
             if len(original_version.split(".")) == 2:
-                pt_version = ".".join(pt_version.split(".")[:-1])
-            return pt_version
+                base_fw_version = ".".join(base_fw_version.split(".")[:-1])
+            return base_fw_version
 
 
 @pytest.fixture(scope="module")
 def huggingface_pytorch_version(huggingface_training_version):
-    return _huggingface_pytorch_version(huggingface_training_version)
+    return _huggingface_base_fm_version(huggingface_training_version, "pytorch")
 
 
 @pytest.fixture(scope="module")
 def huggingface_pytorch_latest_version(huggingface_training_latest_version):
-    return _huggingface_pytorch_version(huggingface_training_latest_version)
+    return _huggingface_base_fm_version(huggingface_training_latest_version, "pytorch")
+
+
+@pytest.fixture(scope="module")
+def huggingface_tensorflow_latest_version(huggingface_training_latest_version):
+    return _huggingface_base_fm_version(huggingface_training_latest_version, "tensorflow")
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/data/huggingface/run_glue.py b/tests/data/huggingface/run_glue.py
diff --git a/tests/data/huggingface/run_tf.py b/tests/data/huggingface/run_tf.py
diff --git a/tests/integ/test_huggingface.py b/tests/integ/test_huggingface.py
diff --git a/tests/integ/test_tfs.py b/tests/integ/test_tfs.py
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py

Original file line number	Diff line number	Diff line change
@@ -9,3 +9,4 @@ For general information about using the SageMaker Python SDK, see :ref:`overview
`9`	`9`	`:maxdepth: 2`
`10`	`10`
`11`	`11`	`sagemaker.huggingface`
	`12`	`+ Use Hugging Face with the SageMaker Python SDK <https://huggingface.co/transformers/sagemaker.html>`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`sphinx==3.1.1`
`2`	`2`	`sphinx-rtd-theme==0.5.0`
	`3`	`+docutils==0.15.2`