Merge branch 'dev' into dev

ahsan-z-khan · web-flow · commit 7b8c470f5258 · 2022-03-07T12:47:38.000-08:00
diff --git a/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst b/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst
@@ -243,16 +243,25 @@ TensorFlow API
 
 .. function:: smdistributed.dataparallel.tensorflow.allreduce(tensor, param_index, num_params, compression=Compression.none, op=ReduceOp.AVERAGE)
 
-   Performs an all-reduce operation on a tensor (``tf.Tensor``).
+   Performs an ``allreduce`` operation on a tensor (``tf.Tensor``).
+
+   The ``smdistributed.dataparallel`` package's AllReduce API for TensorFlow to allreduce
+   gradient tensors. By default, ``smdistributed.dataparallel`` allreduce averages the
+   gradient tensors across participating workers.
+
+   .. note::
+
+    :class:`smdistributed.dataparallel.tensorflow.allreduce()` should
+    only be used to allreduce gradient tensors.
+    For other (non-gradient) tensors, you must use
+    :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()`.
+    If you use :class:`smdistributed.dataparallel.tensorflow.allreduce()`
+    for non-gradient tensors,
+    the distributed training job might stall or stop.
 
-   ``smdistributed.dataparallel`` AllReduce API can be used for all
-   reducing gradient tensors or any other tensors. By
-   default, ``smdistributed.dataparallel`` AllReduce averages the
-   tensors across the participating workers.
-   ​
    **Inputs:**
 
-   - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all ranks.
+   - ``tensor (tf.Tensor)(required)``: The tensor to be allreduced. The shape of the input must be identical across all ranks.
    - ``param_index (int)(required):`` 0 if you are reducing a single tensor. Index of the tensor if you are reducing a list of tensors.
    - ``num_params (int)(required):`` len(tensor).
    - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression.
@@ -306,9 +315,9 @@ TensorFlow API
 
 .. function:: smdistributed.dataparallel.tensorflow.oob_allreduce(tensor, compression=Compression.none, op=ReduceOp.AVERAGE)
 
-   OutOfBand (oob) AllReduce is simplified AllReduce function for use cases
+   Out-of-band (oob) AllReduce is simplified AllReduce function for use-cases
    such as calculating total loss across all the GPUs in the training.
-   oob_allreduce average the tensors, as reduction operation, across the
+   ``oob_allreduce`` average the tensors, as reduction operation, across the
    worker nodes.
 
    **Inputs:**
@@ -326,15 +335,25 @@ TensorFlow API
 
    -  ``None``
 
-   .. rubric:: Notes
-
-   ``smdistributed.dataparallel.tensorflow.oob_allreduce``, in most
-   cases, is ~2x slower
-   than ``smdistributed.dataparallel.tensorflow.allreduce``  so it is not
-   recommended to be used for performing gradient reduction during the
-   training
-   process. ``smdistributed.dataparallel.tensorflow.oob_allreduce`` internally
-   uses NCCL AllReduce with ``ncclSum`` as the reduction operation.
+   .. note::
+
+      In most cases, the :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()`
+      function is ~2x slower
+      than :class:`smdistributed.dataparallel.tensorflow.allreduce()`. It is not
+      recommended to use the :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()`
+      function for performing gradient
+      reduction during the training process.
+      ``smdistributed.dataparallel.tensorflow.oob_allreduce`` internally
+      uses NCCL AllReduce with ``ncclSum`` as the reduction operation.
+
+   .. note::
+
+      :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()` should
+      only be used to allreduce non-gradient tensors.
+      If you use :class:`smdistributed.dataparallel.tensorflow.allreduce()`
+      for non-gradient tensors,
+      the distributed training job might stall or stop.
+      To allreduce gradients, use :class:`smdistributed.dataparallel.tensorflow.allreduce()`.
 
 
 .. function:: smdistributed.dataparallel.tensorflow.overlap(tensor)
diff --git a/src/sagemaker/huggingface/estimator.py b/src/sagemaker/huggingface/estimator.py
@@ -50,14 +50,15 @@ def __init__(
         compiler_config=None,
         **kwargs,
     ):
-        """This ``Estimator`` executes a HuggingFace script in a managed execution environment.
+        """This estimator runs a Hugging Face training script in a SageMaker training environment.
 
-        The managed HuggingFace environment is an Amazon-built Docker container that executes
-        functions defined in the supplied ``entry_point`` Python script within a SageMaker
-        Training Job.
+        The estimator initiates the SageMaker-managed Hugging Face environment
+        by using the pre-built Hugging Face Docker container and runs
+        the Hugging Face training script that user provides through
+        the ``entry_point`` argument.
 
-        Training is started by calling
-        :meth:`~sagemaker.amazon.estimator.Framework.fit` on this Estimator.
+        After configuring the estimator class, use the class method
+        :meth:`~sagemaker.amazon.estimator.Framework.fit()` to start a training job.
 
         Args:
             py_version (str): Python version you want to use for executing your model training
diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
@@ -467,7 +467,7 @@ def _upload_code(self, key_prefix: str, repack: bool = False) -> None:
             )
 
     def _script_mode_env_vars(self):
-        """Placeholder docstring"""
+        """Returns a mapping of environment variables for script mode execution"""
         script_name = None
         dir_name = None
         if self.uploaded_code:
@@ -479,8 +479,11 @@ def _script_mode_env_vars(self):
         elif self.entry_point is not None:
             script_name = self.entry_point
             if self.source_dir is not None:
-                dir_name = "file://" + self.source_dir
-
+                dir_name = (
+                    self.source_dir
+                    if self.source_dir.startswith("s3://")
+                    else "file://" + self.source_dir
+                )
         return {
             SCRIPT_PARAM_NAME.upper(): script_name or str(),
             DIR_PARAM_NAME.upper(): dir_name or str(),
diff --git a/src/sagemaker/training_compiler/config.py b/src/sagemaker/training_compiler/config.py
@@ -18,11 +18,7 @@
 
 
 class TrainingCompilerConfig(object):
-    """The configuration class for accelerating SageMaker training jobs through compilation.
-
-    SageMaker Training Compiler speeds up training by optimizing the model execution graph.
-
-    """
+    """The SageMaker Training Compiler configuration class."""
 
     DEBUG_PATH = "/opt/ml/output/data/compiler/"
     SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4"]
@@ -37,9 +33,15 @@ def __init__(
     ):
         """This class initializes a ``TrainingCompilerConfig`` instance.
 
-        Pass the output of it to the ``compiler_config``
+        `Amazon SageMaker Training Compiler
+        <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
+        is a feature of SageMaker Training
+        and speeds up training jobs by optimizing model execution graphs.
+
+        You can compile Hugging Face models
+        by passing the object of this configuration class to the ``compiler_config``
         parameter of the :class:`~sagemaker.huggingface.HuggingFace`
-        class.
+        estimator.
 
         Args:
             enabled (bool): Optional. Switch to enable SageMaker Training Compiler.
@@ -48,13 +50,28 @@ def __init__(
                 This comes with a potential performance slowdown.
                 The default is ``False``.
 
-        **Example**: The following example shows the basic ``compiler_config``
-        parameter configuration, enabling compilation with default parameter values.
+        **Example**: The following code shows the basic usage of the
+        :class:`sagemaker.huggingface.TrainingCompilerConfig()` class
+        to run a HuggingFace training job with the compiler.
 
         .. code-block:: python
 
-            from sagemaker.huggingface import TrainingCompilerConfig
-            compiler_config = TrainingCompilerConfig()
+            from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig
+
+            huggingface_estimator=HuggingFace(
+                ...
+                compiler_config=TrainingCompilerConfig()
+            )
+
+        .. seealso::
+
+            For more information about how to enable SageMaker Training Compiler
+            for various training settings such as using TensorFlow-based models,
+            PyTorch-based models, and distributed training,
+            see `Enable SageMaker Training Compiler
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler-enable.html>`_
+            in the `Amazon SageMaker Training Compiler developer guide
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_.
 
         """
 
diff --git a/tests/unit/sagemaker/model/test_model.py b/tests/unit/sagemaker/model/test_model.py
@@ -26,6 +26,8 @@
 from sagemaker.sklearn.model import SKLearnModel
 from sagemaker.tensorflow.model import TensorFlowModel
 from sagemaker.xgboost.model import XGBoostModel
+from sagemaker.workflow.properties import Properties
+
 
 MODEL_DATA = "s3://bucket/model.tar.gz"
 MODEL_IMAGE = "mi"
@@ -42,7 +44,6 @@
 BRANCH = "test-branch-git-config"
 COMMIT = "ae15c9d7d5b97ea95ea451e4662ee43da3401d73"
 ENTRY_POINT_INFERENCE = "inference.py"
-
 SCRIPT_URI = "s3://codebucket/someprefix/sourcedir.tar.gz"
 IMAGE_URI = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.9.0-gpu-py38"
 
@@ -71,6 +72,23 @@ def sagemaker_session():
     return sms
 
 
+@patch("shutil.rmtree", MagicMock())
+@patch("tarfile.open", MagicMock())
+@patch("os.listdir", MagicMock(return_value=[ENTRY_POINT_INFERENCE]))
+def test_prepare_container_def_with_model_src_s3_returns_correct_url(sagemaker_session):
+    model = Model(
+        entry_point=ENTRY_POINT_INFERENCE,
+        role=ROLE,
+        sagemaker_session=sagemaker_session,
+        source_dir=SCRIPT_URI,
+        image_uri=MODEL_IMAGE,
+        model_data=Properties("Steps.MyStep"),
+    )
+    container_def = model.prepare_container_def(INSTANCE_TYPE, "ml.eia.medium")
+
+    assert container_def["Environment"]["SAGEMAKER_SUBMIT_DIRECTORY"] == SCRIPT_URI
+
+
 def test_prepare_container_def_with_model_data():
     model = Model(MODEL_IMAGE)
     container_def = model.prepare_container_def(INSTANCE_TYPE, "ml.eia.medium")