aws
diff --git a/‎CHANGELOG.md
Lines changed: 55 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 55 additions & 0 deletions
diff --git a/‎VERSION
Lines changed: 1 addition & 1 deletion b/‎VERSION
Lines changed: 1 addition & 1 deletion
diff --git a/‎buildspec-deploy.yml
Lines changed: 10 additions & 1 deletion b/‎buildspec-deploy.yml
Lines changed: 10 additions & 1 deletion
diff --git a/‎buildspec-localmodetests.yml
Lines changed: 2 additions & 2 deletions b/‎buildspec-localmodetests.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎buildspec-release.yml
Lines changed: 2 additions & 17 deletions b/‎buildspec-release.yml
Lines changed: 2 additions & 17 deletions
diff --git a/‎buildspec-slowtests.yml
Lines changed: 15 additions & 0 deletions b/‎buildspec-slowtests.yml
Lines changed: 15 additions & 0 deletions
diff --git a/‎buildspec-unittests.yml
Lines changed: 2 additions & 2 deletions b/‎buildspec-unittests.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎buildspec.yml
Lines changed: 3 additions & 3 deletions b/‎buildspec.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎doc/api/training/smd_model_parallel_general.rst
Lines changed: 41 additions & 3 deletions b/‎doc/api/training/smd_model_parallel_general.rst
Lines changed: 41 additions & 3 deletions
diff --git a/‎doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md
Lines changed: 0 additions & 4 deletions b/‎doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md
Lines changed: 0 additions & 4 deletions
diff --git a/‎doc/api/training/smp_versions/model-data-parallel.png
35.9 KB b/‎doc/api/training/smp_versions/model-data-parallel.png
35.9 KB
diff --git a/‎doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst
Lines changed: 3 additions & 0 deletions b/‎doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst
Lines changed: 16 additions & 3 deletions b/‎doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst
Lines changed: 16 additions & 3 deletions
diff --git a/‎doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst
Lines changed: 1 addition & 1 deletion b/‎doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/frameworks/mxnet/using_mxnet.rst
Lines changed: 5 additions & 4 deletions b/‎doc/frameworks/mxnet/using_mxnet.rst
Lines changed: 5 additions & 4 deletions
diff --git a/‎doc/workflows/pipelines/sagemaker.workflow.pipelines.rst
Lines changed: 0 additions & 2 deletions b/‎doc/workflows/pipelines/sagemaker.workflow.pipelines.rst
Lines changed: 0 additions & 2 deletions
diff --git a/‎setup.py
Lines changed: 1 addition & 0 deletions b/‎setup.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/sagemaker/analytics.py
Lines changed: 1 addition & 0 deletions b/‎src/sagemaker/analytics.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/sagemaker/image_uri_config/inferentia-mxnet.json
Lines changed: 22 additions & 1 deletion b/‎src/sagemaker/image_uri_config/inferentia-mxnet.json
Lines changed: 22 additions & 1 deletion
@@ -1,5 +1,60 @@
 # Changelog
 
+## v2.25.1 (2021-02-20)
+
+### Bug Fixes and Other Changes
+
+ * Add tests for visualizer to improve test coverage
+
+### Documentation Changes
+
+ * specify correct return type
+
+### Testing and Release Infrastructure
+
+ * rename canary_quick pytest mark to release
+
+## v2.25.0 (2021-02-19)
+
+### Features
+
+ * Enable step caching
+ * Add other Neo supported regions for Inferentia inference images
+
+### Bug Fixes and Other Changes
+
+ * remove FailStep from pipelines
+ * use sagemaker_session in workflow tests
+ * use ECR public for multidatamodel tests
+ * add the mapping from py3 to cuda11 images
+ * Add 30s cap time for tag tests
+ * add build spec for slow tests
+ * mark top 10 slow tests
+ * remove slow test_run_xxx_monitor_baseline tests
+ * pin astroid to 2.4.2
+
+### Testing and Release Infrastructure
+
+ * unmark more flaky integ tests
+ * remove canary_quick pytest mark from flaky/unnecessary tests
+ * remove python3.8 from buildspec
+ * remove py38 tox env
+ * fix release buildspec typo
+ * unblock regional release builds
+ * lower test TPS for experiment analytics
+ * move package preparation and publishing to the deploy step
+
+## v2.24.5 (2021-02-12)
+
+### Bug Fixes and Other Changes
+
+ * test_tag/test_tags method assert fix in association tests
+
+### Documentation Changes
+
+ * removing mention of TF 2.4 from SM distributed model parallel docs
+ * adding details about mpi options, other small updates
+
 ## v2.24.4 (2021-02-09)
 
 ### Bug Fixes and Other Changes
 
@@ -1 +1 @@
-2.24.5.dev0
+2.25.2.dev0
@@ -3,7 +3,16 @@ version: 0.2
 phases:
   build:
     commands:
-      - PACKAGE_FILE="$CODEBUILD_SRC_DIR_ARTIFACT_1/sagemaker-*.tar.gz"
+      # prepare the release (update versions, changelog etc.)
+      - git-release --prepare
+
+      # generate the distribution package
+      - python3 setup.py sdist
+
+      # publish the release to github
+      - git-release --publish
+
+      - PACKAGE_FILE="dist/sagemaker-*.tar.gz"
       - PYPI_USER=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/user --query SecretString --output text)
       - PYPI_PASSWORD=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/password --query SecretString --output text)
       - GPG_PRIVATE_KEY=$(aws secretsmanager get-secret-value --secret-id /codebuild/gpg/private_key --query SecretString --output text)
 
@@ -11,5 +11,5 @@ phases:
 
       # local mode tests
       - start_time=`date +%s`
-      - execute-command-if-has-matching-changes "tox -e py38 -- tests/integ -m local_mode --durations 50" "tests/integ" "tests/data" "tests/conftest.py" "tests/__init__.py" "src/*.py" "setup.py" "setup.cfg" "buildspec-localmodetests.yml"
-      - ./ci-scripts/displaytime.sh 'py38 local mode' $start_time
+      - execute-command-if-has-matching-changes "tox -e py37 -- tests/integ -m local_mode --durations 50" "tests/integ" "tests/data" "tests/conftest.py" "tests/__init__.py" "src/*.py" "setup.py" "setup.cfg" "buildspec-localmodetests.yml"
+      - ./ci-scripts/displaytime.sh 'py37 local mode' $start_time
@@ -3,9 +3,6 @@ version: 0.2
 phases:
   build:
     commands:
-      # prepare the release (update versions, changelog etc.)
-      - git-release --prepare
-
       # run linters
       - tox -e flake8,pylint
 
@@ -18,19 +15,7 @@ phases:
       # run unit tests
       - AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
         AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
-        tox -e py36,py37,py38 -- tests/unit
+        tox -e py36,py37 -- tests/unit
 
       # run a subset of the integration tests
-      - IGNORE_COVERAGE=- tox -e py36 -- tests/integ -m canary_quick -n 64 --boxed --reruns 2
-
-      # generate the distribution package
-      - python3 setup.py sdist
-
-      # publish the release to github
-      - git-release --publish
-
-artifacts:
-  files:
-    - dist/sagemaker-*.tar.gz
-  name: ARTIFACT_1
-  discard-paths: yes
+      - IGNORE_COVERAGE=- tox -e py36 -- tests/integ -m "not (local_mode or slow_test)" -n 32 --boxed --reruns 2
@@ -0,0 +1,15 @@
+version: 0.2
+
+phases:
+  pre_build:
+    commands:
+      - start-dockerd
+
+  build:
+    commands:
+      - IGNORE_COVERAGE=-
+
+      # slow tests
+      - start_time=`date +%s`
+      - execute-command-if-has-matching-changes "tox -e py37 -- tests/integ -m slow_test -n 16 --durations 0" "tests/integ" "tests/data" "tests/conftest.py" "tests/__init__.py" "src/*.py" "setup.py" "setup.cfg" "buildspec-slowtests.yml"
+      - ./ci-scripts/displaytime.sh 'py37 slow tests' $start_time
@@ -18,5 +18,5 @@ phases:
       - start_time=`date +%s`
       - AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
         AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
-        tox -e py36,py37,py38 --parallel all -- tests/unit
-      - ./ci-scripts/displaytime.sh 'py36,py37,py38 unit' $start_time
+        tox -e py36,py37 --parallel all -- tests/unit
+      - ./ci-scripts/displaytime.sh 'py36,py37 unit' $start_time
@@ -11,13 +11,13 @@ phases:
 
       # run integration tests
       - start_time=`date +%s`
-      - execute-command-if-has-matching-changes "python3.8 -u ci-scripts/queue_build.py" "tests/integ" "tests/scripts" "tests/data" "tests/conftest.py" "tests/__init__.py" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+      - execute-command-if-has-matching-changes "python3.7 -u ci-scripts/queue_build.py" "tests/integ" "tests/scripts" "tests/data" "tests/conftest.py" "tests/__init__.py" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
       - ./ci-scripts/displaytime.sh 'build queue' $start_time
 
       - start_time=`date +%s`
       - |
-        execute-command-if-has-matching-changes "env -u AWS_DEFAULT_REGION tox -e py38 -- tests/integ -m \"not local_mode and not cron\" -n 384 --reruns 3 --reruns-delay 15 --durations 50 --boto-config '{\"region_name\": \"us-east-2\"}'" "tests/integ" "tests/scripts" "tests/data" "tests/conftest.py" "tests/__init__.py" "src/*.py" "src/sagemaker/image_uri_config/*.json" "setup.py" "setup.cfg" "buildspec.yml"
-      - ./ci-scripts/displaytime.sh 'py38 tests/integ' $start_time
+        execute-command-if-has-matching-changes "env -u AWS_DEFAULT_REGION tox -e py37 -- tests/integ -m \"not local_mode and not cron and not slow_test\" -n 384 --reruns 3 --reruns-delay 15 --durations 50 --boto-config '{\"region_name\": \"us-east-2\"}'" "tests/integ" "tests/scripts" "tests/data" "tests/conftest.py" "tests/__init__.py" "src/*.py" "src/sagemaker/image_uri_config/*.json" "setup.py" "setup.cfg" "buildspec.yml"
+      - ./ci-scripts/displaytime.sh 'py37 tests/integ' $start_time
 
   post_build:
     finally:
 
@@ -5,13 +5,13 @@
 
 .. _sm-sdk-modelparallel-params:
 
-SageMaker Python SDK ``modelparallel`` parameters
-=================================================
+Required SageMaker Python SDK parameters
+========================================
 
 The TensorFlow and PyTorch ``Estimator`` objects contains a ``distribution`` parameter,
 which is used to enable and specify parameters for the
 initialization of the SageMaker distributed model parallel library. The library internally uses MPI,
-so in order to use model parallelism, MPI must be enabled using the ``distribution`` parameter.
+so in order to use model parallelism, MPI must also be enabled using the ``distribution`` parameter.
 
 The following is an example of how you can launch a new PyTorch training job with the library.
 
@@ -55,6 +55,9 @@ The following is an example of how you can launch a new PyTorch training job wit
 
    smd_mp_estimator.fit('s3://my_bucket/my_training_data/')
 
+``smdistributed`` Parameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 You can use the following parameters to initialize the library using the ``parameters``
 in the ``smdistributed`` of ``distribution``.
 
@@ -302,6 +305,41 @@ table are optional.
    |                   |                         |                 | SageMaker.                        |
    +-------------------+-------------------------+-----------------+-----------------------------------+
 
+``mpi`` Parameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+For the ``"mpi"`` key, a dict must be passed which contains:
+
+* ``"enabled"``: Set to ``True`` to launch the training job with MPI.
+
+* ``"processes_per_host"``: Specifies the number of processes MPI should launch on each host.
+  In SageMaker a host is a single Amazon EC2 ml instance. The SageMaker distributed model parallel library maintains
+  a one-to-one mapping between processes and GPUs across model and data parallelism.
+  This means that SageMaker schedules each process on a single, separate GPU and no GPU contains more than one process.
+  If you are using PyTorch, you must restrict each process to its own device using
+  ``torch.cuda.set_device(smp.local_rank())``. To learn more, see
+  `Modify a PyTorch Training Script
+  <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt-16>`_.
+
+  .. important::
+   ``process_per_host`` must be less than or equal to the number of GPUs per instance, and typically will be equal to
+   the number of GPUs per instance.
+
+  For example, if you use one instance with 4-way model parallelism and 2-way data parallelism,
+  then processes_per_host should be 2 x 4 = 8. Therefore, you must choose an instance that has at least 8 GPUs,
+  such as an ml.p3.16xlarge.
+
+  The following image illustrates how 2-way data parallelism and 4-way model parallelism is distributed across 8 GPUs:
+  the model is partitioned across 4 GPUs, and each partition is added to 2 GPUs.
+
+  .. image:: smp_versions/model-data-parallel.png
+      :width: 650
+      :alt: 2-way data parallelism and 4-way model parallelism distributed across 8 GPUs
+
+
+* ``"custom_mpi_options"``: Use this key to pass any custom MPI options you might need.
+  To avoid Docker warnings from contaminating your training logs, we recommend the following flag.
+  ```--mca btl_vader_single_copy_mechanism none```
+
 
 .. _ranking-basics:
 
 
@@ -17,10 +17,6 @@
 
 - Adds support for `_register_comm_hook` (PyTorch 1.7 only) which will register the callable as a communication hook for DDP. NOTE: Like in DDP, this is an experimental API and subject to change.
 
-### Tensorflow
-
-- Adds support for Tensorflow 2.4
-
 ## Bug Fixes
 
 ### PyTorch
 
@@ -118,6 +118,9 @@ The following SageMaker distribute model parallel APIs are common across all fra
    -  https://www.tensorflow.org/api_docs/python/tf/function\
    -  https://www.tensorflow.org/guide/function\
 
+   Each ``smp.step`` decorated function must have a return value that depends on the
+   output of ``smp.DistributedModel``.
+
    **Common parameters**
 
    -  ``non_split_inputs`` (``list``): The list of arguments to the decorated function
 
@@ -31,7 +31,6 @@ This API document assumes you use the following import statements in your traini
    model in the training script can be wrapped with
    ``smp.DistributedModel``.
 
-
    **Example:**
 
    .. code:: python
@@ -89,6 +88,17 @@ This API document assumes you use the following import statements in your traini
    the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside
    a ``smp.step``-decorated function.
 
+   **Using DDP**
+
+   If DDP is enabled, do not not place a PyTorch
+   ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
+   the ``DistributedModel`` wrapper will also handle data parallelism.
+
+   Unlike the original DDP wrapper, when you use ``DistributedModel``,
+   model parameters and buffers are not immediately broadcast across
+   processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the
+   ``smp.step``-decorated function when the partition is done.
+
    **Parameters**
 
    -  ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism).
@@ -248,11 +258,14 @@ This API document assumes you use the following import statements in your traini
    .. function:: join( )
 
       **Available for PyTorch 1.7 only**
+
       A context manager to be used in conjunction with an instance of
-      ``smp.DistributedModel``to be able to train with uneven inputs across
+      ``smp.DistributedModel`` to be able to train with uneven inputs across
       participating processes. This is only supported when ``ddp=True`` for
       ``smp.DistributedModel``. This will use the join with the wrapped
-      ``DistributedDataParallel`` instance. Please see: `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__.
+      ``DistributedDataParallel`` instance. For more information, see:
+      `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__
+      in the PyTorch documentation.
 
 
 .. class:: smp.DistributedOptimizer
 
@@ -1,7 +1,7 @@
 TensorFlow API
 ==============
 
-**Supported version: 2.4, 2.3**
+**Supported version: 2.3**
 
 **Important**: This API document assumes you use the following import statement in your training scripts.
 
 
@@ -377,7 +377,7 @@ It loads the model parameters from a ``model.params`` file in the SageMaker mode
         return net
 
 MXNet on Amazon SageMaker has support for `Elastic Inference <https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html>`__, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance.
-In order to load and serve your MXNet model through Amazon Elastic Inference, the MXNet context passed to your MXNet Symbol or Module object within your ``model_fn`` needs to be set to ``eia``, as shown `here <https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-mxnet-elastic-inference.html#ei-mxnet>`__.
+In order to load and serve your MXNet model through Amazon Elastic Inference, import the ``eimx`` Python package and make one change in the code to partition your model and optimize it for the ``EIA`` back end, as shown `here <https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-mxnet-elastic-inference.html#ei-mxnet>`__.
 
 Based on the example above, the following code-snippet shows an example custom ``model_fn`` implementation, which enables loading and serving our MXNet model through Amazon Elastic Inference.
 
@@ -392,11 +392,12 @@ Based on the example above, the following code-snippet shows an example custom `
         Returns:
             mxnet.gluon.nn.Block: a Gluon network (for this example)
         """
-        net = models.get_model('resnet34_v2', ctx=mx.eia(), pretrained=False, classes=10)
-        net.load_params('%s/model.params' % model_dir, ctx=mx.eia())
+        net = models.get_model('resnet34_v2', ctx=mx.cpu(), pretrained=False, classes=10)
+        net.load_params('%s/model.params' % model_dir, ctx=mx.cpu())
+        net.hybridize(backend='EIA', static_alloc=True, static_shape=True)
         return net
 
-The `default_model_fn <https://github.com/aws/sagemaker-mxnet-container/pull/55/files#diff-aabf018d906ed282a3c738377d19a8deR71>`__ loads and serve your model through Elastic Inference, if applicable, within the Amazon SageMaker MXNet containers.
+If you are using MXNet 1.5.1 and earlier, the `default_model_fn <https://github.com/aws/sagemaker-mxnet-container/pull/55/files#diff-aabf018d906ed282a3c738377d19a8deR71>`__ loads and serve your model through Elastic Inference, if applicable, within the Amazon SageMaker MXNet containers.
 
 For more information on how to enable MXNet to interact with Amazon Elastic Inference, see `Use Elastic Inference with MXNet <https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-mxnet-elastic-inference.html>`__.
 
 
@@ -110,8 +110,6 @@ Steps
 
 .. autoclass:: sagemaker.workflow.steps.ProcessingStep
 
-.. autoclass:: sagemaker.workflow.steps.FailStep
-
 Utilities
 ---------
 
 
@@ -66,6 +66,7 @@ def read_version():
         "pytest<6.1.0",
         "pytest-cov",
         "pytest-rerunfailures",
+        "pytest-timeout",
         "pytest-xdist",
         "mock",
         "contextlib2",
 
@@ -43,6 +43,7 @@ class AnalyticsMetricsBase(with_metaclass(ABCMeta, object)):
     """
 
     def __init__(self):
+        """Initializes ``AnalyticsMetricsBase`` instance."""
         self._dataframe = None
 
     def export_csv(self, filename):
 
@@ -5,8 +5,29 @@
         "1.5.1": {
             "py_versions": ["py3"],
             "registries": {
+                "af-south-1": "774647643957",
+                "ap-east-1": "110948597952",
+                "ap-northeast-1": "941853720454",
+                "ap-northeast-2": "151534178276",
+                "ap-south-1": "763008648453",
+                "ap-southeast-1": "324986816169",
+                "ap-southeast-2": "355873309152",
+                "ca-central-1": "464438896020",
+                "cn-north-1": "472730292857",
+                "cn-northwest-1": "474822919863",
+                "eu-central-1": "746233611703",
+                "eu-north-1": "601324751636",
+                "eu-south-1": "966458181534",
+                "eu-west-1": "802834080501",
+                "eu-west-2": "205493899709",
+                "eu-west-3": "254080097072",
+                "me-south-1": "836785723513",
+                "sa-east-1": "756306329178",
                 "us-east-1": "785573368785",
-                "us-west-2": "301217895009"
+                "us-east-2": "007439368137",
+                "us-gov-west-1": "263933020539",
+                "us-west-1": "710691900526",
+                "us-west-2": "301217895009"                
             },
             "repository": "sagemaker-neo-mxnet"
         }