Merge remote-tracking branch 'aws/master' into trcomp-hf-pt-111

Lokiiiiii · Lokiiiiii · commit 0b2a16c75eb8 · 2022-08-29T17:47:52.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## v2.107.0 (2022-08-29)
+
+### Features
+
+ * support python 3.10, update airflow dependency
+
+### Bug Fixes and Other Changes
+
+ * Add retry in session.py to check if training is finished
+
+### Documentation Changes
+
+ * remove Other tab in Built-in algorithms section and mi…
+
 ## v2.106.0 (2022-08-24)
 
 ### Features
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -78,9 +78,9 @@ Before sending us a pull request, please ensure that:
 1. cd into the sagemaker-python-sdk folder: `cd sagemaker-python-sdk` or `cd /environment/sagemaker-python-sdk`
 1. Run the following tox command and verify that all code checks and unit tests pass: `tox tests/unit`
 
-You can also run a single test with the following command: `tox -e py36 -- -s -vv <path_to_file><file_name>::<test_function_name>`
+You can also run a single test with the following command: `tox -e py310 -- -s -vv <path_to_file><file_name>::<test_function_name>`
   * Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE`
-  * Example: `export IGNORE_COVERAGE=- ; tox -e py36 -- -s -vv tests/unit/test_estimator.py::test_sagemaker_model_s3_uri_invalid ; unset IGNORE_COVERAGE`
+  * Example: `export IGNORE_COVERAGE=- ; tox -e py310 -- -s -vv tests/unit/test_estimator.py::test_sagemaker_model_s3_uri_invalid ; unset IGNORE_COVERAGE`
 
 
 ### Run the Integration Tests
@@ -89,9 +89,9 @@ Our CI system runs integration tests (the ones in the `tests/integ` directory),
 You should only worry about manually running any new integration tests that you write, or integration tests that test an area of code that you've modified.
 
 1. Follow the instructions at [Set Up the AWS Command Line Interface (AWS CLI)](https://docs.aws.amazon.com/polly/latest/dg/setup-aws-cli.html).
-1. To run a test, specify the test file and method you want to run per the following command: `tox -e py36 -- -s -vv <path_to_file><file_name>::<test_function_name>`
+1. To run a test, specify the test file and method you want to run per the following command: `tox -e py310 -- -s -vv <path_to_file><file_name>::<test_function_name>`
    * Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE`
-   * Example: `export IGNORE_COVERAGE=- ; tox -e py36 -- -s -vv tests/integ/test_tf_script_mode.py::test_mnist ; unset IGNORE_COVERAGE`
+   * Example: `export IGNORE_COVERAGE=- ; tox -e py310 -- -s -vv tests/integ/test_tf_script_mode.py::test_mnist ; unset IGNORE_COVERAGE`
 
 If you are writing or modifying a test that creates a SageMaker job (training, tuner, or transform) or endpoint, it's important to assign a concurrency-friendly `job_name` (or `endpoint_name`), or your tests may fail randomly due to name collisions. We have a helper method `sagemaker.utils.unique_name_from_base(base, max_length)` that makes test-friendly names. You can find examples of how to use it [here](https://github.com/aws/sagemaker-python-sdk/blob/3816a5658d3737c9767e01bc8d37fc3ed5551593/tests/integ/test_tfs.py#L37) and
 [here](https://github.com/aws/sagemaker-python-sdk/blob/3816a5658d3737c9767e01bc8d37fc3ed5551593/tests/integ/test_tuner.py#L616), or by searching for "unique\_name\_from\_base" in our test code.
diff --git a/README.rst b/README.rst
@@ -90,6 +90,7 @@ SageMaker Python SDK is tested on:
 - Python 3.7
 - Python 3.8
 - Python 3.9
+- Python 3.10
 
 AWS Permissions
 ~~~~~~~~~~~~~~~
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.106.1.dev0
+2.107.1.dev0
diff --git a/requirements/extras/test_requirements.txt b/requirements/extras/test_requirements.txt
@@ -11,7 +11,7 @@ contextlib2==21.6.0
 awslogs==0.14.0
 black==22.3.0
 stopit==1.1.2
-apache-airflow==2.2.4
+apache-airflow==2.3.4
 apache-airflow-providers-amazon==4.0.0
 attrs==20.3.0
 fabric==2.6.0
diff --git a/setup.py b/setup.py
@@ -94,6 +94,7 @@ def read_requirements(filename):
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
     ],
     install_requires=required_packages,
     extras_require=extras,
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -167,10 +167,44 @@ def __init__(
             instance_type (str): Type of EC2 instance to use for training,
                 for example, ``'ml.c4.xlarge'``. Required if instance_groups is
                 not set.
-            volume_size (int): Size in GB of the EBS volume to use for
-                storing input data during training (default: 30). Must be large
-                enough to store training data if File Mode is used (which is the
-                default).
+            volume_size (int): Size in GB of the storage volume to use for
+                storing input and output data during training (default: 30).
+
+                Must be large enough to store training data if File mode is
+                used, which is the default mode.
+
+                When you use an ML instance with the EBS-only storage option
+                such as ``ml.c5`` and ``ml.p2``,
+                you must define the size of the EBS
+                volume through the ``volume_size`` parameter in the estimator class.
+
+                .. note::
+
+                    When you use an ML instance with `NVMe SSD volumes
+                    <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
+                    such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
+                    do not include this parameter in the estimator configuration.
+                    If you use one of those ML instance types,
+                    SageMaker doesn't provision Amazon EBS General Purpose SSD
+                    (gp2) storage nor take this parameter to adjust the NVMe instance storage.
+                    Available storage is fixed to the NVMe instance storage
+                    capacity. SageMaker configures storage paths for training
+                    datasets, checkpoints, model artifacts, and outputs to use the
+                    entire capacity of the instance storage.
+
+                    Note that if you include this parameter and specify a number that
+                    exceeds the size of the NVMe volume attached to the instance type,
+                    SageMaker returns an ``Invalid VolumeSizeInGB`` error.
+
+                To look up instance types and their instance storage types
+                and volumes, see `Amazon EC2 Instance Types
+                <http://aws.amazon.com/ec2/instance-types/>`_.
+
+                To find the default local paths defined by the SageMaker
+                training platform, see `Amazon SageMaker Training Storage
+                Folders for Training Datasets, Checkpoints, Model Artifacts,
+                and Outputs
+                <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
             volume_kms_key (str): Optional. KMS key ID for encrypting EBS
                 volume attached to the training instance (default: None).
             max_run (int): Timeout in seconds for training (default: 24 *
@@ -2233,12 +2267,46 @@ def __init__(
             instance_count (int): Number of Amazon EC2 instances to use
                 for training. Required if instance_groups is not set.
             instance_type (str): Type of EC2 instance to use for training,
-                for example, 'ml.c4.xlarge'. Required if instance_groups is
+                for example, ``'ml.c4.xlarge'``. Required if instance_groups is
                 not set.
-            volume_size (int): Size in GB of the EBS volume to use for
-                storing input data during training (default: 30). Must be large
-                enough to store training data if File Mode is used (which is the
-                default).
+            volume_size (int): Size in GB of the storage volume to use for
+                storing input and output data during training (default: 30).
+
+                Must be large enough to store training data if File mode is
+                used, which is the default mode.
+
+                When you use an ML instance with the EBS-only storage option
+                such as ``ml.c5`` and ``ml.p2``,
+                you must define the size of the EBS
+                volume through the ``volume_size`` parameter in the estimator class.
+
+                .. note::
+
+                    When you use an ML instance with `NVMe SSD volumes
+                    <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
+                    such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
+                    do not include this parameter in the estimator configuration.
+                    If you use one of those ML instance types,
+                    SageMaker doesn't provision Amazon EBS General Purpose SSD
+                    (gp2) storage nor take this parameter to adjust the NVMe instance storage.
+                    Available storage is fixed to the NVMe instance storage
+                    capacity. SageMaker configures storage paths for training
+                    datasets, checkpoints, model artifacts, and outputs to use the
+                    entire capacity of the instance storage.
+
+                    Note that if you include this parameter and specify a number that
+                    exceeds the size of the NVMe volume attached to the instance type,
+                    SageMaker returns an ``Invalid VolumeSizeInGB`` error.
+
+                To look up instance types and their instance storage types
+                and volumes, see `Amazon EC2 Instance Types
+                <http://aws.amazon.com/ec2/instance-types/>`_.
+
+                To find the default local paths defined by the SageMaker
+                training platform, see `Amazon SageMaker Training Storage
+                Folders for Training Datasets, Checkpoints, Model Artifacts,
+                and Outputs
+                <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
             volume_kms_key (str): Optional. KMS key ID for encrypting EBS
                 volume attached to the training instance (default: None).
             max_run (int): Timeout in seconds for training (default: 24 *
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -41,6 +41,7 @@
     secondary_training_status_changed,
     secondary_training_status_message,
     sts_regional_endpoint,
+    retries,
 )
 from sagemaker import exceptions
 from sagemaker.session_settings import SessionSettings
@@ -4699,21 +4700,30 @@ def _train_done(sagemaker_client, job_name, last_desc):
     """Placeholder docstring"""
     in_progress_statuses = ["InProgress", "Created"]
 
-    desc = sagemaker_client.describe_training_job(TrainingJobName=job_name)
-    status = desc["TrainingJobStatus"]
+    for _ in retries(
+        max_retry_count=10,  # 10*30 = 5min
+        exception_message_prefix="Waiting for schedule to leave 'Pending' status",
+        seconds_to_sleep=30,
+    ):
+        try:
+            desc = sagemaker_client.describe_training_job(TrainingJobName=job_name)
+            status = desc["TrainingJobStatus"]
 
-    if secondary_training_status_changed(desc, last_desc):
-        print()
-        print(secondary_training_status_message(desc, last_desc), end="")
-    else:
-        print(".", end="")
-    sys.stdout.flush()
+            if secondary_training_status_changed(desc, last_desc):
+                print()
+                print(secondary_training_status_message(desc, last_desc), end="")
+            else:
+                print(".", end="")
+            sys.stdout.flush()
 
-    if status in in_progress_statuses:
-        return desc, False
+            if status in in_progress_statuses:
+                return desc, False
 
-    print()
-    return desc, True
+            print()
+            return desc, True
+        except botocore.exceptions.ClientError as err:
+            if err.response["Error"]["Code"] == "AccessDeniedException":
+                pass
 
 
 def _processing_job_status(sagemaker_client, job_name):
diff --git a/tests/integ/test_airflow_config.py b/tests/integ/test_airflow_config.py
@@ -14,9 +14,9 @@
 
 import os
 
-import airflow
 import pytest
 import numpy as np
+from airflow import utils
 from airflow import DAG
 from airflow.contrib.operators.sagemaker_training_operator import SageMakerTrainingOperator
 from airflow.contrib.operators.sagemaker_transform_operator import SageMakerTransformOperator
@@ -624,7 +624,7 @@ def _build_airflow_workflow(estimator, instance_type, inputs=None, mini_batch_si
 
     default_args = {
         "owner": "airflow",
-        "start_date": airflow.utils.dates.days_ago(2),
+        "start_date": utils.dates.days_ago(2),
         "provide_context": True,
     }