Merge branch 'master' into pipeline-experiment-config

ajaykarpur · web-flow · commit 331d24fd8bdc · 2021-05-13T14:51:01.000Z
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -124,6 +124,7 @@ def __init__(
         profiler_config=None,
         disable_profiler=False,
         environment=None,
+        max_retry_attempts=None,
         **kwargs,
     ):
         """Initialize an ``EstimatorBase`` instance.
@@ -269,6 +270,13 @@ def __init__(
                 will be disabled (default: ``False``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+             max_retry_attempts (int): The number of times to move a job to the STARTING status.
+                You can specify between 1 and 30 attempts.
+                If the value of attempts is greater than zero,
+                the job is retried on InternalServerFailure
+                the same number of attempts as the value.
+                You can cap the total duration for your job by setting ``max_wait`` and ``max_run``
+                (default: ``None``)
 
         """
         instance_count = renamed_kwargs(
@@ -357,6 +365,8 @@ def __init__(
 
         self.environment = environment
 
+        self.max_retry_attempts = max_retry_attempts
+
         if not _region_supports_profiler(self.sagemaker_session.boto_region_name):
             self.disable_profiler = True
 
@@ -1114,6 +1124,13 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
             if max_wait:
                 init_params["max_wait"] = max_wait
 
+        if job_details.get("RetryStrategy", False):
+            init_params["max_retry_attempts"] = job_details.get("RetryStrategy", {}).get(
+                "MaximumRetryAttempts"
+            )
+            max_wait = job_details.get("StoppingCondition", {}).get("MaxWaitTimeInSeconds")
+            if max_wait:
+                init_params["max_wait"] = max_wait
         return init_params
 
     def transformer(
@@ -1489,6 +1506,11 @@ def _get_train_args(cls, estimator, inputs, experiment_config):
         if estimator.enable_network_isolation():
             train_args["enable_network_isolation"] = True
 
+        if estimator.max_retry_attempts is not None:
+            train_args["retry_strategy"] = {"MaximumRetryAttempts": estimator.max_retry_attempts}
+        else:
+            train_args["retry_strategy"] = None
+
         if estimator.encrypt_inter_container_traffic:
             train_args["encrypt_inter_container_traffic"] = True
 
@@ -1666,6 +1688,7 @@ def __init__(
         profiler_config=None,
         disable_profiler=False,
         environment=None,
+        max_retry_attempts=None,
         **kwargs,
     ):
         """Initialize an ``Estimator`` instance.
@@ -1816,6 +1839,13 @@ def __init__(
                 will be disabled (default: ``False``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            max_retry_attempts (int): The number of times to move a job to the STARTING status.
+                You can specify between 1 and 30 attempts.
+                If the value of attempts is greater than zero,
+                the job is retried on InternalServerFailure
+                the same number of attempts as the value.
+                You can cap the total duration for your job by setting ``max_wait`` and ``max_run``
+                (default: ``None``)
         """
         self.image_uri = image_uri
         self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {}
@@ -1850,6 +1880,7 @@ def __init__(
             profiler_config=profiler_config,
             disable_profiler=disable_profiler,
             environment=environment,
+            max_retry_attempts=max_retry_attempts,
             **kwargs,
         )
 
diff --git a/src/sagemaker/image_uri_config/data-wrangler.json b/src/sagemaker/image_uri_config/data-wrangler.json
@@ -0,0 +1,33 @@
+{
+  "processing": {
+    "versions": {
+      "1.x": {
+        "registries": {
+          "af-south-1": "143210264188",
+          "ap-east-1": "707077482487",
+          "ap-northeast-1": "649008135260",
+          "ap-northeast-2": "131546521161",
+          "ap-south-1": "089933028263",
+          "ap-southeast-1": "119527597002",
+          "ap-southeast-2": "422173101802",
+          "ca-central-1": "557239378090",
+          "eu-central-1": "024640144536",
+          "eu-north-1": "054986407534",
+          "eu-south-1": "488287956546",
+          "eu-west-1": "245179582081",
+          "eu-west-2": "894491911112",
+          "eu-west-3": "807237891255",
+          "me-south-1": "376037874950",
+          "sa-east-1": "424196993095",
+          "us-east-1": "663277389841",
+          "us-east-2": "415577184552",
+          "us-west-1": "926135532090",
+          "us-west-2": "174368400705",
+          "cn-north-1": "245909111842",
+          "cn-northwest-1": "249157047649"
+        },
+        "repository": "sagemaker-data-wrangler-container"
+      }
+    }
+  }
+}
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -457,6 +457,7 @@ def train(  # noqa: C901
         profiler_rule_configs=None,
         profiler_config=None,
         environment=None,
+        retry_strategy=None,
     ):
         """Create an Amazon SageMaker training job.
 
@@ -529,6 +530,9 @@ def train(  # noqa: C901
                 with SageMaker Profiler. (default: ``None``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            retry_strategy(dict): Defines RetryStrategy for InternalServerFailures.
+                * max_retry_attsmpts (int): Number of times a job should be retried.
+                The key in RetryStrategy is 'MaxRetryAttempts'.
 
         Returns:
             str: ARN of the training job, if it is created.
@@ -561,6 +565,7 @@ def train(  # noqa: C901
             profiler_rule_configs=profiler_rule_configs,
             profiler_config=profiler_config,
             environment=environment,
+            retry_strategy=retry_strategy,
         )
         LOGGER.info("Creating training-job with name: %s", job_name)
         LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
@@ -594,6 +599,7 @@ def _get_train_request(  # noqa: C901
         profiler_rule_configs=None,
         profiler_config=None,
         environment=None,
+        retry_strategy=None,
     ):
         """Constructs a request compatible for creating an Amazon SageMaker training job.
 
@@ -665,6 +671,9 @@ def _get_train_request(  # noqa: C901
                 SageMaker Profiler. (default: ``None``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            retry_strategy(dict): Defines RetryStrategy for InternalServerFailures.
+                * max_retry_attsmpts (int): Number of times a job should be retried.
+                The key in RetryStrategy is 'MaxRetryAttempts'.
 
         Returns:
             Dict: a training request dict
@@ -749,6 +758,9 @@ def _get_train_request(  # noqa: C901
         if profiler_config is not None:
             train_request["ProfilerConfig"] = profiler_config
 
+        if retry_strategy is not None:
+            train_request["RetryStrategy"] = retry_strategy
+
         return train_request
 
     def update_training_job(
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
@@ -61,6 +61,8 @@ def test_mnist_with_checkpoint_config(
         checkpoint_s3_uri=checkpoint_s3_uri,
         checkpoint_local_path=checkpoint_local_path,
         environment=ENV_INPUT,
+        max_wait=24 * 60 * 60,
+        max_retry_attempts=2,
     )
     inputs = estimator.sagemaker_session.upload_data(
         path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
@@ -89,8 +91,16 @@ def test_mnist_with_checkpoint_config(
             "Environment"
         ]
     )
+
+    expected_retry_strategy = {
+        "MaximumRetryAttempts": 2,
+    }
+    actual_retry_strategy = sagemaker_session.sagemaker_client.describe_training_job(
+        TrainingJobName=training_job_name
+    )["RetryStrategy"]
     assert actual_training_checkpoint_config == expected_training_checkpoint_config
     assert actual_training_environment_variable_config == ENV_INPUT
+    assert actual_retry_strategy == expected_retry_strategy
 
 
 def test_server_side_encryption(sagemaker_session, tf_full_version, tf_full_py_version):
diff --git a/tests/unit/sagemaker/huggingface/test_estimator.py b/tests/unit/sagemaker/huggingface/test_estimator.py
@@ -150,6 +150,7 @@ def _create_train_job(version, base_framework_version):
         "vpc_config": None,
         "metric_definitions": None,
         "environment": None,
+        "retry_strategy": None,
         "experiment_config": None,
         "debugger_hook_config": {
             "CollectionConfigurations": [],
diff --git a/tests/unit/sagemaker/image_uris/test_data_wrangler.py b/tests/unit/sagemaker/image_uris/test_data_wrangler.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from sagemaker import image_uris
+from tests.unit.sagemaker.image_uris import expected_uris, regions
+
+DATA_WRANGLER_ACCOUNTS = {
+    "af-south-1": "143210264188",
+    "ap-east-1": "707077482487",
+    "ap-northeast-1": "649008135260",
+    "ap-northeast-2": "131546521161",
+    "ap-south-1": "089933028263",
+    "ap-southeast-1": "119527597002",
+    "ap-southeast-2": "422173101802",
+    "ca-central-1": "557239378090",
+    "eu-central-1": "024640144536",
+    "eu-north-1": "054986407534",
+    "eu-south-1": "488287956546",
+    "eu-west-1": "245179582081",
+    "eu-west-2": "894491911112",
+    "eu-west-3": "807237891255",
+    "me-south-1": "376037874950",
+    "sa-east-1": "424196993095",
+    "us-east-1": "663277389841",
+    "us-east-2": "415577184552",
+    "us-west-1": "926135532090",
+    "us-west-2": "174368400705",
+    "cn-north-1": "245909111842",
+    "cn-northwest-1": "249157047649",
+}
+
+
+def test_data_wrangler_ecr_uri():
+    for region in regions.regions():
+        if region in DATA_WRANGLER_ACCOUNTS.keys():
+            actual_uri = image_uris.retrieve("data-wrangler", region=region)
+
+            expected_uri = expected_uris.algo_uri(
+                "sagemaker-data-wrangler-container",
+                DATA_WRANGLER_ACCOUNTS[region],
+                region,
+                version="1.x",
+            )
+            assert expected_uri == actual_uri
diff --git a/tests/unit/sagemaker/tensorflow/test_estimator.py b/tests/unit/sagemaker/tensorflow/test_estimator.py
@@ -127,6 +127,7 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
         },
         "hyperparameters": _hyperparameters(horovod, smdataparallel),
         "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
+        "retry_strategy": None,
         "tags": None,
         "vpc_config": None,
         "metric_definitions": None,
diff --git a/tests/unit/test_chainer.py b/tests/unit/test_chainer.py
@@ -140,6 +140,7 @@ def _create_train_job(version, py_version):
             "sagemaker_region": '"us-west-2"',
         },
         "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
+        "retry_strategy": None,
         "tags": None,
         "vpc_config": None,
         "metric_definitions": None,
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
@@ -245,6 +245,7 @@ def test_framework_all_init_args(sagemaker_session):
         enable_sagemaker_metrics=True,
         enable_network_isolation=True,
         environment=ENV_INPUT,
+        max_retry_attempts=2,
     )
     _TrainingJob.start_new(f, "s3://mydata", None)
     sagemaker_session.train.assert_called_once()
@@ -269,6 +270,7 @@ def test_framework_all_init_args(sagemaker_session):
         "output_config": {"KmsKeyId": "outputkms", "S3OutputPath": "outputpath"},
         "vpc_config": {"Subnets": ["123", "456"], "SecurityGroupIds": ["789", "012"]},
         "stop_condition": {"MaxRuntimeInSeconds": 456},
+        "retry_strategy": {"MaximumRetryAttempts": 2},
         "role": sagemaker_session.expand_role(),
         "job_name": None,
         "resource_config": {
@@ -1092,6 +1094,7 @@ def test_framework_with_spot_and_checkpoints(sagemaker_session):
         "checkpoint_local_path": "/tmp/checkpoints",
         "environment": None,
         "experiment_config": None,
+        "retry_strategy": None,
     }
 
 
@@ -2392,6 +2395,7 @@ def test_unsupported_type_in_dict():
         "VolumeSizeInGB": 30,
     },
     "stop_condition": {"MaxRuntimeInSeconds": 86400},
+    "retry_strategy": None,
     "tags": None,
     "vpc_config": None,
     "metric_definitions": None,
@@ -2703,6 +2707,24 @@ def test_add_environment_variables_to_train_args(sagemaker_session):
     assert args["environment"] == ENV_INPUT
 
 
+def test_add_retry_strategy_to_train_args(sagemaker_session):
+    e = Estimator(
+        IMAGE_URI,
+        ROLE,
+        INSTANCE_COUNT,
+        INSTANCE_TYPE,
+        output_path=OUTPUT_PATH,
+        sagemaker_session=sagemaker_session,
+        max_retry_attempts=2,
+    )
+
+    e.fit()
+
+    sagemaker_session.train.assert_called_once()
+    args = sagemaker_session.train.call_args[1]
+    assert args["retry_strategy"] == {"MaximumRetryAttempts": 2}
+
+
 def test_generic_to_fit_with_sagemaker_metrics_enabled(sagemaker_session):
     e = Estimator(
         IMAGE_URI,
@@ -3159,6 +3181,25 @@ def test_prepare_init_params_from_job_description_with_spot_training():
     assert init_params["max_wait"] == 87000
 
 
+def test_prepare_init_params_from_job_description_with_retry_strategy():
+    job_description = RETURNED_JOB_DESCRIPTION.copy()
+    job_description["RetryStrategy"] = {"MaximumRetryAttempts": 2}
+    job_description["StoppingCondition"] = {
+        "MaxRuntimeInSeconds": 86400,
+        "MaxWaitTimeInSeconds": 87000,
+    }
+
+    init_params = EstimatorBase._prepare_init_params_from_job_description(
+        job_details=job_description
+    )
+
+    assert init_params["role"] == "arn:aws:iam::366:role/SageMakerRole"
+    assert init_params["instance_count"] == 1
+    assert init_params["max_run"] == 86400
+    assert init_params["max_wait"] == 87000
+    assert init_params["max_retry_attempts"] == 2
+
+
 def test_prepare_init_params_from_job_description_with_invalid_training_job():
 
     invalid_job_description = RETURNED_JOB_DESCRIPTION.copy()
diff --git a/tests/unit/test_mxnet.py b/tests/unit/test_mxnet.py
@@ -147,6 +147,7 @@ def _get_train_args(job_name):
         "vpc_config": None,
         "metric_definitions": None,
         "environment": None,
+        "retry_strategy": None,
         "experiment_config": None,
         "debugger_hook_config": {
             "CollectionConfigurations": [],
diff --git a/tests/unit/test_pytorch.py b/tests/unit/test_pytorch.py
@@ -149,6 +149,7 @@ def _create_train_job(version, py_version):
         "vpc_config": None,
         "metric_definitions": None,
         "environment": None,
+        "retry_strategy": None,
         "experiment_config": None,
         "debugger_hook_config": {
             "CollectionConfigurations": [],
diff --git a/tests/unit/test_rl.py b/tests/unit/test_rl.py
@@ -162,6 +162,7 @@ def _create_train_job(toolkit, toolkit_version, framework):
         "profiler_config": {
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
+        "retry_strategy": None,
     }
 
 
diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py
diff --git a/tests/unit/test_sklearn.py b/tests/unit/test_sklearn.py
diff --git a/tests/unit/test_xgboost.py b/tests/unit/test_xgboost.py

Original file line number	Diff line number	Diff line change
`@@ -162,6 +162,7 @@ def _create_train_job(toolkit, toolkit_version, framework):`
`162`	`162`	`"profiler_config": {`
`163`	`163`	`"S3OutputPath": "s3://{}/".format(BUCKET_NAME),`
`164`	`164`	`},`
	`165`	`+ "retry_strategy": None,`
`165`	`166`	`}`
`166`	`167`
`167`	`168`