Merge branch 'master' into eia_151

ahsan-z-khan · web-flow · commit fecc3c823c4f · 2021-08-12T13:33:16.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## v2.53.0 (2021-08-12)
+
+### Features
+
+ * support tuning step parameter range parameterization + support retry strategy in tuner
+
+## v2.52.2.post0 (2021-08-11)
+
+### Documentation Changes
+
+ * clarify that default_bucket creates a bucket
+ * Minor updates to Clarify API documentation
+
 ## v2.52.2 (2021-08-10)
 
 ### Bug Fixes and Other Changes
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.52.3.dev0
+2.53.1.dev0
diff --git a/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst b/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst
@@ -5,7 +5,6 @@ ConditionStep
 -------------
 
 .. autoclass:: sagemaker.workflow.condition_step.ConditionStep
-
 .. deprecated:: sagemaker.workflow.condition_step.JsonGet
 
 Conditions
diff --git a/src/sagemaker/local/local_session.py b/src/sagemaker/local/local_session.py
@@ -571,6 +571,21 @@ def logs_for_job(self, job_name, wait=False, poll=5, log_type="All"):
         # on local mode.
         pass  # pylint: disable=unnecessary-pass
 
+    def logs_for_processing_job(self, job_name, wait=False, poll=10):
+        """A no-op method meant to override the sagemaker client.
+
+        Args:
+          job_name:
+          wait:  (Default value = False)
+          poll:  (Default value = 10)
+
+        Returns:
+
+        """
+        # override logs_for_job() as it doesn't need to perform any action
+        # on local mode.
+        pass  # pylint: disable=unnecessary-pass
+
 
 class file_input(object):
     """Amazon SageMaker channel configuration for FILE data sources, used in local mode."""
diff --git a/src/sagemaker/parameter.py b/src/sagemaker/parameter.py
@@ -12,7 +12,9 @@
 # language governing permissions and limitations under the License.
 """Placeholder docstring"""
 from __future__ import absolute_import
+
 import json
+from sagemaker.workflow.parameters import Parameter as PipelineParameter
 
 
 class ParameterRange(object):
@@ -68,8 +70,12 @@ def as_tuning_range(self, name):
         """
         return {
             "Name": name,
-            "MinValue": str(self.min_value),
-            "MaxValue": str(self.max_value),
+            "MinValue": str(self.min_value)
+            if not isinstance(self.min_value, PipelineParameter)
+            else self.min_value,
+            "MaxValue": str(self.max_value)
+            if not isinstance(self.max_value, PipelineParameter)
+            else self.max_value,
             "ScalingType": self.scaling_type,
         }
 
@@ -103,9 +109,9 @@ def __init__(self, values):  # pylint: disable=super-init-not-called
                 This input will be converted into a list of strings.
         """
         if isinstance(values, list):
-            self.values = [str(v) for v in values]
+            self.values = [str(v) if not isinstance(v, PipelineParameter) else v for v in values]
         else:
-            self.values = [str(values)]
+            self.values = [str(values) if not isinstance(values, PipelineParameter) else values]
 
     def as_tuning_range(self, name):
         """Represent the parameter range as a dictionary.
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -357,6 +357,8 @@ def list_s3_files(self, bucket, key_prefix):
     def default_bucket(self):
         """Return the name of the default bucket to use in relevant Amazon SageMaker interactions.
 
+        This function will create the s3 bucket if it does not exist.
+
         Returns:
             str: The name of the default bucket, which is of the form:
                 ``sagemaker-{region}-{AWS account ID}``.
@@ -2211,6 +2213,7 @@ def _map_training_config(
         use_spot_instances=False,
         checkpoint_s3_uri=None,
         checkpoint_local_path=None,
+        max_retry_attempts=None,
     ):
         """Construct a dictionary of training job configuration from the arguments.
 
@@ -2264,6 +2267,7 @@ def _map_training_config(
             objective_metric_name (str): Name of the metric for evaluating training jobs.
             parameter_ranges (dict): Dictionary of parameter ranges. These parameter ranges can
                 be one of three types: Continuous, Integer, or Categorical.
+            max_retry_attempts (int): The number of times to retry the job.
 
         Returns:
             A dictionary of training job configuration. For format details, please refer to
@@ -2320,6 +2324,8 @@ def _map_training_config(
         if parameter_ranges is not None:
             training_job_definition["HyperParameterRanges"] = parameter_ranges
 
+        if max_retry_attempts is not None:
+            training_job_definition["RetryStrategy"] = {"MaximumRetryAttempts": max_retry_attempts}
         return training_job_definition
 
     def stop_tuning_job(self, name):
diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py
@@ -1507,7 +1507,10 @@ def _get_tuner_args(cls, tuner, inputs):
 
         if tuner.estimator is not None:
             tuner_args["training_config"] = cls._prepare_training_config(
-                inputs, tuner.estimator, tuner.static_hyperparameters, tuner.metric_definitions
+                inputs=inputs,
+                estimator=tuner.estimator,
+                static_hyperparameters=tuner.static_hyperparameters,
+                metric_definitions=tuner.metric_definitions,
             )
 
         if tuner.estimator_dict is not None:
@@ -1580,6 +1583,9 @@ def _prepare_training_config(
         if parameter_ranges is not None:
             training_config["parameter_ranges"] = parameter_ranges
 
+        if estimator.max_retry_attempts is not None:
+            training_config["max_retry_attempts"] = estimator.max_retry_attempts
+
         return training_config
 
     def stop(self):
diff --git a/src/sagemaker/workflow/pipeline.py b/src/sagemaker/workflow/pipeline.py
@@ -320,6 +320,7 @@ def _interpolate(
     """
     if isinstance(obj, (Expression, Parameter, Properties)):
         return obj.expr
+
     if isinstance(obj, CallbackOutput):
         step_name = callback_output_to_step_map[obj.output_name]
         return obj.expr(step_name)
diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py
@@ -1075,7 +1075,7 @@ def test_conditional_pytorch_training_model_registration(
             pass
 
 
-def test_tuning(
+def test_tuning_single_algo(
     sagemaker_session,
     role,
     cpu_instance_type,
@@ -1098,14 +1098,17 @@ def test_tuning(
         role=role,
         framework_version="1.5.0",
         py_version="py3",
-        instance_count=1,
-        instance_type="ml.m5.xlarge",
+        instance_count=instance_count,
+        instance_type=instance_type,
         sagemaker_session=sagemaker_session,
         enable_sagemaker_metrics=True,
+        max_retry_attempts=3,
     )
 
+    min_batch_size = ParameterString(name="MinBatchSize", default_value="64")
+    max_batch_size = ParameterString(name="MaxBatchSize", default_value="128")
     hyperparameter_ranges = {
-        "batch-size": IntegerParameter(64, 128),
+        "batch-size": IntegerParameter(min_batch_size, max_batch_size),
     }
 
     tuner = HyperparameterTuner(
@@ -1161,7 +1164,7 @@ def test_tuning(
 
     pipeline = Pipeline(
         name=pipeline_name,
-        parameters=[instance_count, instance_type],
+        parameters=[instance_count, instance_type, min_batch_size, max_batch_size],
         steps=[step_tune, step_best_model, step_second_best_model],
         sagemaker_session=sagemaker_session,
     )
@@ -1185,6 +1188,93 @@ def test_tuning(
             pass
 
 
+def test_tuning_multi_algos(
+    sagemaker_session,
+    role,
+    cpu_instance_type,
+    pipeline_name,
+    region_name,
+):
+    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
+    entry_point = os.path.join(base_dir, "mnist.py")
+    input_path = sagemaker_session.upload_data(
+        path=os.path.join(base_dir, "training"),
+        key_prefix="integ-test-data/pytorch_mnist/training",
+    )
+
+    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
+    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
+
+    pytorch_estimator = PyTorch(
+        entry_point=entry_point,
+        role=role,
+        framework_version="1.5.0",
+        py_version="py3",
+        instance_count=instance_count,
+        instance_type=instance_type,
+        sagemaker_session=sagemaker_session,
+        enable_sagemaker_metrics=True,
+        max_retry_attempts=3,
+    )
+
+    min_batch_size = ParameterString(name="MinBatchSize", default_value="64")
+    max_batch_size = ParameterString(name="MaxBatchSize", default_value="128")
+
+    tuner = HyperparameterTuner.create(
+        estimator_dict={
+            "estimator-1": pytorch_estimator,
+            "estimator-2": pytorch_estimator,
+        },
+        objective_metric_name_dict={
+            "estimator-1": "test:acc",
+            "estimator-2": "test:acc",
+        },
+        hyperparameter_ranges_dict={
+            "estimator-1": {"batch-size": IntegerParameter(min_batch_size, max_batch_size)},
+            "estimator-2": {"batch-size": IntegerParameter(min_batch_size, max_batch_size)},
+        },
+        metric_definitions_dict={
+            "estimator-1": [{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}],
+            "estimator-2": [{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}],
+        },
+    )
+    inputs = {
+        "estimator-1": TrainingInput(s3_data=input_path),
+        "estimator-2": TrainingInput(s3_data=input_path),
+    }
+
+    step_tune = TuningStep(
+        name="my-tuning-step",
+        tuner=tuner,
+        inputs=inputs,
+    )
+
+    pipeline = Pipeline(
+        name=pipeline_name,
+        parameters=[instance_count, instance_type, min_batch_size, max_batch_size],
+        steps=[step_tune],
+        sagemaker_session=sagemaker_session,
+    )
+
+    try:
+        response = pipeline.create(role)
+        create_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn
+        )
+
+        execution = pipeline.start(parameters={})
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
+            execution.arn,
+        )
+    finally:
+        try:
+            pipeline.delete()
+        except Exception:
+            pass
+
+
 def test_mxnet_model_registration(
     sagemaker_session,
     role,
diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py
@@ -716,14 +716,16 @@ def test_multi_algo_tuning_step(sagemaker_session):
     data_source_uri_parameter = ParameterString(
         name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest"
     )
+    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
     estimator = Estimator(
         image_uri=IMAGE_URI,
         role=ROLE,
-        instance_count=1,
+        instance_count=instance_count,
         instance_type="ml.c5.4xlarge",
         profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
         rules=[],
         sagemaker_session=sagemaker_session,
+        max_retry_attempts=10,
     )
 
     estimator.set_hyperparameters(
@@ -739,8 +741,9 @@ def test_multi_algo_tuning_step(sagemaker_session):
         augmentation_type="crop",
     )
 
+    initial_lr_param = ParameterString(name="InitialLR", default_value="0.0001")
     hyperparameter_ranges = {
-        "learning_rate": ContinuousParameter(0.0001, 0.05),
+        "learning_rate": ContinuousParameter(initial_lr_param, 0.05),
         "momentum": ContinuousParameter(0.0, 0.99),
         "weight_decay": ContinuousParameter(0.0, 0.99),
     }
@@ -825,7 +828,7 @@ def test_multi_algo_tuning_step(sagemaker_session):
                         "ContinuousParameterRanges": [
                             {
                                 "Name": "learning_rate",
-                                "MinValue": "0.0001",
+                                "MinValue": initial_lr_param,
                                 "MaxValue": "0.05",
                                 "ScalingType": "Auto",
                             },
@@ -845,6 +848,9 @@ def test_multi_algo_tuning_step(sagemaker_session):
                         "CategoricalParameterRanges": [],
                         "IntegerParameterRanges": [],
                     },
+                    "RetryStrategy": {
+                        "MaximumRetryAttempts": 10,
+                    },
                 },
                 {
                     "StaticHyperParameters": {
@@ -889,7 +895,7 @@ def test_multi_algo_tuning_step(sagemaker_session):
                         "ContinuousParameterRanges": [
                             {
                                 "Name": "learning_rate",
-                                "MinValue": "0.0001",
+                                "MinValue": initial_lr_param,
                                 "MaxValue": "0.05",
                                 "ScalingType": "Auto",
                             },
@@ -909,6 +915,9 @@ def test_multi_algo_tuning_step(sagemaker_session):
                         "CategoricalParameterRanges": [],
                         "IntegerParameterRanges": [],
                     },
+                    "RetryStrategy": {
+                        "MaximumRetryAttempts": 10,
+                    },
                 },
             ],
         },
diff --git a/tests/unit/test_local_session.py b/tests/unit/test_local_session.py
@@ -551,6 +551,20 @@ def test_describe_transform_job_does_not_exist(LocalSession, _LocalTransformJob)
         local_sagemaker_client.describe_transform_job("transform-job-does-not-exist")
 
 
+@patch("sagemaker.local.image._SageMakerContainer.process")
+@patch("sagemaker.local.local_session.LocalSession")
+def test_logs_for_job(process, LocalSession):
+    local_job_logs = LocalSession.logs_for_job("my-processing-job")
+    assert local_job_logs is not None
+
+
+@patch("sagemaker.local.image._SageMakerContainer.process")
+@patch("sagemaker.local.local_session.LocalSession")
+def test_logs_for_processing_job(process, LocalSession):
+    local_processing_job_logs = LocalSession.logs_for_processing_job("my-processing-job")
+    assert local_processing_job_logs is not None
+
+
 @patch("sagemaker.local.local_session.LocalSession")
 def test_describe_endpoint_config(LocalSession):
     local_sagemaker_client = sagemaker.local.local_session.LocalSagemakerClient()