feature: add support of the intelligent stopping in the tuner

Anton Repushko · Anton Repushko · commit cca24a54dbde · 2023-02-10T22:02:16.000+01:00
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -2189,7 +2189,9 @@ def tune(  # noqa: C901
         stop_condition,
         tags,
         warm_start_config,
+        max_runtime_in_seconds=None,
         strategy_config=None,
+        completion_criteria_config=None,
         enable_network_isolation=False,
         image_uri=None,
         algorithm_arn=None,
@@ -2256,6 +2258,10 @@ def tune(  # noqa: C901
                 https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html.
             warm_start_config (dict): Configuration defining the type of warm start and
                 other required configurations.
+            max_runtime_in_seconds (int or PipelineVariable): The maximum time in seconds
+                that a training job launched by a hyperparameter tuning job can run.
+            completion_criteria_config (sagemaker.tuner.TuningJobCompletionCriteriaConfig): A
+                configuration for the completion criteria.
             early_stopping_type (str): Specifies whether early stopping is enabled for the job.
                 Can be either 'Auto' or 'Off'. If set to 'Off', early stopping will not be
                 attempted. If set to 'Auto', early stopping of some training jobs may happen, but
@@ -2311,12 +2317,14 @@ def tune(  # noqa: C901
                 strategy=strategy,
                 max_jobs=max_jobs,
                 max_parallel_jobs=max_parallel_jobs,
+                max_runtime_in_seconds=max_runtime_in_seconds,
                 objective_type=objective_type,
                 objective_metric_name=objective_metric_name,
                 parameter_ranges=parameter_ranges,
                 early_stopping_type=early_stopping_type,
                 random_seed=random_seed,
                 strategy_config=strategy_config,
+                completion_criteria_config=completion_criteria_config,
             ),
             "TrainingJobDefinition": self._map_training_config(
                 static_hyperparameters=static_hyperparameters,
@@ -2470,12 +2478,14 @@ def _map_tuning_config(
         strategy,
         max_jobs,
         max_parallel_jobs,
+        max_runtime_in_seconds=None,
         early_stopping_type="Off",
         objective_type=None,
         objective_metric_name=None,
         parameter_ranges=None,
         random_seed=None,
         strategy_config=None,
+        completion_criteria_config=None,
     ):
         """Construct tuning job configuration dictionary.
 
@@ -2484,6 +2494,8 @@ def _map_tuning_config(
             max_jobs (int): Maximum total number of training jobs to start for the hyperparameter
                 tuning job.
             max_parallel_jobs (int): Maximum number of parallel training jobs to start.
+            max_runtime_in_seconds (int or PipelineVariable): The maximum time in seconds
+                that a training job launched by a hyperparameter tuning job can run.
             early_stopping_type (str): Specifies whether early stopping is enabled for the job.
                 Can be either 'Auto' or 'Off'. If set to 'Off', early stopping will not be
                 attempted. If set to 'Auto', early stopping of some training jobs may happen,
@@ -2498,6 +2510,8 @@ def _map_tuning_config(
                 produce more consistent configurations for the same tuning job.
             strategy_config (dict): A configuration for the hyperparameter tuning job optimisation
                 strategy.
+            completion_criteria_config (dict): A configuration
+                for the completion criteria.
 
         Returns:
             A dictionary of tuning job configuration. For format details, please refer to
@@ -2514,6 +2528,9 @@ def _map_tuning_config(
             "TrainingJobEarlyStoppingType": early_stopping_type,
         }
 
+        if max_runtime_in_seconds is not None:
+            tuning_config["ResourceLimits"]["MaxRuntimeInSeconds"] = max_runtime_in_seconds
+
         if random_seed is not None:
             tuning_config["RandomSeed"] = random_seed
 
@@ -2526,6 +2543,9 @@ def _map_tuning_config(
 
         if strategy_config is not None:
             tuning_config["StrategyConfig"] = strategy_config
+
+        if completion_criteria_config is not None:
+            tuning_config["TuningJobCompletionCriteria"] = completion_criteria_config
         return tuning_config
 
     @classmethod
diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py
@@ -460,6 +460,116 @@ def to_input_req(self):
         }
 
 
+class TuningJobCompletionCriteriaConfig(object):
+    """The configuration for a job completion criteria."""
+
+    def __init__(
+        self,
+        max_number_of_training_jobs_not_improving: int = None,
+        complete_on_convergence: bool = None,
+        target_objective_metric_value: float = None,
+    ):
+        """Creates a ``TuningJobCompletionCriteriaConfig`` with provided criteria.
+
+        Args:
+            max_number_of_training_jobs_not_improving (int): The number of training jobs that do not
+                improve the best objective after which tuning job will stop.
+            complete_on_convergence (bool): A flag to stop your hyperparameter tuning job if
+                automatic model tuning (AMT) has detected that your model has converged as evaluated
+                against your objective function.
+            target_objective_metric_value (float): The value of the objective metric.
+        """
+
+        self.max_number_of_training_jobs_not_improving = max_number_of_training_jobs_not_improving
+        self.complete_on_convergence = complete_on_convergence
+        self.target_objective_metric_value = target_objective_metric_value
+
+    @classmethod
+    def from_job_desc(cls, completion_criteria_config):
+        """Creates a ``TuningJobCompletionCriteriaConfig`` from a configuration response.
+
+        This is the completion criteria configuration from the DescribeTuningJob response.
+        Args:
+            completion_criteria_config (dict): The expected format of the
+                ``completion_criteria_config`` contains three first-class fields
+
+        Returns:
+            sagemaker.tuner.TuningJobCompletionCriteriaConfig: De-serialized instance of
+            TuningJobCompletionCriteriaConfig containing the completion criteria.
+        """
+        complete_on_convergence = None
+        if CONVERGENCE_DETECTED in completion_criteria_config:
+            if completion_criteria_config[CONVERGENCE_DETECTED][COMPLETE_ON_CONVERGENCE_DETECTED]:
+                complete_on_convergence = bool(
+                    completion_criteria_config[CONVERGENCE_DETECTED][
+                        COMPLETE_ON_CONVERGENCE_DETECTED
+                    ]
+                    == "Enabled"
+                )
+
+        max_number_of_training_jobs_not_improving = None
+        if BEST_OBJECTIVE_NOT_IMPROVING in completion_criteria_config:
+            if completion_criteria_config[BEST_OBJECTIVE_NOT_IMPROVING][
+                MAX_NUMBER_OF_TRAINING_JOBS_NOT_IMPROVING
+            ]:
+                max_number_of_training_jobs_not_improving = completion_criteria_config[
+                    BEST_OBJECTIVE_NOT_IMPROVING
+                ][MAX_NUMBER_OF_TRAINING_JOBS_NOT_IMPROVING]
+
+        target_objective_metric_value = None
+        if TARGET_OBJECTIVE_METRIC_VALUE in completion_criteria_config:
+            target_objective_metric_value = completion_criteria_config[
+                TARGET_OBJECTIVE_METRIC_VALUE
+            ]
+
+        return cls(
+            max_number_of_training_jobs_not_improving=max_number_of_training_jobs_not_improving,
+            complete_on_convergence=complete_on_convergence,
+            target_objective_metric_value=target_objective_metric_value,
+        )
+
+    def to_input_req(self):
+        """Converts the ``self`` instance to the desired input request format.
+
+        Examples:
+            >>> completion_criteria_config = TuningJobCompletionCriteriaConfig(
+                max_number_of_training_jobs_not_improving=5
+                complete_on_convergence = True,
+                target_objective_metric_value = 0.42
+            )
+            >>> completion_criteria_config.to_input_req()
+            {
+                "BestObjectiveNotImproving": {
+                    "MaxNumberOfTrainingJobsNotImproving":5
+                },
+                "ConvergenceDetected": {
+                    "CompleteOnConvergence": "Enabled",
+                },
+                "TargetObjectiveMetricValue": 0.42
+            }
+
+        Returns:
+            dict: Containing the completion criteria configurations.
+        """
+        completion_criteria_config = {}
+        if self.max_number_of_training_jobs_not_improving is not None:
+            completion_criteria_config[BEST_OBJECTIVE_NOT_IMPROVING][
+                MAX_NUMBER_OF_TRAINING_JOBS_NOT_IMPROVING
+            ] = self.max_number_of_training_jobs_not_improving
+
+        if self.target_objective_metric_value is not None:
+            completion_criteria_config[
+                TARGET_OBJECTIVE_METRIC_VALUE
+            ] = self.target_objective_metric_value
+
+        if self.complete_on_convergence is not None:
+            completion_criteria_config[CONVERGENCE_DETECTED][COMPLETE_ON_CONVERGENCE_DETECTED] = (
+                "Enabled" if self.complete_on_convergence else "Disabled"
+            )
+
+        return completion_criteria_config
+
+
 class HyperparameterTuner(object):
     """Defines interaction with Amazon SageMaker hyperparameter tuning jobs.
 
@@ -559,14 +669,14 @@ def __init__(
             self.estimator = None
             self.objective_metric_name = None
             self._hyperparameter_ranges = None
-            self.static_hyperparameters = None
             self.metric_definitions = None
             self.estimator_dict = {estimator_name: estimator}
             self.objective_metric_name_dict = {estimator_name: objective_metric_name}
             self._hyperparameter_ranges_dict = {estimator_name: hyperparameter_ranges}
             self.metric_definitions_dict = (
                 {estimator_name: metric_definitions} if metric_definitions is not None else {}
             )
+            self.static_hyperparameters = None
         else:
             self.estimator = estimator
             self.objective_metric_name = objective_metric_name
@@ -598,31 +708,6 @@ def __init__(
         self.warm_start_config = warm_start_config
         self.early_stopping_type = early_stopping_type
         self.random_seed = random_seed
-        self.instance_configs_dict = None
-        self.instance_configs = None
-
-    def override_resource_config(
-        self, instance_configs: Union[List[InstanceConfig], Dict[str, List[InstanceConfig]]]
-    ):
-        """Override the instance configuration of the estimators used by the tuner.
-
-        Args:
-            instance_configs (List[InstanceConfig] or Dict[str, List[InstanceConfig]):
-                The InstanceConfigs to use as an override for the instance configuration
-                of the estimator. ``None`` will remove the override.
-        """
-        if isinstance(instance_configs, dict):
-            self._validate_dict_argument(
-                name="instance_configs",
-                value=instance_configs,
-                allowed_keys=list(self.estimator_dict.keys()),
-            )
-            self.instance_configs_dict = instance_configs
-        else:
-            self.instance_configs = instance_configs
-            if self.estimator_dict is not None and self.estimator_dict.keys():
-                estimator_names = list(self.estimator_dict.keys())
-                self.instance_configs_dict = {estimator_names[0]: instance_configs}
 
     def _prepare_for_tuning(self, job_name=None, include_cls_metadata=False):
         """Prepare the tuner instance for tuning (fit)."""
@@ -691,6 +776,7 @@ def _prepare_job_name_for_tuning(self, job_name=None):
 
     def _prepare_static_hyperparameters_for_tuning(self, include_cls_metadata=False):
         """Prepare static hyperparameters for all estimators before tuning."""
+        self.static_hyperparameters = None
         if self.estimator is not None:
             self.static_hyperparameters = self._prepare_static_hyperparameters(
                 self.estimator, self._hyperparameter_ranges, include_cls_metadata
@@ -1918,7 +2004,6 @@ def _get_tuner_args(cls, tuner, inputs):
                 estimator=tuner.estimator,
                 static_hyperparameters=tuner.static_hyperparameters,
                 metric_definitions=tuner.metric_definitions,
-                instance_configs=tuner.instance_configs,
             )
 
         if tuner.estimator_dict is not None:
@@ -1932,44 +2017,12 @@ def _get_tuner_args(cls, tuner, inputs):
                     tuner.objective_type,
                     tuner.objective_metric_name_dict[estimator_name],
                     tuner.hyperparameter_ranges_dict()[estimator_name],
-                    tuner.instance_configs_dict.get(estimator_name, None)
-                    if tuner.instance_configs_dict is not None
-                    else None,
                 )
                 for estimator_name in sorted(tuner.estimator_dict.keys())
             ]
 
         return tuner_args
 
-    @staticmethod
-    def _prepare_hp_resource_config(
-        instance_configs: List[InstanceConfig],
-        instance_count: int,
-        instance_type: str,
-        volume_size: int,
-        volume_kms_key: str,
-    ):
-        """Placeholder hpo resource config for one estimator of the tuner."""
-        resource_config = {}
-        if volume_kms_key is not None:
-            resource_config["VolumeKmsKeyId"] = volume_kms_key
-
-        if instance_configs is None:
-            resource_config["InstanceCount"] = instance_count
-            resource_config["InstanceType"] = instance_type
-            resource_config["VolumeSizeInGB"] = volume_size
-        else:
-            resource_config["InstanceConfigs"] = _TuningJob._prepare_instance_configs(
-                instance_configs
-            )
-
-        return resource_config
-
-    @staticmethod
-    def _prepare_instance_configs(instance_configs: List[InstanceConfig]):
-        """Prepare instance config for create tuning request."""
-        return [config.to_input_req() for config in instance_configs]
-
     @staticmethod
     def _prepare_training_config(
         inputs,
@@ -1980,20 +2033,10 @@ def _prepare_training_config(
         objective_type=None,
         objective_metric_name=None,
         parameter_ranges=None,
-        instance_configs=None,
     ):
         """Prepare training config for one estimator."""
         training_config = _Job._load_config(inputs, estimator)
 
-        del training_config["resource_config"]
-        training_config["hpo_resource_config"] = _TuningJob._prepare_hp_resource_config(
-            instance_configs,
-            estimator.instance_count,
-            estimator.instance_type,
-            estimator.volume_size,
-            estimator.volume_kms_key,
-        )
-
         training_config["input_mode"] = estimator.input_mode
         training_config["metric_definitions"] = metric_definitions
 
diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py
@@ -663,12 +663,17 @@ def test_attach_tuning_job_with_estimator_from_hyperparameters(sagemaker_session
     assert tuner.objective_metric_name == OBJECTIVE_METRIC_NAME
     assert tuner.max_jobs == 1
     assert tuner.max_parallel_jobs == 1
+    assert tuner.max_runtime_in_seconds == 1
     assert tuner.metric_definitions == METRIC_DEFINITIONS
     assert tuner.strategy == "Bayesian"
     assert tuner.objective_type == "Minimize"
     assert tuner.early_stopping_type == "Off"
     assert tuner.random_seed == 0
 
+    assert tuner.completion_criteria_config.complete_on_convergence is True
+    assert tuner.completion_criteria_config.target_objective_metric_value == 0.42
+    assert tuner.completion_criteria_config.max_number_of_training_jobs_not_improving == 5
+
     assert isinstance(tuner.estimator, PCA)
     assert tuner.estimator.role == ROLE
     assert tuner.estimator.instance_count == 1
diff --git a/tests/unit/tuner_test_utils.py b/tests/unit/tuner_test_utils.py
@@ -97,7 +97,11 @@
 
 TUNING_JOB_DETAILS = {
     "HyperParameterTuningJobConfig": {
-        "ResourceLimits": {"MaxParallelTrainingJobs": 1, "MaxNumberOfTrainingJobs": 1},
+        "ResourceLimits": {
+            "MaxParallelTrainingJobs": 1,
+            "MaxNumberOfTrainingJobs": 1,
+            "MaxRuntimeInSeconds": 1,
+        },
         "HyperParameterTuningJobObjective": {
             "MetricName": OBJECTIVE_METRIC_NAME,
             "Type": "Minimize",
@@ -117,6 +121,11 @@
         },
         "TrainingJobEarlyStoppingType": "Off",
         "RandomSeed": 0,
+        "TuningJobCompletionCriteria": {
+            "BestObjectiveNotImproving": {"MaxNumberOfTrainingJobsNotImproving": 5},
+            "ConvergenceDetected": {"CompleteOnConvergence": "Enabled"},
+            "TargetObjectiveMetricValue": 0.42,
+        },
     },
     "HyperParameterTuningJobName": JOB_NAME,
     "TrainingJobDefinition": {