feature: support for flexible instance types in the HPO

Anton Repushko · trajanikant · commit 01fa581e54ed · 2023-01-25T10:11:29.000-08:00
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -2150,6 +2150,7 @@ def tune(  # noqa: C901
         checkpoint_s3_uri=None,
         checkpoint_local_path=None,
         random_seed=None,
+        hpo_resource_config=None,
     ):
         """Create an Amazon SageMaker hyperparameter tuning job.
 
@@ -2233,6 +2234,25 @@ def tune(  # noqa: C901
             random_seed (int): An initial value used to initialize a pseudo-random number generator.
                 Setting a random seed will make the hyperparameter tuning search strategies to
                 produce more consistent configurations for the same tuning job. (default: ``None``).
+            hpo_resource_config (dict): The configuration for the hyperparameter tuning resources,
+                including the compute instances and storage volumes, used for training jobs launched
+                by the tuning job. You must specify either
+                instance_configs or instance_count + instance_type + volume_size.
+                * instance_count (int): Number of EC2 instances to use for training.
+                The key in resource_config is 'InstanceCount'.
+                * instance_type (str): Type of EC2 instance to use for training, for example,
+                'ml.c4.xlarge'. The key in resource_config is 'InstanceType'.
+                * volume_size (int or PipelineVariable): The volume size in GB of the data to be
+                processed for hyperparameter optimisation
+                * instance_configs (List[InstanceConfig]): A list containing the configuration(s)
+                for one or more resources for processing hyperparameter jobs. These resources
+                include compute instances and storage volumes to use in model training jobs.
+                * volume_kms_key_id: A key used by AWS Key Management Service to encrypt data on
+                the storage volume attached to the compute instances used to run the training job.
+                You can use either of the following formats to specify a key.
+                    * KMS Key ID: ``1234abcd-12ab-34cd-56ef-1234567890ab``
+                    * Amazon Resource Name (ARN) of a KMS key:
+                    ``arn:aws:kms:us-west-2:111122223333:key/1234abcd-12ab-34cd-56ef-1234567890ab``
         """
 
         tune_request = {
@@ -2258,6 +2278,7 @@ def tune(  # noqa: C901
                 input_config=input_config,
                 output_config=output_config,
                 resource_config=resource_config,
+                hpo_resource_config=hpo_resource_config,
                 vpc_config=vpc_config,
                 stop_condition=stop_condition,
                 enable_network_isolation=enable_network_isolation,
@@ -2491,9 +2512,10 @@ def _map_training_config(
         input_mode,
         role,
         output_config,
-        resource_config,
         stop_condition,
         input_config=None,
+        resource_config=None,
+        hpo_resource_config=None,
         metric_definitions=None,
         image_uri=None,
         algorithm_arn=None,
@@ -2568,13 +2590,18 @@ def _map_training_config(
             TrainingJobDefinition as described in
             https://botocore.readthedocs.io/en/latest/reference/services/sagemaker.html#SageMaker.Client.create_hyper_parameter_tuning_job
         """
+        if hpo_resource_config:
+            resource_config_map = {"HyperParameterTuningResourceConfig": hpo_resource_config}
+        else:
+            resource_config_map = {"ResourceConfig": resource_config}
 
         training_job_definition = {
             "StaticHyperParameters": static_hyperparameters,
             "RoleArn": role,
             "OutputDataConfig": output_config,
             "ResourceConfig": resource_config,
             "StoppingCondition": stop_condition,
+            **resource_config_map,
         }
 
         algorithm_spec = {"TrainingInputMode": input_mode}
diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py
@@ -383,6 +383,37 @@ def to_input_req(self):
         }
 
 
+class InstanceConfig:
+    """Instance configuration for training jobs started by hyperparameter tuning.
+
+    Contains the configuration(s) for one or more resources for processing hyperparameter jobs.
+    These resources include compute instances and storage volumes to use in model training jobs
+    launched by hyperparameter tuning jobs.
+    """
+
+    def __init__(
+        self,
+        instance_count: Union[int, PipelineVariable] = None,
+        instance_type: Union[str, PipelineVariable] = None,
+        volume_size: Union[int, PipelineVariable] = 30,
+    ):
+        """Creates a ``InstanceConfig`` instance.
+
+        It takes instance configuration information for training
+        jobs that are created as the result of a hyperparameter tuning job.
+        Args:
+            * instance_count (str or PipelineVariable): The number of compute instances of type
+            InstanceType to use. For distributed training, select a value greater than 1.
+            * instance_type (str or PipelineVariable):
+            The instance type used to run hyperparameter optimization tuning jobs.
+            * volume_size (int or PipelineVariable): The volume size in GB of the data to be
+            processed for hyperparameter optimization
+        """
+        self.instance_count = instance_count
+        self.instance_type = instance_type
+        self.volume_size = volume_size
+
+
 class HyperparameterTuner(object):
     """Defines interaction with Amazon SageMaker hyperparameter tuning jobs.
 
@@ -419,7 +450,6 @@ def __init__(
 
         It takes an estimator to obtain configuration information for training
         jobs that are created as the result of a hyperparameter tuning job.
-
         Args:
             estimator (sagemaker.estimator.EstimatorBase): An estimator object
                 that has been initialized with the desired configuration. There
@@ -489,6 +519,7 @@ def __init__(
             self.metric_definitions_dict = (
                 {estimator_name: metric_definitions} if metric_definitions is not None else {}
             )
+            self.instance_configs_dict = {}
             self.static_hyperparameters = None
         else:
             self.estimator = estimator
@@ -500,6 +531,7 @@ def __init__(
             self._hyperparameter_ranges_dict = None
             self.metric_definitions_dict = None
             self.static_hyperparameters_dict = None
+            self.instance_configs_dict = None
 
         self._validate_parameter_ranges(estimator, hyperparameter_ranges)
 
@@ -521,6 +553,30 @@ def __init__(
         self.warm_start_config = warm_start_config
         self.early_stopping_type = early_stopping_type
         self.random_seed = random_seed
+        self.instance_configs = None
+
+    def override_resource_config(
+        self, instance_configs: Union[List[InstanceConfig], Dict[str, List[InstanceConfig]]]
+    ):
+        """Override the instance configuration of the estimators used by the tuner.
+
+        Args:
+            instance_configs (List[InstanceConfig] or Dict[str, List[InstanceConfig]):
+                The InstanceConfigs to use as an override for the instance configuration
+                of the estimator. ``None`` will remove the override.
+        """
+        if isinstance(instance_configs, dict):
+            self._validate_dict_argument(
+                name="instance_configs",
+                value=instance_configs,
+                allowed_keys=list(self.estimator_dict.keys()),
+            )
+            self.instance_configs_dict = instance_configs
+        else:
+            self.instance_configs = instance_configs
+            if self.estimator_dict and self.estimator_dict.keys():
+                estimator_names = list(self.estimator_dict.keys())
+                self.instance_configs_dict = {estimator_names[0]: instance_configs}
 
     def _prepare_for_tuning(self, job_name=None, include_cls_metadata=False):
         """Prepare the tuner instance for tuning (fit)."""
@@ -1817,6 +1873,7 @@ def _get_tuner_args(cls, tuner, inputs):
                 estimator=tuner.estimator,
                 static_hyperparameters=tuner.static_hyperparameters,
                 metric_definitions=tuner.metric_definitions,
+                instance_configs=tuner.instance_configs,
             )
 
         if tuner.estimator_dict is not None:
@@ -1830,12 +1887,49 @@ def _get_tuner_args(cls, tuner, inputs):
                     tuner.objective_type,
                     tuner.objective_metric_name_dict[estimator_name],
                     tuner.hyperparameter_ranges_dict()[estimator_name],
+                    tuner.instance_configs_dict.get(estimator_name, None),
                 )
                 for estimator_name in sorted(tuner.estimator_dict.keys())
             ]
 
         return tuner_args
 
+    @staticmethod
+    def _prepare_hp_resource_config(
+        instance_configs: List[InstanceConfig],
+        instance_count: int,
+        instance_type: str,
+        volume_size: int,
+        volume_kms_key: str,
+    ):
+        """Placeholder hpo resource config for one estimator of the tuner."""
+        resource_config = {}
+        if volume_kms_key is not None:
+            resource_config["VolumeKmsKeyId"] = volume_kms_key
+
+        if instance_configs is None:
+            resource_config["InstanceCount"] = instance_count
+            resource_config["InstanceType"] = instance_type
+            resource_config["VolumeSizeInGB"] = volume_size
+        else:
+            resource_config["InstanceConfigs"] = _TuningJob._prepare_instance_configs(
+                instance_configs
+            )
+
+        return resource_config
+
+    @staticmethod
+    def _prepare_instance_configs(instance_configs):
+        """Prepare instance config for create tuning request."""
+        return [
+            {
+                "InstanceCount": config.instance_count,
+                "InstanceType": config.instance_type,
+                "VolumeSizeInGB": config.volume_size,
+            }
+            for config in instance_configs
+        ]
+
     @staticmethod
     def _prepare_training_config(
         inputs,
@@ -1846,10 +1940,20 @@ def _prepare_training_config(
         objective_type=None,
         objective_metric_name=None,
         parameter_ranges=None,
+        instance_configs=None,
     ):
         """Prepare training config for one estimator."""
         training_config = _Job._load_config(inputs, estimator)
 
+        del training_config["resource_config"]
+        training_config["hpo_resource_config"] = _TuningJob._prepare_hp_resource_config(
+            instance_configs,
+            estimator.instance_count,
+            estimator.instance_type,
+            estimator.volume_size,
+            estimator.volume_kms_key,
+        )
+
         training_config["input_mode"] = estimator.input_mode
         training_config["metric_definitions"] = metric_definitions
 
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -35,6 +35,7 @@
     ContinuousParameter,
     CategoricalParameter,
     HyperparameterTuner,
+    InstanceConfig,
     WarmStartConfig,
     WarmStartTypes,
     create_transfer_learning_tuner,
@@ -97,6 +98,7 @@ def _tune_and_deploy(
     job_name=None,
     warm_start_config=None,
     early_stopping_type="Off",
+    instance_configs=None,
 ):
     tuner = _tune(
         kmeans_estimator,
@@ -105,6 +107,7 @@ def _tune_and_deploy(
         warm_start_config=warm_start_config,
         job_name=job_name,
         early_stopping_type=early_stopping_type,
+        instance_configs=instance_configs,
     )
     _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type, cpu_instance_type)
 
@@ -134,6 +137,7 @@ def _tune(
     max_jobs=2,
     max_parallel_jobs=2,
     early_stopping_type="Off",
+    instance_configs=None,
 ):
     with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
 
@@ -148,7 +152,7 @@ def _tune(
                 warm_start_config=warm_start_config,
                 early_stopping_type=early_stopping_type,
             )
-
+        tuner.override_resource_config(instance_configs=instance_configs)
         records = kmeans_estimator.record_set(kmeans_train_set[0][:100])
         test_record_set = kmeans_estimator.record_set(kmeans_train_set[0][:100], channel="test")
 
@@ -173,6 +177,25 @@ def test_tuning_kmeans(
     )
 
 
+@pytest.mark.release
+def test_tuning_kmeans_with_instance_configs(
+    sagemaker_session, kmeans_train_set, kmeans_estimator, hyperparameter_ranges, cpu_instance_type
+):
+    job_name = unique_name_from_base("tst-fit")
+    _tune_and_deploy(
+        kmeans_estimator,
+        kmeans_train_set,
+        sagemaker_session,
+        cpu_instance_type,
+        hyperparameter_ranges=hyperparameter_ranges,
+        job_name=job_name,
+        instance_configs=[
+            InstanceConfig(instance_count=1, instance_type="ml.m4.2xlarge", volume_size=30),
+            InstanceConfig(instance_count=1, instance_type="ml.m4.xlarge", volume_size=30),
+        ],
+    )
+
+
 def test_tuning_kmeans_identical_dataset_algorithm_tuner_raw(
     sagemaker_session, kmeans_train_set, kmeans_estimator, hyperparameter_ranges
 ):
diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py
@@ -1133,7 +1133,7 @@ def test_single_algo_tuning_step(sagemaker_session):
                 },
                 "RoleArn": "DummyRole",
                 "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
-                "ResourceConfig": {
+                "HyperParameterTuningResourceConfig": {
                     "InstanceCount": 1,
                     "InstanceType": "ml.c5.4xlarge",
                     "VolumeSizeInGB": 30,
@@ -1285,7 +1285,7 @@ def test_multi_algo_tuning_step(sagemaker_session):
                     },
                     "RoleArn": "DummyRole",
                     "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
-                    "ResourceConfig": {
+                    "HyperParameterTuningResourceConfig": {
                         "InstanceCount": {"Get": "Parameters.InstanceCount"},
                         "InstanceType": "ml.c5.4xlarge",
                         "VolumeSizeInGB": 30,
@@ -1352,7 +1352,7 @@ def test_multi_algo_tuning_step(sagemaker_session):
                     },
                     "RoleArn": "DummyRole",
                     "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
-                    "ResourceConfig": {
+                    "HyperParameterTuningResourceConfig": {
                         "InstanceCount": {"Get": "Parameters.InstanceCount"},
                         "InstanceType": "ml.c5.4xlarge",
                         "VolumeSizeInGB": 30,
diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py