Skip to content

feature: Support for flexible instance types in the HPO #3611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions src/sagemaker/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -2201,6 +2201,7 @@ def tune( # noqa: C901
checkpoint_local_path=None,
random_seed=None,
environment=None,
hpo_resource_config=None,
):
"""Create an Amazon SageMaker hyperparameter tuning job.

Expand Down Expand Up @@ -2286,6 +2287,22 @@ def tune( # noqa: C901
produce more consistent configurations for the same tuning job. (default: ``None``).
environment (dict[str, str]) : Environment variables to be set for
use during training jobs (default: ``None``)
hpo_resource_config (dict): The configuration for the hyperparameter tuning resources,
including the compute instances and storage volumes, used for training jobs launched
by the tuning job, where you must specify either
instance_configs or instance_count + instance_type + volume_size:
* instance_count (int): Number of EC2 instances to use for training.
The key in resource_config is 'InstanceCount'.
* instance_type (str): Type of EC2 instance to use for training, for example,
'ml.c4.xlarge'. The key in resource_config is 'InstanceType'.
* volume_size (int or PipelineVariable): The volume size in GB of the data to be
processed for hyperparameter optimisation
* instance_configs (List[InstanceConfig]): A list containing the configuration(s)
for one or more resources for processing hyperparameter jobs. These resources
include compute instances and storage volumes to use in model training jobs.
* volume_kms_key_id: The AWS Key Management Service (AWS KMS) key
that Amazon SageMaker uses to encrypt data on the storage
volume attached to the ML compute instance(s) that run the training job.
"""

tune_request = {
Expand All @@ -2311,6 +2328,7 @@ def tune( # noqa: C901
input_config=input_config,
output_config=output_config,
resource_config=resource_config,
hpo_resource_config=hpo_resource_config,
vpc_config=vpc_config,
stop_condition=stop_condition,
enable_network_isolation=enable_network_isolation,
Expand Down Expand Up @@ -2545,9 +2563,10 @@ def _map_training_config(
input_mode,
role,
output_config,
resource_config,
stop_condition,
input_config=None,
resource_config=None,
hpo_resource_config=None,
metric_definitions=None,
image_uri=None,
algorithm_arn=None,
Expand Down Expand Up @@ -2625,13 +2644,17 @@ def _map_training_config(
TrainingJobDefinition as described in
https://botocore.readthedocs.io/en/latest/reference/services/sagemaker.html#SageMaker.Client.create_hyper_parameter_tuning_job
"""
if hpo_resource_config is not None:
resource_config_map = {"HyperParameterTuningResourceConfig": hpo_resource_config}
else:
resource_config_map = {"ResourceConfig": resource_config}

training_job_definition = {
"StaticHyperParameters": static_hyperparameters,
"RoleArn": role,
"OutputDataConfig": output_config,
"ResourceConfig": resource_config,
"StoppingCondition": stop_condition,
**resource_config_map,
}

algorithm_spec = {"TrainingInputMode": input_mode}
Expand Down
148 changes: 146 additions & 2 deletions src/sagemaker/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,83 @@ def to_input_req(self):
}


class InstanceConfig:
"""Instance configuration for training jobs started by hyperparameter tuning.

Contains the configuration(s) for one or more resources for processing hyperparameter jobs.
These resources include compute instances and storage volumes to use in model training jobs
launched by hyperparameter tuning jobs.
"""

def __init__(
self,
instance_count: Union[int, PipelineVariable] = None,
instance_type: Union[str, PipelineVariable] = None,
volume_size: Union[int, PipelineVariable] = 30,
):
"""Creates a ``InstanceConfig`` instance.

It takes instance configuration information for training
jobs that are created as the result of a hyperparameter tuning job.

Args:
* instance_count (str or PipelineVariable): The number of compute instances of type
InstanceType to use. For distributed training, select a value greater than 1.
* instance_type (str or PipelineVariable):
The instance type used to run hyperparameter optimization tuning jobs.
* volume_size (int or PipelineVariable): The volume size in GB of the data to be
processed for hyperparameter optimization
"""
self.instance_count = instance_count
self.instance_type = instance_type
self.volume_size = volume_size

@classmethod
def from_job_desc(cls, instance_config):
"""Creates a ``InstanceConfig`` from an instance configuration response.

This is the instance configuration from the DescribeTuningJob response.

Args:
instance_config (dict): The expected format of the
``instance_config`` contains one first-class field

Returns:
sagemaker.tuner.InstanceConfig: De-serialized instance of
InstanceConfig containing the strategy configuration.
"""
return cls(
instance_count=instance_config["InstanceCount"],
instance_type=instance_config[" InstanceType "],
volume_size=instance_config["VolumeSizeInGB"],
)

def to_input_req(self):
"""Converts the ``self`` instance to the desired input request format.

Examples:
>>> strategy_config = InstanceConfig(
instance_count=1,
instance_type='ml.m4.xlarge',
volume_size=30
)
>>> strategy_config.to_input_req()
{
"InstanceCount":1,
"InstanceType":"ml.m4.xlarge",
"VolumeSizeInGB":30
}

Returns:
dict: Containing the instance configurations.
"""
return {
"InstanceCount": self.instance_count,
"InstanceType": self.instance_type,
"VolumeSizeInGB": self.volume_size,
}


class HyperparameterTuner(object):
"""Defines interaction with Amazon SageMaker hyperparameter tuning jobs.

Expand Down Expand Up @@ -482,14 +559,14 @@ def __init__(
self.estimator = None
self.objective_metric_name = None
self._hyperparameter_ranges = None
self.static_hyperparameters = None
self.metric_definitions = None
self.estimator_dict = {estimator_name: estimator}
self.objective_metric_name_dict = {estimator_name: objective_metric_name}
self._hyperparameter_ranges_dict = {estimator_name: hyperparameter_ranges}
self.metric_definitions_dict = (
{estimator_name: metric_definitions} if metric_definitions is not None else {}
)
self.static_hyperparameters = None
else:
self.estimator = estimator
self.objective_metric_name = objective_metric_name
Expand Down Expand Up @@ -521,6 +598,31 @@ def __init__(
self.warm_start_config = warm_start_config
self.early_stopping_type = early_stopping_type
self.random_seed = random_seed
self.instance_configs_dict = None
self.instance_configs = None

def override_resource_config(
self, instance_configs: Union[List[InstanceConfig], Dict[str, List[InstanceConfig]]]
):
"""Override the instance configuration of the estimators used by the tuner.

Args:
instance_configs (List[InstanceConfig] or Dict[str, List[InstanceConfig]):
The InstanceConfigs to use as an override for the instance configuration
of the estimator. ``None`` will remove the override.
"""
if isinstance(instance_configs, dict):
self._validate_dict_argument(
name="instance_configs",
value=instance_configs,
allowed_keys=list(self.estimator_dict.keys()),
)
self.instance_configs_dict = instance_configs
else:
self.instance_configs = instance_configs
if self.estimator_dict is not None and self.estimator_dict.keys():
estimator_names = list(self.estimator_dict.keys())
self.instance_configs_dict = {estimator_names[0]: instance_configs}

def _prepare_for_tuning(self, job_name=None, include_cls_metadata=False):
"""Prepare the tuner instance for tuning (fit)."""
Expand Down Expand Up @@ -589,7 +691,6 @@ def _prepare_job_name_for_tuning(self, job_name=None):

def _prepare_static_hyperparameters_for_tuning(self, include_cls_metadata=False):
"""Prepare static hyperparameters for all estimators before tuning."""
self.static_hyperparameters = None
if self.estimator is not None:
self.static_hyperparameters = self._prepare_static_hyperparameters(
self.estimator, self._hyperparameter_ranges, include_cls_metadata
Expand Down Expand Up @@ -1817,6 +1918,7 @@ def _get_tuner_args(cls, tuner, inputs):
estimator=tuner.estimator,
static_hyperparameters=tuner.static_hyperparameters,
metric_definitions=tuner.metric_definitions,
instance_configs=tuner.instance_configs,
)

if tuner.estimator_dict is not None:
Expand All @@ -1830,12 +1932,44 @@ def _get_tuner_args(cls, tuner, inputs):
tuner.objective_type,
tuner.objective_metric_name_dict[estimator_name],
tuner.hyperparameter_ranges_dict()[estimator_name],
tuner.instance_configs_dict.get(estimator_name, None)
if tuner.instance_configs_dict is not None
else None,
)
for estimator_name in sorted(tuner.estimator_dict.keys())
]

return tuner_args

@staticmethod
def _prepare_hp_resource_config(
instance_configs: List[InstanceConfig],
instance_count: int,
instance_type: str,
volume_size: int,
volume_kms_key: str,
):
"""Placeholder hpo resource config for one estimator of the tuner."""
resource_config = {}
if volume_kms_key is not None:
resource_config["VolumeKmsKeyId"] = volume_kms_key

if instance_configs is None:
resource_config["InstanceCount"] = instance_count
resource_config["InstanceType"] = instance_type
resource_config["VolumeSizeInGB"] = volume_size
else:
resource_config["InstanceConfigs"] = _TuningJob._prepare_instance_configs(
instance_configs
)

return resource_config

@staticmethod
def _prepare_instance_configs(instance_configs: List[InstanceConfig]):
"""Prepare instance config for create tuning request."""
return [config.to_input_req() for config in instance_configs]

@staticmethod
def _prepare_training_config(
inputs,
Expand All @@ -1846,10 +1980,20 @@ def _prepare_training_config(
objective_type=None,
objective_metric_name=None,
parameter_ranges=None,
instance_configs=None,
):
"""Prepare training config for one estimator."""
training_config = _Job._load_config(inputs, estimator)

del training_config["resource_config"]
training_config["hpo_resource_config"] = _TuningJob._prepare_hp_resource_config(
instance_configs,
estimator.instance_count,
estimator.instance_type,
estimator.volume_size,
estimator.volume_kms_key,
)

training_config["input_mode"] = estimator.input_mode
training_config["metric_definitions"] = metric_definitions

Expand Down
25 changes: 24 additions & 1 deletion tests/integ/test_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
ContinuousParameter,
CategoricalParameter,
HyperparameterTuner,
InstanceConfig,
WarmStartConfig,
WarmStartTypes,
create_transfer_learning_tuner,
Expand Down Expand Up @@ -97,6 +98,7 @@ def _tune_and_deploy(
job_name=None,
warm_start_config=None,
early_stopping_type="Off",
instance_configs=None,
):
tuner = _tune(
kmeans_estimator,
Expand All @@ -105,6 +107,7 @@ def _tune_and_deploy(
warm_start_config=warm_start_config,
job_name=job_name,
early_stopping_type=early_stopping_type,
instance_configs=instance_configs,
)
_deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type, cpu_instance_type)

Expand Down Expand Up @@ -134,6 +137,7 @@ def _tune(
max_jobs=2,
max_parallel_jobs=2,
early_stopping_type="Off",
instance_configs=None,
):
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):

Expand All @@ -148,7 +152,7 @@ def _tune(
warm_start_config=warm_start_config,
early_stopping_type=early_stopping_type,
)

tuner.override_resource_config(instance_configs=instance_configs)
records = kmeans_estimator.record_set(kmeans_train_set[0][:100])
test_record_set = kmeans_estimator.record_set(kmeans_train_set[0][:100], channel="test")

Expand All @@ -173,6 +177,25 @@ def test_tuning_kmeans(
)


@pytest.mark.release
def test_tuning_kmeans_with_instance_configs(
sagemaker_session, kmeans_train_set, kmeans_estimator, hyperparameter_ranges, cpu_instance_type
):
job_name = unique_name_from_base("tst-fit")
_tune_and_deploy(
kmeans_estimator,
kmeans_train_set,
sagemaker_session,
cpu_instance_type,
hyperparameter_ranges=hyperparameter_ranges,
job_name=job_name,
instance_configs=[
InstanceConfig(instance_count=1, instance_type="ml.m4.2xlarge", volume_size=30),
InstanceConfig(instance_count=1, instance_type="ml.m4.xlarge", volume_size=30),
],
)


def test_tuning_kmeans_identical_dataset_algorithm_tuner_raw(
sagemaker_session, kmeans_train_set, kmeans_estimator, hyperparameter_ranges
):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/sagemaker/workflow/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,7 @@ def test_single_algo_tuning_step(sagemaker_session):
},
"RoleArn": "DummyRole",
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
"ResourceConfig": {
"HyperParameterTuningResourceConfig": {
"InstanceCount": 1,
"InstanceType": "ml.c5.4xlarge",
"VolumeSizeInGB": 30,
Expand Down Expand Up @@ -1285,7 +1285,7 @@ def test_multi_algo_tuning_step(sagemaker_session):
},
"RoleArn": "DummyRole",
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
"ResourceConfig": {
"HyperParameterTuningResourceConfig": {
"InstanceCount": {"Get": "Parameters.InstanceCount"},
"InstanceType": "ml.c5.4xlarge",
"VolumeSizeInGB": 30,
Expand Down Expand Up @@ -1352,7 +1352,7 @@ def test_multi_algo_tuning_step(sagemaker_session):
},
"RoleArn": "DummyRole",
"OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
"ResourceConfig": {
"HyperParameterTuningResourceConfig": {
"InstanceCount": {"Get": "Parameters.InstanceCount"},
"InstanceType": "ml.c5.4xlarge",
"VolumeSizeInGB": 30,
Expand Down
Loading