Skip to content

Commit b299825

Browse files
authored
Merge branch 'master' into pt1.0
2 parents db1a52b + 2ad6c1d commit b299825

14 files changed

+244
-126
lines changed

CHANGELOG.rst

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,16 @@
22
CHANGELOG
33
=========
44

5-
1.16.2.dev
5+
1.16.3.dev
66
==========
77

8+
* bug-fix: Append retry id to default Airflow job name to avoid name collisions in retry
9+
* bug-fix: Local Mode: No longer requires s3 permissions to run local entry point file
10+
* feature: Estimators: add support for PyTorch 1.0.0
11+
12+
1.16.2
13+
======
14+
815
* enhancement: Check for S3 paths being passed as entry point
916
* feature: Add support for AugmentedManifestFile and ShuffleConfig
1017
* bug-fix: Add version bound for requests module to avoid conflicts with docker-compose and docker-py
@@ -15,7 +22,7 @@ CHANGELOG
1522
* bug-fix: Update PyYAML version to avoid conflicts with docker-compose
1623
* doc-fix: Correct the numbered list in the table of contents
1724
* doc-fix: Add Airflow API documentation
18-
* feature: Estimators: add support for PyTorch 1.0.0
25+
* feature: HyperparameterTuner: add Early Stopping support
1926

2027
1.16.1.post1
2128
============

README.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,22 @@ A hyperparameter range can be one of three types: continuous, integer, or catego
611611
The SageMaker Python SDK provides corresponding classes for defining these different types.
612612
You can define up to 20 hyperparameters to search over, but each value of a categorical hyperparameter range counts against that limit.
613613
614+
By default, training job early stopping is turned off. To enable early stopping for the tuning job, you need to set the ``early_stopping_type`` parameter to ``Auto``:
615+
616+
.. code:: python
617+
618+
# Enable early stopping
619+
my_tuner = HyperparameterTuner(estimator=my_estimator, # previously-configured Estimator object
620+
objective_metric_name='validation-accuracy',
621+
hyperparameter_ranges={'learning-rate': ContinuousParameter(0.05, 0.06)},
622+
metric_definitions=[{'Name': 'validation-accuracy', 'Regex': 'validation-accuracy=(\d\.\d+)'}],
623+
max_jobs=100,
624+
max_parallel_jobs=10,
625+
early_stopping_type='Auto')
626+
627+
When early stopping is turned on, Amazon SageMaker will automatically stop a training job if it appears unlikely to produce a model of better quality than other jobs.
628+
If not using built-in Amazon SageMaker algorithms, note that, for early stopping to be effective, the objective metric should be emitted at epoch level.
629+
614630
If you are using an Amazon SageMaker built-in algorithm, you don't need to pass in anything for ``metric_definitions``.
615631
In addition, the ``fit()`` call uses a list of ``RecordSet`` objects instead of a dictionary:
616632

doc/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def __getattr__(cls, name):
3232
'numpy', 'scipy', 'scipy.sparse']
3333
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
3434

35-
version = '1.16.1.post1'
35+
version = '1.16.2'
3636
project = u'sagemaker'
3737

3838
# Add any Sphinx extension module names here, as strings. They can be extensions

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def read(fname):
3333

3434

3535
# Declare minimal set for installation
36-
required_packages = ['boto3>=1.9.55', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0',
36+
required_packages = ['boto3>=1.9.64', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0',
3737
'urllib3>=1.21', 'PyYAML>=3.2, <4', 'protobuf3-to-dict>=0.1.5',
3838
'docker-compose>=1.23.0', 'requests>=2.20.0, <2.21']
3939

src/sagemaker/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@
3939
from sagemaker.session import s3_input # noqa: F401
4040
from sagemaker.session import get_execution_role # noqa: F401
4141

42-
__version__ = '1.16.1.post1'
42+
__version__ = '1.16.2'

src/sagemaker/local/image.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ def train(self, input_data_config, output_data_config, hyperparameters, job_name
106106
data_dir = self._create_tmp_folder()
107107
volumes = self._prepare_training_volumes(data_dir, input_data_config, output_data_config,
108108
hyperparameters)
109+
# If local, source directory needs to be updated to mounted /opt/ml/code path
110+
hyperparameters = self._update_local_src_path(hyperparameters, key=sagemaker.estimator.DIR_PARAM_NAME)
109111

110112
# Create the configuration files for each container that we will create
111113
# Each container will map the additional local volumes (if any).
@@ -169,6 +171,9 @@ def serve(self, model_dir, environment):
169171
parsed_uri = urlparse(script_dir)
170172
if parsed_uri.scheme == 'file':
171173
volumes.append(_Volume(parsed_uri.path, '/opt/ml/code'))
174+
# Update path to mount location
175+
environment = environment.copy()
176+
environment[sagemaker.estimator.DIR_PARAM_NAME.upper()] = '/opt/ml/code'
172177

173178
if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image):
174179
_pull_image(self.image)
@@ -302,7 +307,7 @@ def _prepare_training_volumes(self, data_dir, input_data_config, output_data_con
302307
volumes.append(_Volume(data_source.get_root_dir(), channel=channel_name))
303308

304309
# If there is a training script directory and it is a local directory,
305-
# mount it to the container.
310+
# mount it to the container.
306311
if sagemaker.estimator.DIR_PARAM_NAME in hyperparameters:
307312
training_dir = json.loads(hyperparameters[sagemaker.estimator.DIR_PARAM_NAME])
308313
parsed_uri = urlparse(training_dir)
@@ -321,6 +326,16 @@ def _prepare_training_volumes(self, data_dir, input_data_config, output_data_con
321326

322327
return volumes
323328

329+
def _update_local_src_path(self, params, key):
330+
if key in params:
331+
src_dir = json.loads(params[key])
332+
parsed_uri = urlparse(src_dir)
333+
if parsed_uri.scheme == 'file':
334+
new_params = params.copy()
335+
new_params[key] = json.dumps('/opt/ml/code')
336+
return new_params
337+
return params
338+
324339
def _prepare_serving_volumes(self, model_location):
325340
volumes = []
326341
host = self.hosts[0]

src/sagemaker/session.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,8 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
350350
max_jobs, max_parallel_jobs, parameter_ranges,
351351
static_hyperparameters, input_mode, metric_definitions,
352352
role, input_config, output_config, resource_config, stop_condition, tags,
353-
warm_start_config, enable_network_isolation=False, image=None, algorithm_arn=None):
353+
warm_start_config, enable_network_isolation=False, image=None, algorithm_arn=None,
354+
early_stopping_type='Off'):
354355
"""Create an Amazon SageMaker hyperparameter tuning job
355356
356357
Args:
@@ -396,6 +397,9 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
396397
https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html.
397398
warm_start_config (dict): Configuration defining the type of warm start and
398399
other required configurations.
400+
early_stopping_type (str): Specifies whether early stopping is enabled for the job.
401+
Can be either 'Auto' or 'Off'. If set to 'Off', early stopping will not be attempted.
402+
If set to 'Auto', early stopping of some training jobs may happen, but is not guaranteed to.
399403
"""
400404
tune_request = {
401405
'HyperParameterTuningJobName': job_name,
@@ -410,6 +414,7 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
410414
'MaxParallelTrainingJobs': max_parallel_jobs,
411415
},
412416
'ParameterRanges': parameter_ranges,
417+
'TrainingJobEarlyStoppingType': early_stopping_type,
413418
},
414419
'TrainingJobDefinition': {
415420
'StaticHyperParameters': static_hyperparameters,

src/sagemaker/tuner.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ class HyperparameterTuner(object):
165165

166166
def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metric_definitions=None,
167167
strategy='Bayesian', objective_type='Maximize', max_jobs=1, max_parallel_jobs=1,
168-
tags=None, base_tuning_job_name=None, warm_start_config=None):
168+
tags=None, base_tuning_job_name=None, warm_start_config=None, early_stopping_type='Off'):
169169
"""Initialize a ``HyperparameterTuner``. It takes an estimator to obtain configuration information
170170
for training jobs that are created as the result of a hyperparameter tuning job.
171171
@@ -194,6 +194,9 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr
194194
a default job name is generated, based on the training image name and current timestamp.
195195
warm_start_config (sagemaker.tuner.WarmStartConfig): A ``WarmStartConfig`` object that has been initialized
196196
with the configuration defining the nature of warm start tuning job.
197+
early_stopping_type (str): Specifies whether early stopping is enabled for the job.
198+
Can be either 'Auto' or 'Off' (default: 'Off'). If set to 'Off', early stopping will not be attempted.
199+
If set to 'Auto', early stopping of some training jobs may happen, but is not guaranteed to.
197200
"""
198201
self._hyperparameter_ranges = hyperparameter_ranges
199202
if self._hyperparameter_ranges is None or len(self._hyperparameter_ranges) == 0:
@@ -214,6 +217,7 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr
214217
self._current_job_name = None
215218
self.latest_tuning_job = None
216219
self.warm_start_config = warm_start_config
220+
self.early_stopping_type = early_stopping_type
217221

218222
def _prepare_for_training(self, job_name=None, include_cls_metadata=True):
219223
if job_name is not None:
@@ -445,7 +449,8 @@ def _prepare_init_params_from_job_description(cls, job_details):
445449
'strategy': tuning_config['Strategy'],
446450
'max_jobs': tuning_config['ResourceLimits']['MaxNumberOfTrainingJobs'],
447451
'max_parallel_jobs': tuning_config['ResourceLimits']['MaxParallelTrainingJobs'],
448-
'warm_start_config': WarmStartConfig.from_job_desc(job_details.get('WarmStartConfig', None))
452+
'warm_start_config': WarmStartConfig.from_job_desc(job_details.get('WarmStartConfig', None)),
453+
'early_stopping_type': tuning_config['TrainingJobEarlyStoppingType']
449454
}
450455

451456
@classmethod
@@ -625,6 +630,7 @@ def start_new(cls, tuner, inputs):
625630
tuner_args['metric_definitions'] = tuner.metric_definitions
626631
tuner_args['tags'] = tuner.tags
627632
tuner_args['warm_start_config'] = warm_start_config_req
633+
tuner_args['early_stopping_type'] = tuner.early_stopping_type
628634

629635
del tuner_args['vpc_config']
630636
if isinstance(tuner.estimator, sagemaker.algorithm.AlgorithmEstimator):

src/sagemaker/utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@
2727
import six
2828

2929

30-
AIRFLOW_TIME_MACRO = "{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}"
31-
AIRFLOW_TIME_MACRO_LEN = 19
32-
AIRFLOW_TIME_MACRO_SHORT = "{{ execution_date.strftime('%y%m%d-%H%M') }}"
33-
AIRFLOW_TIME_MACRO_SHORT_LEN = 11
30+
AIRFLOW_RETRY_MACRO = "{{ task_instance.try_number }}"
31+
AIRFLOW_TIME_MACRO = "{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" + "-{}".format(AIRFLOW_RETRY_MACRO)
32+
AIRFLOW_TIME_MACRO_LEN = 22
33+
AIRFLOW_TIME_MACRO_SHORT = "{{ execution_date.strftime('%y%m%d-%H%M') }}" + "-{}".format(AIRFLOW_RETRY_MACRO)
34+
AIRFLOW_TIME_MACRO_SHORT_LEN = 14
3435

3536

3637
# Use the base name of the image as the job name if the user doesn't give us one

tests/integ/test_tuner.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,15 +83,16 @@ def hyperparameter_ranges():
8383

8484
def _tune_and_deploy(kmeans_estimator, kmeans_train_set, sagemaker_session,
8585
hyperparameter_ranges=None, job_name=None,
86-
warm_start_config=None):
86+
warm_start_config=None, early_stopping_type='Off'):
8787
tuner = _tune(kmeans_estimator, kmeans_train_set,
8888
hyperparameter_ranges=hyperparameter_ranges, warm_start_config=warm_start_config,
89-
job_name=job_name)
90-
_deploy(kmeans_train_set, sagemaker_session, tuner)
89+
job_name=job_name, early_stopping_type=early_stopping_type)
90+
_deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type)
9191

9292

93-
def _deploy(kmeans_train_set, sagemaker_session, tuner):
93+
def _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type):
9494
best_training_job = tuner.best_training_job()
95+
assert tuner.early_stopping_type == early_stopping_type
9596
with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
9697
predictor = tuner.deploy(1, 'ml.c4.xlarge')
9798

@@ -105,7 +106,7 @@ def _deploy(kmeans_train_set, sagemaker_session, tuner):
105106

106107
def _tune(kmeans_estimator, kmeans_train_set, tuner=None,
107108
hyperparameter_ranges=None, job_name=None, warm_start_config=None,
108-
wait_till_terminal=True, max_jobs=2, max_parallel_jobs=2):
109+
wait_till_terminal=True, max_jobs=2, max_parallel_jobs=2, early_stopping_type='Off'):
109110
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
110111

111112
if not tuner:
@@ -115,7 +116,8 @@ def _tune(kmeans_estimator, kmeans_train_set, tuner=None,
115116
objective_type='Minimize',
116117
max_jobs=max_jobs,
117118
max_parallel_jobs=max_parallel_jobs,
118-
warm_start_config=warm_start_config)
119+
warm_start_config=warm_start_config,
120+
early_stopping_type=early_stopping_type)
119121

120122
records = kmeans_estimator.record_set(kmeans_train_set[0][:100])
121123
test_record_set = kmeans_estimator.record_set(kmeans_train_set[0][:100], channel='test')
@@ -332,16 +334,23 @@ def test_tuning_lda(sagemaker_session):
332334
tuner = HyperparameterTuner(estimator=lda, objective_metric_name=objective_metric_name,
333335
hyperparameter_ranges=hyperparameter_ranges,
334336
objective_type='Maximize', max_jobs=2,
335-
max_parallel_jobs=2)
337+
max_parallel_jobs=2,
338+
early_stopping_type='Auto')
336339

337340
tuning_job_name = unique_name_from_base('test-lda', max_length=32)
338341
tuner.fit([record_set, test_record_set], mini_batch_size=1, job_name=tuning_job_name)
339342

340-
print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)
343+
latest_tuning_job_name = tuner.latest_tuning_job.name
344+
345+
print('Started hyperparameter tuning job with name:' + latest_tuning_job_name)
341346

342347
time.sleep(15)
343348
tuner.wait()
344349

350+
desc = tuner.latest_tuning_job.sagemaker_session.sagemaker_client \
351+
.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=latest_tuning_job_name)
352+
assert desc['HyperParameterTuningJobConfig']['TrainingJobEarlyStoppingType'] == 'Auto'
353+
345354
best_training_job = tuner.best_training_job()
346355
with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
347356
predictor = tuner.deploy(1, 'ml.c4.xlarge')
@@ -555,7 +564,8 @@ def test_attach_tuning_pytorch(sagemaker_session):
555564

556565
tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges,
557566
metric_definitions,
558-
max_jobs=2, max_parallel_jobs=2)
567+
max_jobs=2, max_parallel_jobs=2,
568+
early_stopping_type='Auto')
559569

560570
training_data = estimator.sagemaker_session.upload_data(
561571
path=os.path.join(mnist_dir, 'training'),
@@ -571,6 +581,8 @@ def test_attach_tuning_pytorch(sagemaker_session):
571581

572582
attached_tuner = HyperparameterTuner.attach(tuning_job_name,
573583
sagemaker_session=sagemaker_session)
584+
assert attached_tuner.early_stopping_type == 'Auto'
585+
574586
best_training_job = tuner.best_training_job()
575587
with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
576588
predictor = attached_tuner.deploy(1, 'ml.c4.xlarge')

0 commit comments

Comments
 (0)