Support MetricDefinitions for general training jobs (aws#484)

SifeiLi · laurenyu · commit 641299102c35 · 2018-11-15T22:53:42.000-08:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,7 @@ CHANGELOG
 * feature: HyperparameterTuner: add support for Automatic Model Tuning's Warm Start Jobs
 * feature: HyperparameterTuner: Make input channels optional
 * feature: Add support for Chainer 5.0
+* feature: Estimator: add support for MetricDefinitions
 
 1.14.2
 ======
diff --git a/README.rst b/README.rst
@@ -170,6 +170,25 @@ Here is an end to end example of how to use a SageMaker Estimator:
     # Tears down the SageMaker endpoint
     mxnet_estimator.delete_endpoint()
 
+Training Metrics
+~~~~~~~~~~~~~~~~
+The SageMaker Python SDK allows you to specify a name and a regular expression for metrics you want to track for training.
+A regular expression (regex) matches what is in the training algorithm logs, like a search function.
+Here is an example of how to define metrics:
+
+.. code:: python
+
+    # Configure an BYO Estimator with metric definitions (no training happens yet)
+    byo_estimator = Estimator(image_name=image_name,
+                              role='SageMakerRole', train_instance_count=1,
+                              train_instance_type='ml.c4.xlarge',
+                              sagemaker_session=sagemaker_session,
+                              metric_definitions=[{'Name': 'test:msd', 'Regex': '#quality_metric: host=\S+, test msd <loss>=(\S+)'},
+                                                  {'Name': 'test:ssd', 'Regex': '#quality_metric: host=\S+, test ssd <loss>=(\S+)'}])
+
+All Amazon SageMaker algorithms come with built-in support for metrics.
+You can go to `the AWS documentation <https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html>`__ for more details about built-in metrics of each Amazon SageMaker algorithm.
+
 Local Mode
 ~~~~~~~~~~
 
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -50,7 +50,8 @@ class EstimatorBase(with_metaclass(ABCMeta, object)):
     def __init__(self, role, train_instance_count, train_instance_type,
                  train_volume_size=30, train_volume_kms_key=None, train_max_run=24 * 60 * 60, input_mode='File',
                  output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None, tags=None,
-                 subnets=None, security_group_ids=None, model_uri=None, model_channel_name='model'):
+                 subnets=None, security_group_ids=None, model_uri=None, model_channel_name='model',
+                 metric_definitions=None):
         """Initialize an ``EstimatorBase`` instance.
 
         Args:
@@ -97,6 +98,10 @@ def __init__(self, role, train_instance_count, train_instance_type,
 
                 More information: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html#td-deserialization
             model_channel_name (str): Name of the channel where 'model_uri' will be downloaded (default: 'model').
+            metric_definitions (list[dict]): A list of dictionaries that defines the metric(s) used to evaluate the
+                training jobs. Each dictionary contains two keys: 'Name' for the name of the metric, and 'Regex' for
+                the regular expression used to extract the metric from the logs. This should be defined only
+                for jobs that don't use an Amazon algorithm.
         """
         self.role = role
         self.train_instance_count = train_instance_count
@@ -106,6 +111,7 @@ def __init__(self, role, train_instance_count, train_instance_type,
         self.train_max_run = train_max_run
         self.input_mode = input_mode
         self.tags = tags
+        self.metric_definitions = metric_definitions
         self.model_uri = model_uri
         self.model_channel_name = model_channel_name
 
@@ -324,6 +330,9 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
         init_params['hyperparameters'] = job_details['HyperParameters']
         init_params['image'] = job_details['AlgorithmSpecification']['TrainingImage']
 
+        if 'MetricDefinitons' in job_details['AlgorithmSpecification']:
+            init_params['metric_definitions'] = job_details['AlgorithmSpecification']['MetricsDefinition']
+
         subnets, security_group_ids = vpc_utils.from_dict(job_details.get(vpc_utils.VPC_CONFIG_KEY))
         if subnets:
             init_params['subnets'] = subnets
@@ -441,7 +450,7 @@ def start_new(cls, estimator, inputs):
                                           job_name=estimator._current_job_name, output_config=config['output_config'],
                                           resource_config=config['resource_config'], vpc_config=config['vpc_config'],
                                           hyperparameters=hyperparameters, stop_condition=config['stop_condition'],
-                                          tags=estimator.tags)
+                                          tags=estimator.tags, metric_definitions=estimator.metric_definitions)
 
         return cls(estimator.sagemaker_session, estimator._current_job_name)
 
@@ -466,7 +475,7 @@ def __init__(self, image_name, role, train_instance_count, train_instance_type,
                  train_volume_size=30, train_volume_kms_key=None, train_max_run=24 * 60 * 60,
                  input_mode='File', output_path=None, output_kms_key=None, base_job_name=None,
                  sagemaker_session=None, hyperparameters=None, tags=None, subnets=None, security_group_ids=None,
-                 model_uri=None, model_channel_name='model'):
+                 model_uri=None, model_channel_name='model', metric_definitions=None):
         """Initialize an ``Estimator`` instance.
 
         Args:
@@ -517,14 +526,18 @@ def __init__(self, image_name, role, train_instance_count, train_instance_type,
 
                 More information: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html#td-deserialization
             model_channel_name (str): Name of the channel where 'model_uri' will be downloaded (default: 'model').
+            metric_definitions (list[dict]): A list of dictionaries that defines the metric(s) used to evaluate the
+                training jobs. Each dictionary contains two keys: 'Name' for the name of the metric, and 'Regex' for
+                the regular expression used to extract the metric from the logs. This should be defined only
+                for jobs that don't use an Amazon algorithm.
         """
         self.image_name = image_name
         self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {}
         super(Estimator, self).__init__(role, train_instance_count, train_instance_type,
                                         train_volume_size, train_volume_kms_key, train_max_run, input_mode,
                                         output_path, output_kms_key, base_job_name, sagemaker_session,
                                         tags, subnets, security_group_ids, model_uri=model_uri,
-                                        model_channel_name=model_channel_name)
+                                        model_channel_name=model_channel_name, metric_definitions=metric_definitions)
 
     def train_image(self):
         """
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -203,7 +203,7 @@ def default_bucket(self):
         return self._default_bucket
 
     def train(self, image, input_mode, input_config, role, job_name, output_config,
-              resource_config, vpc_config, hyperparameters, stop_condition, tags):
+              resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions):
         """Create an Amazon SageMaker training job.
 
         Args:
@@ -243,6 +243,9 @@ def train(self, image, input_mode, input_config, role, job_name, output_config,
                 service like ``MaxRuntimeInSeconds``.
             tags (list[dict]): List of tags for labeling a training job. For more, see
                 https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html.
+            metric_definitions (list[dict]): A list of dictionaries that defines the metric(s) used to evaluate the
+                training jobs. Each dictionary contains two keys: 'Name' for the name of the metric, and 'Regex' for
+                the regular expression used to extract the metric from the logs.
 
         Returns:
             str: ARN of the training job, if it is created.
@@ -263,6 +266,9 @@ def train(self, image, input_mode, input_config, role, job_name, output_config,
         if input_config is not None:
             train_request['InputDataConfig'] = input_config
 
+        if metric_definitions is not None:
+            train_request['AlgorithmSpecification']['MetricDefinitions'] = metric_definitions
+
         if hyperparameters and len(hyperparameters) > 0:
             train_request['HyperParameters'] = hyperparameters
 
@@ -306,7 +312,7 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
             metric_definitions (list[dict]): A list of dictionaries that defines the metric(s) used to evaluate the
                 training jobs. Each dictionary contains two keys: 'Name' for the name of the metric, and 'Regex' for
                 the regular expression used to extract the metric from the logs. This should be defined only for
-                hyperparameter tuning jobs that don't use an Amazon algorithm.
+                jobs that don't use an Amazon algorithm.
             role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and APIs
                 that create Amazon SageMaker endpoints use this role to access training data and model artifacts.
                 You must grant sufficient permissions to this role.
diff --git a/tests/unit/test_chainer.py b/tests/unit/test_chainer.py
@@ -121,7 +121,8 @@ def _create_train_job(version):
             'MaxRuntimeInSeconds': 24 * 60 * 60
         },
         'tags': None,
-        'vpc_config': None
+        'vpc_config': None,
+        'metric_definitions': None
     }
 
 
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
@@ -143,7 +143,8 @@ def test_framework_all_init_args(sagemaker_session):
                        sagemaker_session=sagemaker_session, train_volume_size=123, train_volume_kms_key='volumekms',
                        train_max_run=456, input_mode='inputmode', output_path='outputpath', output_kms_key='outputkms',
                        base_job_name='basejobname', tags=[{'foo': 'bar'}], subnets=['123', '456'],
-                       security_group_ids=['789', '012'])
+                       security_group_ids=['789', '012'],
+                       metric_definitions=[{'Name': 'validation-rmse', 'Regex': 'validation-rmse=(\\d+)'}])
     _TrainingJob.start_new(f, 's3://mydata')
     sagemaker_session.train.assert_called_once()
     _, args = sagemaker_session.train.call_args
@@ -158,7 +159,8 @@ def test_framework_all_init_args(sagemaker_session):
                     'stop_condition': {'MaxRuntimeInSeconds': 456},
                     'role': sagemaker_session.expand_role(), 'job_name': None,
                     'resource_config': {'VolumeSizeInGB': 123, 'InstanceCount': 3, 'VolumeKmsKeyId': 'volumekms',
-                                        'InstanceType': 'ml.m4.xlarge'}}
+                                        'InstanceType': 'ml.m4.xlarge'},
+                    'metric_definitions': [{'Name': 'validation-rmse', 'Regex': 'validation-rmse=(\\d+)'}]}
 
 
 def test_sagemaker_s3_uri_invalid(sagemaker_session):
@@ -711,7 +713,8 @@ def test_unsupported_type_in_dict():
     },
     'stop_condition': {'MaxRuntimeInSeconds': 86400},
     'tags': None,
-    'vpc_config': None
+    'vpc_config': None,
+    'metric_definitions': None
 }
 
 INPUT_CONFIG = [{
diff --git a/tests/unit/test_mxnet.py b/tests/unit/test_mxnet.py
@@ -96,7 +96,8 @@ def _create_train_job(version):
             'MaxRuntimeInSeconds': 24 * 60 * 60
         },
         'tags': None,
-        'vpc_config': None
+        'vpc_config': None,
+        'metric_definitions': None
     }
 
 
diff --git a/tests/unit/test_pytorch.py b/tests/unit/test_pytorch.py
@@ -112,7 +112,8 @@ def _create_train_job(version):
             'MaxRuntimeInSeconds': 24 * 60 * 60
         },
         'tags': None,
-        'vpc_config': None
+        'vpc_config': None,
+        'metric_definitions': None
     }
 
 
diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py
@@ -184,6 +184,7 @@ def test_s3_input_all_arguments():
 JOB_NAME = 'jobname'
 TAGS = [{'Name': 'some-tag', 'Value': 'value-for-tag'}]
 VPC_CONFIG = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
+METRIC_DEFINITONS = [{'Name': 'validation-rmse', 'Regex': 'validation-rmse=(\\d+)'}]
 
 DEFAULT_EXPECTED_TRAIN_JOB_ARGS = {
     'OutputDataConfig': {
@@ -268,7 +269,8 @@ def test_train_pack_to_request(sagemaker_session):
 
     sagemaker_session.train(image=IMAGE, input_mode='File', input_config=in_config, role=EXPANDED_ROLE,
                             job_name=JOB_NAME, output_config=out_config, resource_config=resource_config,
-                            hyperparameters=None, stop_condition=stop_cond, tags=None, vpc_config=VPC_CONFIG)
+                            hyperparameters=None, stop_condition=stop_cond, tags=None, vpc_config=VPC_CONFIG,
+                            metric_definitions=None)
 
     assert sagemaker_session.sagemaker_client.method_calls[0] == (
         'create_training_job', (), DEFAULT_EXPECTED_TRAIN_JOB_ARGS)
@@ -439,13 +441,15 @@ def test_train_pack_to_request_with_optional_params(sagemaker_session):
 
     sagemaker_session.train(image=IMAGE, input_mode='File', input_config=in_config, role=EXPANDED_ROLE,
                             job_name=JOB_NAME, output_config=out_config, resource_config=resource_config,
-                            vpc_config=VPC_CONFIG, hyperparameters=hyperparameters, stop_condition=stop_cond, tags=TAGS)
+                            vpc_config=VPC_CONFIG, hyperparameters=hyperparameters, stop_condition=stop_cond, tags=TAGS,
+                            metric_definitions=METRIC_DEFINITONS)
 
     _, _, actual_train_args = sagemaker_session.sagemaker_client.method_calls[0]
 
     assert actual_train_args['VpcConfig'] == VPC_CONFIG
     assert actual_train_args['HyperParameters'] == hyperparameters
     assert actual_train_args['Tags'] == TAGS
+    assert actual_train_args['AlgorithmSpecification']['MetricDefinitions'] == METRIC_DEFINITONS
 
 
 def test_transform_pack_to_request(sagemaker_session):
diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py
@@ -117,7 +117,8 @@ def _create_train_job(tf_version, script_mode=False, repo_name=IMAGE_REPO_NAME,
             'MaxRuntimeInSeconds': 24 * 60 * 60
         },
         'tags': None,
-        'vpc_config': None
+        'vpc_config': None,
+        'metric_definitions': None
     }
 
 
diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py
@@ -413,7 +413,8 @@ def test_deploy_default(tuner):
     returned_training_job_description = {
         'AlgorithmSpecification': {
             'TrainingInputMode': 'File',
-            'TrainingImage': IMAGE_NAME
+            'TrainingImage': IMAGE_NAME,
+            'MetricDefinitions': METRIC_DEFINTIONS,
         },
         'HyperParameters': {
             'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',

Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,8 @@ def _create_train_job(version):`
`121`	`121`	`'MaxRuntimeInSeconds': 24 * 60 * 60`
`122`	`122`	`},`
`123`	`123`	`'tags': None,`
`124`		`- 'vpc_config': None`
	`124`	`+ 'vpc_config': None,`
	`125`	`+ 'metric_definitions': None`
`125`	`126`	`}`
`126`	`127`
`127`	`128`
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,8 @@ def _create_train_job(version):`
`96`	`96`	`'MaxRuntimeInSeconds': 24 * 60 * 60`
`97`	`97`	`},`
`98`	`98`	`'tags': None,`
`99`		`- 'vpc_config': None`
	`99`	`+ 'vpc_config': None,`
	`100`	`+ 'metric_definitions': None`
`100`	`101`	`}`
`101`	`102`
`102`	`103`
Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,8 @@ def _create_train_job(version):`
`112`	`112`	`'MaxRuntimeInSeconds': 24 * 60 * 60`
`113`	`113`	`},`
`114`	`114`	`'tags': None,`
`115`		`- 'vpc_config': None`
	`115`	`+ 'vpc_config': None,`
	`116`	`+ 'metric_definitions': None`
`116`	`117`	`}`
`117`	`118`
`118`	`119`
Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,8 @@ def _create_train_job(tf_version, script_mode=False, repo_name=IMAGE_REPO_NAME,`
`117`	`117`	`'MaxRuntimeInSeconds': 24 * 60 * 60`
`118`	`118`	`},`
`119`	`119`	`'tags': None,`
`120`		`- 'vpc_config': None`
	`120`	`+ 'vpc_config': None,`
	`121`	`+ 'metric_definitions': None`
`121`	`122`	`}`
`122`	`123`
`123`	`124`