aws · winstonaws · Sep 15, 2018 · Sep 14, 2018 · Sep 14, 2018
@@ -43,28 +43,22 @@ def __init__(self, role, train_instance_count, train_instance_type, k, sample_si
                  dimension_reduction_type=None, dimension_reduction_target=None, index_type=None,
                  index_metric=None, faiss_index_ivf_nlists=None, faiss_index_pq_m=None, **kwargs):
         """k-nearest neighbors (KNN) is :class:`Estimator` used for classification and regression.
-
         This Estimator may be fit via calls to
         :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
         :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
         There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
         can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
         to the `fit` call.
-
         To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
         consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
-
         After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
         Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
         deploy returns a :class:`~sagemaker.amazon.knn.KNNPredictor` object that can be used
         for inference calls using the trained model hosted in the SageMaker Endpoint.
-
         KNN Estimators can be configured by setting hyperparameters. The available hyperparameters for
         KNN are documented below.
-
         For further information on the AWS KNN algorithm,
         please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/knn.html
-
         Args:
             role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
                 APIs that create Amazon SageMaker endpoints use this role to access
@@ -76,17 +70,17 @@ def __init__(self, role, train_instance_count, train_instance_type, k, sample_si
             predictor_type (str): Required. Type of inference to use on the data's labels,
                 allowed values are 'classifier' and 'regressor'.
             dimension_reduction_type (str): Optional. Type of dimension reduction technique to use.
-                Valid values: “sign”, “fjlt”
+                Valid values: "sign", "fjlt"
             dimension_reduction_target (int): Optional. Target dimension to reduce to. Required when
                 dimension_reduction_type is specified.
             index_type (str): Optional. Type of index to use. Valid values are
-                “faiss.Flat”, “faiss.IVFFlat”, “faiss.IVFPQ”.
+                "faiss.Flat", "faiss.IVFFlat", "faiss.IVFPQ".
             index_metric(str): Optional. Distance metric to measure between points when finding nearest neighbors.
                 Valid values are "COSINE", "INNER_PRODUCT", "L2"
             faiss_index_ivf_nlists(str): Optional. Number of centroids to construct in the index if
-                index_type is “faiss.IVFFlat” or “faiss.IVFPQ”.
+                index_type is "faiss.IVFFlat" or "faiss.IVFPQ".
             faiss_index_pq_m(int): Optional. Number of vector sub-components to construct in the index,
-                if index_type is “faiss.IVFPQ”.
+                if index_type is "faiss.IVFPQ".
             **kwargs: base class keyword argument values.
         """
 

@@ -46,7 +46,7 @@ class EstimatorBase(with_metaclass(ABCMeta, object)):
     """
 
     def __init__(self, role, train_instance_count, train_instance_type,
-                 train_volume_size=30, train_max_run=24 * 60 * 60, input_mode='File',
+                 train_volume_size=30, train_volume_kms_key=None, train_max_run=24 * 60 * 60, input_mode='File',
                  output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None, tags=None,
                  subnets=None, security_group_ids=None):
         """Initialize an ``EstimatorBase`` instance.
@@ -61,6 +61,8 @@ def __init__(self, role, train_instance_count, train_instance_type,
             train_volume_size (int): Size in GB of the EBS volume to use for storing input data
                 during training (default: 30). Must be large enough to store training data if File Mode is used
                 (which is the default).
+            train_volume_kms_key (str): Optional. KMS key ID for encrypting EBS volume attached to the
+                training instance (default: None).
             train_max_run (int): Timeout in seconds for training (default: 24 * 60 * 60).
                 After this amount of time Amazon SageMaker terminates the job regardless of its current status.
             input_mode (str): The input mode that the algorithm supports (default: 'File'). Valid modes:
@@ -87,6 +89,7 @@ def __init__(self, role, train_instance_count, train_instance_type,
         self.train_instance_count = train_instance_count
         self.train_instance_type = train_instance_type
         self.train_volume_size = train_volume_size
+        self.train_volume_kms_key = train_volume_kms_key
         self.train_max_run = train_max_run
         self.input_mode = input_mode
         self.tags = tags
@@ -427,9 +430,9 @@ class Estimator(EstimatorBase):
     """
 
     def __init__(self, image_name, role, train_instance_count, train_instance_type,
-                 train_volume_size=30, train_max_run=24 * 60 * 60, input_mode='File',
-                 output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None,
-                 hyperparameters=None, tags=None, subnets=None, security_group_ids=None):
+                 train_volume_size=30, train_volume_kms_key=None, train_max_run=24 * 60 * 60,
+                 input_mode='File', output_path=None, output_kms_key=None, base_job_name=None,
+                 sagemaker_session=None, hyperparameters=None, tags=None, subnets=None, security_group_ids=None):
         """Initialize an ``Estimator`` instance.
 
         Args:
@@ -443,6 +446,8 @@ def __init__(self, image_name, role, train_instance_count, train_instance_type,
             train_volume_size (int): Size in GB of the EBS volume to use for storing input data
                 during training (default: 30). Must be large enough to store training data if File Mode is used
                 (which is the default).
+            train_volume_kms_key (str): Optional. KMS key ID for encrypting EBS volume attached to the
+                training instance (default: None).
             train_max_run (int): Timeout in seconds for training (default: 24 * 60 * 60).
                 After this amount of time Amazon SageMaker terminates the job regardless of its current status.
             input_mode (str): The input mode that the algorithm supports (default: 'File'). Valid modes:
@@ -462,11 +467,16 @@ def __init__(self, image_name, role, train_instance_count, train_instance_type,
                 Amazon SageMaker APIs and any other AWS services needed. If not specified, the estimator creates one
                 using the default AWS configuration chain.
             hyperparameters (dict): Dictionary containing the hyperparameters to initialize this estimator with.
+            tags (list[dict]): List of tags for labeling a training job. For more, see
+                https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html.
+            subnets (list[str]): List of subnet ids. If not specified training job will be created without VPC config.
+            security_group_ids (list[str]): List of security group ids. If not specified training job will be created
+                without VPC config.
         """
         self.image_name = image_name
         self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {}
         super(Estimator, self).__init__(role, train_instance_count, train_instance_type,
-                                        train_volume_size, train_max_run, input_mode,
+                                        train_volume_size, train_volume_kms_key, train_max_run, input_mode,
                                         output_path, output_kms_key, base_job_name, sagemaker_session,
                                         tags, subnets, security_group_ids)
 

@@ -57,7 +57,8 @@ def _load_config(inputs, estimator):
         output_config = _Job._prepare_output_config(estimator.output_path, estimator.output_kms_key)
         resource_config = _Job._prepare_resource_config(estimator.train_instance_count,
                                                         estimator.train_instance_type,
-                                                        estimator.train_volume_size)
+                                                        estimator.train_volume_size,
+                                                        estimator.train_volume_kms_key)
         stop_condition = _Job._prepare_stop_condition(estimator.train_max_run)
         vpc_config = _Job._prepare_vpc_config(estimator.subnets, estimator.security_group_ids)
 
@@ -140,10 +141,14 @@ def _prepare_output_config(s3_path, kms_key_id):
         return config
 
     @staticmethod
-    def _prepare_resource_config(instance_count, instance_type, volume_size):
-        return {'InstanceCount': instance_count,
-                'InstanceType': instance_type,
-                'VolumeSizeInGB': volume_size}
+    def _prepare_resource_config(instance_count, instance_type, volume_size, train_volume_kms_key):
+        resource_config = {'InstanceCount': instance_count,
+                           'InstanceType': instance_type,
+                           'VolumeSizeInGB': volume_size}
+        if train_volume_kms_key is not None:
+            resource_config['VolumeKmsKeyId'] = train_volume_kms_key
+
+        return resource_config
 
     @staticmethod
     def _prepare_vpc_config(subnets, security_group_ids):

@@ -259,9 +259,9 @@ def train(self, image, input_mode, input_config, role, job_name, output_config,
             'InputDataConfig': input_config,
             'OutputDataConfig': output_config,
             'TrainingJobName': job_name,
-            "StoppingCondition": stop_condition,
-            "ResourceConfig": resource_config,
-            "RoleArn": role,
+            'StoppingCondition': stop_condition,
+            'ResourceConfig': resource_config,
+            'RoleArn': role,
         }
 
         if hyperparameters and len(hyperparameters) > 0:

@@ -44,23 +44,6 @@
 TAGS = [{'Name': 'some-tag', 'Value': 'value-for-tag'}]
 OUTPUT_PATH = 's3://bucket/prefix'
 
-COMMON_TRAIN_ARGS = {
-    'volume_size': 30,
-    'hyperparameters': {
-        'sagemaker_program': 'dummy_script.py',
-        'sagemaker_enable_cloudwatch_metrics': False,
-        'sagemaker_container_log_level': logging.INFO,
-    },
-    'input_mode': 'File',
-    'instance_type': 'c4.4xlarge',
-    'inputs': 's3://mybucket/train',
-    'instance_count': 1,
-    'role': 'DummyRole',
-    'kms_key_id': None,
-    'max_run': 24,
-    'wait': True,
-}
-
 DESCRIBE_TRAINING_JOB_RESULT = {
     'ModelArtifacts': {
         'S3ModelArtifacts': MODEL_DATA
@@ -119,6 +102,29 @@ def sagemaker_session():
     return sms
 
 
+def test_framework_all_init_args(sagemaker_session):
+    f = DummyFramework('my_script.py', role='DummyRole', train_instance_count=3, train_instance_type='ml.m4.xlarge',
+                       sagemaker_session=sagemaker_session, train_volume_size=123, train_volume_kms_key='volumekms',
+                       train_max_run=456, input_mode='inputmode', output_path='outputpath', output_kms_key='outputkms',
+                       base_job_name='basejobname', tags=[{'foo': 'bar'}], subnets=['123', '456'],
+                       security_group_ids=['789', '012'])
+    _TrainingJob.start_new(f, 's3://mydata')
+    sagemaker_session.train.assert_called_once()
+    _, args = sagemaker_session.train.call_args
+    assert args == {'input_mode': 'inputmode', 'tags': [{'foo': 'bar'}], 'hyperparameters': {}, 'image': 'fakeimage',
+                    'input_config': [{'ChannelName': 'training',
+                                      'DataSource': {
+                                          'S3DataSource': {'S3DataType': 'S3Prefix',
+                                                           'S3DataDistributionType': 'FullyReplicated',
+                                                           'S3Uri': 's3://mydata'}}}],
+                    'output_config': {'KmsKeyId': 'outputkms', 'S3OutputPath': 'outputpath'},
+                    'vpc_config': {'Subnets': ['123', '456'], 'SecurityGroupIds': ['789', '012']},
+                    'stop_condition': {'MaxRuntimeInSeconds': 456},
+                    'role': sagemaker_session.expand_role(), 'job_name': None,
+                    'resource_config': {'VolumeSizeInGB': 123, 'InstanceCount': 3, 'VolumeKmsKeyId': 'volumekms',
+                                        'InstanceType': 'ml.m4.xlarge'}}
+
+
 def test_sagemaker_s3_uri_invalid(sagemaker_session):
     with pytest.raises(ValueError) as error:
         t = DummyFramework(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,

@@ -30,12 +30,13 @@
 ROLE = 'DummyRole'
 IMAGE_NAME = 'fakeimage'
 JOB_NAME = 'fakejob'
+VOLUME_KMS_KEY = 'volkmskey'
 
 
 @pytest.fixture()
 def estimator(sagemaker_session):
-    return Estimator(IMAGE_NAME, ROLE, INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE, MAX_RUNTIME,
-                     output_path=S3_OUTPUT_PATH, sagemaker_session=sagemaker_session)
+    return Estimator(IMAGE_NAME, ROLE, INSTANCE_COUNT, INSTANCE_TYPE, train_volume_size=VOLUME_SIZE,
+                     train_max_run=MAX_RUNTIME, output_path=S3_OUTPUT_PATH, sagemaker_session=sagemaker_session)
 
 
 @pytest.fixture()
@@ -282,11 +283,24 @@ def test_prepare_output_config_kms_key_none():
 
 
 def test_prepare_resource_config():
-    resource_config = _Job._prepare_resource_config(INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE)
+    resource_config = _Job._prepare_resource_config(INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE, None)
 
-    assert resource_config['InstanceCount'] == INSTANCE_COUNT
-    assert resource_config['InstanceType'] == INSTANCE_TYPE
-    assert resource_config['VolumeSizeInGB'] == VOLUME_SIZE
+    assert resource_config == {
+        'InstanceCount': INSTANCE_COUNT,
+        'InstanceType': INSTANCE_TYPE,
+        'VolumeSizeInGB': VOLUME_SIZE
+    }
+
+
+def test_prepare_resource_config_with_volume_kms():
+    resource_config = _Job._prepare_resource_config(INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE, VOLUME_KMS_KEY)
+
+    assert resource_config == {
+        'InstanceCount': INSTANCE_COUNT,
+        'InstanceType': INSTANCE_TYPE,
+        'VolumeSizeInGB': VOLUME_SIZE,
+        'VolumeKmsKeyId': VOLUME_KMS_KEY
+    }
 
 
 def test_prepare_stop_condition():