diff --git a/src/sagemaker/amazon/knn.py b/src/sagemaker/amazon/knn.py index 3c47423c47..257c9c6722 100644 --- a/src/sagemaker/amazon/knn.py +++ b/src/sagemaker/amazon/knn.py @@ -43,28 +43,22 @@ def __init__(self, role, train_instance_count, train_instance_type, k, sample_si dimension_reduction_type=None, dimension_reduction_target=None, index_type=None, index_metric=None, faiss_index_ivf_nlists=None, faiss_index_pq_m=None, **kwargs): """k-nearest neighbors (KNN) is :class:`Estimator` used for classification and regression. - This Estimator may be fit via calls to :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3. There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed to the `fit` call. - To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html - After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint, deploy returns a :class:`~sagemaker.amazon.knn.KNNPredictor` object that can be used for inference calls using the trained model hosted in the SageMaker Endpoint. - KNN Estimators can be configured by setting hyperparameters. The available hyperparameters for KNN are documented below. - For further information on the AWS KNN algorithm, please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/knn.html - Args: role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and APIs that create Amazon SageMaker endpoints use this role to access @@ -76,17 +70,17 @@ def __init__(self, role, train_instance_count, train_instance_type, k, sample_si predictor_type (str): Required. Type of inference to use on the data's labels, allowed values are 'classifier' and 'regressor'. dimension_reduction_type (str): Optional. Type of dimension reduction technique to use. - Valid values: “sign”, “fjlt” + Valid values: "sign", "fjlt" dimension_reduction_target (int): Optional. Target dimension to reduce to. Required when dimension_reduction_type is specified. index_type (str): Optional. Type of index to use. Valid values are - “faiss.Flat”, “faiss.IVFFlat”, “faiss.IVFPQ”. + "faiss.Flat", "faiss.IVFFlat", "faiss.IVFPQ". index_metric(str): Optional. Distance metric to measure between points when finding nearest neighbors. Valid values are "COSINE", "INNER_PRODUCT", "L2" faiss_index_ivf_nlists(str): Optional. Number of centroids to construct in the index if - index_type is “faiss.IVFFlat” or “faiss.IVFPQ”. + index_type is "faiss.IVFFlat" or "faiss.IVFPQ". faiss_index_pq_m(int): Optional. Number of vector sub-components to construct in the index, - if index_type is “faiss.IVFPQ”. + if index_type is "faiss.IVFPQ". **kwargs: base class keyword argument values. """ diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 0c9a10afed..25a3f044fb 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -46,7 +46,7 @@ class EstimatorBase(with_metaclass(ABCMeta, object)): """ def __init__(self, role, train_instance_count, train_instance_type, - train_volume_size=30, train_max_run=24 * 60 * 60, input_mode='File', + train_volume_size=30, train_volume_kms_key=None, train_max_run=24 * 60 * 60, input_mode='File', output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None, tags=None, subnets=None, security_group_ids=None): """Initialize an ``EstimatorBase`` instance. @@ -61,6 +61,8 @@ def __init__(self, role, train_instance_count, train_instance_type, train_volume_size (int): Size in GB of the EBS volume to use for storing input data during training (default: 30). Must be large enough to store training data if File Mode is used (which is the default). + train_volume_kms_key (str): Optional. KMS key ID for encrypting EBS volume attached to the + training instance (default: None). train_max_run (int): Timeout in seconds for training (default: 24 * 60 * 60). After this amount of time Amazon SageMaker terminates the job regardless of its current status. input_mode (str): The input mode that the algorithm supports (default: 'File'). Valid modes: @@ -87,6 +89,7 @@ def __init__(self, role, train_instance_count, train_instance_type, self.train_instance_count = train_instance_count self.train_instance_type = train_instance_type self.train_volume_size = train_volume_size + self.train_volume_kms_key = train_volume_kms_key self.train_max_run = train_max_run self.input_mode = input_mode self.tags = tags @@ -427,9 +430,9 @@ class Estimator(EstimatorBase): """ def __init__(self, image_name, role, train_instance_count, train_instance_type, - train_volume_size=30, train_max_run=24 * 60 * 60, input_mode='File', - output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None, - hyperparameters=None, tags=None, subnets=None, security_group_ids=None): + train_volume_size=30, train_volume_kms_key=None, train_max_run=24 * 60 * 60, + input_mode='File', output_path=None, output_kms_key=None, base_job_name=None, + sagemaker_session=None, hyperparameters=None, tags=None, subnets=None, security_group_ids=None): """Initialize an ``Estimator`` instance. Args: @@ -443,6 +446,8 @@ def __init__(self, image_name, role, train_instance_count, train_instance_type, train_volume_size (int): Size in GB of the EBS volume to use for storing input data during training (default: 30). Must be large enough to store training data if File Mode is used (which is the default). + train_volume_kms_key (str): Optional. KMS key ID for encrypting EBS volume attached to the + training instance (default: None). train_max_run (int): Timeout in seconds for training (default: 24 * 60 * 60). After this amount of time Amazon SageMaker terminates the job regardless of its current status. input_mode (str): The input mode that the algorithm supports (default: 'File'). Valid modes: @@ -462,11 +467,16 @@ def __init__(self, image_name, role, train_instance_count, train_instance_type, Amazon SageMaker APIs and any other AWS services needed. If not specified, the estimator creates one using the default AWS configuration chain. hyperparameters (dict): Dictionary containing the hyperparameters to initialize this estimator with. + tags (list[dict]): List of tags for labeling a training job. For more, see + https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html. + subnets (list[str]): List of subnet ids. If not specified training job will be created without VPC config. + security_group_ids (list[str]): List of security group ids. If not specified training job will be created + without VPC config. """ self.image_name = image_name self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {} super(Estimator, self).__init__(role, train_instance_count, train_instance_type, - train_volume_size, train_max_run, input_mode, + train_volume_size, train_volume_kms_key, train_max_run, input_mode, output_path, output_kms_key, base_job_name, sagemaker_session, tags, subnets, security_group_ids) diff --git a/src/sagemaker/job.py b/src/sagemaker/job.py index 773350aa69..0e008e99f4 100644 --- a/src/sagemaker/job.py +++ b/src/sagemaker/job.py @@ -57,7 +57,8 @@ def _load_config(inputs, estimator): output_config = _Job._prepare_output_config(estimator.output_path, estimator.output_kms_key) resource_config = _Job._prepare_resource_config(estimator.train_instance_count, estimator.train_instance_type, - estimator.train_volume_size) + estimator.train_volume_size, + estimator.train_volume_kms_key) stop_condition = _Job._prepare_stop_condition(estimator.train_max_run) vpc_config = _Job._prepare_vpc_config(estimator.subnets, estimator.security_group_ids) @@ -140,10 +141,14 @@ def _prepare_output_config(s3_path, kms_key_id): return config @staticmethod - def _prepare_resource_config(instance_count, instance_type, volume_size): - return {'InstanceCount': instance_count, - 'InstanceType': instance_type, - 'VolumeSizeInGB': volume_size} + def _prepare_resource_config(instance_count, instance_type, volume_size, train_volume_kms_key): + resource_config = {'InstanceCount': instance_count, + 'InstanceType': instance_type, + 'VolumeSizeInGB': volume_size} + if train_volume_kms_key is not None: + resource_config['VolumeKmsKeyId'] = train_volume_kms_key + + return resource_config @staticmethod def _prepare_vpc_config(subnets, security_group_ids): diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 68a89a6ac4..45f92f30cf 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -259,9 +259,9 @@ def train(self, image, input_mode, input_config, role, job_name, output_config, 'InputDataConfig': input_config, 'OutputDataConfig': output_config, 'TrainingJobName': job_name, - "StoppingCondition": stop_condition, - "ResourceConfig": resource_config, - "RoleArn": role, + 'StoppingCondition': stop_condition, + 'ResourceConfig': resource_config, + 'RoleArn': role, } if hyperparameters and len(hyperparameters) > 0: diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index d173f013fb..4ecb8ee85b 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -44,23 +44,6 @@ TAGS = [{'Name': 'some-tag', 'Value': 'value-for-tag'}] OUTPUT_PATH = 's3://bucket/prefix' -COMMON_TRAIN_ARGS = { - 'volume_size': 30, - 'hyperparameters': { - 'sagemaker_program': 'dummy_script.py', - 'sagemaker_enable_cloudwatch_metrics': False, - 'sagemaker_container_log_level': logging.INFO, - }, - 'input_mode': 'File', - 'instance_type': 'c4.4xlarge', - 'inputs': 's3://mybucket/train', - 'instance_count': 1, - 'role': 'DummyRole', - 'kms_key_id': None, - 'max_run': 24, - 'wait': True, -} - DESCRIBE_TRAINING_JOB_RESULT = { 'ModelArtifacts': { 'S3ModelArtifacts': MODEL_DATA @@ -119,6 +102,29 @@ def sagemaker_session(): return sms +def test_framework_all_init_args(sagemaker_session): + f = DummyFramework('my_script.py', role='DummyRole', train_instance_count=3, train_instance_type='ml.m4.xlarge', + sagemaker_session=sagemaker_session, train_volume_size=123, train_volume_kms_key='volumekms', + train_max_run=456, input_mode='inputmode', output_path='outputpath', output_kms_key='outputkms', + base_job_name='basejobname', tags=[{'foo': 'bar'}], subnets=['123', '456'], + security_group_ids=['789', '012']) + _TrainingJob.start_new(f, 's3://mydata') + sagemaker_session.train.assert_called_once() + _, args = sagemaker_session.train.call_args + assert args == {'input_mode': 'inputmode', 'tags': [{'foo': 'bar'}], 'hyperparameters': {}, 'image': 'fakeimage', + 'input_config': [{'ChannelName': 'training', + 'DataSource': { + 'S3DataSource': {'S3DataType': 'S3Prefix', + 'S3DataDistributionType': 'FullyReplicated', + 'S3Uri': 's3://mydata'}}}], + 'output_config': {'KmsKeyId': 'outputkms', 'S3OutputPath': 'outputpath'}, + 'vpc_config': {'Subnets': ['123', '456'], 'SecurityGroupIds': ['789', '012']}, + 'stop_condition': {'MaxRuntimeInSeconds': 456}, + 'role': sagemaker_session.expand_role(), 'job_name': None, + 'resource_config': {'VolumeSizeInGB': 123, 'InstanceCount': 3, 'VolumeKmsKeyId': 'volumekms', + 'InstanceType': 'ml.m4.xlarge'}} + + def test_sagemaker_s3_uri_invalid(sagemaker_session): with pytest.raises(ValueError) as error: t = DummyFramework(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index 4692641fb4..158e7229c4 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -30,12 +30,13 @@ ROLE = 'DummyRole' IMAGE_NAME = 'fakeimage' JOB_NAME = 'fakejob' +VOLUME_KMS_KEY = 'volkmskey' @pytest.fixture() def estimator(sagemaker_session): - return Estimator(IMAGE_NAME, ROLE, INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE, MAX_RUNTIME, - output_path=S3_OUTPUT_PATH, sagemaker_session=sagemaker_session) + return Estimator(IMAGE_NAME, ROLE, INSTANCE_COUNT, INSTANCE_TYPE, train_volume_size=VOLUME_SIZE, + train_max_run=MAX_RUNTIME, output_path=S3_OUTPUT_PATH, sagemaker_session=sagemaker_session) @pytest.fixture() @@ -282,11 +283,24 @@ def test_prepare_output_config_kms_key_none(): def test_prepare_resource_config(): - resource_config = _Job._prepare_resource_config(INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE) + resource_config = _Job._prepare_resource_config(INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE, None) - assert resource_config['InstanceCount'] == INSTANCE_COUNT - assert resource_config['InstanceType'] == INSTANCE_TYPE - assert resource_config['VolumeSizeInGB'] == VOLUME_SIZE + assert resource_config == { + 'InstanceCount': INSTANCE_COUNT, + 'InstanceType': INSTANCE_TYPE, + 'VolumeSizeInGB': VOLUME_SIZE + } + + +def test_prepare_resource_config_with_volume_kms(): + resource_config = _Job._prepare_resource_config(INSTANCE_COUNT, INSTANCE_TYPE, VOLUME_SIZE, VOLUME_KMS_KEY) + + assert resource_config == { + 'InstanceCount': INSTANCE_COUNT, + 'InstanceType': INSTANCE_TYPE, + 'VolumeSizeInGB': VOLUME_SIZE, + 'VolumeKmsKeyId': VOLUME_KMS_KEY + } def test_prepare_stop_condition():