diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3c81ca055c..fe2ddd6b5d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ CHANGELOG 1.4.3dev ======== * feature: Allow Local Serving of Models in S3 +* enhancement: Allow option for ``HyperparameterTuner`` to not include estimator metadata in job 1.4.2 diff --git a/README.rst b/README.rst index 6cced902f2..eff0b4ecc3 100644 --- a/README.rst +++ b/README.rst @@ -321,6 +321,14 @@ In addition, the ``fit()`` call uses a list of ``RecordSet`` objects instead of # Start hyperparameter tuning job my_tuner.fit([train_records, test_records]) +To aid with attaching a previously-started hyperparameter tuning job with a ``HyperparameterTuner`` instance, ``fit()`` injects metadata in the hyperparameters by default. +If the algorithm you are using cannot handle unknown hyperparameters (e.g. an Amazon ML algorithm that does not have a custom estimator in the Python SDK), then you can set ``include_cls_metadata`` to ``False`` when calling fit: + +.. code:: python + + my_tuner.fit({'train': 's3://my_bucket/my_training_data', 'test': 's3://my_bucket_my_testing_data'}, + include_cls_metadata=False) + There is also an analytics object associated with each ``HyperparameterTuner`` instance that presents useful information about the hyperparameter tuning job. For example, the ``dataframe`` method gets a pandas dataframe summarizing the associated training jobs: diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py index 24b6ec7bb0..e721704844 100644 --- a/src/sagemaker/tuner.py +++ b/src/sagemaker/tuner.py @@ -204,7 +204,7 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr self._current_job_name = None self.latest_tuning_job = None - def _prepare_for_training(self, job_name=None): + def _prepare_for_training(self, job_name=None, include_cls_metadata=True): if job_name is not None: self._current_job_name = job_name else: @@ -217,12 +217,12 @@ def _prepare_for_training(self, job_name=None): # For attach() to know what estimator to use for non-1P algorithms # (1P algorithms don't accept extra hyperparameters) - if not isinstance(self.estimator, AmazonAlgorithmEstimatorBase): + if include_cls_metadata and not isinstance(self.estimator, AmazonAlgorithmEstimatorBase): self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_CLASS_NAME] = json.dumps( self.estimator.__class__.__name__) self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_MODULE] = json.dumps(self.estimator.__module__) - def fit(self, inputs, job_name=None, **kwargs): + def fit(self, inputs, job_name=None, include_cls_metadata=True, **kwargs): """Start a hyperparameter tuning job. Args: @@ -253,7 +253,7 @@ def fit(self, inputs, job_name=None, **kwargs): else: self.estimator._prepare_for_training(job_name) - self._prepare_for_training(job_name=job_name) + self._prepare_for_training(job_name=job_name, include_cls_metadata=include_cls_metadata) self.latest_tuning_job = _TuningJob.start_new(self, inputs) @classmethod diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py index bae9899f0a..5de9f1ffd6 100644 --- a/tests/integ/test_tuner.py +++ b/tests/integ/test_tuner.py @@ -13,19 +13,24 @@ from __future__ import absolute_import import gzip +import io +import json import os import pickle import sys import time +import boto3 import numpy as np import pytest -from sagemaker import LDA, RandomCutForest -from sagemaker.amazon.common import read_records -from sagemaker.amazon.kmeans import KMeans +from sagemaker import KMeans, LDA, RandomCutForest +from sagemaker.amazon.amazon_estimator import registry +from sagemaker.amazon.common import read_records, write_numpy_to_dense_tensor from sagemaker.chainer import Chainer +from sagemaker.estimator import Estimator from sagemaker.mxnet.estimator import MXNet +from sagemaker.predictor import json_deserializer from sagemaker.tensorflow import TensorFlow from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner from tests.integ import DATA_DIR @@ -307,3 +312,83 @@ def test_tuning_chainer(sagemaker_session): data = np.zeros((batch_size, 28, 28), dtype='float32') output = predictor.predict(data) assert len(output) == batch_size + + +@pytest.mark.continuous_testing +def test_tuning_byo_estimator(sagemaker_session): + """Use Factorization Machines algorithm as an example here. + + First we need to prepare data for training. We take standard data set, convert it to the + format that the algorithm can process and upload it to S3. + Then we create the Estimator and set hyperparamets as required by the algorithm. + Next, we can call fit() with path to the S3. + Later the trained model is deployed and prediction is called against the endpoint. + Default predictor is updated with json serializer and deserializer. + """ + image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1' + + with timeout(minutes=15): + data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') + pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} + + with gzip.open(data_path, 'rb') as f: + train_set, _, _ = pickle.load(f, **pickle_args) + + # take 100 examples for faster execution + vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32') + labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32') + + buf = io.BytesIO() + write_numpy_to_dense_tensor(buf, vectors, labels) + buf.seek(0) + + bucket = sagemaker_session.default_bucket() + prefix = 'test_byo_estimator' + key = 'recordio-pb-data' + boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf) + s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key) + + estimator = Estimator(image_name=image_name, + role='SageMakerRole', train_instance_count=1, + train_instance_type='ml.c4.xlarge', + sagemaker_session=sagemaker_session, base_job_name='test-byo') + + estimator.set_hyperparameters(num_factors=10, + feature_dim=784, + mini_batch_size=100, + predictor_type='binary_classifier') + + hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)} + + tuner = HyperparameterTuner(estimator=estimator, base_tuning_job_name='byo', + objective_metric_name='test:binary_classification_accuracy', + hyperparameter_ranges=hyperparameter_ranges, + max_jobs=2, max_parallel_jobs=2) + + tuner.fit({'train': s3_train_data, 'test': s3_train_data}, include_cls_metadata=False) + + print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) + + time.sleep(15) + tuner.wait() + + best_training_job = tuner.best_training_job() + with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): + predictor = tuner.deploy(1, 'ml.m4.xlarge', endpoint_name=best_training_job) + predictor.serializer = _fm_serializer + predictor.content_type = 'application/json' + predictor.deserializer = json_deserializer + + result = predictor.predict(train_set[0][:10]) + + assert len(result['predictions']) == 10 + for prediction in result['predictions']: + assert prediction['score'] is not None + + +# Serializer for the Factorization Machines predictor (for BYO example) +def _fm_serializer(data): + js = {'instances': []} + for row in data: + js['instances'].append({'features': row.tolist()}) + return json.dumps(js) diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py index a1f2d21515..81de313f1d 100644 --- a/tests/unit/test_tuner.py +++ b/tests/unit/test_tuner.py @@ -159,6 +159,21 @@ def test_prepare_for_training(tuner): assert tuner.static_hyperparameters['sagemaker_estimator_module'] == module +def test_prepare_for_training_with_amazon_estimator(tuner, sagemaker_session): + tuner.estimator = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS, + sagemaker_session=sagemaker_session) + + tuner._prepare_for_training() + assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters + assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters + + +def test_prepare_for_training_dont_include_estimator_cls(tuner): + tuner._prepare_for_training(include_cls_metadata=False) + assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters + assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters + + def test_prepare_for_training_with_job_name(tuner): static_hyperparameters = {'validated': 1, 'another_one': 0} tuner.estimator.set_hyperparameters(**static_hyperparameters)