Skip to content

Add option for not including estimator metadata in hyperparameter tuning job #237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ CHANGELOG
1.4.3dev
========
* feature: Allow Local Serving of Models in S3
* enhancement: Allow option for ``HyperparameterTuner`` to not include estimator metadata in job


1.4.2
Expand Down
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,14 @@ In addition, the ``fit()`` call uses a list of ``RecordSet`` objects instead of
# Start hyperparameter tuning job
my_tuner.fit([train_records, test_records])

To aid with attaching a previously-started hyperparameter tuning job with a ``HyperparameterTuner`` instance, ``fit()`` injects metadata in the hyperparameters by default.
If the algorithm you are using cannot handle unknown hyperparameters (e.g. an Amazon ML algorithm that does not have a custom estimator in the Python SDK), then you can set ``include_cls_metadata`` to ``False`` when calling fit:

.. code:: python

my_tuner.fit({'train': 's3://my_bucket/my_training_data', 'test': 's3://my_bucket_my_testing_data'},
include_cls_metadata=False)

There is also an analytics object associated with each ``HyperparameterTuner`` instance that presents useful information about the hyperparameter tuning job.
For example, the ``dataframe`` method gets a pandas dataframe summarizing the associated training jobs:

Expand Down
8 changes: 4 additions & 4 deletions src/sagemaker/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr
self._current_job_name = None
self.latest_tuning_job = None

def _prepare_for_training(self, job_name=None):
def _prepare_for_training(self, job_name=None, include_cls_metadata=True):
if job_name is not None:
self._current_job_name = job_name
else:
Expand All @@ -217,12 +217,12 @@ def _prepare_for_training(self, job_name=None):

# For attach() to know what estimator to use for non-1P algorithms
# (1P algorithms don't accept extra hyperparameters)
if not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
if include_cls_metadata and not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_CLASS_NAME] = json.dumps(
self.estimator.__class__.__name__)
self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_MODULE] = json.dumps(self.estimator.__module__)

def fit(self, inputs, job_name=None, **kwargs):
def fit(self, inputs, job_name=None, include_cls_metadata=True, **kwargs):
"""Start a hyperparameter tuning job.

Args:
Expand Down Expand Up @@ -253,7 +253,7 @@ def fit(self, inputs, job_name=None, **kwargs):
else:
self.estimator._prepare_for_training(job_name)

self._prepare_for_training(job_name=job_name)
self._prepare_for_training(job_name=job_name, include_cls_metadata=include_cls_metadata)
self.latest_tuning_job = _TuningJob.start_new(self, inputs)

@classmethod
Expand Down
91 changes: 88 additions & 3 deletions tests/integ/test_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,24 @@
from __future__ import absolute_import

import gzip
import io
import json
import os
import pickle
import sys
import time

import boto3
import numpy as np
import pytest

from sagemaker import LDA, RandomCutForest
from sagemaker.amazon.common import read_records
from sagemaker.amazon.kmeans import KMeans
from sagemaker import KMeans, LDA, RandomCutForest
from sagemaker.amazon.amazon_estimator import registry
from sagemaker.amazon.common import read_records, write_numpy_to_dense_tensor
from sagemaker.chainer import Chainer
from sagemaker.estimator import Estimator
from sagemaker.mxnet.estimator import MXNet
from sagemaker.predictor import json_deserializer
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner
from tests.integ import DATA_DIR
Expand Down Expand Up @@ -307,3 +312,83 @@ def test_tuning_chainer(sagemaker_session):
data = np.zeros((batch_size, 28, 28), dtype='float32')
output = predictor.predict(data)
assert len(output) == batch_size


@pytest.mark.continuous_testing
def test_tuning_byo_estimator(sagemaker_session):
"""Use Factorization Machines algorithm as an example here.

First we need to prepare data for training. We take standard data set, convert it to the
format that the algorithm can process and upload it to S3.
Then we create the Estimator and set hyperparamets as required by the algorithm.
Next, we can call fit() with path to the S3.
Later the trained model is deployed and prediction is called against the endpoint.
Default predictor is updated with json serializer and deserializer.
"""
image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1'

with timeout(minutes=15):
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

with gzip.open(data_path, 'rb') as f:
train_set, _, _ = pickle.load(f, **pickle_args)

# take 100 examples for faster execution
vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

buf = io.BytesIO()
write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

bucket = sagemaker_session.default_bucket()
prefix = 'test_byo_estimator'
key = 'recordio-pb-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

estimator = Estimator(image_name=image_name,
role='SageMakerRole', train_instance_count=1,
train_instance_type='ml.c4.xlarge',
sagemaker_session=sagemaker_session, base_job_name='test-byo')

estimator.set_hyperparameters(num_factors=10,
feature_dim=784,
mini_batch_size=100,
predictor_type='binary_classifier')

hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)}

tuner = HyperparameterTuner(estimator=estimator, base_tuning_job_name='byo',
objective_metric_name='test:binary_classification_accuracy',
hyperparameter_ranges=hyperparameter_ranges,
max_jobs=2, max_parallel_jobs=2)

tuner.fit({'train': s3_train_data, 'test': s3_train_data}, include_cls_metadata=False)

print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

time.sleep(15)
tuner.wait()

best_training_job = tuner.best_training_job()
with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
predictor = tuner.deploy(1, 'ml.m4.xlarge', endpoint_name=best_training_job)
predictor.serializer = _fm_serializer
predictor.content_type = 'application/json'
predictor.deserializer = json_deserializer

result = predictor.predict(train_set[0][:10])

assert len(result['predictions']) == 10
for prediction in result['predictions']:
assert prediction['score'] is not None


# Serializer for the Factorization Machines predictor (for BYO example)
def _fm_serializer(data):
js = {'instances': []}
for row in data:
js['instances'].append({'features': row.tolist()})
return json.dumps(js)
15 changes: 15 additions & 0 deletions tests/unit/test_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,21 @@ def test_prepare_for_training(tuner):
assert tuner.static_hyperparameters['sagemaker_estimator_module'] == module


def test_prepare_for_training_with_amazon_estimator(tuner, sagemaker_session):
tuner.estimator = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS,
sagemaker_session=sagemaker_session)

tuner._prepare_for_training()
assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters
assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters


def test_prepare_for_training_dont_include_estimator_cls(tuner):
tuner._prepare_for_training(include_cls_metadata=False)
assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters
assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters


def test_prepare_for_training_with_job_name(tuner):
static_hyperparameters = {'validated': 1, 'another_one': 0}
tuner.estimator.set_hyperparameters(**static_hyperparameters)
Expand Down