aws · laurenyu · Jun 18, 2018 · Jun 15, 2018 · Jun 15, 2018 · Jun 15, 2018
@@ -5,6 +5,7 @@ CHANGELOG
 1.4.3dev
 ========
 * feature: Allow Local Serving of Models in S3
+* enhancement: Allow option for ``HyperparameterTuner`` to not include estimator metadata in job
 
 
 1.4.2

@@ -321,6 +321,14 @@ In addition, the ``fit()`` call uses a list of ``RecordSet`` objects instead of
     # Start hyperparameter tuning job
     my_tuner.fit([train_records, test_records])
 
+To aid with attaching a previously-started hyperparameter tuning job with a ``HyperparameterTuner`` instance, ``fit()`` injects metadata in the hyperparameters by default.
+If the algorithm you are using cannot handle unknown hyperparameters (e.g. an Amazon ML algorithm that does not have a custom estimator in the Python SDK), then you can set ``include_cls_metadata`` to ``False`` when calling fit:
+
+.. code:: python
+
+    my_tuner.fit({'train': 's3://my_bucket/my_training_data', 'test': 's3://my_bucket_my_testing_data'},
+                 include_cls_metadata=False)
+
 There is also an analytics object associated with each ``HyperparameterTuner`` instance that presents useful information about the hyperparameter tuning job.
 For example, the ``dataframe`` method gets a pandas dataframe summarizing the associated training jobs:
 

@@ -204,7 +204,7 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr
         self._current_job_name = None
         self.latest_tuning_job = None
 
-    def _prepare_for_training(self, job_name=None):
+    def _prepare_for_training(self, job_name=None, include_cls_metadata=True):
         if job_name is not None:
             self._current_job_name = job_name
         else:
@@ -217,12 +217,12 @@ def _prepare_for_training(self, job_name=None):
 
         # For attach() to know what estimator to use for non-1P algorithms
         # (1P algorithms don't accept extra hyperparameters)
-        if not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
+        if include_cls_metadata and not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
             self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_CLASS_NAME] = json.dumps(
                 self.estimator.__class__.__name__)
             self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_MODULE] = json.dumps(self.estimator.__module__)
 
-    def fit(self, inputs, job_name=None, **kwargs):
+    def fit(self, inputs, job_name=None, include_cls_metadata=True, **kwargs):
         """Start a hyperparameter tuning job.
 
         Args:
@@ -253,7 +253,7 @@ def fit(self, inputs, job_name=None, **kwargs):
         else:
             self.estimator._prepare_for_training(job_name)
 
-        self._prepare_for_training(job_name=job_name)
+        self._prepare_for_training(job_name=job_name, include_cls_metadata=include_cls_metadata)
         self.latest_tuning_job = _TuningJob.start_new(self, inputs)
 
     @classmethod

@@ -13,19 +13,24 @@
 from __future__ import absolute_import
 
 import gzip
+import io
+import json
 import os
 import pickle
 import sys
 import time
 
+import boto3
 import numpy as np
 import pytest
 
-from sagemaker import LDA, RandomCutForest
-from sagemaker.amazon.common import read_records
-from sagemaker.amazon.kmeans import KMeans
+from sagemaker import KMeans, LDA, RandomCutForest
+from sagemaker.amazon.amazon_estimator import registry
+from sagemaker.amazon.common import read_records, write_numpy_to_dense_tensor
 from sagemaker.chainer import Chainer
+from sagemaker.estimator import Estimator
 from sagemaker.mxnet.estimator import MXNet
+from sagemaker.predictor import json_deserializer
 from sagemaker.tensorflow import TensorFlow
 from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner
 from tests.integ import DATA_DIR
@@ -307,3 +312,83 @@ def test_tuning_chainer(sagemaker_session):
         data = np.zeros((batch_size, 28, 28), dtype='float32')
         output = predictor.predict(data)
         assert len(output) == batch_size
+
+
+@pytest.mark.continuous_testing
+def test_tuning_byo_estimator(sagemaker_session):
+    """Use Factorization Machines algorithm as an example here.
+
+    First we need to prepare data for training. We take standard data set, convert it to the
+    format that the algorithm can process and upload it to S3.
+    Then we create the Estimator and set hyperparamets as required by the algorithm.
+    Next, we can call fit() with path to the S3.
+    Later the trained model is deployed and prediction is called against the endpoint.
+    Default predictor is updated with json serializer and deserializer.
+    """
+    image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1'
+
+    with timeout(minutes=15):
+        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
+        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
+
+        with gzip.open(data_path, 'rb') as f:
+            train_set, _, _ = pickle.load(f, **pickle_args)
+
+        # take 100 examples for faster execution
+        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
+        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')
+
+        buf = io.BytesIO()
+        write_numpy_to_dense_tensor(buf, vectors, labels)
+        buf.seek(0)
+
+        bucket = sagemaker_session.default_bucket()
+        prefix = 'test_byo_estimator'
+        key = 'recordio-pb-data'
+        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
+        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
+
+        estimator = Estimator(image_name=image_name,
+                              role='SageMakerRole', train_instance_count=1,
+                              train_instance_type='ml.c4.xlarge',
+                              sagemaker_session=sagemaker_session, base_job_name='test-byo')
+
+        estimator.set_hyperparameters(num_factors=10,
+                                      feature_dim=784,
+                                      mini_batch_size=100,
+                                      predictor_type='binary_classifier')
+
+        hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)}
+
+        tuner = HyperparameterTuner(estimator=estimator, base_tuning_job_name='byo',
+                                    objective_metric_name='test:binary_classification_accuracy',
+                                    hyperparameter_ranges=hyperparameter_ranges,
+                                    max_jobs=2, max_parallel_jobs=2)
+
+        tuner.fit({'train': s3_train_data, 'test': s3_train_data}, include_cls_metadata=False)
+
+        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)
+
+        time.sleep(15)
+        tuner.wait()
+
+    best_training_job = tuner.best_training_job()
+    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
+        predictor = tuner.deploy(1, 'ml.m4.xlarge', endpoint_name=best_training_job)
+        predictor.serializer = _fm_serializer
+        predictor.content_type = 'application/json'
+        predictor.deserializer = json_deserializer
+
+        result = predictor.predict(train_set[0][:10])
+
+        assert len(result['predictions']) == 10
+        for prediction in result['predictions']:
+            assert prediction['score'] is not None
+
+
+# Serializer for the Factorization Machines predictor (for BYO example)
+def _fm_serializer(data):
+    js = {'instances': []}
+    for row in data:
+        js['instances'].append({'features': row.tolist()})
+    return json.dumps(js)
@@ -159,6 +159,21 @@ def test_prepare_for_training(tuner):
     assert tuner.static_hyperparameters['sagemaker_estimator_module'] == module
 
 
+def test_prepare_for_training_with_amazon_estimator(tuner, sagemaker_session):
+    tuner.estimator = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS,
+                          sagemaker_session=sagemaker_session)
+
+    tuner._prepare_for_training()
+    assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters
+    assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters
+
+
+def test_prepare_for_training_dont_include_estimator_cls(tuner):
+    tuner._prepare_for_training(include_cls_metadata=False)
+    assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters
+    assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters
+
+
 def test_prepare_for_training_with_job_name(tuner):
     static_hyperparameters = {'validated': 1, 'another_one': 0}
     tuner.estimator.set_hyperparameters(**static_hyperparameters)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ CHANGELOG @@
 .4.3dev
     ========
     * feature: Allow Local Serving of Models in S3
+    * enhancement: Allow option for ``HyperparameterTuner`` to not include estimator metadata in job
 .4.2
@@ Expand Down @@