Skip to content

Add test for BYO estimator #50

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 19, 2018
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions tests/integ/test_byo_estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import gzip
import io
import json
import numpy as np
import os
import pickle
import sys

import boto3

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.amazon.amazon_estimator import registry
from sagemaker.amazon.common import write_numpy_to_dense_tensor
from sagemaker.utils import name_from_base
from tests.integ import DATA_DIR, REGION
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name


def test_byo_estimator():
"""Use Factorization Machines algorithm as an example here.

First we need to prepare data for training. We take standard data set, convert it to the
format that the algorithm can process and upload it to S3.
Then we create the Estimator and set hyperparamets as required by the algorithm.
Next, we can call fit() with path to the S3.
Later the trained model is deployed and prediction is called against the endpoint.
Default predictor is updated with json serializer and deserializer.

"""
image_name = registry(REGION) + "/factorization-machines:1"

with timeout(minutes=15):
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

with gzip.open(data_path, 'rb') as f:
train_set, _, _ = pickle.load(f, **pickle_args)

# take 100 examples for faster execution
vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

buf = io.BytesIO()
write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

bucket = sagemaker_session.default_bucket()
prefix = 'test_byo_estimator'
key = 'recordio-pb-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

estimator = Estimator(image_name=image_name,
role='SageMakerRole', train_instance_count=1,
train_instance_type='ml.c4.xlarge',
sagemaker_session=sagemaker_session, base_job_name='test-byo')

estimator.set_hyperparameters(num_factors=10,
feature_dim=784,
mini_batch_size=100,
predictor_type='binary_classifier')

# training labels must be 'float32'
estimator.fit({'train': s3_train_data})

endpoint_name = name_from_base('byo')

def fm_serializer(data):
js = {'instances': []}
for row in data:
js['instances'].append({'features': row.tolist()})
return json.dumps(js)

with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
model = estimator.create_model()
predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
predictor.serializer = fm_serializer
predictor.content_type = 'application/json'
predictor.deserializer = sagemaker.predictor.json_deserializer

result = predictor.predict(train_set[0][:10])

assert len(result['predictions']) == 10
for prediction in result['predictions']:
assert prediction['score'] is not None