Skip to content

Add wrapper for LDA. #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 31, 2018
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
CHANGELOG
=========

1.0.3
=====

* feature: Estimators: add support for Amazon LDA algorithm
* feature: Hyperparameters: Add data_type to hyperparameters
* feature: Documentation: Update TensorFlow examples following API change
* feature: Session: Support multi-part uploads


1.0.2
=====

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def read(fname):


setup(name="sagemaker",
version="1.0.2",
version="1.0.3",
description="Open source library for training and deploying models on Amazon SageMaker.",
packages=find_packages('src'),
package_dir={'': 'src'},
Expand Down
2 changes: 2 additions & 0 deletions src/sagemaker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from sagemaker import estimator
from sagemaker.amazon.kmeans import KMeans, KMeansModel, KMeansPredictor
from sagemaker.amazon.pca import PCA, PCAModel, PCAPredictor
from sagemaker.amazon.lda import LDA, LDAModel, LDAPredictor
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel, LinearLearnerPredictor
from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesModel
from sagemaker.amazon.factorization_machines import FactorizationMachinesPredictor
Expand All @@ -30,6 +31,7 @@

__all__ = [estimator, KMeans, KMeansModel, KMeansPredictor, PCA, PCAModel, PCAPredictor, LinearLearner,
LinearLearnerModel, LinearLearnerPredictor,
LDA, LDAModel, LDAPredictor,
FactorizationMachines, FactorizationMachinesModel, FactorizationMachinesPredictor,
Model, RealTimePredictor, Session,
container_def, s3_input, production_variant, get_execution_role]
27 changes: 19 additions & 8 deletions src/sagemaker/amazon/amazon_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def __init__(self, role, train_instance_count, train_instance_type, data_locatio
self.data_location = data_location

def train_image(self):
return registry(self.sagemaker_session.boto_region_name) + "/" + type(self).repo
repo = '{}:{}'.format(type(self).repo_name, type(self).repo_version)
return '{}/{}'.format(registry(self.sagemaker_session.boto_region_name, type(self).repo_name), repo)

def hyperparameters(self):
return hp.serialize_all(self)
Expand Down Expand Up @@ -200,12 +201,22 @@ def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=
raise ex


def registry(region_name):
def registry(region_name, algorithm=None):
"""Return docker registry for the given AWS region"""
account_id = {
"us-east-1": "382416733822",
"us-east-2": "404615174143",
"us-west-2": "174872318107",
"eu-west-1": "438346466558"
}[region_name]
if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines"]:
account_id = {
"us-east-1": "382416733822",
"us-east-2": "404615174143",
"us-west-2": "174872318107",
"eu-west-1": "438346466558"
}[region_name]
elif algorithm in ["lda"]:
account_id = {
"us-east-1": "766337827248",
"us-east-2": "999911452149",
"us-west-2": "266724342769",
"eu-west-1": "999678624901"
}[region_name]
else:
raise ValueError("Algorithm class:{} doesn't have mapping to account_id with images".format(algorithm))
return "{}.dkr.ecr.{}.amazonaws.com".format(account_id, region_name)
6 changes: 4 additions & 2 deletions src/sagemaker/amazon/factorization_machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

class FactorizationMachines(AmazonAlgorithmEstimatorBase):

repo = 'factorization-machines:1'
repo_name = 'factorization-machines'
repo_version = 1

num_factors = hp('num_factors', gt(0), 'An integer greater than zero', int)
predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'),
Expand Down Expand Up @@ -194,7 +195,8 @@ class FactorizationMachinesModel(Model):

def __init__(self, model_data, role, sagemaker_session=None):
sagemaker_session = sagemaker_session or Session()
image = registry(sagemaker_session.boto_session.region_name) + "/" + FactorizationMachines.repo
repo = '{}:{}'.format(FactorizationMachines.repo_name, FactorizationMachines.repo_version)
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
super(FactorizationMachinesModel, self).__init__(model_data,
image,
role,
Expand Down
6 changes: 4 additions & 2 deletions src/sagemaker/amazon/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

class KMeans(AmazonAlgorithmEstimatorBase):

repo = 'kmeans:1'
repo_name = 'kmeans'
repo_version = 1

k = hp('k', gt(1), 'An integer greater-than 1', int)
init_method = hp('init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
Expand Down Expand Up @@ -132,6 +133,7 @@ class KMeansModel(Model):

def __init__(self, model_data, role, sagemaker_session=None):
sagemaker_session = sagemaker_session or Session()
image = registry(sagemaker_session.boto_session.region_name) + "/" + KMeans.repo
repo = '{}:{}'.format(KMeans.repo_name, KMeans.repo_version)
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
super(KMeansModel, self).__init__(model_data, image, role, predictor_cls=KMeansPredictor,
sagemaker_session=sagemaker_session)
127 changes: 127 additions & 0 deletions src/sagemaker/amazon/lda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
from sagemaker.amazon.validation import gt
from sagemaker.predictor import RealTimePredictor
from sagemaker.model import Model
from sagemaker.session import Session


class LDA(AmazonAlgorithmEstimatorBase):

repo_name = 'lda'
repo_version = 1

num_topics = hp('num_topics', gt(0), 'An integer greater than zero', int)
alpha0 = hp('alpha0', gt(0), 'A positive float', float)
max_restarts = hp('max_restarts', gt(0), 'An integer greater than zero', int)
max_iterations = hp('max_iterations', gt(0), 'An integer greater than zero', int)
tol = hp('tol', gt(0), 'A positive float', float)

def __init__(self, role, train_instance_type, num_topics,
alpha0=None, max_restarts=None, max_iterations=None, tol=None, **kwargs):
"""Latent Dirichlet Allocation (LDA) is :class:`Estimator` used for unsupervised learning.

Amazon SageMaker Latent Dirichlet Allocation is an unsupervised learning algorithm that attempts to describe
a set of observations as a mixture of distinct categories. LDA is most commonly used to discover
a user-specified number of topics shared by documents within a text corpus.
Here each observation is a document, the features are the presence (or occurrence count) of each word, and
the categories are the topics.

This Estimator may be fit via calls to
:meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
:class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
to the `fit` call.

To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
deploy returns a :class:`~sagemaker.amazon.lda.LDAPredictor` object that can be used
for inference calls using the trained model hosted in the SageMaker Endpoint.

LDA Estimators can be configured by setting hyperparameters. The available hyperparameters for
LDA are documented below.

For further information on the AWS LDA algorithm,
please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/lda.html

Args:
role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
APIs that create Amazon SageMaker endpoints use this role to access
training data and model artifacts. After the endpoint is created,
the inference code might use the IAM role, if accessing AWS resource.
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
num_topics (int): The number of topics for LDA to find within the data.
alpha0 (float): Optional. Initial guess for the concentration parameter
max_restarts (int): Optional. The number of restarts to perform during the Alternating Least Squares (ALS)
spectral decomposition phase of the algorithm.
max_iterations (int): Optional. The maximum number of iterations to perform during the
ALS phase of the algorithm.
tol (float): Optional. Target error tolerance for the ALS phase of the algorithm.
**kwargs: base class keyword argument values.
"""

# this algorithm only supports single instance training
super(LDA, self).__init__(role, 1, train_instance_type, **kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a link to the docs for this.

It also indicates that it only supports CPU instances for training. That seems like it would be good to validate.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The training job will not fail if not started on CPU instance so I think we shouldn't fail too fast here. Will add comment/link.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, that makes sense. I misunderstood.

self.num_topics = num_topics
self.alpha0 = alpha0
self.max_restarts = max_restarts
self.max_iterations = max_iterations
self.tol = tol

def create_model(self):
"""Return a :class:`~sagemaker.amazon.LDAModel` referencing the latest
s3 model data produced by this Estimator."""

return LDAModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)

def fit(self, records, mini_batch_size, **kwargs):
# mini_batch_size is required, prevent explicit calls with None
if mini_batch_size is None:
raise ValueError("mini_batch_size must be set")
super(LDA, self).fit(records, mini_batch_size, **kwargs)


class LDAPredictor(RealTimePredictor):
"""Transforms input vectors to lower-dimesional representations.

The implementation of :meth:`~sagemaker.predictor.RealTimePredictor.predict` in this
`RealTimePredictor` requires a numpy ``ndarray`` as input. The array should contain the
same number of columns as the feature-dimension of the data used to fit the model this
Predictor performs inference on.

:meth:`predict()` returns a list of :class:`~sagemaker.amazon.record_pb2.Record` objects, one
for each row in the input ``ndarray``. The lower dimension vector result is stored in the ``projection``
key of the ``Record.label`` field."""

def __init__(self, endpoint, sagemaker_session=None):
super(LDAPredictor, self).__init__(endpoint, sagemaker_session, serializer=numpy_to_record_serializer(),
deserializer=record_deserializer())


class LDAModel(Model):
"""Reference LDA s3 model data. Calling :meth:`~sagemaker.model.Model.deploy` creates an Endpoint and return
a Predictor that transforms vectors to a lower-dimensional representation."""

def __init__(self, model_data, role, sagemaker_session=None):
sagemaker_session = sagemaker_session or Session()
repo = '{}:{}'.format(LDA.repo_name, LDA.repo_version)
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name, LDA.repo_name), repo)
super(LDAModel, self).__init__(model_data, image, role, predictor_cls=LDAPredictor,
sagemaker_session=sagemaker_session)
6 changes: 4 additions & 2 deletions src/sagemaker/amazon/linear_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

class LinearLearner(AmazonAlgorithmEstimatorBase):

repo = 'linear-learner:1'
repo_name = 'linear-learner'
repo_version = 1

DEFAULT_MINI_BATCH_SIZE = 1000

Expand Down Expand Up @@ -226,7 +227,8 @@ class LinearLearnerModel(Model):

def __init__(self, model_data, role, sagemaker_session=None):
sagemaker_session = sagemaker_session or Session()
image = registry(sagemaker_session.boto_session.region_name) + "/" + LinearLearner.repo
repo = '{}:{}'.format(LinearLearner.repo_name, LinearLearner.repo_version)
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
super(LinearLearnerModel, self).__init__(model_data, image, role,
predictor_cls=LinearLearnerPredictor,
sagemaker_session=sagemaker_session)
6 changes: 4 additions & 2 deletions src/sagemaker/amazon/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

class PCA(AmazonAlgorithmEstimatorBase):

repo = 'pca:1'
repo_name = 'pca'
repo_version = 1

DEFAULT_MINI_BATCH_SIZE = 500

Expand Down Expand Up @@ -118,6 +119,7 @@ class PCAModel(Model):

def __init__(self, model_data, role, sagemaker_session=None):
sagemaker_session = sagemaker_session or Session()
image = registry(sagemaker_session.boto_session.region_name) + "/" + PCA.repo
repo = '{}:{}'.format(PCA.repo_name, PCA.repo_version)
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
super(PCAModel, self).__init__(model_data, image, role, predictor_cls=PCAPredictor,
sagemaker_session=sagemaker_session)
Binary file added tests/data/lda/nips-train_1.pbr
Binary file not shown.
77 changes: 77 additions & 0 deletions tests/integ/test_lda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import boto3
import numpy as np
import os
from six.moves.urllib.parse import urlparse

import sagemaker
from sagemaker import LDA, LDAModel
from sagemaker.amazon.amazon_estimator import RecordSet
from sagemaker.amazon.common import read_records
from sagemaker.utils import name_from_base, sagemaker_timestamp

from tests.integ import DATA_DIR, REGION
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name


def test_lda():

with timeout(minutes=15):
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
data_path = os.path.join(DATA_DIR, 'lda')
data_filename = 'nips-train_1.pbr'

with open(os.path.join(data_path, data_filename), 'rb') as f:
all_records = read_records(f)

# all records must be same
feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])

lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10,
sagemaker_session=sagemaker_session, base_job_name='test-lda')

record_set = _prepare_record_set_from_local_files(data_path, lda.data_location,
len(all_records), feature_num, sagemaker_session)
lda.fit(record_set, 100)

endpoint_name = name_from_base('lda')
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
model = LDAModel(lda.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

predict_input = np.random.rand(1, feature_num)
result = predictor.predict(predict_input)

assert len(result) == 1
for record in result:
assert record.label["topic_mixture"] is not None


def _prepare_record_set_from_local_files(dir_path, destination, num_records, feature_dim, sagemaker_session):
"""Build a :class:`~RecordSet` by pointing to local files.

Args:
dir_path (string): Path to local directory from where the files shall be uploaded.
destination (string): S3 path to upload the file to.
num_records (int): Number of records in all the files
feature_dim (int): Number of features in the data set
sagemaker_session (sagemaker.session.Session): Session object to manage interactions with Amazon SageMaker APIs.
Returns:
RecordSet: A RecordSet specified by S3Prefix to to be used in training.
"""
key_prefix = urlparse(destination).path
key_prefix = key_prefix + '{}-{}'.format("testfiles", sagemaker_timestamp())
key_prefix = key_prefix.lstrip('/')
uploaded_location = sagemaker_session.upload_data(path=dir_path, key_prefix=key_prefix)
return RecordSet(uploaded_location, num_records, feature_dim, s3_data_type='S3Prefix')
1 change: 0 additions & 1 deletion tests/unit/test_amazon_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from sagemaker.amazon.pca import PCA
from sagemaker.amazon.amazon_estimator import upload_numpy_to_s3_shards, _build_shards, registry


COMMON_ARGS = {'role': 'myrole', 'train_instance_count': 1, 'train_instance_type': 'ml.c4.xlarge'}

REGION = "us-west-2"
Expand Down
Loading