Add wrapper for LDA.

lukmis · lukmis · commit e2b8f0de0e5b · 2018-01-24T09:11:47.000-08:00
Update CHANGELOG and bump the version number.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,14 @@
 CHANGELOG
 =========
 
+1.0.3
+=====
+
+* feature: Estimators: add support for Amazon LDA algorithm
+* feature: Documentation: Update TensorFlow examples following API change
+* feature: Session: Support multi-part uploads
+
+
 1.0.2
 =====
 
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@ def read(fname):
 
 
 setup(name="sagemaker",
-      version="1.0.2",
+      version="1.0.3",
       description="Open source library for training and deploying models on Amazon SageMaker.",
       packages=find_packages('src'),
       package_dir={'': 'src'},
diff --git a/src/sagemaker/__init__.py b/src/sagemaker/__init__.py
@@ -15,6 +15,7 @@
 from sagemaker import estimator
 from sagemaker.amazon.kmeans import KMeans, KMeansModel, KMeansPredictor
 from sagemaker.amazon.pca import PCA, PCAModel, PCAPredictor
+from sagemaker.amazon.lda import LDA, LDAModel, LDAPredictor
 from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel, LinearLearnerPredictor
 from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesModel
 from sagemaker.amazon.factorization_machines import FactorizationMachinesPredictor
@@ -30,6 +31,7 @@
 
 __all__ = [estimator, KMeans, KMeansModel, KMeansPredictor, PCA, PCAModel, PCAPredictor, LinearLearner,
            LinearLearnerModel, LinearLearnerPredictor,
+           LDA, LDAModel, LDAPredictor,
            FactorizationMachines, FactorizationMachinesModel, FactorizationMachinesPredictor,
            Model, RealTimePredictor, Session,
            container_def, s3_input, production_variant, get_execution_role]
diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py
@@ -10,6 +10,7 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+import boto3
 import json
 import logging
 import tempfile
@@ -18,6 +19,7 @@
 from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
 from sagemaker.amazon.common import write_numpy_to_dense_tensor
 from sagemaker.estimator import EstimatorBase
+from sagemaker.fw_utils import parse_s3_url
 from sagemaker.session import s3_input
 from sagemaker.utils import sagemaker_timestamp
 
@@ -47,7 +49,7 @@ def __init__(self, role, train_instance_count, train_instance_type, data_locatio
         self.data_location = data_location
 
     def train_image(self):
-        return registry(self.sagemaker_session.boto_region_name) + "/" + type(self).repo
+        return registry(self.sagemaker_session.boto_region_name, type(self).__name__) + "/" + type(self).repo
 
     def hyperparameters(self):
         return hp.serialize_all(self)
@@ -152,6 +154,61 @@ def __repr__(self):
         """Return an unambiguous representation of this RecordSet"""
         return str((RecordSet, self.__dict__))
 
+    @staticmethod
+    def from_s3(data_path, num_records, feature_dim, channel='train'):
+        """
+        Create instance of the class given S3 path. It prepares the manifest file with all files found at the location.
+
+        Args:
+            data_path: S3 path to files
+            num_records: Number of records at S3 location
+            feature_dim: Number of features in each of the files
+            channel: Name of the data channel
+
+        Returns:
+            Instance of RecordSet that can be used when calling
+            :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`
+        """
+        s3 = boto3.client('s3')
+
+        if not data_path.endswith('/'):
+            data_path = data_path + '/'
+
+        bucket, prefix = parse_s3_url(data_path)
+
+        all_files = []
+        next_token = None
+        more = True
+        while more:
+            list_req = {
+                'Bucket': bucket,
+                'Prefix': prefix
+            }
+            if next_token is not None:
+                list_req.update({'ContinuationToken': next_token})
+            objects = s3.list_objects_v2(**list_req)
+            more = objects['IsTruncated']
+            if more:
+                next_token = objects['NextContinuationToken']
+            files_list = objects.get('Contents', None)
+            if files_list is None:
+                continue
+            long_names = [content['Key'] for content in files_list]
+            files = [file.split(prefix)[1] for file in long_names]
+            [all_files.append(f) for f in files]
+
+        if len(all_files) == 0:
+            raise ValueError("S3 location:{} doesn't have any files".format(data_path))
+        manifest_key = prefix + ".amazon.manifest"
+        manifest_str = json.dumps([{'prefix': data_path}] + all_files)
+
+        s3.put_object(Bucket=bucket, Body=manifest_str.encode('utf-8'), Key=manifest_key)
+
+        return RecordSet("s3://{}/{}".format(bucket, manifest_key),
+                         num_records=num_records,
+                         feature_dim=feature_dim,
+                         channel=channel)
+
 
 def _build_shards(num_shards, array):
     if num_shards < 1:
@@ -200,12 +257,22 @@ def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=
             raise ex
 
 
-def registry(region_name):
+def registry(region_name, algorithm=None):
     """Return docker registry for the given AWS region"""
-    account_id = {
-        "us-east-1": "382416733822",
-        "us-east-2": "404615174143",
-        "us-west-2": "174872318107",
-        "eu-west-1": "438346466558"
-    }[region_name]
+    if algorithm in [None, "PCA", "KMeans", "LinearLearner", "FactorizationMachines"]:
+        account_id = {
+            "us-east-1": "382416733822",
+            "us-east-2": "404615174143",
+            "us-west-2": "174872318107",
+            "eu-west-1": "438346466558"
+        }[region_name]
+    elif algorithm in ["LDA"]:
+        account_id = {
+            "us-east-1": "766337827248",
+            "us-east-2": "999911452149",
+            "us-west-2": "266724342769",
+            "eu-west-1": "999678624901"
+        }[region_name]
+    else:
+        raise ValueError("Algorithm class:{} doesn't have mapping to account_id with images".format(algorithm))
     return "{}.dkr.ecr.{}.amazonaws.com".format(account_id, region_name)
diff --git a/src/sagemaker/amazon/lda.py b/src/sagemaker/amazon/lda.py
@@ -0,0 +1,127 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
+from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
+from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
+from sagemaker.amazon.validation import gt, isint, isnumber
+from sagemaker.predictor import RealTimePredictor
+from sagemaker.model import Model
+from sagemaker.session import Session
+
+
+class LDA(AmazonAlgorithmEstimatorBase):
+
+    repo = 'lda:1'
+
+    num_topics = hp('num_topics', (gt(0), isint), 'An integer greater than zero')
+    alpha0 = hp('alpha0', isnumber, "A float value")
+    max_restarts = hp('max_restarts', (gt(0), isint), 'An integer greater than zero')
+    max_iterations = hp('max_iterations', (gt(0), isint), 'An integer greater than zero')
+    tol = hp('tol', (gt(0), isnumber), "A positive float")
+
+    def __init__(self, role, train_instance_type, num_topics,
+                 alpha0=None, max_restarts=None, max_iterations=None, tol=None, **kwargs):
+        """Latent Dirichlet Allocation (LDA) is :class:`Estimator` used for unsupervised learning.
+
+        Amazon SageMaker Latent Dirichlet Allocation is an unsupervised learning algorithm that attempts to describe
+        a set of observations as a mixture of distinct categories. LDA is most commonly used to discover
+        a user-specified number of topics shared by documents within a text corpus.
+        Here each observation is a document, the features are the presence (or occurrence count) of each word, and
+        the categories are the topics.
+
+        This Estimator may be fit via calls to
+        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
+        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
+        There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
+        can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
+        to the `fit` call.
+
+        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
+        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
+
+        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
+        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
+        deploy returns a :class:`~sagemaker.amazon.lda.LDAPredictor` object that can be used
+        for inference calls using the trained model hosted in the SageMaker Endpoint.
+
+        LDA Estimators can be configured by setting hyperparameters. The available hyperparameters for
+        LDA are documented below.
+
+        For further information on the AWS LDA algorithm,
+        please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/lda.html
+
+        Args:
+            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
+                APIs that create Amazon SageMaker endpoints use this role to access
+                training data and model artifacts. After the endpoint is created,
+                the inference code might use the IAM role, if accessing AWS resource.
+            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
+            num_topics (int): The number of topics for LDA to find within the data.
+            alpha0 (float): Initial guess for the concentration parameter
+            max_restarts (int): The number of restarts to perform during the Alternating Least Squares (ALS)
+                spectral decomposition phase of the algorithm.
+            max_iterations (int): The maximum number of iterations to perform during the ALS phase of the algorithm.
+            tol (float): Target error tolerance for the ALS phase of the algorithm.
+            **kwargs: base class keyword argument values.
+        """
+
+        # this algorithm only supports single instance training
+        super(LDA, self).__init__(role, 1, train_instance_type, **kwargs)
+        self.num_topics = num_topics
+        self.alpha0 = alpha0
+        self.max_restarts = max_restarts
+        self.max_iterations = max_iterations
+        self.tol = tol
+
+    def create_model(self):
+        """Return a :class:`~sagemaker.amazon.FactorizationMachinesModel` referencing the latest
+        s3 model data produced by this Estimator."""
+
+        return LDAModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
+
+    def fit(self, records, mini_batch_size, **kwargs):
+        # mini_batch_size is required
+        if mini_batch_size is None:
+            raise ValueError("mini_batch_size must be set")
+        if not isinstance(mini_batch_size, int) or mini_batch_size < 1:
+            raise ValueError("mini_batch_size must be positive integer")
+
+        super(LDA, self).fit(records, mini_batch_size, **kwargs)
+
+
+class LDAPredictor(RealTimePredictor):
+    """Transforms input vectors to lower-dimesional representations.
+
+    The implementation of :meth:`~sagemaker.predictor.RealTimePredictor.predict` in this
+    `RealTimePredictor` requires a numpy ``ndarray`` as input. The array should contain the
+    same number of columns as the feature-dimension of the data used to fit the model this
+    Predictor performs inference on.
+
+    :meth:`predict()` returns a list of :class:`~sagemaker.amazon.record_pb2.Record` objects, one
+    for each row in the input ``ndarray``. The lower dimension vector result is stored in the ``projection``
+    key of the ``Record.label`` field."""
+
+    def __init__(self, endpoint, sagemaker_session=None):
+        super(LDAPredictor, self).__init__(endpoint, sagemaker_session, serializer=numpy_to_record_serializer(),
+                                           deserializer=record_deserializer())
+
+
+class LDAModel(Model):
+    """Reference LDA s3 model data. Calling :meth:`~sagemaker.model.Model.deploy` creates an Endpoint and return
+    a Predictor that transforms vectors to a lower-dimensional representation."""
+
+    def __init__(self, model_data, role, sagemaker_session=None):
+        sagemaker_session = sagemaker_session or Session()
+        image = registry(sagemaker_session.boto_session.region_name, LDA.__name__) + "/" + LDA.repo
+        super(LDAModel, self).__init__(model_data, image, role, predictor_cls=LDAPredictor,
+                                       sagemaker_session=sagemaker_session)
diff --git a/tests/data/lda/nips-train_1.pbr b/tests/data/lda/nips-train_1.pbr
diff --git a/tests/integ/test_lda.py b/tests/integ/test_lda.py
@@ -0,0 +1,61 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import boto3
+import numpy as np
+import os
+
+import sagemaker
+from sagemaker import LDA, LDAModel
+from sagemaker.amazon.amazon_estimator import RecordSet
+from sagemaker.amazon.common import read_records
+from sagemaker.utils import name_from_base, sagemaker_timestamp
+from tests.integ import DATA_DIR, REGION
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
+
+
+def test_lda():
+
+    with timeout(minutes=15):
+        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
+        data_filename = 'nips-train_1.pbr'
+        data_path = os.path.join(DATA_DIR, 'lda', data_filename)
+
+        with open(data_path, 'r') as f:
+            all_records = read_records(f)
+
+        # all records must be same
+        feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])
+
+        lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10,
+                  sagemaker_session=sagemaker_session, base_job_name='test-lda')
+
+        # upload data and prepare the set
+        data_location_key = "integ-test-data/lda-" + sagemaker_timestamp()
+        sagemaker_session.upload_data(path=data_path, key_prefix=data_location_key)
+        record_set = RecordSet.from_s3("s3://{}/{}".format(sagemaker_session.default_bucket(), data_location_key),
+                                       num_records=len(all_records),
+                                       feature_dim=feature_num,
+                                       channel='train')
+        lda.fit(record_set, 100)
+
+    endpoint_name = name_from_base('lda')
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
+        model = LDAModel(lda.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
+        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
+
+        predict_input = np.random.rand(1, feature_num)
+        result = predictor.predict(predict_input)
+
+        assert len(result) == 1
+        for record in result:
+            assert record.label["topic_mixture"] is not None
diff --git a/tests/unit/test_lda.py b/tests/unit/test_lda.py