aws · ChoiByungWook · Jun 5, 2018 · May 31, 2018 · May 31, 2018 · May 31, 2018
@@ -24,3 +24,4 @@ doc/_templates
 venv/
 *~
 .pytest_cache/
+*.swp
@@ -2,10 +2,13 @@
 CHANGELOG
 =========
 
-1.3.dev1
-========
+1.4.0
+=====
 
 * bug-fix: Estimators: Change max_iterations hyperparameter key for KMeans
+* bug-fix: Local Mode: Show logs in Jupyter notebooks
+* feature: HyperparameterTuner: Add support for hyperparameter tuning jobs
+* feature: Analytics: Add functions for metrics in Training and Hyperparameter Tuning jobs
 
 1.3.0
 =====

@@ -48,7 +48,7 @@ You can install from source by cloning this repository and issuing a pip install
 
     git clone https://github.com/aws/sagemaker-python-sdk.git
     python setup.py sdist
-    pip install dist/sagemaker-1.3.0.tar.gz
+    pip install dist/sagemaker-1.4.0.tar.gz
 
 Supported Python versions
 ~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -23,7 +23,7 @@ def read(fname):
 
 
 setup(name="sagemaker",
-      version="1.3.0",
+      version="1.4.0",
       description="Open source library for training and deploying models on Amazon SageMaker.",
       packages=find_packages('src'),
       package_dir={'': 'src'},

@@ -22,6 +22,7 @@
 from sagemaker.amazon.ntm import NTM, NTMModel, NTMPredictor
 from sagemaker.amazon.randomcutforest import RandomCutForest, RandomCutForestModel, RandomCutForestPredictor
 
+from sagemaker.analytics import TrainingJobAnalytics, HyperparameterTuningJobAnalytics
 from sagemaker.local.local_session import LocalSession
 
 from sagemaker.model import Model
@@ -39,4 +40,5 @@
            'FactorizationMachines', 'FactorizationMachinesModel', 'FactorizationMachinesPredictor',
            'RandomCutForest', 'RandomCutForestModel', 'RandomCutForestPredictor',
            'Model', 'NTM', 'NTMModel', 'NTMPredictor', 'RealTimePredictor', 'Session', 'LocalSession',
+           'TrainingJobAnalytics', 'HyperparameterTuningJobAnalytics',
            'container_def', 's3_input', 'production_variant', 'get_execution_role']
@@ -19,7 +19,7 @@
 from sagemaker.amazon import validation
 from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
 from sagemaker.amazon.common import write_numpy_to_dense_tensor
-from sagemaker.estimator import EstimatorBase
+from sagemaker.estimator import EstimatorBase, _TrainingJob
 from sagemaker.session import s3_input
 from sagemaker.utils import sagemaker_timestamp
 
@@ -92,11 +92,38 @@ def _prepare_init_params_from_job_description(cls, job_details):
         del init_params['image']
         return init_params
 
-    def fit(self, records, mini_batch_size=None, **kwargs):
+    def _prepare_for_training(self, records, mini_batch_size=None, job_name=None):
+        """Set hyperparameters needed for training.
+
+        Args:
+            * records (:class:`~RecordSet`): The records to train this ``Estimator`` on.
+            * mini_batch_size (int or None): The size of each mini-batch to use when training. If ``None``, a
+                default value will be used.
+            * job_name (str): Name of the training job to be created. If not specified, one is generated,
+                using the base name given to the constructor if applicable.
+        """
+        super(AmazonAlgorithmEstimatorBase, self)._prepare_for_training(job_name=job_name)
+
+        feature_dim = None
+
+        if isinstance(records, list):
+            for record in records:
+                if record.channel == 'train':
+                    feature_dim = record.feature_dim
+                    break
+            if feature_dim is None:
+                raise ValueError('Must provide train channel.')
+        else:
+            feature_dim = records.feature_dim
+
+        self.feature_dim = feature_dim
+        self.mini_batch_size = mini_batch_size
+
+    def fit(self, records, mini_batch_size=None, wait=True, logs=True, job_name=None):
         """Fit this Estimator on serialized Record objects, stored in S3.
 
         ``records`` should be an instance of :class:`~RecordSet`. This defines a collection of
-        s3 data files to train this ``Estimator`` on.
+        S3 data files to train this ``Estimator`` on.
 
         Training data is expected to be encoded as dense or sparse vectors in the "values" feature
         on each Record. If the data is labeled, the label is expected to be encoded as a list of
@@ -110,15 +137,19 @@ def fit(self, records, mini_batch_size=None, **kwargs):
 
         Args:
             records (:class:`~RecordSet`): The records to train this ``Estimator`` on
-            mini_batch_size (int or None): The size of each mini-batch to use when training. If None, a
+            mini_batch_size (int or None): The size of each mini-batch to use when training. If ``None``, a
                 default value will be used.
+            wait (bool): Whether the call should wait until the job completes (default: True).
+            logs (bool): Whether to show the logs produced by the job.
+                Only meaningful when wait is True (default: True).
+            job_name (str): Training job name. If not specified, the estimator generates a default job name,
+                based on the training image name and current timestamp.
         """
-        self.feature_dim = records.feature_dim
-        self.mini_batch_size = mini_batch_size
+        self._prepare_for_training(records, job_name=job_name, mini_batch_size=mini_batch_size)
 
-        data = {records.channel: s3_input(records.s3_data, distribution='ShardedByS3Key',
-                                          s3_data_type=records.s3_data_type)}
-        super(AmazonAlgorithmEstimatorBase, self).fit(data, **kwargs)
+        self.latest_training_job = _TrainingJob.start_new(self, records)
+        if wait:
+            self.latest_training_job.wait(logs=logs)
 
     def record_set(self, train, labels=None, channel="train"):
         """Build a :class:`~RecordSet` from a numpy :class:`~ndarray` matrix and label vector.
@@ -180,6 +211,14 @@ def __repr__(self):
         """Return an unambiguous representation of this RecordSet"""
         return str((RecordSet, self.__dict__))
 
+    def data_channel(self):
+        """Return a dictionary to represent the training data in a channel for use with ``fit()``"""
+        return {self.channel: self.records_s3_input()}
+
+    def records_s3_input(self):
+        """Return a s3_input to represent the training data"""
+        return s3_input(self.s3_data, distribution='ShardedByS3Key', s3_data_type=self.s3_data_type)
+
 
 def _build_shards(num_shards, array):
     if num_shards < 1:

@@ -46,7 +46,6 @@ def validate(self, value):
                 raise ValueError(error_message)
 
     def __get__(self, obj, objtype):
-        """Return the value of this hyperparameter"""
         if '_hyperparameters' not in dir(obj) or self.name not in obj._hyperparameters:
             raise AttributeError()
         return obj._hyperparameters[self.name]

@@ -108,8 +108,8 @@ def create_model(self):
         s3 model data produced by this Estimator."""
         return KMeansModel(self.model_data, self.role, self.sagemaker_session)
 
-    def fit(self, records, mini_batch_size=5000, **kwargs):
-        super(KMeans, self).fit(records, mini_batch_size, **kwargs)
+    def _prepare_for_training(self, records, mini_batch_size=5000, job_name=None):
+        super(KMeans, self)._prepare_for_training(records, mini_batch_size=mini_batch_size, job_name=job_name)
 
     def hyperparameters(self):
         """Return the SageMaker hyperparameters for training this KMeans Estimator"""

@@ -93,11 +93,12 @@ def create_model(self):
 
         return LDAModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
 
-    def fit(self, records, mini_batch_size, **kwargs):
+    def _prepare_for_training(self, records, mini_batch_size, job_name=None):
         # mini_batch_size is required, prevent explicit calls with None
         if mini_batch_size is None:
             raise ValueError("mini_batch_size must be set")
-        super(LDA, self).fit(records, mini_batch_size, **kwargs)
+
+        super(LDA, self)._prepare_for_training(records, mini_batch_size=mini_batch_size, job_name=job_name)
 
 
 class LDAPredictor(RealTimePredictor):

@@ -228,12 +228,23 @@ def create_model(self):
 
         return LinearLearnerModel(self.model_data, self.role, self.sagemaker_session)
 
-    def fit(self, records, mini_batch_size=None, **kwargs):
+    def _prepare_for_training(self, records, mini_batch_size=None, job_name=None):
+        num_records = None
+        if isinstance(records, list):
+            for record in records:
+                if record.channel == 'train':
+                    num_records = record.num_records
+                    break
+            if num_records is None:
+                raise ValueError('Must provide train channel.')
+        else:
+            num_records = records.num_records
+
         # mini_batch_size can't be greater than number of records or training job fails
         default_mini_batch_size = min(self.DEFAULT_MINI_BATCH_SIZE,
-                                      max(1, int(records.num_records / self.train_instance_count)))
+                                      max(1, int(num_records / self.train_instance_count)))
         use_mini_batch_size = mini_batch_size or default_mini_batch_size
-        super(LinearLearner, self).fit(records, use_mini_batch_size, **kwargs)
+        super(LinearLearner, self)._prepare_for_training(records, mini_batch_size=use_mini_batch_size, job_name=job_name)
 
 
 class LinearLearnerPredictor(RealTimePredictor):

@@ -113,10 +113,10 @@ def create_model(self):
 
         return NTMModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
 
-    def fit(self, records, mini_batch_size=None, **kwargs):
+    def _prepare_for_training(self, records, mini_batch_size, job_name=None):
         if mini_batch_size is not None and (mini_batch_size < 1 or mini_batch_size > 10000):
             raise ValueError("mini_batch_size must be in [1, 10000]")
-        super(NTM, self).fit(records, mini_batch_size, **kwargs)
+        super(NTM, self)._prepare_for_training(records, mini_batch_size=mini_batch_size, job_name=job_name)
 
 
 class NTMPredictor(RealTimePredictor):

@@ -92,12 +92,33 @@ def create_model(self):
 
         return PCAModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
 
-    def fit(self, records, mini_batch_size=None, **kwargs):
+    def _prepare_for_training(self, records, mini_batch_size=None, job_name=None):
+        """Set hyperparameters needed for training.
+
+        Args:
+            * records (:class:`~RecordSet`): The records to train this ``Estimator`` on.
+            * mini_batch_size (int or None): The size of each mini-batch to use when training. If ``None``, a
+                default value will be used.
+            * job_name (str): Name of the training job to be created. If not specified, one is generated,
+                using the base name given to the constructor if applicable.
+        """
+        num_records = None
+        if isinstance(records, list):
+            for record in records:
+                if record.channel == 'train':
+                    num_records = record.num_records
+                    break
+            if num_records is None:
+                raise ValueError('Must provide train channel.')
+        else:
+            num_records = records.num_records
+
         # mini_batch_size is a required parameter
         default_mini_batch_size = min(self.DEFAULT_MINI_BATCH_SIZE,
-                                      max(1, int(records.num_records / self.train_instance_count)))
+                                      max(1, int(num_records / self.train_instance_count)))
         use_mini_batch_size = mini_batch_size or default_mini_batch_size
-        super(PCA, self).fit(records, use_mini_batch_size, **kwargs)
+
+        super(PCA, self)._prepare_for_training(records=records, mini_batch_size=use_mini_batch_size, job_name=job_name)
 
 
 class PCAPredictor(RealTimePredictor):

@@ -87,13 +87,11 @@ def create_model(self):
 
         return RandomCutForestModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
 
-    def fit(self, records, mini_batch_size=None, **kwargs):
-        if mini_batch_size is None:
-            mini_batch_size = RandomCutForest.MINI_BATCH_SIZE
-        elif mini_batch_size != RandomCutForest.MINI_BATCH_SIZE:
+    def _prepare_for_training(self, records, mini_batch_size=MINI_BATCH_SIZE, job_name=None):
+        if mini_batch_size != self.MINI_BATCH_SIZE:
             raise ValueError("Random Cut Forest uses a fixed mini_batch_size of {}"
-                             .format(RandomCutForest.MINI_BATCH_SIZE))
-        super(RandomCutForest, self).fit(records, mini_batch_size, **kwargs)
+                             .format(self.MINI_BATCH_SIZE))
+        super(RandomCutForest, self)._prepare_for_training(records, mini_batch_size=mini_batch_size, job_name=job_name)
 
 
 class RandomCutForestPredictor(RealTimePredictor):
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,3 +24,4 @@ doc/_templates @@
     venv/
     *~
     .pytest_cache/
+    *.swp