Merge branch 'master' into pt1.0

nadiaya · web-flow · commit b299825c2064 · 2018-12-14T16:07:40.000-08:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,9 +2,16 @@
 CHANGELOG
 =========
 
-1.16.2.dev
+1.16.3.dev
 ==========
 
+* bug-fix: Append retry id to default Airflow job name to avoid name collisions in retry
+* bug-fix: Local Mode: No longer requires s3 permissions to run local entry point file
+* feature: Estimators: add support for PyTorch 1.0.0
+
+1.16.2
+======
+
 * enhancement: Check for S3 paths being passed as entry point
 * feature: Add support for AugmentedManifestFile and ShuffleConfig
 * bug-fix: Add version bound for requests module to avoid conflicts with docker-compose and docker-py
@@ -15,7 +22,7 @@ CHANGELOG
 * bug-fix: Update PyYAML version to avoid conflicts with docker-compose
 * doc-fix: Correct the numbered list in the table of contents
 * doc-fix: Add Airflow API documentation
-* feature: Estimators: add support for PyTorch 1.0.0
+* feature: HyperparameterTuner: add Early Stopping support
 
 1.16.1.post1
 ============
diff --git a/README.rst b/README.rst
@@ -611,6 +611,22 @@ A hyperparameter range can be one of three types: continuous, integer, or catego
 The SageMaker Python SDK provides corresponding classes for defining these different types.
 You can define up to 20 hyperparameters to search over, but each value of a categorical hyperparameter range counts against that limit.
 
+By default, training job early stopping is turned off. To enable early stopping for the tuning job, you need to set the ``early_stopping_type`` parameter to ``Auto``:
+
+.. code:: python
+
+    # Enable early stopping
+    my_tuner = HyperparameterTuner(estimator=my_estimator,  # previously-configured Estimator object
+                                   objective_metric_name='validation-accuracy',
+                                   hyperparameter_ranges={'learning-rate': ContinuousParameter(0.05, 0.06)},
+                                   metric_definitions=[{'Name': 'validation-accuracy', 'Regex': 'validation-accuracy=(\d\.\d+)'}],
+                                   max_jobs=100,
+                                   max_parallel_jobs=10,
+                                   early_stopping_type='Auto')
+
+When early stopping is turned on, Amazon SageMaker will automatically stop a training job if it appears unlikely to produce a model of better quality than other jobs.
+If not using built-in Amazon SageMaker algorithms, note that, for early stopping to be effective, the objective metric should be emitted at epoch level.
+
 If you are using an Amazon SageMaker built-in algorithm, you don't need to pass in anything for ``metric_definitions``.
 In addition, the ``fit()`` call uses a list of ``RecordSet`` objects instead of a dictionary:
 
diff --git a/doc/conf.py b/doc/conf.py
@@ -32,7 +32,7 @@ def __getattr__(cls, name):
                 'numpy', 'scipy', 'scipy.sparse']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
-version = '1.16.1.post1'
+version = '1.16.2'
 project = u'sagemaker'
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
diff --git a/setup.py b/setup.py
@@ -33,7 +33,7 @@ def read(fname):
 
 
 # Declare minimal set for installation
-required_packages = ['boto3>=1.9.55', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0',
+required_packages = ['boto3>=1.9.64', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0',
                      'urllib3>=1.21', 'PyYAML>=3.2, <4', 'protobuf3-to-dict>=0.1.5',
                      'docker-compose>=1.23.0', 'requests>=2.20.0, <2.21']
 
diff --git a/src/sagemaker/__init__.py b/src/sagemaker/__init__.py
@@ -39,4 +39,4 @@
 from sagemaker.session import s3_input  # noqa: F401
 from sagemaker.session import get_execution_role  # noqa: F401
 
-__version__ = '1.16.1.post1'
+__version__ = '1.16.2'
diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py
@@ -106,6 +106,8 @@ def train(self, input_data_config, output_data_config, hyperparameters, job_name
         data_dir = self._create_tmp_folder()
         volumes = self._prepare_training_volumes(data_dir, input_data_config, output_data_config,
                                                  hyperparameters)
+        # If local, source directory needs to be updated to mounted /opt/ml/code path
+        hyperparameters = self._update_local_src_path(hyperparameters, key=sagemaker.estimator.DIR_PARAM_NAME)
 
         # Create the configuration files for each container that we will create
         # Each container will map the additional local volumes (if any).
@@ -169,6 +171,9 @@ def serve(self, model_dir, environment):
             parsed_uri = urlparse(script_dir)
             if parsed_uri.scheme == 'file':
                 volumes.append(_Volume(parsed_uri.path, '/opt/ml/code'))
+                # Update path to mount location
+                environment = environment.copy()
+                environment[sagemaker.estimator.DIR_PARAM_NAME.upper()] = '/opt/ml/code'
 
         if _ecr_login_if_needed(self.sagemaker_session.boto_session, self.image):
             _pull_image(self.image)
@@ -302,7 +307,7 @@ def _prepare_training_volumes(self, data_dir, input_data_config, output_data_con
             volumes.append(_Volume(data_source.get_root_dir(), channel=channel_name))
 
         # If there is a training script directory and it is a local directory,
-        #  mount it to the container.
+        # mount it to the container.
         if sagemaker.estimator.DIR_PARAM_NAME in hyperparameters:
             training_dir = json.loads(hyperparameters[sagemaker.estimator.DIR_PARAM_NAME])
             parsed_uri = urlparse(training_dir)
@@ -321,6 +326,16 @@ def _prepare_training_volumes(self, data_dir, input_data_config, output_data_con
 
         return volumes
 
+    def _update_local_src_path(self, params, key):
+        if key in params:
+            src_dir = json.loads(params[key])
+            parsed_uri = urlparse(src_dir)
+            if parsed_uri.scheme == 'file':
+                new_params = params.copy()
+                new_params[key] = json.dumps('/opt/ml/code')
+                return new_params
+        return params
+
     def _prepare_serving_volumes(self, model_location):
         volumes = []
         host = self.hosts[0]
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -350,7 +350,8 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
              max_jobs, max_parallel_jobs, parameter_ranges,
              static_hyperparameters, input_mode, metric_definitions,
              role, input_config, output_config, resource_config, stop_condition, tags,
-             warm_start_config, enable_network_isolation=False, image=None, algorithm_arn=None):
+             warm_start_config, enable_network_isolation=False, image=None, algorithm_arn=None,
+             early_stopping_type='Off'):
         """Create an Amazon SageMaker hyperparameter tuning job
 
         Args:
@@ -396,6 +397,9 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
                 https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html.
             warm_start_config (dict): Configuration defining the type of warm start and
                 other required configurations.
+            early_stopping_type (str): Specifies whether early stopping is enabled for the job.
+                Can be either 'Auto' or 'Off'. If set to 'Off', early stopping will not be attempted.
+                If set to 'Auto', early stopping of some training jobs may happen, but is not guaranteed to.
         """
         tune_request = {
             'HyperParameterTuningJobName': job_name,
@@ -410,6 +414,7 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
                     'MaxParallelTrainingJobs': max_parallel_jobs,
                 },
                 'ParameterRanges': parameter_ranges,
+                'TrainingJobEarlyStoppingType': early_stopping_type,
             },
             'TrainingJobDefinition': {
                 'StaticHyperParameters': static_hyperparameters,
diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py
@@ -165,7 +165,7 @@ class HyperparameterTuner(object):
 
     def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metric_definitions=None,
                  strategy='Bayesian', objective_type='Maximize', max_jobs=1, max_parallel_jobs=1,
-                 tags=None, base_tuning_job_name=None, warm_start_config=None):
+                 tags=None, base_tuning_job_name=None, warm_start_config=None, early_stopping_type='Off'):
         """Initialize a ``HyperparameterTuner``. It takes an estimator to obtain configuration information
         for training jobs that are created as the result of a hyperparameter tuning job.
 
@@ -194,6 +194,9 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr
                 a default job name is generated, based on the training image name and current timestamp.
             warm_start_config (sagemaker.tuner.WarmStartConfig): A ``WarmStartConfig`` object that has been initialized
                 with the configuration defining the nature of warm start tuning job.
+            early_stopping_type (str): Specifies whether early stopping is enabled for the job.
+                Can be either 'Auto' or 'Off' (default: 'Off'). If set to 'Off', early stopping will not be attempted.
+                If set to 'Auto', early stopping of some training jobs may happen, but is not guaranteed to.
         """
         self._hyperparameter_ranges = hyperparameter_ranges
         if self._hyperparameter_ranges is None or len(self._hyperparameter_ranges) == 0:
@@ -214,6 +217,7 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr
         self._current_job_name = None
         self.latest_tuning_job = None
         self.warm_start_config = warm_start_config
+        self.early_stopping_type = early_stopping_type
 
     def _prepare_for_training(self, job_name=None, include_cls_metadata=True):
         if job_name is not None:
@@ -445,7 +449,8 @@ def _prepare_init_params_from_job_description(cls, job_details):
             'strategy': tuning_config['Strategy'],
             'max_jobs': tuning_config['ResourceLimits']['MaxNumberOfTrainingJobs'],
             'max_parallel_jobs': tuning_config['ResourceLimits']['MaxParallelTrainingJobs'],
-            'warm_start_config': WarmStartConfig.from_job_desc(job_details.get('WarmStartConfig', None))
+            'warm_start_config': WarmStartConfig.from_job_desc(job_details.get('WarmStartConfig', None)),
+            'early_stopping_type': tuning_config['TrainingJobEarlyStoppingType']
         }
 
     @classmethod
@@ -625,6 +630,7 @@ def start_new(cls, tuner, inputs):
         tuner_args['metric_definitions'] = tuner.metric_definitions
         tuner_args['tags'] = tuner.tags
         tuner_args['warm_start_config'] = warm_start_config_req
+        tuner_args['early_stopping_type'] = tuner.early_stopping_type
 
         del tuner_args['vpc_config']
         if isinstance(tuner.estimator, sagemaker.algorithm.AlgorithmEstimator):
diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py
@@ -27,10 +27,11 @@
 import six
 
 
-AIRFLOW_TIME_MACRO = "{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}"
-AIRFLOW_TIME_MACRO_LEN = 19
-AIRFLOW_TIME_MACRO_SHORT = "{{ execution_date.strftime('%y%m%d-%H%M') }}"
-AIRFLOW_TIME_MACRO_SHORT_LEN = 11
+AIRFLOW_RETRY_MACRO = "{{ task_instance.try_number }}"
+AIRFLOW_TIME_MACRO = "{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}" + "-{}".format(AIRFLOW_RETRY_MACRO)
+AIRFLOW_TIME_MACRO_LEN = 22
+AIRFLOW_TIME_MACRO_SHORT = "{{ execution_date.strftime('%y%m%d-%H%M') }}" + "-{}".format(AIRFLOW_RETRY_MACRO)
+AIRFLOW_TIME_MACRO_SHORT_LEN = 14
 
 
 # Use the base name of the image as the job name if the user doesn't give us one
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -83,15 +83,16 @@ def hyperparameter_ranges():
 
 def _tune_and_deploy(kmeans_estimator, kmeans_train_set, sagemaker_session,
                      hyperparameter_ranges=None, job_name=None,
-                     warm_start_config=None):
+                     warm_start_config=None, early_stopping_type='Off'):
     tuner = _tune(kmeans_estimator, kmeans_train_set,
                   hyperparameter_ranges=hyperparameter_ranges, warm_start_config=warm_start_config,
-                  job_name=job_name)
-    _deploy(kmeans_train_set, sagemaker_session, tuner)
+                  job_name=job_name, early_stopping_type=early_stopping_type)
+    _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type)
 
 
-def _deploy(kmeans_train_set, sagemaker_session, tuner):
+def _deploy(kmeans_train_set, sagemaker_session, tuner, early_stopping_type):
     best_training_job = tuner.best_training_job()
+    assert tuner.early_stopping_type == early_stopping_type
     with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
         predictor = tuner.deploy(1, 'ml.c4.xlarge')
 
@@ -105,7 +106,7 @@ def _deploy(kmeans_train_set, sagemaker_session, tuner):
 
 def _tune(kmeans_estimator, kmeans_train_set, tuner=None,
           hyperparameter_ranges=None, job_name=None, warm_start_config=None,
-          wait_till_terminal=True, max_jobs=2, max_parallel_jobs=2):
+          wait_till_terminal=True, max_jobs=2, max_parallel_jobs=2, early_stopping_type='Off'):
     with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
 
         if not tuner:
@@ -115,7 +116,8 @@ def _tune(kmeans_estimator, kmeans_train_set, tuner=None,
                                         objective_type='Minimize',
                                         max_jobs=max_jobs,
                                         max_parallel_jobs=max_parallel_jobs,
-                                        warm_start_config=warm_start_config)
+                                        warm_start_config=warm_start_config,
+                                        early_stopping_type=early_stopping_type)
 
         records = kmeans_estimator.record_set(kmeans_train_set[0][:100])
         test_record_set = kmeans_estimator.record_set(kmeans_train_set[0][:100], channel='test')
@@ -332,16 +334,23 @@ def test_tuning_lda(sagemaker_session):
         tuner = HyperparameterTuner(estimator=lda, objective_metric_name=objective_metric_name,
                                     hyperparameter_ranges=hyperparameter_ranges,
                                     objective_type='Maximize', max_jobs=2,
-                                    max_parallel_jobs=2)
+                                    max_parallel_jobs=2,
+                                    early_stopping_type='Auto')
 
         tuning_job_name = unique_name_from_base('test-lda', max_length=32)
         tuner.fit([record_set, test_record_set], mini_batch_size=1, job_name=tuning_job_name)
 
-        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)
+        latest_tuning_job_name = tuner.latest_tuning_job.name
+
+        print('Started hyperparameter tuning job with name:' + latest_tuning_job_name)
 
         time.sleep(15)
         tuner.wait()
 
+    desc = tuner.latest_tuning_job.sagemaker_session.sagemaker_client \
+        .describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=latest_tuning_job_name)
+    assert desc['HyperParameterTuningJobConfig']['TrainingJobEarlyStoppingType'] == 'Auto'
+
     best_training_job = tuner.best_training_job()
     with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
         predictor = tuner.deploy(1, 'ml.c4.xlarge')
@@ -555,7 +564,8 @@ def test_attach_tuning_pytorch(sagemaker_session):
 
         tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges,
                                     metric_definitions,
-                                    max_jobs=2, max_parallel_jobs=2)
+                                    max_jobs=2, max_parallel_jobs=2,
+                                    early_stopping_type='Auto')
 
         training_data = estimator.sagemaker_session.upload_data(
             path=os.path.join(mnist_dir, 'training'),
@@ -571,6 +581,8 @@ def test_attach_tuning_pytorch(sagemaker_session):
 
     attached_tuner = HyperparameterTuner.attach(tuning_job_name,
                                                 sagemaker_session=sagemaker_session)
+    assert attached_tuner.early_stopping_type == 'Auto'
+
     best_training_job = tuner.best_training_job()
     with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
         predictor = attached_tuner.deploy(1, 'ml.c4.xlarge')
diff --git a/tests/unit/test_airflow.py b/tests/unit/test_airflow.py
diff --git a/tests/unit/test_image.py b/tests/unit/test_image.py
diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py
diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py