Merge branch 'master' into warning_msg

ChoiByungWook · web-flow · commit 9784c439c750 · 2019-04-30T12:52:30.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,23 @@
 # Changelog
 
+## v1.19.0 (2019-04-30)
+
+### Features
+
+ * add document embedding support to Object2Vec algorithm
+
+## v1.18.19 (2019-04-30)
+
+### Bug fixes and other changes
+
+ * skip p2/p3 tests in eu-central-1
+
+## v1.18.18 (2019-04-29)
+
+### Bug fixes and other changes
+
+ * add automatic model tuning integ test for TF script mode
+
 ## v1.18.17 (2019-04-25)
 
 ### Bug fixes and other changes
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.18.18.dev0
+1.19.1.dev0
diff --git a/src/sagemaker/amazon/object2vec.py b/src/sagemaker/amazon/object2vec.py
@@ -21,6 +21,19 @@
 from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT
 
 
+def _list_check_subset(valid_super_list):
+    valid_superset = set(valid_super_list)
+
+    def validate(value):
+        if not isinstance(value, str):
+            return False
+
+        val_list = [s.strip() for s in value.split(',')]
+        return set(val_list).issubset(valid_superset)
+
+    return validate
+
+
 class Object2Vec(AmazonAlgorithmEstimatorBase):
 
     repo_name = 'object2vec'
@@ -57,6 +70,14 @@ class Object2Vec(AmazonAlgorithmEstimatorBase):
                    'One of "adagrad", "adam", "rmsprop", "sgd", "adadelta"', str)
     learning_rate = hp('learning_rate', (ge(1e-06), le(1.0)),
                        'A float in [1e-06, 1.0]', float)
+
+    negative_sampling_rate = hp('negative_sampling_rate', (ge(0), le(100)), 'An integer in [0, 100]', int)
+    comparator_list = hp('comparator_list', _list_check_subset(["hadamard", "concat", "abs_diff"]),
+                         'Comma-separated of hadamard, concat, abs_diff. E.g. "hadamard,abs_diff"', str)
+    tied_token_embedding_weight = hp('tied_token_embedding_weight', (), 'Either True or False', bool)
+    token_embedding_storage_type = hp('token_embedding_storage_type', isin("dense", "row_sparse"),
+                                      'One of "dense", "row_sparse"', str)
+
     enc0_network = hp('enc0_network', isin("hcnn", "bilstm", "pooled_embedding"),
                       'One of "hcnn", "bilstm", "pooled_embedding"', str)
     enc1_network = hp('enc1_network', isin("hcnn", "bilstm", "pooled_embedding", "enc0"),
@@ -104,6 +125,10 @@ def __init__(self, role, train_instance_count, train_instance_type,
                  output_layer=None,
                  optimizer=None,
                  learning_rate=None,
+                 negative_sampling_rate=None,
+                 comparator_list=None,
+                 tied_token_embedding_weight=None,
+                 token_embedding_storage_type=None,
                  enc0_network=None,
                  enc1_network=None,
                  enc0_cnn_filter_width=None,
@@ -164,6 +189,10 @@ def __init__(self, role, train_instance_count, train_instance_type,
             output_layer(str): Optional. Type of output layer
             optimizer(str): Optional. Type of optimizer for training
             learning_rate(float): Optional. Learning rate for SGD training
+            negative_sampling_rate(int): Optional. Negative sampling rate
+            comparator_list(str): Optional. Customization of comparator operator
+            tied_token_embedding_weight(bool): Optional. Tying of token embedding layer weight
+            token_embedding_storage_type(str): Optional. Type of token embedding storage
             enc0_network(str): Optional. Network model of encoder "enc0"
             enc1_network(str): Optional. Network model of encoder "enc1"
             enc0_cnn_filter_width(int): Optional. CNN filter width
@@ -197,6 +226,12 @@ def __init__(self, role, train_instance_count, train_instance_type,
         self.output_layer = output_layer
         self.optimizer = optimizer
         self.learning_rate = learning_rate
+
+        self.negative_sampling_rate = negative_sampling_rate
+        self.comparator_list = comparator_list
+        self.tied_token_embedding_weight = tied_token_embedding_weight
+        self.token_embedding_storage_type = token_embedding_storage_type
+
         self.enc0_network = enc0_network
         self.enc1_network = enc1_network
         self.enc0_cnn_filter_width = enc0_cnn_filter_width
diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py
@@ -24,10 +24,11 @@
 TRANSFORM_DEFAULT_TIMEOUT_MINUTES = 20
 PYTHON_VERSION = 'py' + str(sys.version_info.major)
 
-# 'eu-central-1' has some p2, but no enough for continuous testing
-HOSTING_NO_P2_REGIONS = ['ca-central-1', 'eu-west-2', 'us-west-1', 'eu-central-1']
+# these regions have some p2 and p3 instances, but not enough for continuous testing
+HOSTING_NO_P2_REGIONS = ['ca-central-1', 'eu-central-1', 'eu-west-2', 'us-west-1']
 HOSTING_NO_P3_REGIONS = ['ap-southeast-1', 'ap-southeast-2', 'ap-south-1', 'ca-central-1',
-                         'eu-west-2', 'us-west-1']
+                         'eu-central-1', 'eu-west-2', 'us-west-1']
+
 # EI is currently only supported in the following regions
 # regions were derived from https://aws.amazon.com/machine-learning/elastic-inference/pricing/
 EI_SUPPORTED_REGIONS = ['us-east-1', 'us-east-2', 'us-west-2', 'eu-west-1', 'ap-northeast-1', 'ap-northeast-2']
diff --git a/tests/integ/test_object2vec.py b/tests/integ/test_object2vec.py
@@ -43,6 +43,10 @@ def test_object2vec(sagemaker_session):
             enc0_vocab_size=45000,
             enc_dim=16,
             num_classes=3,
+            negative_sampling_rate=0,
+            comparator_list='hadamard,concat,abs_diff',
+            tied_token_embedding_weight=False,
+            token_embedding_storage_type='dense',
             sagemaker_session=sagemaker_session)
 
         record_set = prepare_record_set_from_local_files(data_path, object2vec.data_location,
diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py
@@ -22,9 +22,8 @@
 from sagemaker.tensorflow import TensorFlow
 from six.moves.urllib.parse import urlparse
 from sagemaker.utils import unique_name_from_base
-import tests.integ as integ
-from tests.integ import kms_utils
-import tests.integ.timeout as timeout
+
+import tests.integ
 
 ROLE = 'SageMakerRole'
 
@@ -35,14 +34,18 @@
 TAGS = [{'Key': 'some-key', 'Value': 'some-value'}]
 
 
-@pytest.fixture(scope='session', params=['ml.c5.xlarge', 'ml.p2.xlarge'])
+@pytest.fixture(scope='session', params=[
+    'ml.c5.xlarge',
+    pytest.param('ml.p2.xlarge',
+                 marks=pytest.mark.skipif(
+                     tests.integ.test_region() in tests.integ.HOSTING_NO_P2_REGIONS,
+                     reason='no ml.p2 instances in this region'))])
 def instance_type(request):
     return request.param
 
 
-@pytest.mark.skipif(integ.test_region() in integ.HOSTING_NO_P2_REGIONS,
-                    reason='no ml.p2 instances in these regions')
-@pytest.mark.skipif(integ.PYTHON_VERSION != 'py3', reason="Script Mode tests are only configured to run with Python 3")
+@pytest.mark.skipif(tests.integ.PYTHON_VERSION != 'py3',
+                    reason="Script Mode tests are only configured to run with Python 3")
 def test_mnist(sagemaker_session, instance_type):
     estimator = TensorFlow(entry_point=SCRIPT,
                            role='SageMakerRole',
@@ -51,26 +54,26 @@ def test_mnist(sagemaker_session, instance_type):
                            sagemaker_session=sagemaker_session,
                            py_version='py3',
                            framework_version=TensorFlow.LATEST_VERSION,
-                           metric_definitions=[{'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}])
+                           metric_definitions=[
+                               {'Name': 'train:global_steps', 'Regex': r'global_step\/sec:\s(.*)'}])
     inputs = estimator.sagemaker_session.upload_data(
         path=os.path.join(RESOURCE_PATH, 'data'),
         key_prefix='scriptmode/mnist')
 
-    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
+    with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
         estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-mnist'))
     _assert_s3_files_exist(estimator.model_dir,
                            ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
     df = estimator.training_job_analytics.dataframe()
-    print(df)
     assert df.size > 0
 
 
 def test_server_side_encryption(sagemaker_session):
-
     boto_session = sagemaker_session.boto_session
-    with kms_utils.bucket_with_encryption(boto_session, ROLE) as (bucket_with_kms, kms_key):
-
-        output_path = os.path.join(bucket_with_kms, 'test-server-side-encryption', time.strftime('%y%m%d-%H%M'))
+    with tests.integ.kms_utils.bucket_with_encryption(boto_session, ROLE) as (
+            bucket_with_kms, kms_key):
+        output_path = os.path.join(bucket_with_kms, 'test-server-side-encryption',
+                                   time.strftime('%y%m%d-%H%M'))
 
         estimator = TensorFlow(entry_point=SCRIPT,
                                role=ROLE,
@@ -88,28 +91,29 @@ def test_server_side_encryption(sagemaker_session):
             path=os.path.join(RESOURCE_PATH, 'data'),
             key_prefix='scriptmode/mnist')
 
-        with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
-            estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-server-side-encryption'))
+        with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
+            estimator.fit(inputs=inputs,
+                          job_name=unique_name_from_base('test-server-side-encryption'))
 
 
 @pytest.mark.canary_quick
-@pytest.mark.skipif(integ.PYTHON_VERSION != 'py3', reason="Script Mode tests are only configured to run with Python 3")
+@pytest.mark.skipif(tests.integ.PYTHON_VERSION != 'py3',
+                    reason="Script Mode tests are only configured to run with Python 3")
 def test_mnist_distributed(sagemaker_session, instance_type):
     estimator = TensorFlow(entry_point=SCRIPT,
                            role=ROLE,
                            train_instance_count=2,
-                           # TODO: change train_instance_type to instance_type once the test is passing consistently
-                           train_instance_type='ml.c5.xlarge',
+                           train_instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
-                           py_version=integ.PYTHON_VERSION,
+                           py_version=tests.integ.PYTHON_VERSION,
                            script_mode=True,
                            framework_version=TensorFlow.LATEST_VERSION,
                            distributions=PARAMETER_SERVER_DISTRIBUTION)
     inputs = estimator.sagemaker_session.upload_data(
         path=os.path.join(RESOURCE_PATH, 'data'),
         key_prefix='scriptmode/distributed_mnist')
 
-    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
+    with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
         estimator.fit(inputs=inputs, job_name=unique_name_from_base('test-tf-sm-distributed'))
     _assert_s3_files_exist(estimator.model_dir,
                            ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
@@ -131,22 +135,26 @@ def test_mnist_async(sagemaker_session):
     training_job_name = estimator.latest_training_job.name
     time.sleep(20)
     endpoint_name = training_job_name
-    _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
-    with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
-        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
+    _assert_training_job_tags_match(sagemaker_session.sagemaker_client,
+                                    estimator.latest_training_job.name, TAGS)
+    with tests.integ.timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
+        estimator = TensorFlow.attach(training_job_name=training_job_name,
+                                      sagemaker_session=sagemaker_session)
         predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                      endpoint_name=endpoint_name)
 
         result = predictor.predict(np.zeros(784))
         print('predict result: {}'.format(result))
         _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS)
-        _assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
+        _assert_model_tags_match(sagemaker_session.sagemaker_client,
+                                 estimator.latest_training_job.name, TAGS)
 
 
 def _assert_s3_files_exist(s3_url, files):
     parsed_url = urlparse(s3_url)
     s3 = boto3.client('s3')
-    contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip('/'))["Contents"]
+    contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip('/'))[
+        "Contents"]
     for f in files:
         found = [x['Key'] for x in contents if x['Key'].endswith(f)]
         if not found:
@@ -169,5 +177,6 @@ def _assert_endpoint_tags_match(sagemaker_client, endpoint_name, tags):
 
 
 def _assert_training_job_tags_match(sagemaker_client, training_job_name, tags):
-    training_job_description = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
+    training_job_description = sagemaker_client.describe_training_job(
+        TrainingJobName=training_job_name)
     _assert_tags_match(sagemaker_client, training_job_description['TrainingJobArn'], tags)
diff --git a/tests/unit/test_object2vec.py b/tests/unit/test_object2vec.py
@@ -111,6 +111,10 @@ def test_all_hyperparameters(sagemaker_session):
         output_layer='softmax',
         optimizer='adam',
         learning_rate=0.0001,
+        negative_sampling_rate=1,
+        comparator_list='hadamard, abs_diff',
+        tied_token_embedding_weight=True,
+        token_embedding_storage_type='row_sparse',
         enc0_network='bilstm',
         enc1_network='hcnn',
         enc0_cnn_filter_width=3,
@@ -161,7 +165,11 @@ def test_required_hyper_parameters_value(sagemaker_session, required_hyper_param
     ('optimizer', 0),
     ('enc0_cnn_filter_width', 'string'),
     ('weight_decay', 'string'),
-    ('learning_rate', 'string')
+    ('learning_rate', 'string'),
+    ('negative_sampling_rate', 'some_string'),
+    ('comparator_list', 0),
+    ('comparator_list', ['foobar']),
+    ('token_embedding_storage_type', 123),
 ])
 def test_optional_hyper_parameters_type(sagemaker_session, optional_hyper_parameters, value):
     with pytest.raises(ValueError):
@@ -182,7 +190,10 @@ def test_optional_hyper_parameters_type(sagemaker_session, optional_hyper_parame
     ('weight_decay', 200000),
     ('enc0_cnn_filter_width', 2000),
     ('learning_rate', 0),
-    ('learning_rate', 2)
+    ('learning_rate', 2),
+    ('negative_sampling_rate', -1),
+    ('comparator_list', 'hadamard,foobar'),
+    ('token_embedding_storage_type', 'foobar'),
 ])
 def test_optional_hyper_parameters_value(sagemaker_session, optional_hyper_parameters, value):
     with pytest.raises(ValueError):
diff --git a/tox.ini b/tox.ini
@@ -60,21 +60,25 @@ commands =
 deps = .[test]
 
 [testenv:flake8]
-basepython = python
+basepython = python3
+skipdist = true
+skip_install = true
 deps =
     flake8
     flake8-future-import
 commands = flake8
 
 [testenv:pylint]
 basepython = python3
+skipdist = true
+skip_install = true
 deps =
     pylint==2.3.1
 commands =
     python -m pylint --rcfile=.pylintrc -j 0 src/sagemaker
 
 [testenv:twine]
-basepython = python
+basepython = python3
 # twine check was added starting in 1.12.0
 # https://github.com/pypa/twine/blob/master/docs/changelog.rst
 deps =